diff --git a/.bzrignore b/.bzrignore index b0b87a34e6..f14ff947f6 100644 --- a/.bzrignore +++ b/.bzrignore @@ -39,3 +39,45 @@ recipes/.git recipes/.gitignore recipes/README recipes/katalog_egazeciarz.recipe +recipes/tv_axnscifi.recipe +recipes/tv_comedycentral.recipe +recipes/tv_discoveryscience.recipe +recipes/tv_foxlife.recipe +recipes/tv_fox.recipe +recipes/tv_hbo.recipe +recipes/tv_kinopolska.recipe +recipes/tv_nationalgeographic.recipe +recipes/tv_polsat2.recipe +recipes/tv_polsat.recipe +recipes/tv_tv4.recipe +recipes/tv_tvn7.recipe +recipes/tv_tvn.recipe +recipes/tv_tvp1.recipe +recipes/tv_tvp2.recipe +recipes/tv_tvphd.recipe +recipes/tv_tvphistoria.recipe +recipes/tv_tvpkultura.recipe +recipes/tv_tvppolonia.recipe +recipes/tv_tvpuls.recipe +recipes/tv_viasathistory.recipe +recipes/icons/tv_axnscifi.png +recipes/icons/tv_comedycentral.png +recipes/icons/tv_discoveryscience.png +recipes/icons/tv_foxlife.png +recipes/icons/tv_fox.png +recipes/icons/tv_hbo.png +recipes/icons/tv_kinopolska.png +recipes/icons/tv_nationalgeographic.png +recipes/icons/tv_polsat2.png +recipes/icons/tv_polsat.png +recipes/icons/tv_tv4.png +recipes/icons/tv_tvn7.png +recipes/icons/tv_tvn.png +recipes/icons/tv_tvp1.png +recipes/icons/tv_tvp2.png +recipes/icons/tv_tvphd.png +recipes/icons/tv_tvphistoria.png +recipes/icons/tv_tvpkultura.png +recipes/icons/tv_tvppolonia.png +recipes/icons/tv_tvpuls.png +recipes/icons/tv_viasathistory.png diff --git a/Changelog.yaml b/Changelog.yaml index f4c5e25cb4..129af0afd5 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,144 @@ # new recipes: # - title: +- version: 0.9.7 + date: 2012-11-23 + + new features: + - title: "Edit metadata dialog: Show the size of the current book cover in the edit metadata dialog." + tickets: [1079781] + + - title: "Get Books: Allow easy searching by title and author in addition to any keyword, to prevent large numbers of spurious matches." + + - title: "An option to automatically convert any added book to the current output format, found under Preferences->Adding books" + + - title: "E-book viewer: Allow viewing tables in a separate popup window by right clicking on the table and selecting 'View table'. Useful for reference books that have lots of large tables." + tickets: [1080710] + + - title: "Catalogs: Add the current library name as an available field when generating catalogs in csv/xml format." + tickets: [1078422] + + - title: "Enable colored text in the output from the command line tools on windows" + + - title: "E-book viewer: Add an option to hide the help message when entering full screen mode" + + - title: "E-book viewer: Add an option to always start the viewer in full screen mode" + + - title: "E-book viewer: Add many more controls to the context menu, particularly useful in full screen mode" + + - title: "E-book viewer: Allow easy searching of the selected word or phrase in google via the context menu" + + - title: "Add a new type of FileType plugin, postimport, that runs after a book has been added to the database." + + - title: "Get Books: Remove Gandalf store, add Publio store. Update the Legimi store plugin for website changes" + + bug fixes: + - title: "Conversion: Correctly handle values of left and right for the deprecated align attribute of images, mapping them to the CSS float property instead of to text-align." + tickets: [1081094] + + - title: "MOBI Output: When generating joint MOBI6/KF8 files do not set incorrect display CSS values for tables in the KF8 part" + + - title: "Connect to iTunes: Ignore AAC audio files." + tickets: [1081096] + + - title: "E-book viewer: Fix restoring from fullscreen not respecting maximized window state" + + - title: "Fix rows in the device books view sometimes being too high" + + - title: "Catalogs: Fixed a problem occurring when merging comments with a custom field whose type is a list." + + - title: "Linux binary: Use exec in the wrapper shell scripts that are used to set env vars and launch calibre utilities." + tickets: [1077884] + + - title: "E-book viewer: Fix blank pages after every page when viewing some comic files in paged mode" + + - title: "E-book viewer: When printing, respect the specified page range." + tickets: [1074220] + + - title: "Font subsetting: Parse the GSUB table for glyph substitution rules and do not remove any glyphs that could act as substitutes. Keep zero length glyphs like the glyphs for non printable characters when subsetting TrueType outlines." + + - title: "Smarten punctuation: Fix self closing script tags causing smarten punctuation to fail" + + + improved recipes: + - Arguments and facts + - Business Standard + - The New Yorker + + new recipes: + - title: Various Czech and Hungarian news sources + author: bubak + + - title: Various Polish recipes + author: Artur Stachecki + + - title: Buchreport + author: a.peter + + - title: Red Voltaire + author: atordo + + - title: Autosport + author: Mr Stefan + + - title: House News + author: Eddie Lau + +- version: 0.9.6 + date: 2012-11-10 + + new features: + - title: "Experimental support for subsetting fonts" + description: "Subsetting a font means reducing the font to contain only the glyphs for the text actually present in the book. This can easily halve the size of the font. calibre can now do this for all embedded fonts during a conversion. Turn it on via the 'Subset all embedded fonts' option under the Look & Feel section of the conversion dialog. calibre can subset both TrueType and OpenType fonts. Note that this code is very new and likely has bugs, so please check the output if you turn on subsetting. The conversion log will have info about the subsetting operations." + type: major + + - title: "EPUB Input: Try to workaround EPUBs that have missing or damaged ZIP central directories. calibre should now be able to read/convert such an EPUB file, provided it does not suffer from further corruption." + + - title: "Allow using identifiers in save to disk templates." + tickets: [1074623] + + - title: "calibredb: Add an option to not notify the GUI" + + - title: "Catalogs: Fix long tags causing catalog generation to fail on windows. Add the ability to cross-reference authors, i.e. to relist the authors for a book with multiple authors separately." + tickets: [1074931] + + - title: "Edit metadata dialog: Add a clear tags button to remove all tags with a single click" + + - title: "Add search to the font family chooser dialog" + + bug fixes: + - title: "Windows: Fix a long standing bug in the device eject code that for some reason only manifested in 0.9.5." + tickets: [1075782] + + - title: "Get Books: Fix Amazon stores, Google Books store and libri.de" + + - title: "Kobo driver: More fixes for on device book matching, and list books as being on device even if the Kobo has not yet indexed them. Also some performance improvements." + tickets: [1069617] + + - title: "EPUB Output: Remove duplicate id and name attributes to eliminate pointless noise from the various epub check utilities" + + - title: "Ask for confirmation before removing plugins" + + - title: "Fix bulk convert queueing dialog becoming very long if any of the books have a very long title." + tickets: [1076191] + + - title: "Fix deleting custom column tags like data from the Tag browser not updating the last modified timestamp for affected books" + tickets: [1075476] + + - title: "When updating a previously broken plugin, do not show an error message because the previous version of the plugin cannot be loaded" + + - title: "Fix regression that broke the Template Editor" + + improved recipes: + - Various updated Polish recipes + - London Review of Books + - Yemen Times + + new recipes: + - title: "Various Polish news sources" + author: Artur Stachecki + + - version: 0.9.5 date: 2012-11-02 diff --git a/manual/faq.rst b/manual/faq.rst index b434927c04..109aff440d 100644 --- a/manual/faq.rst +++ b/manual/faq.rst @@ -649,20 +649,24 @@ If it still wont launch, start a command prompt (press the windows key and R; th Post any output you see in a help message on the `Forum `_. -|app| freezes when I click on anything? +|app| freezes/crashes occasionally? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There are three possible things I know of, that can cause this: - * You recently connected an external monitor or TV to your computer. In this case, whenever |app| opens a new window like the edit metadata window or the conversion dialog, it appears on the second monitor where you dont notice it and so you think |app| has frozen. Disconnect your second monitor and restart calibre. + * You recently connected an external monitor or TV to your computer. In + this case, whenever |app| opens a new window like the edit metadata + window or the conversion dialog, it appears on the second monitor where + you dont notice it and so you think |app| has frozen. Disconnect your + second monitor and restart calibre. - * You are using a Wacom branded mouse. There is an incompatibility between Wacom mice and the graphics toolkit |app| uses. Try using a non-Wacom mouse. + * You are using a Wacom branded mouse. There is an incompatibility between + Wacom mice and the graphics toolkit |app| uses. Try using a non-Wacom + mouse. * If you use RoboForm, it is known to cause |app| to crash. Add |app| to - the blacklist of programs inside RoboForm to fix this. - - * Sometimes if some software has installed lots of new files in your fonts folder, |app| can crash until it finishes indexing them. Just start |app|, then leave it alone for about 20 minutes, without clicking on anything. After that you should be able to use |app| as normal. - + the blacklist of programs inside RoboForm to fix this. Or uninstall + RoboForm. |app| is not starting on OS X? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -721,8 +725,8 @@ You can switch |app| to using a backed up library folder by simply clicking the If you want to backup the |app| configuration/plugins, you have to backup the config directory. You can find this config directory via :guilabel:`Preferences->Miscellaneous`. Note that restoring configuration directories is not officially supported, but should work in most cases. Just copy the contents of the backup directory into the current configuration directory to restore. -How do I use purchased EPUB books with |app|? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +How do I use purchased EPUB books with |app| (or what do I do with .acsm files)? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Most purchased EPUB books have `DRM `_. This prevents |app| from opening them. You can still use |app| to store and transfer them to your ebook reader. First, you must authorize your reader on a windows machine with Adobe Digital Editions. Once this is done, EPUB books transferred with |app| will work fine on your reader. When you purchase an epub book from a website, you will get an ".acsm" file. This file should be opened with Adobe Digital Editions, which will then download the actual ".epub" ebook. The ebook file will be stored in the folder "My Digital Editions", from where you can add it to |app|. I am getting a "Permission Denied" error? diff --git a/recipes/aif_ru.recipe b/recipes/aif_ru.recipe index b5d6015d0c..4e018203da 100644 --- a/recipes/aif_ru.recipe +++ b/recipes/aif_ru.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' +__copyright__ = '2010 - 2012, Darko Miletic ' ''' www.aif.ru ''' @@ -19,12 +19,19 @@ class AIF_ru(BasicNewsRecipe): encoding = 'cp1251' language = 'ru' publication_type = 'magazine' - extra_css = ' @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Verdana,Arial,Helvetica,sans1,sans-serif} ' - keep_only_tags = [dict(name='div',attrs={'id':'inner'})] + masthead_url = 'http://static3.aif.ru/glossy/index/i/logo.png' + extra_css = """ + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} + body{font-family: Verdana,Arial,Helvetica,sans1,sans-serif} + img{display: block} + """ + keep_only_tags = [ + dict(name='div',attrs={'class':['content-header', 'zoom']}) + ,dict(name='div',attrs={'id':'article-text'}) + ] remove_tags = [ - dict(name=['iframe','object','link','base','input','img']) - ,dict(name='div',attrs={'class':'photo'}) - ,dict(name='p',attrs={'class':'resizefont'}) + dict(name=['iframe','object','link','base','input','meta']) + ,dict(name='div',attrs={'class':'in-topic'}) ] feeds = [(u'News', u'http://www.aif.ru/rss/all.php')] diff --git a/recipes/aktualne.cz.recipe b/recipes/aktualne.cz.recipe new file mode 100644 index 0000000000..cd2dcc5f09 --- /dev/null +++ b/recipes/aktualne.cz.recipe @@ -0,0 +1,69 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class aktualneRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'aktualne.cz' + publisher = u'Centrum holdings' + description = 'aktuálně.cz' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Domácí', u'http://aktualne.centrum.cz/feeds/rss/domaci/?photo=0'), + (u'Zprávy', u'http://aktualne.centrum.cz/feeds/rss/zpravy/?photo=0'), + (u'Praha', u'http://aktualne.centrum.cz/feeds/rss/domaci/regiony/praha/?photo=0'), + (u'Ekonomika', u'http://aktualne.centrum.cz/feeds/rss/ekonomika/?photo=0'), + (u'Finance', u'http://aktualne.centrum.cz/feeds/rss/finance/?photo=0'), + (u'Blogy a názory', u'http://blog.aktualne.centrum.cz/export-all.php') + ] + + + language = 'cs' + cover_url = 'http://img.aktualne.centrum.cz/design/akt4/o/l/logo-akt-ciste.png' + remove_javascript = True + no_stylesheets = True + + remove_attributes = [] + remove_tags_before = dict(name='h1', attrs={'class':['titulek-clanku']}) + filter_regexps = [r'img.aktualne.centrum.cz'] + remove_tags = [dict(name='div', attrs={'id':['social-bookmark']}), + dict(name='div', attrs={'class':['box1', 'svazane-tagy']}), + dict(name='div', attrs={'class':'itemcomment id0'}), + dict(name='div', attrs={'class':'hlavicka'}), + dict(name='div', attrs={'class':'hlavni-menu'}), + dict(name='div', attrs={'class':'top-standard-brand-obal'}), + dict(name='div', attrs={'class':'breadcrumb'}), + dict(name='div', attrs={'id':'start-standard'}), + dict(name='div', attrs={'id':'forum'}), + dict(name='span', attrs={'class':'akce'}), + dict(name='span', attrs={'class':'odrazka vetsi'}), + dict(name='div', attrs={'class':'boxP'}), + dict(name='div', attrs={'class':'box2'})] + preprocess_regexps = [ + (re.compile(r'
'), + (re.compile(r'
')] + + keep_only_tags = [] + + visited_urls = {} + def get_article_url(self, article): + url = BasicNewsRecipe.get_article_url(self, article) + if url in self.visited_urls: + self.log.debug('Ignoring duplicate: ' + url) + return None + else: + self.visited_urls[url] = True + self.log.debug('Accepting: ' + url) + return url + + def encoding(self, source): + if source.newurl.find('blog.aktualne') >= 0: + enc = 'utf-8' + else: + enc = 'iso-8859-2' + self.log.debug('Called encoding ' + enc + " " + str(source.newurl)) + return source.decode(enc, 'replace') + diff --git a/recipes/antyweb.recipe b/recipes/antyweb.recipe new file mode 100644 index 0000000000..c2576191dd --- /dev/null +++ b/recipes/antyweb.recipe @@ -0,0 +1,48 @@ + +from calibre.web.feeds.news import BasicNewsRecipe + +class AntywebRecipe(BasicNewsRecipe): + encoding = 'utf-8' + __license__ = 'GPL v3' + __author__ = u'Artur Stachecki ' + language = 'pl' + version = 1 + title = u'Antyweb' + category = u'News' + description = u'Blog o internecie i nowych technologiach' + cover_url='' + remove_empty_feeds= True + auto_cleanup = False + no_stylesheets=True + use_embedded_content = False + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript = True + simultaneous_downloads = 3 + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'h1', attrs = { 'class' : 'mm-article-title'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'mm-article-content'})) + + + remove_tags =[] + remove_tags.append(dict(name = 'h2', attrs = {'class' : 'widgettitle'})) + remove_tags.append(dict(name = 'img', attrs = {'class' : 'alignleft'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'float: right;margin-left:1em;margin-bottom: 0.5em;padding-bottom: 3px; width: 72px;'})) + remove_tags.append(dict(name = 'img', attrs = {'src' : 'http://antyweb.pl/wp-content/uploads/2011/09/HOSTERSI_testy_pasek600x30.gif'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'podwpisowe'})) + + + extra_css = ''' + body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} + ''' + + feeds = [ + (u'Artykuly', u'feed://feeds.feedburner.com/Antyweb?format=xml'), + ] + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/app_funds.recipe b/recipes/app_funds.recipe new file mode 100644 index 0000000000..d5734fc451 --- /dev/null +++ b/recipes/app_funds.recipe @@ -0,0 +1,27 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'teepel ' + +''' +appfunds.blogspot.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class app_funds(BasicNewsRecipe): + title = u'APP Funds' + __author__ = 'teepel ' + language = 'pl' + description ='Blog inwestora dla inwestorów i oszczędzających' + INDEX='http://appfunds.blogspot.com' + remove_empty_feeds= True + oldest_article = 7 + max_articles_per_feed = 100 + simultaneous_downloads = 5 + remove_javascript=True + no_stylesheets=True + auto_cleanup = True + + feeds = [(u'blog', u'http://feeds.feedburner.com/blogspot/etVI')] + diff --git a/recipes/autosport.recipe b/recipes/autosport.recipe new file mode 100644 index 0000000000..df98125f25 --- /dev/null +++ b/recipes/autosport.recipe @@ -0,0 +1,30 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'MrStefan ' + +''' +www.autosport.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class autosport(BasicNewsRecipe): + title = u'Autosport' + __author__ = 'MrStefan ' + language = 'en_GB' + description =u'Daily Formula 1 and motorsport news from the leading weekly motor racing magazine. The authority on Formula 1, F1, MotoGP, GP2, Champ Car, Le Mans...' + masthead_url='http://cdn.images.autosport.com/asdotcom.gif' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'h1', attrs = {'class' : 'news_headline'})) + keep_only_tags.append(dict(name = 'td', attrs = {'class' : 'news_article_author'})) + keep_only_tags.append(dict(name = 'td', attrs = {'class' : 'news_article_date'})) + keep_only_tags.append(dict(name = 'p')) + + feeds = [(u'ALL NEWS', u'http://www.autosport.com/rss/allnews.xml')] diff --git a/recipes/bankier_pl.recipe b/recipes/bankier_pl.recipe new file mode 100644 index 0000000000..8a68d844b3 --- /dev/null +++ b/recipes/bankier_pl.recipe @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'teepel ' + +''' +bankier.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class bankier(BasicNewsRecipe): + title = u'Bankier.pl' + __author__ = 'teepel ' + language = 'pl' + description ='Polski portal finansowy. Informacje o: gospodarka, inwestowanie, finanse osobiste, prowadzenie firmy, kursy walut, notowania akcji, fundusze.' + masthead_url='http://www.bankier.pl/gfx/hd-mid-02.gif' + INDEX='http://bankier.pl/' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + simultaneous_downloads = 5 + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'align' : 'left'})) + + remove_tags =[] + remove_tags.append(dict(name = 'table', attrs = {'cellspacing' : '2'})) + remove_tags.append(dict(name = 'div', attrs = {'align' : 'center'})) + remove_tags.append(dict(name = 'img', attrs = {'src' : '/gfx/hd-mid-02.gif'})) + #remove_tags.append(dict(name = 'a', attrs = {'target' : '_blank'})) + #remove_tags.append(dict(name = 'br', attrs = {'clear' : 'all'})) + + feeds = [ + (u'Wiadomości dnia', u'http://feeds.feedburner.com/bankier-wiadomosci-dnia'), + (u'Finanse osobiste', u'http://feeds.feedburner.com/bankier-finanse-osobiste'), + (u'Firma', u'http://feeds.feedburner.com/bankier-firma'), + (u'Giełda', u'http://feeds.feedburner.com/bankier-gielda'), + (u'Rynek walutowy', u'http://feeds.feedburner.com/bankier-rynek-walutowy'), + (u'Komunikaty ze spółek', u'http://feeds.feedburner.com/bankier-espi'), + ] + def print_version(self, url): + segment = url.split('.') + urlPart = segment[2] + segments = urlPart.split('-') + urlPart2 = segments[-1] + return 'http://www.bankier.pl/wiadomosci/print.html?article_id=' + urlPart2 + diff --git a/recipes/blesk.recipe b/recipes/blesk.recipe new file mode 100644 index 0000000000..7eff4c42d0 --- /dev/null +++ b/recipes/blesk.recipe @@ -0,0 +1,55 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class bleskRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Blesk' + publisher = u'' + description = 'blesk.cz' + oldest_article = 1 + max_articles_per_feed = 20 + use_embedded_content = False + + feeds = [ + (u'Zprávy', u'http://www.blesk.cz/rss/7'), + (u'Blesk', u'http://www.blesk.cz/rss/1'), + (u'Sex a tabu', u'http://www.blesk.cz/rss/2'), + (u'Celebrity', u'http://www.blesk.cz/rss/5'), + (u'Cestování', u'http://www.blesk.cz/rss/12') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://img.blesk.cz/images/blesk/blesk-logo.png' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + remove_tags_before = dict(name='div', attrs={'id':['boxContent']}) + remove_tags_after = dict(name='div', attrs={'class':['artAuthors']}) + remove_tags = [dict(name='div', attrs={'class':['link_clanek']}), + dict(name='div', attrs={'id':['partHeader']}), + dict(name='div', attrs={'id':['top_bottom_box', 'lista_top']})] + preprocess_regexps = [(re.compile(r'
')] + + keep_only_tags = [dict(name='div', attrs={'class':'articleContent'})] + + visited_urls = {} + def get_article_url(self, article): + url = BasicNewsRecipe.get_article_url(self, article) + if url in self.visited_urls: + self.log.debug('Ignoring duplicate: ' + url) + return None + else: + self.visited_urls[url] = True + self.log.debug('Accepting: ' + url) + return url + + + + diff --git a/recipes/blognexto.recipe b/recipes/blognexto.recipe new file mode 100644 index 0000000000..b5ced2cf50 --- /dev/null +++ b/recipes/blognexto.recipe @@ -0,0 +1,28 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class blognexto(BasicNewsRecipe): + title = 'BLOG.NEXTO.pl' + __author__ = 'MrStefan ' + language = 'pl' + description ='o e-publikacjach prawie wszystko' + masthead_url='http://blog.nexto.pl/wp-content/uploads/2012/04/logo-blog-nexto.pl_.jpg' + remove_empty_feeds= True + oldest_article = 7 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'content'})) + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'class' : 'comment-cloud'})) + remove_tags.append(dict(name = 'p', attrs = {'class' : 'post-date1'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'fb-like'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'tags'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'postnavi'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'commments-box'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'respond'})) + + feeds = [('Artykuly', 'http://feeds.feedburner.com/blognexto')] diff --git a/recipes/brewiarz.recipe b/recipes/brewiarz.recipe new file mode 100644 index 0000000000..5d16278b00 --- /dev/null +++ b/recipes/brewiarz.recipe @@ -0,0 +1,140 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe +import datetime, re + + +class brewiarz(BasicNewsRecipe): + title = u'Brewiarz' + __author__ = 'Artur Stachecki ' + language = 'pl' + description = u'Serwis poświęcony Liturgii Godzin (brewiarzowi) - formie codziennej modlitwy Kościoła katolickiego.' + masthead_url = 'http://brewiarz.pl/images/logo2.gif' + max_articles_per_feed = 100 + remove_javascript = True + no_stylesheets = True + publication_type = 'newspaper' + next_days = 1 + + def parse_index(self): + dec2rom_dict = {"01": "i", "02": "ii", "03": "iii", "04": "iv", + "05": "v", "06": "vi", "07": "vii", "08": "viii", + "09": "ix", "10": "x", "11": "xi", "12": "xii"} + + weekday_dict = {"Sunday": "Niedziela", "Monday": "Poniedziałek", "Tuesday": "Wtorek", + "Wednesday": "Środa", "Thursday": "Czwartek", "Friday": "Piątek", "Saturday": "Sobota"} + + now = datetime.datetime.now() + + feeds = [] + for i in range(0, self.next_days): + url_date = now + datetime.timedelta(days=i) + url_date_month = url_date.strftime("%m") + url_date_month_roman = dec2rom_dict[url_date_month] + url_date_day = url_date.strftime("%d") + url_date_year = url_date.strftime("%Y")[2:] + url_date_weekday = url_date.strftime("%A") + url_date_weekday_pl = weekday_dict[url_date_weekday] + + url = "http://brewiarz.pl/" + url_date_month_roman + "_" + url_date_year + "/" + url_date_day + url_date_month + "/index.php3" + articles = self.parse_pages(url) + if articles: + title = url_date_weekday_pl + " " + url_date_day + "." + url_date_month + "." + url_date_year + feeds.append((title, articles)) + else: + sectors = self.get_sectors(url) + for subpage in sectors: + title = url_date_weekday_pl + " " + url_date_day + "." + url_date_month + "." + url_date_year + " - " + subpage.string + url = "http://brewiarz.pl/" + url_date_month_roman + "_" + url_date_year + "/" + url_date_day + url_date_month + "/" + subpage['href'] + print(url) + articles = self.parse_pages(url) + if articles: + feeds.append((title, articles)) + return feeds + + def get_sectors(self, url): + sectors = [] + soup = self.index_to_soup(url) + sectors_table = soup.find(name='table', attrs={'width': '490'}) + sector_links = sectors_table.findAll(name='a') + for sector_links_modified in sector_links: + link_parent_text = sector_links_modified.findParent(name='div').text + if link_parent_text: + sector_links_modified.text = link_parent_text.text + sectors.append(sector_links_modified) + return sectors + + def parse_pages(self, url): + current_articles = [] + soup = self.index_to_soup(url) + www = soup.find(attrs={'class': 'www'}) + if www: + box_title = www.find(text='Teksty LG') + article_box_parent = box_title.findParent('ul') + article_box_sibling = article_box_parent.findNextSibling('ul') + for li in article_box_sibling.findAll('li'): + link = li.find(name='a') + ol = link.findNextSibling(name='ol') + if ol: + sublinks = ol.findAll(name='a') + for sublink in sublinks: + link_title = self.tag_to_string(link) + " - " + self.tag_to_string(sublink) + link_url_print = re.sub('php3', 'php3?kr=_druk&wr=lg&', sublink['href']) + link_url = url[:-10] + link_url_print + current_articles.append({'title': link_title, + 'url': link_url, 'description': '', 'date': ''}) + else: + if link.findParent(name = 'ol'): + continue + else: + link_title = self.tag_to_string(link) + link_url_print = re.sub('php3', 'php3?kr=_druk&wr=lg&', link['href']) + link_url = url[:-10] + link_url_print + current_articles.append({'title': link_title, + 'url': link_url, 'description': '', 'date': ''}) + return current_articles + else: + return None + + def preprocess_html(self, soup): + footer = soup.find(name='a', attrs={'href': 'http://brewiarz.pl'}) + footer_parent = footer.findParent('div') + footer_parent.extract() + + header = soup.find(text='http://brewiarz.pl') + header_parent = header.findParent('div') + header_parent.extract() + + subheader = soup.find(text='Kolor szat:').findParent('div') + subheader.extract() + + color = soup.find('b') + color.extract() + + cleaned = self.strip_tags(soup) + + div = cleaned.findAll(name='div') + div[1].extract() + div[2].extract() + div[3].extract() + + return cleaned + + def strip_tags(self, soup_dirty): + VALID_TAGS = ['p', 'div', 'br', 'b', 'a', 'title', 'head', 'html', 'body'] + + for tag in soup_dirty.findAll(True): + if tag.name not in VALID_TAGS: + for i, x in enumerate(tag.parent.contents): + if x == tag: + break + else: + print "Can't find", tag, "in", tag.parent + continue + for r in reversed(tag.contents): + tag.parent.insert(i, r) + tag.extract() + + return soup_dirty diff --git a/recipes/buchreport.recipe b/recipes/buchreport.recipe new file mode 100644 index 0000000000..5ed34d1ee8 --- /dev/null +++ b/recipes/buchreport.recipe @@ -0,0 +1,45 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe + +'''Calibre recipe to convert the RSS feeds of the Buchreport to an ebook.''' + +class Buchreport(BasicNewsRecipe) : + __author__ = 'a.peter' + __copyright__ = 'a.peter' + __license__ = 'GPL v3' + description = 'Buchreport' + version = 4 + title = u'Buchreport' + timefmt = ' [%d.%m.%Y]' + encoding = 'cp1252' + language = 'de' + + + extra_css = 'body { margin-left: 0.00em; margin-right: 0.00em; } \ + article, articledate, articledescription { text-align: left; } \ + h1 { text-align: left; font-size: 140%; font-weight: bold; } \ + h2 { text-align: left; font-size: 100%; font-weight: bold; font-style: italic; } \ + h3 { text-align: left; font-size: 100%; font-weight: regular; font-style: italic; } \ + h4, h5, h6 { text-align: left; font-size: 100%; font-weight: bold; }' + + oldest_article = 7.0 + no_stylesheets = True + remove_javascript = True + use_embedded_content = False + publication_type = 'newspaper' + + remove_tags_before = dict(name='h2') + remove_tags_after = [ + dict(name='div', attrs={'style':["padding-top:10px;clear:both"]}) + ] + remove_tags = [ + dict(name='div', attrs={'style':["padding-top:10px;clear:both"]}), + dict(name='iframe'), + dict(name='img') + ] + + feeds = [ + (u'Buchreport', u'http://www.buchreport.de/index.php?id=5&type=100') + ] + + def get_masthead_url(self): + return 'http://www.buchreport.de/fileadmin/template/img/buchreport_logo.jpg' diff --git a/recipes/business_standard.recipe b/recipes/business_standard.recipe index badca48733..a61c32aa42 100644 --- a/recipes/business_standard.recipe +++ b/recipes/business_standard.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2009-2010, Darko Miletic ' +__copyright__ = '2009-2012, Darko Miletic ' ''' www.business-standard.com ''' @@ -14,10 +14,12 @@ class BusinessStandard(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False + auto_cleanup = False encoding = 'cp1252' publisher = 'Business Standard Limited' category = 'news, business, money, india, world' language = 'en_IN' + masthead_url = 'http://feeds.business-standard.com/images/logo_08.jpg' conversion_options = { 'comments' : description @@ -26,7 +28,7 @@ class BusinessStandard(BasicNewsRecipe): ,'publisher' : publisher ,'linearize_tables': True } - keep_only_tags=[dict(attrs={'class':'TableClas'})] + #keep_only_tags=[dict(name='td', attrs={'class':'TableClas'})] remove_tags = [ dict(name=['object','link','script','iframe','base','meta']) ,dict(attrs={'class':'rightDiv2'}) @@ -45,3 +47,8 @@ class BusinessStandard(BasicNewsRecipe): ,(u'Management & Mktg' , u'http://feeds.business-standard.com/rss/7_0.xml' ) ,(u'Opinion' , u'http://feeds.business-standard.com/rss/5_0.xml' ) ] + + def print_version(self, url): + l, s, tp = url.rpartition('/') + t, k, autono = l.rpartition('/') + return 'http://www.business-standard.com/india/printpage.php?autono=' + autono + '&tp=' + tp diff --git a/recipes/ceska_pozice.recipe b/recipes/ceska_pozice.recipe new file mode 100644 index 0000000000..478f6823b9 --- /dev/null +++ b/recipes/ceska_pozice.recipe @@ -0,0 +1,68 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class ceskaPoziceRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Česká pozice' + description = 'Česká pozice' + oldest_article = 2 + max_articles_per_feed = 20 + + feeds = [ + (u'Všechny články', u'http://www.ceskapozice.cz/rss.xml'), + (u'Domov', u'http://www.ceskapozice.cz/taxonomy/term/16/feed'), + (u'Chrono', u'http://www.ceskapozice.cz/chrono/feed'), + (u'Evropa', u'http://www.ceskapozice.cz/taxonomy/term/17/feed') + ] + + + language = 'cs' + cover_url = 'http://www.ceskapozice.cz/sites/default/files/cpozice_logo.png' + remove_javascript = True + no_stylesheets = True + domain = u'http://www.ceskapozice.cz' + use_embedded_content = False + + + remove_tags = [dict(name='div', attrs={'class':['block-ad', 'region region-content-ad']}), + dict(name='ul', attrs={'class':'links'}), + dict(name='div', attrs={'id':['comments', 'back-to-top']}), + dict(name='div', attrs={'class':['next-page', 'region region-content-ad']}), + dict(name='cite')] + + keep_only_tags = [dict(name='div', attrs={'id':'content'})] + + visited_urls = {} + def get_article_url(self, article): + url = BasicNewsRecipe.get_article_url(self, article) + if url in self.visited_urls: + self.log.debug('Ignoring duplicate: ' + url) + return None + else: + self.visited_urls[url] = True + self.log.debug('Accepting: ' + url) + return url + + def preprocess_html(self, soup): + self.append_page(soup, soup.body, 3) + return soup + + def append_page(self, soup, appendtag, position): + pager = soup.find('div', attrs={'class':'paging-bottom'}) + if pager: + nextbutton = pager.find('li', attrs={'class':'pager-next'}) + if nextbutton: + nexturl = self.domain + nextbutton.a['href'] + soup2 = self.index_to_soup(nexturl) + texttag = soup2.find('div', attrs={'class':'main-body'}) + for it in texttag.findAll('div', attrs={'class':'region region-content-ad'}): + it.extract() + for it in texttag.findAll('cite'): + it.extract() + newpos = len(texttag.contents) + self.append_page(soup2, texttag, newpos) + texttag.extract() + appendtag.insert(position, texttag) + pager.extract() + diff --git a/recipes/ceske_noviny.recipe b/recipes/ceske_noviny.recipe new file mode 100644 index 0000000000..10dd16689d --- /dev/null +++ b/recipes/ceske_noviny.recipe @@ -0,0 +1,30 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class ceskenovinyRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'České Noviny' + description = 'ceskenoviny.cz' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Domácí', u'http://www.ceskenoviny.cz/sluzby/rss/domov.php') + #,(u'Hlavní události', u'http://www.ceskenoviny.cz/sluzby/rss/index.php') + #,(u'Přehled zpráv', u'http://www.ceskenoviny.cz/sluzby/rss/zpravy.php') + #,(u'Ze světa', u'http://www.ceskenoviny.cz/sluzby/rss/svet.php') + #,(u'Kultura', u'http://www.ceskenoviny.cz/sluzby/rss/kultura.php') + #,(u'IT', u'http://www.ceskenoviny.cz/sluzby/rss/pocitace.php') + ] + + + language = 'cs' + cover_url = 'http://i4.cn.cz/grafika/cn_logo-print.gif' + remove_javascript = True + no_stylesheets = True + + remove_attributes = [] + filter_regexps = [r'img.aktualne.centrum.cz'] + + keep_only_tags = [dict(name='div', attrs={'id':'clnk'})] diff --git a/recipes/cesky_rozhlas_6.recipe b/recipes/cesky_rozhlas_6.recipe new file mode 100644 index 0000000000..eca32af02c --- /dev/null +++ b/recipes/cesky_rozhlas_6.recipe @@ -0,0 +1,26 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class cro6Recipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Český rozhlas 6' + description = 'Český rozhlas 6' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Český rozhlas 6', u'http://www.rozhlas.cz/export/cro6/') + ] + + + language = 'cs' + cover_url = 'http://www.rozhlas.cz/img/e5/logo/cro6.png' + remove_javascript = True + no_stylesheets = True + + remove_attributes = [] + remove_tags = [dict(name='div', attrs={'class':['audio-play-all', 'poradHeaders', 'actions']}), + dict(name='p', attrs={'class':['para-last']})] + + keep_only_tags = [dict(name='div', attrs={'id':'article'})] diff --git a/recipes/demagog.cz.recipe b/recipes/demagog.cz.recipe new file mode 100644 index 0000000000..7d89af41bd --- /dev/null +++ b/recipes/demagog.cz.recipe @@ -0,0 +1,39 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class demagogRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Demagog.cz' + publisher = u'' + description = 'demagog.cz' + oldest_article = 6 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Aktuality', u'http://demagog.cz/rss') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://demagog.cz/content/images/demagog.cz.png' + remove_javascript = True + no_stylesheets = True + extra_css = """ + .vyrok_suhrn{margin-top:50px; } + .vyrok{margin-bottom:30px; } + """ + + remove_tags = [dict(name='a', attrs={'class':'vyrok_odovodnenie_tgl'}), + dict(name='img', attrs={'class':'vyrok_fotografia'})] + remove_tags_before = dict(name='h1') + remove_tags_after = dict(name='div', attrs={'class':'vyrok_text_after'}) + preprocess_regexps = [(re.compile(r'(
)', re.DOTALL|re.IGNORECASE), lambda match: '\1
')] + + + + diff --git a/recipes/denik.cz.recipe b/recipes/denik.cz.recipe new file mode 100644 index 0000000000..2ccf8caa40 --- /dev/null +++ b/recipes/denik.cz.recipe @@ -0,0 +1,36 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class ceskyDenikRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'denik.cz' + publisher = u'' + description = u'Český deník' + oldest_article = 1 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Z domova', u'http://www.denik.cz/rss/z_domova.html') + ,(u'Pražský deník - Moje Praha', u'http://prazsky.denik.cz/rss/zpravy_region.html') + #,(u'Zahraničí', u'http://www.denik.cz/rss/ze_sveta.html') + #,(u'Kultura', u'http://www.denik.cz/rss/kultura.html') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://g.denik.cz/images/loga/denik.png' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_tags = [] + keep_only_tags = [dict(name='div', attrs={'class':'content'})] + #remove_tags_before = dict(name='h1') + remove_tags_after = dict(name='p', attrs={'class':'clanek-autor'}) + + diff --git a/recipes/denik_referendum.recipe b/recipes/denik_referendum.recipe new file mode 100644 index 0000000000..e04871d067 --- /dev/null +++ b/recipes/denik_referendum.recipe @@ -0,0 +1,28 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class denikReferendumRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Den\u00edk Referendum' + publisher = u'' + description = '' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Deník Referendum', u'http://feeds.feedburner.com/DenikReferendum') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + remove_attributes = [] + remove_tags_after = dict(name='div', attrs={'class':['text']}) + remove_tags = [dict(name='div', attrs={'class':['box boxLine', 'box noprint', 'box']}), + dict(name='h3', attrs={'class':'head alt'})] + + keep_only_tags = [dict(name='div', attrs={'id':['content']})] diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe index 5254694d24..a4e24ac61b 100644 --- a/recipes/dobreprogamy.recipe +++ b/recipes/dobreprogamy.recipe @@ -6,7 +6,6 @@ class Dobreprogramy_pl(BasicNewsRecipe): __author__ = 'fenuks' __licence__ ='GPL v3' category = 'IT' - language = 'pl' masthead_url='http://static.dpcdn.pl/css/Black/Images/header_logo_napis_fullVersion.png' cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png' description = u'Aktualności i blogi z dobreprogramy.pl' @@ -29,4 +28,4 @@ class Dobreprogramy_pl(BasicNewsRecipe): for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: a['href']=self.index + a['href'] - return soup \ No newline at end of file + return soup diff --git a/recipes/editoriali.recipe b/recipes/editoriali.recipe index 1b0c558df4..c5596bd743 100644 --- a/recipes/editoriali.recipe +++ b/recipes/editoriali.recipe @@ -7,6 +7,7 @@ class AdvancedUserRecipe1332847053(BasicNewsRecipe): title = u'Editoriali' __author__ = 'faber1971' description = 'Leading articles on Italy by the best Italian editorials' + language = 'it' oldest_article = 1 max_articles_per_feed = 100 diff --git a/recipes/f1_ultra.recipe b/recipes/f1_ultra.recipe new file mode 100644 index 0000000000..ada82542fc --- /dev/null +++ b/recipes/f1_ultra.recipe @@ -0,0 +1,35 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class f1ultra(BasicNewsRecipe): + title = u'Formuła 1 - F1 ultra' + __license__ = 'GPL v3' + __author__ = 'MrStefan , Artur Stachecki ' + language = 'pl' + description =u'Formuła 1, Robert Kubica, F3, GP2 oraz inne serie wyścigowe.' + masthead_url='http://www.f1ultra.pl/templates/f1ultra/images/logo.gif' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + keep_only_tags =[(dict(name = 'div', attrs = {'id' : 'main'}))] + remove_tags_after =[dict(attrs = {'style' : 'margin-top:5px;margin-bottom:5px;display: inline;'})] + remove_tags =[(dict(attrs = {'class' : ['buttonheading', 'avPlayerContainer', 'createdate']}))] + remove_tags.append(dict(attrs = {'title' : ['PDF', 'Drukuj', 'Email']})) + remove_tags.append(dict(name = 'form', attrs = {'method' : 'post'})) + remove_tags.append(dict(name = 'hr', attrs = {'size' : '2'})) + + preprocess_regexps = [(re.compile(r'align="left"'), lambda match: ''), + (re.compile(r'align="right"'), lambda match: ''), + (re.compile(r'width=\"*\"'), lambda match: ''), + (re.compile(r'\'), lambda match: '')] + + + extra_css = '''.contentheading { font-size: 1.4em; font-weight: bold; } + img { display: block; clear: both;} + ''' + remove_attributes = ['width','height','position','float','padding-left','padding-right','padding','text-align'] + + feeds = [(u'F1 Ultra', u'http://www.f1ultra.pl/index.php?option=com_rd_rss&id=1&Itemid=245')] diff --git a/recipes/foreign_policy.recipe b/recipes/foreign_policy.recipe index 893d055a05..4ddecf842f 100644 --- a/recipes/foreign_policy.recipe +++ b/recipes/foreign_policy.recipe @@ -8,6 +8,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1349086293(BasicNewsRecipe): title = u'Foreign Policy' + language = 'en' __author__ = 'Darko Miletic' description = 'International News' publisher = 'Washingtonpost.Newsweek Interactive, LLC' diff --git a/recipes/fronda.recipe b/recipes/fronda.recipe index 452dca9068..6755770329 100644 --- a/recipes/fronda.recipe +++ b/recipes/fronda.recipe @@ -1,39 +1,88 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = u'2010, Tomasz Dlugosz ' +__copyright__ = u'2010-2012, Tomasz Dlugosz ' ''' fronda.pl ''' from calibre.web.feeds.news import BasicNewsRecipe -import re +from datetime import timedelta, date class Fronda(BasicNewsRecipe): title = u'Fronda.pl' publisher = u'Fronda.pl' - description = u'Portal po\u015bwi\u0119cony - Infformacje' + description = u'Portal po\u015bwi\u0119cony - Informacje' language = 'pl' __author__ = u'Tomasz D\u0142ugosz' oldest_article = 7 max_articles_per_feed = 100 use_embedded_content = False + no_stylesheets = True - feeds = [(u'Infformacje', u'http://fronda.pl/news/feed')] + extra_css = ''' + h1 {font-size:150%} + .body {text-align:left;} + div.headline {font-weight:bold} + ''' - keep_only_tags = [dict(name='h2', attrs={'class':'news_title'}), - dict(name='div', attrs={'class':'naglowek_tresc'}), - dict(name='div', attrs={'id':'czytaj'}) ] + earliest_date = date.today() - timedelta(days=oldest_article) - remove_tags = [dict(name='a', attrs={'class':'print'})] + def date_cut(self,datestr): + # eg. 5.11.2012, 12:07 + timestamp = datestr.split(',')[0] + parts = timestamp.split('.') + art_date = date(int(parts[2]),int(parts[1]),int(parts[0])) + return True if art_date < self.earliest_date else False - preprocess_regexps = [ - (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in - [ (r'

.*

', lambda match: ''), - (r'

.*

', lambda match: ''), - (r'

W.* lektury.*

', lambda match: ''), - (r'

Zobacz t.*?', lambda match: ''), - (r']*> 

', lambda match: ''), - (r'


', lambda match: ''), - (r'')] + needs_subscription = True + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + br.open('http://www.denikinsider.cz/') + br.select_form(nr=0) + br['login-name'] = self.username + br['login-password'] = self.password + res = br.submit() + raw = res.read() + if u'Odhlásit se' not in raw: + raise ValueError('Failed to login to insider.cz' + 'Check your username and password.') + return br + + def parse_index(self): + articles = [] + + soup = self.index_to_soup('http://www.denikinsider.cz') + titles = soup.findAll('span', attrs={'class':'homepageArticleTitle'}) + if titles is None: + raise ValueError('Could not find category content') + + articles = [] + seen_titles = set([]) + for title in titles: + if title.string in seen_titles: + continue + article = title.parent + seen_titles.add(title.string) + url = article['href'] + if url.startswith('/'): + url = 'http://www.denikinsider.cz/'+url + self.log('\tFound article:', title, 'at', url) + articles.append({'title':title.string, 'url':url, 'description':'', + 'date':''}) + return [(self.title, articles)] + + diff --git a/recipes/kerrang.recipe b/recipes/kerrang.recipe new file mode 100644 index 0000000000..bbd944eb62 --- /dev/null +++ b/recipes/kerrang.recipe @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe + +class kerrang(BasicNewsRecipe): + title = u'Kerrang!' + __author__ = 'Artur Stachecki ' + language = 'en_GB' + description = u'UK-based magazine devoted to rock music published by Bauer Media Group' + oldest_article = 7 + masthead_url = 'http://images.kerrang.com/design/kerrang/kerrangsite/logo.gif' + max_articles_per_feed = 100 + simultaneous_downloads = 5 + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + recursions = 0 + + keep_only_tags = [] + keep_only_tags.append(dict(attrs = {'class' : ['headz', 'blktxt']})) + + extra_css = ''' img { display: block; margin-right: auto;} + h1 {text-align: left; font-size: 22px;}''' + + feeds = [(u'News', u'http://www.kerrang.com/blog/rss.xml')] + + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/kp.recipe b/recipes/kp.recipe new file mode 100644 index 0000000000..f52fcef60b --- /dev/null +++ b/recipes/kp.recipe @@ -0,0 +1,52 @@ + +from calibre.web.feeds.news import BasicNewsRecipe + +class KrytykaPolitycznaRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = u'intromatyk ' + language = 'pl' + version = 1 + + title = u'Krytyka Polityczna' + category = u'News' + description = u' Lewicowe pismo zaangażowane w bieg spraw publicznych w Polsce.' + cover_url='' + remove_empty_feeds= True + no_stylesheets=True + oldest_article = 7 + max_articles_per_feed = 100000 + recursions = 0 + + no_stylesheets = True + remove_javascript = True + simultaneous_downloads = 3 + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'h1', attrs = {'class' : 'print-title'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'print-content'})) + + remove_tags =[] + remove_tags.append(dict(attrs = {'class' : ['field field-type-text field-field-story-switch', 'field field-type-filefield field-field-story-temp' , 'field field-type-text field-field-story-author', 'field field-type-text field-field-story-lead-switch']})) + + extra_css = ''' + body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} + td.contentheading{font-size: large; font-weight: bold;} + ''' + + feeds = [ + ('Wszystkie', 'http://www.krytykapolityczna.pl/rss.xml') + ] + + def print_version(self, url): + soup = self.index_to_soup(url) + print_ico = soup.find(attrs = {'class' : 'print-page'}) + print_uri = print_ico['href'] + self.log('PRINT', print_uri) + return 'http://www.krytykapolityczna.pl/' + print_uri + + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/kudy_z_nudy.recipe b/recipes/kudy_z_nudy.recipe new file mode 100644 index 0000000000..d7c0d9ecf9 --- /dev/null +++ b/recipes/kudy_z_nudy.recipe @@ -0,0 +1,32 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class kudyznudyRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Kudy z nudy' + publisher = u'' + description = 'kudyznudy.cz' + oldest_article = 3 + max_articles_per_feed = 20 + use_embedded_content = False + + feeds = [ + (u'Praha nejnovější', u'http://www.kudyznudy.cz/RSS/Charts.aspx?Type=Newest&Lang=cs-CZ&RegionId=1') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.kudyznudy.cz/App_Themes/KzN/Images/Containers/Header/HeaderLogoKZN.png' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + remove_tags_before = dict(name='div', attrs={'class':['C_WholeContentPadding']}) + remove_tags_after = dict(name='div', attrs={'class':['SurroundingsContainer']}) + remove_tags = [dict(name='div', attrs={'class':['Details', 'buttons', 'SurroundingsContainer', 'breadcrumb']})] + + keep_only_tags = [] diff --git a/recipes/lequipe.recipe b/recipes/lequipe.recipe new file mode 100644 index 0000000000..c6e9a26880 --- /dev/null +++ b/recipes/lequipe.recipe @@ -0,0 +1,45 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe + + +class leequipe(BasicNewsRecipe): + title = u'l\'equipe' + __author__ = 'Artur Stachecki ' + language = 'fr' + description = u'Retrouvez tout le sport en direct sur le site de L\'EQUIPE et suivez l\'actualité du football, rugby, basket, cyclisme, f1, volley, hand, tous les résultats sportifs' + oldest_article = 1 + masthead_url = 'http://static.lequipe.fr/v6/img/logo-lequipe.png' + max_articles_per_feed = 100 + simultaneous_downloads = 5 + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + recursions = 0 + + keep_only_tags = [] + keep_only_tags.append(dict(attrs={'id': ['article']})) + + remove_tags = [] + remove_tags.append(dict(attrs={'id': ['partage', 'ensavoirplus', 'bloc_bas_breve', 'commentaires', 'tools']})) + remove_tags.append(dict(attrs={'class': ['partage_bis', 'date']})) + + feeds = [(u'Football', u'http://www.lequipe.fr/rss/actu_rss_Football.xml'), + (u'Auto-Moto', u'http://www.lequipe.fr/rss/actu_rss_Auto-Moto.xml'), + (u'Tennis', u'http://www.lequipe.fr/rss/actu_rss_Tennis.xml'), + (u'Golf', u'http://www.lequipe.fr/rss/actu_rss_Golf.xml'), + (u'Rugby', u'http://www.lequipe.fr/rss/actu_rss_Rugby.xml'), + (u'Basket', u'http://www.lequipe.fr/rss/actu_rss_Basket.xml'), + (u'Hand', u'http://www.lequipe.fr/rss/actu_rss_Hand.xml'), + (u'Cyclisme', u'http://www.lequipe.fr/rss/actu_rss_Cyclisme.xml'), + (u'Autres Sports', u'http://pipes.yahoo.com/pipes/pipe.run?_id=2039f7f4f350c70c5e4e8633aa1b37cd&_render=rss') + ] + + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/lidovky.recipe b/recipes/lidovky.recipe new file mode 100644 index 0000000000..8e4754829b --- /dev/null +++ b/recipes/lidovky.recipe @@ -0,0 +1,40 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class lnRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'lidovky' + publisher = u'' + description = 'lidovky.cz' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Události', u'http://www.lidovky.cz/export/rss.asp?r=ln_domov'), + (u'Svět', u'http://www.lidovky.cz/export/rss.asp?r=ln_zahranici'), + (u'Byznys', u'http://www.lidovky.cz/export/rss.asp?c=ln_byznys'), + (u'Věda', u'http://www.lidovky.cz/export/rss.asp?r=ln_veda'), + (u'Názory', u'http://www.lidovky.cz/export/rss.asp?r=ln_nazory'), + (u'Relax', u'http://www.lidovky.cz/export/rss.asp?c=ln_relax') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://g.lidovky.cz/o/lidovky_ln3b/lidovky-logo.png' + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + remove_attributes = [] + remove_tags_before = dict(name='div', attrs={'id':['content']}) + remove_tags_after = dict(name='div', attrs={'class':['authors']}) + preprocess_regexps = [(re.compile(r'
')] + + keep_only_tags = [] + + + + + diff --git a/recipes/mateusz_czytania.recipe b/recipes/mateusz_czytania.recipe new file mode 100644 index 0000000000..ba7c598787 --- /dev/null +++ b/recipes/mateusz_czytania.recipe @@ -0,0 +1,36 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'teepel ' + +''' +http://www.mateusz.pl/czytania +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class czytania_mateusz(BasicNewsRecipe): + title = u'Czytania na ka\u017cdy dzie\u0144' + __author__ = 'teepel ' + description = u'Codzienne czytania z jednego z najstarszych polskich serwisów katolickich.' + language = 'pl' + INDEX='http://www.mateusz.pl/czytania' + oldest_article = 1 + remove_empty_feeds= True + no_stylesheets=True + auto_cleanup = True + remove_javascript = True + simultaneous_downloads = 2 + max_articles_per_feed = 100 + auto_cleanup = True + + feeds = [(u'Czytania', u'http://mateusz.pl/rss/czytania/')] + + remove_tags =[] + remove_tags.append(dict(name = 'p', attrs = {'class' : 'top'})) + + #thanks t3d + def get_article_url(self, article): + link = article.get('link') + if 'kmt.pl' not in link: + return link diff --git a/recipes/metropol_tv.recipe b/recipes/metropol_tv.recipe new file mode 100644 index 0000000000..56f393c96a --- /dev/null +++ b/recipes/metropol_tv.recipe @@ -0,0 +1,29 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class metropolRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Metropol TV' + publisher = u'' + description = 'metropol.cz' + oldest_article = 1 + max_articles_per_feed = 20 + use_embedded_content = False + + feeds = [ + (u'Metropolcv.cz', u'http://www.metropol.cz/rss/') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.metropol.cz/public/css/../images/logo/metropoltv.png' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + + keep_only_tags = [dict(name='div', attrs={'id':['art-full']})] diff --git a/recipes/myapple_pl.recipe b/recipes/myapple_pl.recipe new file mode 100644 index 0000000000..df5708a325 --- /dev/null +++ b/recipes/myapple_pl.recipe @@ -0,0 +1,49 @@ + +from calibre.web.feeds.news import BasicNewsRecipe + +class MyAppleRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = u'Artur Stachecki ' + language = 'pl' + version = 1 + + title = u'MyApple.pl' + category = u'News' + description = u' Największy w Polsce serwis zajmujący się tematyką związaną z Apple i wszelkimi produktami tej firmy.' + cover_url='' + remove_empty_feeds= True + no_stylesheets=True + oldest_article = 7 + max_articles_per_feed = 100000 + recursions = 0 + + no_stylesheets = True + remove_javascript = True + simultaneous_downloads = 3 + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'article_content'})) + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'class' : 'article_author_date_comment_container'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'fullwidth'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'cmslinks'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'googleads-468'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'comments'})) + + + extra_css = ''' + body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} + td.contentheading{font-size: large; font-weight: bold;} + ''' + + feeds = [ + ('News', 'feed://myapple.pl/external.php?do=rss&type=newcontent§ionid=1&days=120&count=10'), + ] + + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/nadacni_fond_proti_korupci.recipe b/recipes/nadacni_fond_proti_korupci.recipe new file mode 100644 index 0000000000..2a8a69283c --- /dev/null +++ b/recipes/nadacni_fond_proti_korupci.recipe @@ -0,0 +1,30 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class nfpkRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Nadační fond proti korupci' + publisher = u'' + description = 'nfpk.cz' + oldest_article = 7 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Aktuality', u'http://feeds.feedburner.com/nfpk') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.nfpk.cz/_templates/nfpk/_images/logo.gif' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + keep_only_tags = [dict(name='div', attrs={'id':'content'})] + diff --git a/recipes/naszdziennik.recipe b/recipes/naszdziennik.recipe new file mode 100644 index 0000000000..4c7b78c199 --- /dev/null +++ b/recipes/naszdziennik.recipe @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe + +class naszdziennik(BasicNewsRecipe): + title = u'Nasz Dziennik' + __author__ = 'Artur Stachecki ' + language = 'pl' + description =u'Nasz Dziennik - Ogólnopolska gazeta codzienna. Podejmuje tematykę dotyczącą życia społecznego, kulturalnego, politycznego i religijnego. Propaguje wartości chrześcijańskie oraz tradycję i kulturę polską.' + masthead_url='http://www.naszdziennik.pl/images/logo-male.png' + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets = True + + keep_only_tags =[dict(attrs = {'id' : 'article'})] + + #definiujemy nową funkcje; musi zwracać listę feedów wraz z artykułami + def parse_index(self): + #adres do parsowania artykułów + soup = self.index_to_soup('http://www.naszdziennik.pl/news') + #deklaracja pustej listy feedów + feeds = [] + #deklaracja pustego słownika artykułów + articles = {} + #deklaracja pustej listy sekcji + sections = [] + #deklaracja pierwszej sekcji jako pusty string + section = '' + + #pętla for, która analizuje po kolei każdy tag "news-article" + for item in soup.findAll(attrs = {'class' : 'news-article'}) : + #w tagu "news-article szukamy pierwszego taga h4" + section = item.find('h4') + #zmiennej sekcja przypisujemy zawartość tekstową taga + section = self.tag_to_string(section) + #sprawdzamy czy w słowniku artykułów istnieje klucz dotyczący sekcji + #jeśli nie istnieje to : + if not articles.has_key(section) : + #do listy sekcji dodajemy nową sekcje + sections.append(section) + #deklarujemy nową sekcje w słowniku artykułów przypisując jej klucz odpowiadający nowej sekcji, którego wartością jest pusta lista + articles[section] = [] + #przeszukujemy kolejny tag "title-datetime" + article_title_datetime = item.find(attrs = {'class' : 'title-datetime'}) + #w tagu title-datetime znajdujemy pierwszy link + article_a = article_title_datetime.find('a') + #i tworzymy z niego link absolutny do właściwego artykułu + article_url = 'http://naszdziennik.pl' + article_a['href'] + #jako tytuł użyty będzie tekst pomiędzy tagami + article_title = self.tag_to_string(article_a) + #a data będzie tekstem z pierwszego taga h4 znalezionego w tagu title-datetime + article_date = self.tag_to_string(article_title_datetime.find('h4')) + #zebrane elementy dodajemy do listy zadeklarowanej w linijce 44 + articles[section].append( { 'title' : article_title, 'url' : article_url, 'date' : article_date }) + #po dodaniu wszystkich artykułów dodajemy sekcje do listy feedów, korzystając z list sekcji znajdujących się w słowniku + for section in sections: + feeds.append((section, articles[section])) + #zwracamy listę feedów, której parsowaniem zajmie się calibre + return feeds \ No newline at end of file diff --git a/recipes/nepszabadsag.recipe b/recipes/nepszabadsag.recipe new file mode 100644 index 0000000000..8ae5447dd6 --- /dev/null +++ b/recipes/nepszabadsag.recipe @@ -0,0 +1,56 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +''' +Fetch Népszabadság +''' +from calibre.web.feeds.news import BasicNewsRecipe + +class nepszabadsag(BasicNewsRecipe): + title = u'N\u00e9pszabads\u00e1g' + description = '' + __author__ = 'bubak' + use_embedded_content = False + timefmt = ' [%d %b %Y]' + oldest_article = 2 + max_articles_per_feed = 20 + no_stylesheets = True + language = 'hu' + #delay = 1 + #timeout = 10 + simultaneous_downloads = 5 + + #encoding = 'utf-8' + remove_javascript = True + cover_url = 'http://nol.hu/_design/image/logo_nol_live.jpg' + + feeds = [ + (u'Belföld', u'http://nol.hu/feed/belfold.rss') + #,(u'Külföld', u'http://nol.hu/feed/kulfold.rss') + #,(u'Gazdaság', u'http://nol.hu/feed/gazdasag.rss') + #,(u'Kultúra', u'http://nol.hu/feed/kult.rss') + ] + + extra_css = ''' + ''' + + remove_attributes = [] + remove_tags_before = dict(name='div', attrs={'class':['d-source']}) + remove_tags_after = dict(name='div', attrs={'class':['tags']}) + remove_tags = [dict(name='div', attrs={'class':['h']}), + dict(name='tfoot')] + + + keep_only_tags = [dict(name='table', attrs={'class':'article-box'})] + + # NS sends an ad page sometimes but not frequently enough, TBD + def AAskip_ad_pages(self, soup): + if ('advertisement' in soup.find('title').string.lower()): + href = soup.find('a').get('href') + self.log.debug('Skipping to: ' + href) + new = self.browser.open(href).read().decode('utf-8', 'ignore') + #ipython(locals()) + self.log.debug('Finished: ' + href) + return new + else: + return None + diff --git a/recipes/neviditelny_pes.recipe b/recipes/neviditelny_pes.recipe new file mode 100644 index 0000000000..65cfb2b7ec --- /dev/null +++ b/recipes/neviditelny_pes.recipe @@ -0,0 +1,32 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class pesRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Neviditelný pes' + publisher = u'' + description = u'Neviditelný pes' + oldest_article = 1 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Neviditelný pes', u'http://neviditelnypes.lidovky.cz/export/rss.asp?c=pes_neviditelny') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://g.zpravy.cz/o/pes/logo_pes.jpg' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_tags = [] + remove_tags_before = dict(name='div', attrs={'id':'art-full'}) + remove_tags_after = dict(name='div', attrs={'id':'authors'}) + + diff --git a/recipes/new_yorker.recipe b/recipes/new_yorker.recipe index 9eeb8b31ee..c5f1b0aff2 100644 --- a/recipes/new_yorker.recipe +++ b/recipes/new_yorker.recipe @@ -22,9 +22,9 @@ class NewYorker(BasicNewsRecipe): masthead_url = 'http://www.newyorker.com/css/i/hed/logo.gif' extra_css = """ body {font-family: "Times New Roman",Times,serif} - .articleauthor{color: #9F9F9F; + .articleauthor{color: #9F9F9F; font-family: Arial, sans-serif; - font-size: small; + font-size: small; text-transform: uppercase} .rubric,.dd,h6#credit{color: #CD0021; font-family: Arial, sans-serif; @@ -63,11 +63,11 @@ class NewYorker(BasicNewsRecipe): return url.strip() def get_cover_url(self): - cover_url = None - soup = self.index_to_soup('http://www.newyorker.com/magazine/toc/') - cover_item = soup.find('img',attrs={'id':'inThisIssuePhoto'}) + cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg" + soup = self.index_to_soup('http://www.newyorker.com/magazine?intcid=magazine') + cover_item = soup.find('div',attrs={'id':'media-count-1'}) if cover_item: - cover_url = 'http://www.newyorker.com' + cover_item['src'].strip() + cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip() return cover_url def preprocess_html(self, soup): diff --git a/recipes/newsweek_polska.recipe b/recipes/newsweek_polska.recipe index b1d6359d11..ec50e0f438 100644 --- a/recipes/newsweek_polska.recipe +++ b/recipes/newsweek_polska.recipe @@ -13,7 +13,7 @@ import datetime class Newsweek(BasicNewsRecipe): # how many issues to go back, 0 means get the most current one - BACK_ISSUES = 1 + BACK_ISSUES = 2 EDITION = '0' DATE = None diff --git a/recipes/novinky.cz.recipe b/recipes/novinky.cz.recipe new file mode 100644 index 0000000000..19fd52a371 --- /dev/null +++ b/recipes/novinky.cz.recipe @@ -0,0 +1,50 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class novinkyRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'novinky.cz' + publisher = u'seznam.cz' + description = 'novinky.cz' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Domácí', u'http://www.novinky.cz/rss2/domaci/'), + (u'Praha', u'http://www.novinky.cz/rss2/vase-zpravy/praha/'), + (u'Ekonomika', u'http://www.novinky.cz/rss2/ekonomika/'), + (u'Finance', u'http://www.novinky.cz/rss2/finance/'), + ] + + + #encoding = 'utf-8' + language = 'cs' + cover_url = 'http://www.novinky.cz/static/images/logo.gif' + remove_javascript = True + no_stylesheets = True + + remove_tags = [dict(name='div', attrs={'id':['pictureInnerBox']}), + dict(name='div', attrs={'id':['discussionEntry']}), + dict(name='span', attrs={'id':['mynews-hits', 'mynews-author']}), + dict(name='div', attrs={'class':['related']}), + dict(name='div', attrs={'id':['multimediaInfo']})] + remove_tags_before = dict(name='div',attrs={'class':['articleHeader']}) + remove_tags_after = dict(name='div',attrs={'class':'related'}) + + keep_only_tags = [] + + # This source has identical articles under different links + # which are redirected to the common url. I've found + # just this API method that has the real URL + visited_urls = {} + def encoding(self, source): + url = source.newurl + if url in self.visited_urls: + self.log.debug('Ignoring duplicate: ' + url) + return None + else: + self.visited_urls[url] = True + self.log.debug('Accepting: ' + url) + return source.decode('utf-8', 'replace') + diff --git a/recipes/parlamentni_listy.recipe b/recipes/parlamentni_listy.recipe new file mode 100644 index 0000000000..71d904866a --- /dev/null +++ b/recipes/parlamentni_listy.recipe @@ -0,0 +1,38 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class plRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Parlamentn\u00ed Listy' + publisher = u'' + description = '' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Parlamentní listy.cz', u'http://www.parlamentnilisty.cz/export/rss.aspx') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.parlamentnilisty.cz/design/listy-logo2.png' + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + remove_attributes = [] + remove_tags = [dict(name='div', attrs={'class':['articledetailboxin','crumbs', 'relatedarticles articledetailbox']}), + dict(name='div', attrs={'class':['socialshare-1 noprint', 'socialshare-2 noprint']}), + dict(name='div', attrs={'id':'widget'}), + dict(name='div', attrs={'class':'article-discussion-box noprint'})] + preprocess_regexps = [(re.compile(r'<(span|strong)[^>]*>\s*Ptejte se politik.*', re.DOTALL|re.IGNORECASE), lambda match: '')] + + keep_only_tags = [dict(name='div', attrs={'class':['article-detail']})] + + + + + + diff --git a/recipes/piratska_strana.recipe b/recipes/piratska_strana.recipe new file mode 100644 index 0000000000..c125eb8aad --- /dev/null +++ b/recipes/piratska_strana.recipe @@ -0,0 +1,40 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class cpsRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Piratská strana' + publisher = u'' + description = '' + oldest_article = 3 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Články', u'http://www.pirati.cz/rss.xml') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.pirati.cz/sites/all/themes/addari-cps/images/headbg.jpg' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + keep_only_tags = [dict(name='div', attrs={'id':'postarea'})] + remove_tags = [dict(name='div', attrs={'class':['breadcrumb', 'submitted', 'links-readmore']}), + dict(name='div', attrs={'id':['comments']})] + remove_tags_before = dict(name='font', attrs={'size':'+3'}) + remove_tags_after = [dict(name='iframe')] + + conversion_options = {'linearize_tables' : True} + + + + + diff --git a/recipes/piratske_noviny.recipe b/recipes/piratske_noviny.recipe new file mode 100644 index 0000000000..a2d30374ed --- /dev/null +++ b/recipes/piratske_noviny.recipe @@ -0,0 +1,34 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class nfpkRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Piratské noviny' + publisher = u'' + description = 'nfpk.cz' + oldest_article = 2 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Aktuality', u'http://www.piratskenoviny.cz/run/rss.php') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.piratskenoviny.cz/imgs/piratske-noviny.gif' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + remove_tags_before = dict(name='font', attrs={'size':'+3'}) + remove_tags_after = [dict(name='iframe')] + conversion_options = {'linearize_tables' : True} + + + diff --git a/recipes/portfolio_hu.recipe b/recipes/portfolio_hu.recipe index 5eaf0e886f..feadd124ed 100644 --- a/recipes/portfolio_hu.recipe +++ b/recipes/portfolio_hu.recipe @@ -4,7 +4,7 @@ class AdvancedUserRecipe1348063712(BasicNewsRecipe): title = u'Portfolio.hu - English Edition' __author__ = 'laca' oldest_article = 7 - language = 'en_HUN' + language = 'en_HU' masthead_url = 'http://www.portfolio.hu/img/sit/angolfejlec2010.jpg' use_embedded_content = False auto_cleanup = True diff --git a/recipes/pravo.recipe b/recipes/pravo.recipe new file mode 100644 index 0000000000..02d2c13439 --- /dev/null +++ b/recipes/pravo.recipe @@ -0,0 +1,64 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals + +from calibre.web.feeds.news import BasicNewsRecipe + +class pravo(BasicNewsRecipe): + __author__ = 'bubak' + title = 'Právo' + language = 'cs' + + remove_tags_before = dict(name='div', attrs={'class':'rubrika-ostat'}) + remove_tags_after = dict(name='td', attrs={'class':'rubrika'}) + remove_tags = [dict(name='td', attrs={'width':'273'}) + ,dict(name='td', attrs={'class':'rubrika'}) + ,dict(name='div', attrs={'class':'rubrika-ostat'}) + ] + extra_css = '.nadpis {font-weight: bold; font-size: 130%;} .medium {text-align: justify;}' + cover_url = 'http://pravo.novinky.cz/images/horni_6_logo.gif' + cover_margins = (0, 100, '#ffffff') + conversion_options = {'linearize_tables' : True} + + no_stylesheets = True + + # our variables + seen_titles = set([]) + # only yesterday's articles are online + parent_url = 'http://pravo.novinky.cz/minule/' + feeds = [ + ('Hlavní stránka', 'http://pravo.novinky.cz/minule/index.php'), + ('Zpravodajství', 'http://pravo.novinky.cz/minule/zpravodajstvi.php'), + ('Komentáře', 'http://pravo.novinky.cz/minule/komentare.php'), + ('Praha a střední Čechy', 'http://pravo.novinky.cz/minule/praha_stredni_cechy.php') + ] + + + def parse_index(self): + articles = [] + + for feed in self.feeds: + articles.append(self.parse_page(feed)) + return articles + + def parse_page(self, (feed_title, url)): + articles = [] + + soup = self.index_to_soup(url) + titles = soup.findAll('a', attrs={'class':'nadpis'}) + if titles is None: + raise ValueError('Could not find any articles on page ' + url) + + articles = [] + for article in titles: + title = article.string + if title in self.seen_titles: + continue + self.seen_titles.add(title) + url = article['href'] + if not url.startswith('http'): + url = self.parent_url + url + self.log('\tFound article:', title, 'at', url) + articles.append({'title':title.string, 'url':url, 'description':'', + 'date':''}) + return (feed_title, articles) + diff --git a/recipes/prawica_net.recipe b/recipes/prawica_net.recipe new file mode 100644 index 0000000000..edd12695c5 --- /dev/null +++ b/recipes/prawica_net.recipe @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'teepel ' + +''' +http://prawica.net +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class prawica_recipe(BasicNewsRecipe): + title = u'prawica.net' + __author__ = 'teepel ' + language = 'pl' + description ='Wiadomości ze strony prawica.net' + INDEX='http://prawica.net/' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + feeds = [(u'all', u'http://prawica.net/all/feed')] + + + keep_only_tags =[] + #this line should show title of the article, but it doesnt work + keep_only_tags.append(dict(name = 'h1', attrs = {'class' : 'print-title'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'content'})) + + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'class' : 'field field-type-viewfield field-field-autor2'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'field field-type-viewfield field-field-publikacje-autora'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'rate-widget-2 rate-widget clear-block rate-average rate-widget-fivestar rate-daa7512627f21dcf15e0af47e5279f0e rate-processed'})) + remove_tags_after =[(dict(name = 'div', attrs = {'class' : 'field-label-inline-first'}))] + + def print_version(self, url): + return url.replace('http://prawica.net/', 'http://prawica.net/print/') diff --git a/recipes/red_voltaire.recipe b/recipes/red_voltaire.recipe new file mode 100644 index 0000000000..1763125a8e --- /dev/null +++ b/recipes/red_voltaire.recipe @@ -0,0 +1,32 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class RedVoltaireRecipe(BasicNewsRecipe): + title = u'Red Voltaire' + __author__ = 'atordo' + description = u'Red de prensa no alineada, especializada en el an\u00e1lisis de las relaciones internacionales' + oldest_article = 7 + max_articles_per_feed = 30 + auto_cleanup = False + no_stylesheets = True + language = 'es' + use_embedded_content = False + remove_javascript = True + cover_url = u'http://www.voltairenet.org/squelettes/elements/images/logo-voltairenet-org.png' + masthead_url = u'http://www.voltairenet.org/squelettes/elements/images/logo-voltairenet-org.png' + + preprocess_regexps = [ + (re.compile(r'(?P<titulo>.+).+

'+match.group('titulo')+'

. (?P.+).+', re.IGNORECASE|re.DOTALL) + ,lambda match:''+match.group('fecha')+'') + ,(re.compile(r'