diff --git a/Changelog.yaml b/Changelog.yaml index ebc2e5cad1..129af0afd5 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,89 @@ # new recipes: # - title: +- version: 0.9.7 + date: 2012-11-23 + + new features: + - title: "Edit metadata dialog: Show the size of the current book cover in the edit metadata dialog." + tickets: [1079781] + + - title: "Get Books: Allow easy searching by title and author in addition to any keyword, to prevent large numbers of spurious matches." + + - title: "An option to automatically convert any added book to the current output format, found under Preferences->Adding books" + + - title: "E-book viewer: Allow viewing tables in a separate popup window by right clicking on the table and selecting 'View table'. Useful for reference books that have lots of large tables." + tickets: [1080710] + + - title: "Catalogs: Add the current library name as an available field when generating catalogs in csv/xml format." + tickets: [1078422] + + - title: "Enable colored text in the output from the command line tools on windows" + + - title: "E-book viewer: Add an option to hide the help message when entering full screen mode" + + - title: "E-book viewer: Add an option to always start the viewer in full screen mode" + + - title: "E-book viewer: Add many more controls to the context menu, particularly useful in full screen mode" + + - title: "E-book viewer: Allow easy searching of the selected word or phrase in google via the context menu" + + - title: "Add a new type of FileType plugin, postimport, that runs after a book has been added to the database." + + - title: "Get Books: Remove Gandalf store, add Publio store. Update the Legimi store plugin for website changes" + + bug fixes: + - title: "Conversion: Correctly handle values of left and right for the deprecated align attribute of images, mapping them to the CSS float property instead of to text-align." + tickets: [1081094] + + - title: "MOBI Output: When generating joint MOBI6/KF8 files do not set incorrect display CSS values for tables in the KF8 part" + + - title: "Connect to iTunes: Ignore AAC audio files." + tickets: [1081096] + + - title: "E-book viewer: Fix restoring from fullscreen not respecting maximized window state" + + - title: "Fix rows in the device books view sometimes being too high" + + - title: "Catalogs: Fixed a problem occurring when merging comments with a custom field whose type is a list." + + - title: "Linux binary: Use exec in the wrapper shell scripts that are used to set env vars and launch calibre utilities." + tickets: [1077884] + + - title: "E-book viewer: Fix blank pages after every page when viewing some comic files in paged mode" + + - title: "E-book viewer: When printing, respect the specified page range." + tickets: [1074220] + + - title: "Font subsetting: Parse the GSUB table for glyph substitution rules and do not remove any glyphs that could act as substitutes. Keep zero length glyphs like the glyphs for non printable characters when subsetting TrueType outlines." + + - title: "Smarten punctuation: Fix self closing script tags causing smarten punctuation to fail" + + + improved recipes: + - Arguments and facts + - Business Standard + - The New Yorker + + new recipes: + - title: Various Czech and Hungarian news sources + author: bubak + + - title: Various Polish recipes + author: Artur Stachecki + + - title: Buchreport + author: a.peter + + - title: Red Voltaire + author: atordo + + - title: Autosport + author: Mr Stefan + + - title: House News + author: Eddie Lau + - version: 0.9.6 date: 2012-11-10 diff --git a/manual/faq.rst b/manual/faq.rst index 739971c95c..109aff440d 100644 --- a/manual/faq.rst +++ b/manual/faq.rst @@ -649,20 +649,24 @@ If it still wont launch, start a command prompt (press the windows key and R; th Post any output you see in a help message on the `Forum `_. -|app| freezes when I click on anything? +|app| freezes/crashes occasionally? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There are three possible things I know of, that can cause this: - * You recently connected an external monitor or TV to your computer. In this case, whenever |app| opens a new window like the edit metadata window or the conversion dialog, it appears on the second monitor where you dont notice it and so you think |app| has frozen. Disconnect your second monitor and restart calibre. + * You recently connected an external monitor or TV to your computer. In + this case, whenever |app| opens a new window like the edit metadata + window or the conversion dialog, it appears on the second monitor where + you dont notice it and so you think |app| has frozen. Disconnect your + second monitor and restart calibre. - * You are using a Wacom branded mouse. There is an incompatibility between Wacom mice and the graphics toolkit |app| uses. Try using a non-Wacom mouse. + * You are using a Wacom branded mouse. There is an incompatibility between + Wacom mice and the graphics toolkit |app| uses. Try using a non-Wacom + mouse. * If you use RoboForm, it is known to cause |app| to crash. Add |app| to - the blacklist of programs inside RoboForm to fix this. - - * Sometimes if some software has installed lots of new files in your fonts folder, |app| can crash until it finishes indexing them. Just start |app|, then leave it alone for about 20 minutes, without clicking on anything. After that you should be able to use |app| as normal. - + the blacklist of programs inside RoboForm to fix this. Or uninstall + RoboForm. |app| is not starting on OS X? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/recipes/aif_ru.recipe b/recipes/aif_ru.recipe index b5d6015d0c..4e018203da 100644 --- a/recipes/aif_ru.recipe +++ b/recipes/aif_ru.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010, Darko Miletic ' +__copyright__ = '2010 - 2012, Darko Miletic ' ''' www.aif.ru ''' @@ -19,12 +19,19 @@ class AIF_ru(BasicNewsRecipe): encoding = 'cp1251' language = 'ru' publication_type = 'magazine' - extra_css = ' @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Verdana,Arial,Helvetica,sans1,sans-serif} ' - keep_only_tags = [dict(name='div',attrs={'id':'inner'})] + masthead_url = 'http://static3.aif.ru/glossy/index/i/logo.png' + extra_css = """ + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} + body{font-family: Verdana,Arial,Helvetica,sans1,sans-serif} + img{display: block} + """ + keep_only_tags = [ + dict(name='div',attrs={'class':['content-header', 'zoom']}) + ,dict(name='div',attrs={'id':'article-text'}) + ] remove_tags = [ - dict(name=['iframe','object','link','base','input','img']) - ,dict(name='div',attrs={'class':'photo'}) - ,dict(name='p',attrs={'class':'resizefont'}) + dict(name=['iframe','object','link','base','input','meta']) + ,dict(name='div',attrs={'class':'in-topic'}) ] feeds = [(u'News', u'http://www.aif.ru/rss/all.php')] diff --git a/recipes/aktualne.cz.recipe b/recipes/aktualne.cz.recipe new file mode 100644 index 0000000000..cd2dcc5f09 --- /dev/null +++ b/recipes/aktualne.cz.recipe @@ -0,0 +1,69 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class aktualneRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'aktualne.cz' + publisher = u'Centrum holdings' + description = 'aktuálně.cz' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Domácí', u'http://aktualne.centrum.cz/feeds/rss/domaci/?photo=0'), + (u'Zprávy', u'http://aktualne.centrum.cz/feeds/rss/zpravy/?photo=0'), + (u'Praha', u'http://aktualne.centrum.cz/feeds/rss/domaci/regiony/praha/?photo=0'), + (u'Ekonomika', u'http://aktualne.centrum.cz/feeds/rss/ekonomika/?photo=0'), + (u'Finance', u'http://aktualne.centrum.cz/feeds/rss/finance/?photo=0'), + (u'Blogy a názory', u'http://blog.aktualne.centrum.cz/export-all.php') + ] + + + language = 'cs' + cover_url = 'http://img.aktualne.centrum.cz/design/akt4/o/l/logo-akt-ciste.png' + remove_javascript = True + no_stylesheets = True + + remove_attributes = [] + remove_tags_before = dict(name='h1', attrs={'class':['titulek-clanku']}) + filter_regexps = [r'img.aktualne.centrum.cz'] + remove_tags = [dict(name='div', attrs={'id':['social-bookmark']}), + dict(name='div', attrs={'class':['box1', 'svazane-tagy']}), + dict(name='div', attrs={'class':'itemcomment id0'}), + dict(name='div', attrs={'class':'hlavicka'}), + dict(name='div', attrs={'class':'hlavni-menu'}), + dict(name='div', attrs={'class':'top-standard-brand-obal'}), + dict(name='div', attrs={'class':'breadcrumb'}), + dict(name='div', attrs={'id':'start-standard'}), + dict(name='div', attrs={'id':'forum'}), + dict(name='span', attrs={'class':'akce'}), + dict(name='span', attrs={'class':'odrazka vetsi'}), + dict(name='div', attrs={'class':'boxP'}), + dict(name='div', attrs={'class':'box2'})] + preprocess_regexps = [ + (re.compile(r'
'), + (re.compile(r'
')] + + keep_only_tags = [] + + visited_urls = {} + def get_article_url(self, article): + url = BasicNewsRecipe.get_article_url(self, article) + if url in self.visited_urls: + self.log.debug('Ignoring duplicate: ' + url) + return None + else: + self.visited_urls[url] = True + self.log.debug('Accepting: ' + url) + return url + + def encoding(self, source): + if source.newurl.find('blog.aktualne') >= 0: + enc = 'utf-8' + else: + enc = 'iso-8859-2' + self.log.debug('Called encoding ' + enc + " " + str(source.newurl)) + return source.decode(enc, 'replace') + diff --git a/recipes/antyweb.recipe b/recipes/antyweb.recipe index 0b8c5af9f4..c2576191dd 100644 --- a/recipes/antyweb.recipe +++ b/recipes/antyweb.recipe @@ -1,4 +1,3 @@ -import re from calibre.web.feeds.news import BasicNewsRecipe diff --git a/recipes/bankier_pl.recipe b/recipes/bankier_pl.recipe index d65b0c17ed..8a68d844b3 100644 --- a/recipes/bankier_pl.recipe +++ b/recipes/bankier_pl.recipe @@ -8,7 +8,6 @@ bankier.pl ''' from calibre.web.feeds.news import BasicNewsRecipe -import re class bankier(BasicNewsRecipe): title = u'Bankier.pl' @@ -33,19 +32,19 @@ class bankier(BasicNewsRecipe): remove_tags.append(dict(name = 'img', attrs = {'src' : '/gfx/hd-mid-02.gif'})) #remove_tags.append(dict(name = 'a', attrs = {'target' : '_blank'})) #remove_tags.append(dict(name = 'br', attrs = {'clear' : 'all'})) - + feeds = [ - (u'Wiadomości dnia', u'http://feeds.feedburner.com/bankier-wiadomosci-dnia'), - (u'Finanse osobiste', u'http://feeds.feedburner.com/bankier-finanse-osobiste'), - (u'Firma', u'http://feeds.feedburner.com/bankier-firma'), - (u'Giełda', u'http://feeds.feedburner.com/bankier-gielda'), - (u'Rynek walutowy', u'http://feeds.feedburner.com/bankier-rynek-walutowy'), - (u'Komunikaty ze spółek', u'http://feeds.feedburner.com/bankier-espi'), - ] + (u'Wiadomości dnia', u'http://feeds.feedburner.com/bankier-wiadomosci-dnia'), + (u'Finanse osobiste', u'http://feeds.feedburner.com/bankier-finanse-osobiste'), + (u'Firma', u'http://feeds.feedburner.com/bankier-firma'), + (u'Giełda', u'http://feeds.feedburner.com/bankier-gielda'), + (u'Rynek walutowy', u'http://feeds.feedburner.com/bankier-rynek-walutowy'), + (u'Komunikaty ze spółek', u'http://feeds.feedburner.com/bankier-espi'), + ] def print_version(self, url): segment = url.split('.') urlPart = segment[2] segments = urlPart.split('-') urlPart2 = segments[-1] return 'http://www.bankier.pl/wiadomosci/print.html?article_id=' + urlPart2 - \ No newline at end of file + diff --git a/recipes/blesk.recipe b/recipes/blesk.recipe new file mode 100644 index 0000000000..7eff4c42d0 --- /dev/null +++ b/recipes/blesk.recipe @@ -0,0 +1,55 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class bleskRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Blesk' + publisher = u'' + description = 'blesk.cz' + oldest_article = 1 + max_articles_per_feed = 20 + use_embedded_content = False + + feeds = [ + (u'Zprávy', u'http://www.blesk.cz/rss/7'), + (u'Blesk', u'http://www.blesk.cz/rss/1'), + (u'Sex a tabu', u'http://www.blesk.cz/rss/2'), + (u'Celebrity', u'http://www.blesk.cz/rss/5'), + (u'Cestování', u'http://www.blesk.cz/rss/12') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://img.blesk.cz/images/blesk/blesk-logo.png' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + remove_tags_before = dict(name='div', attrs={'id':['boxContent']}) + remove_tags_after = dict(name='div', attrs={'class':['artAuthors']}) + remove_tags = [dict(name='div', attrs={'class':['link_clanek']}), + dict(name='div', attrs={'id':['partHeader']}), + dict(name='div', attrs={'id':['top_bottom_box', 'lista_top']})] + preprocess_regexps = [(re.compile(r'
')] + + keep_only_tags = [dict(name='div', attrs={'class':'articleContent'})] + + visited_urls = {} + def get_article_url(self, article): + url = BasicNewsRecipe.get_article_url(self, article) + if url in self.visited_urls: + self.log.debug('Ignoring duplicate: ' + url) + return None + else: + self.visited_urls[url] = True + self.log.debug('Accepting: ' + url) + return url + + + + diff --git a/recipes/buchreport.recipe b/recipes/buchreport.recipe new file mode 100644 index 0000000000..5ed34d1ee8 --- /dev/null +++ b/recipes/buchreport.recipe @@ -0,0 +1,45 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe + +'''Calibre recipe to convert the RSS feeds of the Buchreport to an ebook.''' + +class Buchreport(BasicNewsRecipe) : + __author__ = 'a.peter' + __copyright__ = 'a.peter' + __license__ = 'GPL v3' + description = 'Buchreport' + version = 4 + title = u'Buchreport' + timefmt = ' [%d.%m.%Y]' + encoding = 'cp1252' + language = 'de' + + + extra_css = 'body { margin-left: 0.00em; margin-right: 0.00em; } \ + article, articledate, articledescription { text-align: left; } \ + h1 { text-align: left; font-size: 140%; font-weight: bold; } \ + h2 { text-align: left; font-size: 100%; font-weight: bold; font-style: italic; } \ + h3 { text-align: left; font-size: 100%; font-weight: regular; font-style: italic; } \ + h4, h5, h6 { text-align: left; font-size: 100%; font-weight: bold; }' + + oldest_article = 7.0 + no_stylesheets = True + remove_javascript = True + use_embedded_content = False + publication_type = 'newspaper' + + remove_tags_before = dict(name='h2') + remove_tags_after = [ + dict(name='div', attrs={'style':["padding-top:10px;clear:both"]}) + ] + remove_tags = [ + dict(name='div', attrs={'style':["padding-top:10px;clear:both"]}), + dict(name='iframe'), + dict(name='img') + ] + + feeds = [ + (u'Buchreport', u'http://www.buchreport.de/index.php?id=5&type=100') + ] + + def get_masthead_url(self): + return 'http://www.buchreport.de/fileadmin/template/img/buchreport_logo.jpg' diff --git a/recipes/business_standard.recipe b/recipes/business_standard.recipe index badca48733..a61c32aa42 100644 --- a/recipes/business_standard.recipe +++ b/recipes/business_standard.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2009-2010, Darko Miletic ' +__copyright__ = '2009-2012, Darko Miletic ' ''' www.business-standard.com ''' @@ -14,10 +14,12 @@ class BusinessStandard(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False + auto_cleanup = False encoding = 'cp1252' publisher = 'Business Standard Limited' category = 'news, business, money, india, world' language = 'en_IN' + masthead_url = 'http://feeds.business-standard.com/images/logo_08.jpg' conversion_options = { 'comments' : description @@ -26,7 +28,7 @@ class BusinessStandard(BasicNewsRecipe): ,'publisher' : publisher ,'linearize_tables': True } - keep_only_tags=[dict(attrs={'class':'TableClas'})] + #keep_only_tags=[dict(name='td', attrs={'class':'TableClas'})] remove_tags = [ dict(name=['object','link','script','iframe','base','meta']) ,dict(attrs={'class':'rightDiv2'}) @@ -45,3 +47,8 @@ class BusinessStandard(BasicNewsRecipe): ,(u'Management & Mktg' , u'http://feeds.business-standard.com/rss/7_0.xml' ) ,(u'Opinion' , u'http://feeds.business-standard.com/rss/5_0.xml' ) ] + + def print_version(self, url): + l, s, tp = url.rpartition('/') + t, k, autono = l.rpartition('/') + return 'http://www.business-standard.com/india/printpage.php?autono=' + autono + '&tp=' + tp diff --git a/recipes/ceska_pozice.recipe b/recipes/ceska_pozice.recipe new file mode 100644 index 0000000000..478f6823b9 --- /dev/null +++ b/recipes/ceska_pozice.recipe @@ -0,0 +1,68 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class ceskaPoziceRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Česká pozice' + description = 'Česká pozice' + oldest_article = 2 + max_articles_per_feed = 20 + + feeds = [ + (u'Všechny články', u'http://www.ceskapozice.cz/rss.xml'), + (u'Domov', u'http://www.ceskapozice.cz/taxonomy/term/16/feed'), + (u'Chrono', u'http://www.ceskapozice.cz/chrono/feed'), + (u'Evropa', u'http://www.ceskapozice.cz/taxonomy/term/17/feed') + ] + + + language = 'cs' + cover_url = 'http://www.ceskapozice.cz/sites/default/files/cpozice_logo.png' + remove_javascript = True + no_stylesheets = True + domain = u'http://www.ceskapozice.cz' + use_embedded_content = False + + + remove_tags = [dict(name='div', attrs={'class':['block-ad', 'region region-content-ad']}), + dict(name='ul', attrs={'class':'links'}), + dict(name='div', attrs={'id':['comments', 'back-to-top']}), + dict(name='div', attrs={'class':['next-page', 'region region-content-ad']}), + dict(name='cite')] + + keep_only_tags = [dict(name='div', attrs={'id':'content'})] + + visited_urls = {} + def get_article_url(self, article): + url = BasicNewsRecipe.get_article_url(self, article) + if url in self.visited_urls: + self.log.debug('Ignoring duplicate: ' + url) + return None + else: + self.visited_urls[url] = True + self.log.debug('Accepting: ' + url) + return url + + def preprocess_html(self, soup): + self.append_page(soup, soup.body, 3) + return soup + + def append_page(self, soup, appendtag, position): + pager = soup.find('div', attrs={'class':'paging-bottom'}) + if pager: + nextbutton = pager.find('li', attrs={'class':'pager-next'}) + if nextbutton: + nexturl = self.domain + nextbutton.a['href'] + soup2 = self.index_to_soup(nexturl) + texttag = soup2.find('div', attrs={'class':'main-body'}) + for it in texttag.findAll('div', attrs={'class':'region region-content-ad'}): + it.extract() + for it in texttag.findAll('cite'): + it.extract() + newpos = len(texttag.contents) + self.append_page(soup2, texttag, newpos) + texttag.extract() + appendtag.insert(position, texttag) + pager.extract() + diff --git a/recipes/ceske_noviny.recipe b/recipes/ceske_noviny.recipe new file mode 100644 index 0000000000..10dd16689d --- /dev/null +++ b/recipes/ceske_noviny.recipe @@ -0,0 +1,30 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class ceskenovinyRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'České Noviny' + description = 'ceskenoviny.cz' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Domácí', u'http://www.ceskenoviny.cz/sluzby/rss/domov.php') + #,(u'Hlavní události', u'http://www.ceskenoviny.cz/sluzby/rss/index.php') + #,(u'Přehled zpráv', u'http://www.ceskenoviny.cz/sluzby/rss/zpravy.php') + #,(u'Ze světa', u'http://www.ceskenoviny.cz/sluzby/rss/svet.php') + #,(u'Kultura', u'http://www.ceskenoviny.cz/sluzby/rss/kultura.php') + #,(u'IT', u'http://www.ceskenoviny.cz/sluzby/rss/pocitace.php') + ] + + + language = 'cs' + cover_url = 'http://i4.cn.cz/grafika/cn_logo-print.gif' + remove_javascript = True + no_stylesheets = True + + remove_attributes = [] + filter_regexps = [r'img.aktualne.centrum.cz'] + + keep_only_tags = [dict(name='div', attrs={'id':'clnk'})] diff --git a/recipes/cesky_rozhlas_6.recipe b/recipes/cesky_rozhlas_6.recipe new file mode 100644 index 0000000000..eca32af02c --- /dev/null +++ b/recipes/cesky_rozhlas_6.recipe @@ -0,0 +1,26 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class cro6Recipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Český rozhlas 6' + description = 'Český rozhlas 6' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Český rozhlas 6', u'http://www.rozhlas.cz/export/cro6/') + ] + + + language = 'cs' + cover_url = 'http://www.rozhlas.cz/img/e5/logo/cro6.png' + remove_javascript = True + no_stylesheets = True + + remove_attributes = [] + remove_tags = [dict(name='div', attrs={'class':['audio-play-all', 'poradHeaders', 'actions']}), + dict(name='p', attrs={'class':['para-last']})] + + keep_only_tags = [dict(name='div', attrs={'id':'article'})] diff --git a/recipes/demagog.cz.recipe b/recipes/demagog.cz.recipe new file mode 100644 index 0000000000..7d89af41bd --- /dev/null +++ b/recipes/demagog.cz.recipe @@ -0,0 +1,39 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class demagogRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Demagog.cz' + publisher = u'' + description = 'demagog.cz' + oldest_article = 6 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Aktuality', u'http://demagog.cz/rss') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://demagog.cz/content/images/demagog.cz.png' + remove_javascript = True + no_stylesheets = True + extra_css = """ + .vyrok_suhrn{margin-top:50px; } + .vyrok{margin-bottom:30px; } + """ + + remove_tags = [dict(name='a', attrs={'class':'vyrok_odovodnenie_tgl'}), + dict(name='img', attrs={'class':'vyrok_fotografia'})] + remove_tags_before = dict(name='h1') + remove_tags_after = dict(name='div', attrs={'class':'vyrok_text_after'}) + preprocess_regexps = [(re.compile(r'(
)', re.DOTALL|re.IGNORECASE), lambda match: '\1
')] + + + + diff --git a/recipes/denik.cz.recipe b/recipes/denik.cz.recipe new file mode 100644 index 0000000000..2ccf8caa40 --- /dev/null +++ b/recipes/denik.cz.recipe @@ -0,0 +1,36 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class ceskyDenikRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'denik.cz' + publisher = u'' + description = u'Český deník' + oldest_article = 1 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Z domova', u'http://www.denik.cz/rss/z_domova.html') + ,(u'Pražský deník - Moje Praha', u'http://prazsky.denik.cz/rss/zpravy_region.html') + #,(u'Zahraničí', u'http://www.denik.cz/rss/ze_sveta.html') + #,(u'Kultura', u'http://www.denik.cz/rss/kultura.html') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://g.denik.cz/images/loga/denik.png' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_tags = [] + keep_only_tags = [dict(name='div', attrs={'class':'content'})] + #remove_tags_before = dict(name='h1') + remove_tags_after = dict(name='p', attrs={'class':'clanek-autor'}) + + diff --git a/recipes/denik_referendum.recipe b/recipes/denik_referendum.recipe new file mode 100644 index 0000000000..e04871d067 --- /dev/null +++ b/recipes/denik_referendum.recipe @@ -0,0 +1,28 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class denikReferendumRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Den\u00edk Referendum' + publisher = u'' + description = '' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Deník Referendum', u'http://feeds.feedburner.com/DenikReferendum') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + remove_attributes = [] + remove_tags_after = dict(name='div', attrs={'class':['text']}) + remove_tags = [dict(name='div', attrs={'class':['box boxLine', 'box noprint', 'box']}), + dict(name='h3', attrs={'class':'head alt'})] + + keep_only_tags = [dict(name='div', attrs={'id':['content']})] diff --git a/recipes/editoriali.recipe b/recipes/editoriali.recipe index 1b0c558df4..c5596bd743 100644 --- a/recipes/editoriali.recipe +++ b/recipes/editoriali.recipe @@ -7,6 +7,7 @@ class AdvancedUserRecipe1332847053(BasicNewsRecipe): title = u'Editoriali' __author__ = 'faber1971' description = 'Leading articles on Italy by the best Italian editorials' + language = 'it' oldest_article = 1 max_articles_per_feed = 100 diff --git a/recipes/foreign_policy.recipe b/recipes/foreign_policy.recipe index 893d055a05..4ddecf842f 100644 --- a/recipes/foreign_policy.recipe +++ b/recipes/foreign_policy.recipe @@ -8,6 +8,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1349086293(BasicNewsRecipe): title = u'Foreign Policy' + language = 'en' __author__ = 'Darko Miletic' description = 'International News' publisher = 'Washingtonpost.Newsweek Interactive, LLC' diff --git a/recipes/gazeta_pl_krakow.recipe b/recipes/gazeta_pl_krakow.recipe index 0f35e536f6..59188a5d6a 100644 --- a/recipes/gazeta_pl_krakow.recipe +++ b/recipes/gazeta_pl_krakow.recipe @@ -8,7 +8,6 @@ krakow.gazeta.pl ''' from calibre.web.feeds.news import BasicNewsRecipe -import re class gw_krakow(BasicNewsRecipe): title = u'Gazeta.pl Kraków' @@ -46,7 +45,7 @@ class gw_krakow(BasicNewsRecipe): remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_buttons'})) remove_tags_after = [dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})] - + feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/krakow.xml')] def skip_ad_pages(self, soup): diff --git a/recipes/gazeta_pl_warszawa.recipe b/recipes/gazeta_pl_warszawa.recipe index 7a43931db4..2d95bcc06f 100644 --- a/recipes/gazeta_pl_warszawa.recipe +++ b/recipes/gazeta_pl_warszawa.recipe @@ -8,7 +8,6 @@ warszawa.gazeta.pl ''' from calibre.web.feeds.news import BasicNewsRecipe -import re class gw_wawa(BasicNewsRecipe): title = u'Gazeta.pl Warszawa' @@ -43,7 +42,7 @@ class gw_wawa(BasicNewsRecipe): remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazeta_article_related_new'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazetaVideoPlayer'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_miniatures'})) - + feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/warszawa.xml')] def skip_ad_pages(self, soup): diff --git a/recipes/house_news.recipe b/recipes/house_news.recipe new file mode 100644 index 0000000000..7d8c3275d2 --- /dev/null +++ b/recipes/house_news.recipe @@ -0,0 +1,30 @@ +__license__ = 'GPL v3' +__copyright__ = '2012, Eddie Lau' +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipeHouseNews(BasicNewsRecipe): + title = u'House News \u4e3b\u5834\u65b0\u805e' + __author__ = 'Eddie Lau' + publisher = 'House News' + oldest_article = 1 + max_articles_per_feed = 100 + auto_cleanup = False + language = 'zh' + encoding = 'utf-8' + description = 'http://thehousenews.com' + category = 'Chinese, Blogs, Opinion, News, Hong Kong' + masthead_url = 'http://thehousenews.com/static/images/housebeta.jpg' + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} p[class=date] {font-size:50%;} div[class=author] {font-size:75%;} p[class=caption] {font-size:50%;}' + feeds = [(u'Latest', u'http://thehousenews.com/rss/')] + keep_only_tags = [dict(name='h1'), + dict(name='div', attrs={'class':['photo']}), + dict(name='p', attrs={'class':'caption'}), + dict(name='div', attrs={'class':'articleTextWrap'}), + dict(name='div', attrs={'class':['author']}), + dict(name='p', attrs={'class':'date'})] + + def populate_article_metadata(self, article, soup, first): + if first and hasattr(self, 'add_toc_thumbnail'): + picdiv = soup.find('img') + if picdiv is not None: + self.add_toc_thumbnail(article,picdiv['src']) diff --git a/recipes/icons/business_standard.png b/recipes/icons/business_standard.png index 1edff420c0..f4c04e566a 100644 Binary files a/recipes/icons/business_standard.png and b/recipes/icons/business_standard.png differ diff --git a/recipes/ihned.cz.recipe b/recipes/ihned.cz.recipe new file mode 100644 index 0000000000..a35be06dd1 --- /dev/null +++ b/recipes/ihned.cz.recipe @@ -0,0 +1,36 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class ihnedRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'iHNed.cz' + publisher = u'' + description = 'ihned.cz' + oldest_article = 1 + max_articles_per_feed = 20 + use_embedded_content = False + + feeds = [ + (u'Zprávy', u'http://zpravy.ihned.cz/?m=rss'), + (u'Hospodářské noviny', u'http://hn.ihned.cz/?p=500000_rss'), + (u'Byznys', u'http://byznys.ihned.cz/?m=rss'), + (u'Life', u'http://life.ihned.cz/?m=rss'), + (u'Dialog', u'http://dialog.ihned.cz/?m=rss') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://rss.ihned.cz/img/0/0_hp09/ihned.cz.gif' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + remove_tags_before = dict(name='div', attrs={'id':['heading']}) + remove_tags_after = dict(name='div', attrs={'id':['next-authors']}) + remove_tags = [dict(name='ul', attrs={'id':['comm']}), + dict(name='div', attrs={'id':['r-big']}), + dict(name='div', attrs={'class':['tools tools-top']})] diff --git a/recipes/insider.recipe b/recipes/insider.recipe new file mode 100644 index 0000000000..faaf00a14a --- /dev/null +++ b/recipes/insider.recipe @@ -0,0 +1,59 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +import re + +from calibre.web.feeds.news import BasicNewsRecipe + +class insider(BasicNewsRecipe): + __author__ = 'bubak' + title = 'Insider' + language = 'cs' + + remove_tags = [dict(name='div', attrs={'class':'article-related-content'}) + ,dict(name='div', attrs={'class':'calendar'}) + ,dict(name='span', attrs={'id':'labelHolder'}) + ] + + no_stylesheets = True + keep_only_tags = [dict(name='div', attrs={'class':['doubleBlock textContentFormat']})] + + preprocess_regexps = [(re.compile(r'T.mata:.*', re.DOTALL|re.IGNORECASE), lambda m: '')] + needs_subscription = True + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + br.open('http://www.denikinsider.cz/') + br.select_form(nr=0) + br['login-name'] = self.username + br['login-password'] = self.password + res = br.submit() + raw = res.read() + if u'Odhlásit se' not in raw: + raise ValueError('Failed to login to insider.cz' + 'Check your username and password.') + return br + + def parse_index(self): + articles = [] + + soup = self.index_to_soup('http://www.denikinsider.cz') + titles = soup.findAll('span', attrs={'class':'homepageArticleTitle'}) + if titles is None: + raise ValueError('Could not find category content') + + articles = [] + seen_titles = set([]) + for title in titles: + if title.string in seen_titles: + continue + article = title.parent + seen_titles.add(title.string) + url = article['href'] + if url.startswith('/'): + url = 'http://www.denikinsider.cz/'+url + self.log('\tFound article:', title, 'at', url) + articles.append({'title':title.string, 'url':url, 'description':'', + 'date':''}) + return [(self.title, articles)] + + diff --git a/recipes/kudy_z_nudy.recipe b/recipes/kudy_z_nudy.recipe new file mode 100644 index 0000000000..d7c0d9ecf9 --- /dev/null +++ b/recipes/kudy_z_nudy.recipe @@ -0,0 +1,32 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class kudyznudyRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Kudy z nudy' + publisher = u'' + description = 'kudyznudy.cz' + oldest_article = 3 + max_articles_per_feed = 20 + use_embedded_content = False + + feeds = [ + (u'Praha nejnovější', u'http://www.kudyznudy.cz/RSS/Charts.aspx?Type=Newest&Lang=cs-CZ&RegionId=1') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.kudyznudy.cz/App_Themes/KzN/Images/Containers/Header/HeaderLogoKZN.png' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + remove_tags_before = dict(name='div', attrs={'class':['C_WholeContentPadding']}) + remove_tags_after = dict(name='div', attrs={'class':['SurroundingsContainer']}) + remove_tags = [dict(name='div', attrs={'class':['Details', 'buttons', 'SurroundingsContainer', 'breadcrumb']})] + + keep_only_tags = [] diff --git a/recipes/lidovky.recipe b/recipes/lidovky.recipe new file mode 100644 index 0000000000..8e4754829b --- /dev/null +++ b/recipes/lidovky.recipe @@ -0,0 +1,40 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class lnRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'lidovky' + publisher = u'' + description = 'lidovky.cz' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Události', u'http://www.lidovky.cz/export/rss.asp?r=ln_domov'), + (u'Svět', u'http://www.lidovky.cz/export/rss.asp?r=ln_zahranici'), + (u'Byznys', u'http://www.lidovky.cz/export/rss.asp?c=ln_byznys'), + (u'Věda', u'http://www.lidovky.cz/export/rss.asp?r=ln_veda'), + (u'Názory', u'http://www.lidovky.cz/export/rss.asp?r=ln_nazory'), + (u'Relax', u'http://www.lidovky.cz/export/rss.asp?c=ln_relax') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://g.lidovky.cz/o/lidovky_ln3b/lidovky-logo.png' + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + remove_attributes = [] + remove_tags_before = dict(name='div', attrs={'id':['content']}) + remove_tags_after = dict(name='div', attrs={'class':['authors']}) + preprocess_regexps = [(re.compile(r'
')] + + keep_only_tags = [] + + + + + diff --git a/recipes/metropol_tv.recipe b/recipes/metropol_tv.recipe new file mode 100644 index 0000000000..56f393c96a --- /dev/null +++ b/recipes/metropol_tv.recipe @@ -0,0 +1,29 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class metropolRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Metropol TV' + publisher = u'' + description = 'metropol.cz' + oldest_article = 1 + max_articles_per_feed = 20 + use_embedded_content = False + + feeds = [ + (u'Metropolcv.cz', u'http://www.metropol.cz/rss/') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.metropol.cz/public/css/../images/logo/metropoltv.png' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + + keep_only_tags = [dict(name='div', attrs={'id':['art-full']})] diff --git a/recipes/myapple_pl.recipe b/recipes/myapple_pl.recipe index eee333012c..df5708a325 100644 --- a/recipes/myapple_pl.recipe +++ b/recipes/myapple_pl.recipe @@ -1,4 +1,3 @@ -import re from calibre.web.feeds.news import BasicNewsRecipe @@ -36,15 +35,15 @@ class MyAppleRecipe(BasicNewsRecipe): extra_css = ''' body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} td.contentheading{font-size: large; font-weight: bold;} - ''' + ''' feeds = [ ('News', 'feed://myapple.pl/external.php?do=rss&type=newcontent§ionid=1&days=120&count=10'), ] - + def preprocess_html(self, soup): for alink in soup.findAll('a'): if alink.string is not None: tstr = alink.string alink.replaceWith(tstr) - return soup \ No newline at end of file + return soup diff --git a/recipes/nadacni_fond_proti_korupci.recipe b/recipes/nadacni_fond_proti_korupci.recipe new file mode 100644 index 0000000000..2a8a69283c --- /dev/null +++ b/recipes/nadacni_fond_proti_korupci.recipe @@ -0,0 +1,30 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class nfpkRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Nadační fond proti korupci' + publisher = u'' + description = 'nfpk.cz' + oldest_article = 7 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Aktuality', u'http://feeds.feedburner.com/nfpk') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.nfpk.cz/_templates/nfpk/_images/logo.gif' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + keep_only_tags = [dict(name='div', attrs={'id':'content'})] + diff --git a/recipes/nepszabadsag.recipe b/recipes/nepszabadsag.recipe new file mode 100644 index 0000000000..8ae5447dd6 --- /dev/null +++ b/recipes/nepszabadsag.recipe @@ -0,0 +1,56 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +''' +Fetch Népszabadság +''' +from calibre.web.feeds.news import BasicNewsRecipe + +class nepszabadsag(BasicNewsRecipe): + title = u'N\u00e9pszabads\u00e1g' + description = '' + __author__ = 'bubak' + use_embedded_content = False + timefmt = ' [%d %b %Y]' + oldest_article = 2 + max_articles_per_feed = 20 + no_stylesheets = True + language = 'hu' + #delay = 1 + #timeout = 10 + simultaneous_downloads = 5 + + #encoding = 'utf-8' + remove_javascript = True + cover_url = 'http://nol.hu/_design/image/logo_nol_live.jpg' + + feeds = [ + (u'Belföld', u'http://nol.hu/feed/belfold.rss') + #,(u'Külföld', u'http://nol.hu/feed/kulfold.rss') + #,(u'Gazdaság', u'http://nol.hu/feed/gazdasag.rss') + #,(u'Kultúra', u'http://nol.hu/feed/kult.rss') + ] + + extra_css = ''' + ''' + + remove_attributes = [] + remove_tags_before = dict(name='div', attrs={'class':['d-source']}) + remove_tags_after = dict(name='div', attrs={'class':['tags']}) + remove_tags = [dict(name='div', attrs={'class':['h']}), + dict(name='tfoot')] + + + keep_only_tags = [dict(name='table', attrs={'class':'article-box'})] + + # NS sends an ad page sometimes but not frequently enough, TBD + def AAskip_ad_pages(self, soup): + if ('advertisement' in soup.find('title').string.lower()): + href = soup.find('a').get('href') + self.log.debug('Skipping to: ' + href) + new = self.browser.open(href).read().decode('utf-8', 'ignore') + #ipython(locals()) + self.log.debug('Finished: ' + href) + return new + else: + return None + diff --git a/recipes/neviditelny_pes.recipe b/recipes/neviditelny_pes.recipe new file mode 100644 index 0000000000..65cfb2b7ec --- /dev/null +++ b/recipes/neviditelny_pes.recipe @@ -0,0 +1,32 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class pesRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Neviditelný pes' + publisher = u'' + description = u'Neviditelný pes' + oldest_article = 1 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Neviditelný pes', u'http://neviditelnypes.lidovky.cz/export/rss.asp?c=pes_neviditelny') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://g.zpravy.cz/o/pes/logo_pes.jpg' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_tags = [] + remove_tags_before = dict(name='div', attrs={'id':'art-full'}) + remove_tags_after = dict(name='div', attrs={'id':'authors'}) + + diff --git a/recipes/new_yorker.recipe b/recipes/new_yorker.recipe index 9eeb8b31ee..c5f1b0aff2 100644 --- a/recipes/new_yorker.recipe +++ b/recipes/new_yorker.recipe @@ -22,9 +22,9 @@ class NewYorker(BasicNewsRecipe): masthead_url = 'http://www.newyorker.com/css/i/hed/logo.gif' extra_css = """ body {font-family: "Times New Roman",Times,serif} - .articleauthor{color: #9F9F9F; + .articleauthor{color: #9F9F9F; font-family: Arial, sans-serif; - font-size: small; + font-size: small; text-transform: uppercase} .rubric,.dd,h6#credit{color: #CD0021; font-family: Arial, sans-serif; @@ -63,11 +63,11 @@ class NewYorker(BasicNewsRecipe): return url.strip() def get_cover_url(self): - cover_url = None - soup = self.index_to_soup('http://www.newyorker.com/magazine/toc/') - cover_item = soup.find('img',attrs={'id':'inThisIssuePhoto'}) + cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg" + soup = self.index_to_soup('http://www.newyorker.com/magazine?intcid=magazine') + cover_item = soup.find('div',attrs={'id':'media-count-1'}) if cover_item: - cover_url = 'http://www.newyorker.com' + cover_item['src'].strip() + cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip() return cover_url def preprocess_html(self, soup): diff --git a/recipes/novinky.cz.recipe b/recipes/novinky.cz.recipe new file mode 100644 index 0000000000..19fd52a371 --- /dev/null +++ b/recipes/novinky.cz.recipe @@ -0,0 +1,50 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class novinkyRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'novinky.cz' + publisher = u'seznam.cz' + description = 'novinky.cz' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Domácí', u'http://www.novinky.cz/rss2/domaci/'), + (u'Praha', u'http://www.novinky.cz/rss2/vase-zpravy/praha/'), + (u'Ekonomika', u'http://www.novinky.cz/rss2/ekonomika/'), + (u'Finance', u'http://www.novinky.cz/rss2/finance/'), + ] + + + #encoding = 'utf-8' + language = 'cs' + cover_url = 'http://www.novinky.cz/static/images/logo.gif' + remove_javascript = True + no_stylesheets = True + + remove_tags = [dict(name='div', attrs={'id':['pictureInnerBox']}), + dict(name='div', attrs={'id':['discussionEntry']}), + dict(name='span', attrs={'id':['mynews-hits', 'mynews-author']}), + dict(name='div', attrs={'class':['related']}), + dict(name='div', attrs={'id':['multimediaInfo']})] + remove_tags_before = dict(name='div',attrs={'class':['articleHeader']}) + remove_tags_after = dict(name='div',attrs={'class':'related'}) + + keep_only_tags = [] + + # This source has identical articles under different links + # which are redirected to the common url. I've found + # just this API method that has the real URL + visited_urls = {} + def encoding(self, source): + url = source.newurl + if url in self.visited_urls: + self.log.debug('Ignoring duplicate: ' + url) + return None + else: + self.visited_urls[url] = True + self.log.debug('Accepting: ' + url) + return source.decode('utf-8', 'replace') + diff --git a/recipes/parlamentni_listy.recipe b/recipes/parlamentni_listy.recipe new file mode 100644 index 0000000000..71d904866a --- /dev/null +++ b/recipes/parlamentni_listy.recipe @@ -0,0 +1,38 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class plRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Parlamentn\u00ed Listy' + publisher = u'' + description = '' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Parlamentní listy.cz', u'http://www.parlamentnilisty.cz/export/rss.aspx') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.parlamentnilisty.cz/design/listy-logo2.png' + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + remove_attributes = [] + remove_tags = [dict(name='div', attrs={'class':['articledetailboxin','crumbs', 'relatedarticles articledetailbox']}), + dict(name='div', attrs={'class':['socialshare-1 noprint', 'socialshare-2 noprint']}), + dict(name='div', attrs={'id':'widget'}), + dict(name='div', attrs={'class':'article-discussion-box noprint'})] + preprocess_regexps = [(re.compile(r'<(span|strong)[^>]*>\s*Ptejte se politik.*', re.DOTALL|re.IGNORECASE), lambda match: '')] + + keep_only_tags = [dict(name='div', attrs={'class':['article-detail']})] + + + + + + diff --git a/recipes/piratska_strana.recipe b/recipes/piratska_strana.recipe new file mode 100644 index 0000000000..c125eb8aad --- /dev/null +++ b/recipes/piratska_strana.recipe @@ -0,0 +1,40 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class cpsRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Piratská strana' + publisher = u'' + description = '' + oldest_article = 3 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Články', u'http://www.pirati.cz/rss.xml') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.pirati.cz/sites/all/themes/addari-cps/images/headbg.jpg' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + keep_only_tags = [dict(name='div', attrs={'id':'postarea'})] + remove_tags = [dict(name='div', attrs={'class':['breadcrumb', 'submitted', 'links-readmore']}), + dict(name='div', attrs={'id':['comments']})] + remove_tags_before = dict(name='font', attrs={'size':'+3'}) + remove_tags_after = [dict(name='iframe')] + + conversion_options = {'linearize_tables' : True} + + + + + diff --git a/recipes/piratske_noviny.recipe b/recipes/piratske_noviny.recipe new file mode 100644 index 0000000000..a2d30374ed --- /dev/null +++ b/recipes/piratske_noviny.recipe @@ -0,0 +1,34 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe + +class nfpkRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Piratské noviny' + publisher = u'' + description = 'nfpk.cz' + oldest_article = 2 + max_articles_per_feed = 20 + use_embedded_content = False + remove_empty_feeds = True + + feeds = [ + (u'Aktuality', u'http://www.piratskenoviny.cz/run/rss.php') + ] + + + #encoding = 'iso-8859-2' + language = 'cs' + cover_url = 'http://www.piratskenoviny.cz/imgs/piratske-noviny.gif' + remove_javascript = True + no_stylesheets = True + extra_css = """ + """ + + remove_attributes = [] + remove_tags_before = dict(name='font', attrs={'size':'+3'}) + remove_tags_after = [dict(name='iframe')] + conversion_options = {'linearize_tables' : True} + + + diff --git a/recipes/portfolio_hu.recipe b/recipes/portfolio_hu.recipe index 5eaf0e886f..feadd124ed 100644 --- a/recipes/portfolio_hu.recipe +++ b/recipes/portfolio_hu.recipe @@ -4,7 +4,7 @@ class AdvancedUserRecipe1348063712(BasicNewsRecipe): title = u'Portfolio.hu - English Edition' __author__ = 'laca' oldest_article = 7 - language = 'en_HUN' + language = 'en_HU' masthead_url = 'http://www.portfolio.hu/img/sit/angolfejlec2010.jpg' use_embedded_content = False auto_cleanup = True diff --git a/recipes/pravo.recipe b/recipes/pravo.recipe new file mode 100644 index 0000000000..02d2c13439 --- /dev/null +++ b/recipes/pravo.recipe @@ -0,0 +1,64 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals + +from calibre.web.feeds.news import BasicNewsRecipe + +class pravo(BasicNewsRecipe): + __author__ = 'bubak' + title = 'Právo' + language = 'cs' + + remove_tags_before = dict(name='div', attrs={'class':'rubrika-ostat'}) + remove_tags_after = dict(name='td', attrs={'class':'rubrika'}) + remove_tags = [dict(name='td', attrs={'width':'273'}) + ,dict(name='td', attrs={'class':'rubrika'}) + ,dict(name='div', attrs={'class':'rubrika-ostat'}) + ] + extra_css = '.nadpis {font-weight: bold; font-size: 130%;} .medium {text-align: justify;}' + cover_url = 'http://pravo.novinky.cz/images/horni_6_logo.gif' + cover_margins = (0, 100, '#ffffff') + conversion_options = {'linearize_tables' : True} + + no_stylesheets = True + + # our variables + seen_titles = set([]) + # only yesterday's articles are online + parent_url = 'http://pravo.novinky.cz/minule/' + feeds = [ + ('Hlavní stránka', 'http://pravo.novinky.cz/minule/index.php'), + ('Zpravodajství', 'http://pravo.novinky.cz/minule/zpravodajstvi.php'), + ('Komentáře', 'http://pravo.novinky.cz/minule/komentare.php'), + ('Praha a střední Čechy', 'http://pravo.novinky.cz/minule/praha_stredni_cechy.php') + ] + + + def parse_index(self): + articles = [] + + for feed in self.feeds: + articles.append(self.parse_page(feed)) + return articles + + def parse_page(self, (feed_title, url)): + articles = [] + + soup = self.index_to_soup(url) + titles = soup.findAll('a', attrs={'class':'nadpis'}) + if titles is None: + raise ValueError('Could not find any articles on page ' + url) + + articles = [] + for article in titles: + title = article.string + if title in self.seen_titles: + continue + self.seen_titles.add(title) + url = article['href'] + if not url.startswith('http'): + url = self.parent_url + url + self.log('\tFound article:', title, 'at', url) + articles.append({'title':title.string, 'url':url, 'description':'', + 'date':''}) + return (feed_title, articles) + diff --git a/recipes/prawica_net.recipe b/recipes/prawica_net.recipe index 96fa605af9..edd12695c5 100644 --- a/recipes/prawica_net.recipe +++ b/recipes/prawica_net.recipe @@ -8,7 +8,6 @@ http://prawica.net ''' from calibre.web.feeds.news import BasicNewsRecipe -import re class prawica_recipe(BasicNewsRecipe): title = u'prawica.net' @@ -38,4 +37,4 @@ class prawica_recipe(BasicNewsRecipe): remove_tags_after =[(dict(name = 'div', attrs = {'class' : 'field-label-inline-first'}))] def print_version(self, url): - return url.replace('http://prawica.net/', 'http://prawica.net/print/') \ No newline at end of file + return url.replace('http://prawica.net/', 'http://prawica.net/print/') diff --git a/recipes/respekt.recipe b/recipes/respekt.recipe new file mode 100644 index 0000000000..91aa2edb40 --- /dev/null +++ b/recipes/respekt.recipe @@ -0,0 +1,37 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals +from calibre.web.feeds.recipes import BasicNewsRecipe +import re + +class respektRecipe(BasicNewsRecipe): + __author__ = 'bubak' + title = u'Respekt' + publisher = u'Respekt' + description = 'Respekt' + oldest_article = 1 + max_articles_per_feed = 20 + + feeds = [ + (u'Všechny články', u'http://respekt.ihned.cz/index.php?p=R00000_rss') + ,(u'Blogy', u'http://blog.respekt.ihned.cz/?p=Rb00VR_rss') + #,(u'Respekt DJ', u'http://respekt.ihned.cz/index.php?p=R00RDJ_rss') + ] + + + encoding = 'cp1250' + language = 'cs' + cover_url = 'http://respekt.ihned.cz/img/R/respekt_logo.png' + remove_javascript = True + no_stylesheets = True + + remove_tags = [dict(name='div', attrs={'class':['d-tools', 'actions']})] + remove_tags_before = dict(name='div',attrs={'id':['detail']}) + remove_tags_after = dict(name='div',attrs={'class':'d-tools'}) + preprocess_regexps = [(re.compile(r'