diff --git a/.bzrignore b/.bzrignore index 99d307fddc..b0b87a34e6 100644 --- a/.bzrignore +++ b/.bzrignore @@ -35,4 +35,7 @@ nbproject/ .settings/ *.DS_Store calibre_plugins/ -./src/calibre/gui2/catalog/catalog_csv_xml.ui.autosave +recipes/.git +recipes/.gitignore +recipes/README +recipes/katalog_egazeciarz.recipe diff --git a/COPYRIGHT b/COPYRIGHT index 1a2c305ad2..85d70a8aa8 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -47,12 +47,6 @@ License: Apache 2.0 The full text of the Apache 2.0 license is available at: http://www.apache.org/licenses/LICENSE-2.0 -Files: src/sfntly/* -Copyright: Google Inc. -License: Apache 2.0 - The full text of the Apache 2.0 license is available at: - http://www.apache.org/licenses/LICENSE-2.0 - Files: resources/viewer/mathjax/* Copyright: Unknown License: Apache 2.0 diff --git a/Changelog.yaml b/Changelog.yaml index f4c5e25cb4..ebc2e5cad1 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,61 @@ # new recipes: # - title: +- version: 0.9.6 + date: 2012-11-10 + + new features: + - title: "Experimental support for subsetting fonts" + description: "Subsetting a font means reducing the font to contain only the glyphs for the text actually present in the book. This can easily halve the size of the font. calibre can now do this for all embedded fonts during a conversion. Turn it on via the 'Subset all embedded fonts' option under the Look & Feel section of the conversion dialog. calibre can subset both TrueType and OpenType fonts. Note that this code is very new and likely has bugs, so please check the output if you turn on subsetting. The conversion log will have info about the subsetting operations." + type: major + + - title: "EPUB Input: Try to workaround EPUBs that have missing or damaged ZIP central directories. calibre should now be able to read/convert such an EPUB file, provided it does not suffer from further corruption." + + - title: "Allow using identifiers in save to disk templates." + tickets: [1074623] + + - title: "calibredb: Add an option to not notify the GUI" + + - title: "Catalogs: Fix long tags causing catalog generation to fail on windows. Add the ability to cross-reference authors, i.e. to relist the authors for a book with multiple authors separately." + tickets: [1074931] + + - title: "Edit metadata dialog: Add a clear tags button to remove all tags with a single click" + + - title: "Add search to the font family chooser dialog" + + bug fixes: + - title: "Windows: Fix a long standing bug in the device eject code that for some reason only manifested in 0.9.5." + tickets: [1075782] + + - title: "Get Books: Fix Amazon stores, Google Books store and libri.de" + + - title: "Kobo driver: More fixes for on device book matching, and list books as being on device even if the Kobo has not yet indexed them. Also some performance improvements." + tickets: [1069617] + + - title: "EPUB Output: Remove duplicate id and name attributes to eliminate pointless noise from the various epub check utilities" + + - title: "Ask for confirmation before removing plugins" + + - title: "Fix bulk convert queueing dialog becoming very long if any of the books have a very long title." + tickets: [1076191] + + - title: "Fix deleting custom column tags like data from the Tag browser not updating the last modified timestamp for affected books" + tickets: [1075476] + + - title: "When updating a previously broken plugin, do not show an error message because the previous version of the plugin cannot be loaded" + + - title: "Fix regression that broke the Template Editor" + + improved recipes: + - Various updated Polish recipes + - London Review of Books + - Yemen Times + + new recipes: + - title: "Various Polish news sources" + author: Artur Stachecki + + - version: 0.9.5 date: 2012-11-02 diff --git a/manual/faq.rst b/manual/faq.rst index 8163861863..739971c95c 100644 --- a/manual/faq.rst +++ b/manual/faq.rst @@ -327,9 +327,8 @@ You can browse your |app| collection on your Android device is by using the calibre content server, which makes your collection available over the net. First perform the following steps in |app| - * Set the :guilabel:`Preferred Output Format` in |app| to EPUB (The output format can be set under :guilabel:`Preferences->Interface->Behavior`) - * Set the output profile to Tablet (this will work for phones as well), under :guilabel:`Preferences->Conversion->Common Options->Page Setup` - * Convert the books you want to read on your device to EPUB format by selecting them and clicking the Convert button. + * Set the :guilabel:`Preferred Output Format` in |app| to EPUB for normal Android devices or MOBI for Kindles (The output format can be set under :guilabel:`Preferences->Interface->Behavior`) + * Convert the books you want to read on your device to EPUB/MOBI format by selecting them and clicking the Convert button. * Turn on the Content Server in |app|'s preferences and leave |app| running. Now on your Android device, open the browser and browse to @@ -722,8 +721,8 @@ You can switch |app| to using a backed up library folder by simply clicking the If you want to backup the |app| configuration/plugins, you have to backup the config directory. You can find this config directory via :guilabel:`Preferences->Miscellaneous`. Note that restoring configuration directories is not officially supported, but should work in most cases. Just copy the contents of the backup directory into the current configuration directory to restore. -How do I use purchased EPUB books with |app|? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +How do I use purchased EPUB books with |app| (or what do I do with .acsm files)? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Most purchased EPUB books have `DRM `_. This prevents |app| from opening them. You can still use |app| to store and transfer them to your ebook reader. First, you must authorize your reader on a windows machine with Adobe Digital Editions. Once this is done, EPUB books transferred with |app| will work fine on your reader. When you purchase an epub book from a website, you will get an ".acsm" file. This file should be opened with Adobe Digital Editions, which will then download the actual ".epub" ebook. The ebook file will be stored in the folder "My Digital Editions", from where you can add it to |app|. I am getting a "Permission Denied" error? diff --git a/recipes/focus_pl.recipe b/recipes/focus_pl.recipe index 342aa0d2db..1954fd7803 100644 --- a/recipes/focus_pl.recipe +++ b/recipes/focus_pl.recipe @@ -2,7 +2,9 @@ import re from calibre.web.feeds.news import BasicNewsRecipe + class FocusRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' __author__ = u'intromatyk ' language = 'pl' @@ -12,10 +14,10 @@ class FocusRecipe(BasicNewsRecipe): publisher = u'Gruner + Jahr Polska' category = u'News' description = u'Newspaper' - category='magazine' - cover_url='' - remove_empty_feeds= True - no_stylesheets=True + category = 'magazine' + cover_url = '' + remove_empty_feeds = True + no_stylesheets = True oldest_article = 7 max_articles_per_feed = 100000 recursions = 0 @@ -27,15 +29,15 @@ class FocusRecipe(BasicNewsRecipe): simultaneous_downloads = 5 r = re.compile('.*(?Phttp:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*') - keep_only_tags =[] - keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'cll'})) - - remove_tags =[] - remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulm noprint'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'txb'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'h2'})) - remove_tags.append(dict(name = 'ul', attrs = {'class' : 'txu'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulc'})) + keep_only_tags = [] + keep_only_tags.append(dict(name='div', attrs={'id': 'cll'})) + + remove_tags = [] + remove_tags.append(dict(name='div', attrs={'class': 'ulm noprint'})) + remove_tags.append(dict(name='div', attrs={'class': 'txb'})) + remove_tags.append(dict(name='div', attrs={'class': 'h2'})) + remove_tags.append(dict(name='ul', attrs={'class': 'txu'})) + remove_tags.append(dict(name='div', attrs={'class': 'ulc'})) extra_css = ''' body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} @@ -44,18 +46,17 @@ class FocusRecipe(BasicNewsRecipe): p.lead {font-weight: bold; text-align: left;} .authordate {font-size: small; color: #696969;} .fot{font-size: x-small; color: #666666;} - ''' + ''' - - feeds = [ - ('Nauka', 'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'), - ('Historia', 'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'), - ('Cywilizacja', 'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'), - ('Sport', 'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'), - ('Technika', 'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'), - ('Przyroda', 'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'), - ('Technologie', 'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'), - ] + feeds = [ + ('Nauka', 'http://www.focus.pl/nauka/rss/'), + ('Historia', 'http://www.focus.pl/historia/rss/'), + ('Cywilizacja', 'http://www.focus.pl/cywilizacja/rss/'), + ('Sport', 'http://www.focus.pl/sport/rss/'), + ('Technika', 'http://www.focus.pl/technika/rss/'), + ('Przyroda', 'http://www.focus.pl/przyroda/rss/'), + ('Technologie', 'http://www.focus.pl/gadzety/rss/') + ] def skip_ad_pages(self, soup): if ('advertisement' in soup.find('title').string.lower()): @@ -65,20 +66,20 @@ class FocusRecipe(BasicNewsRecipe): return None def get_cover_url(self): - soup=self.index_to_soup('http://www.focus.pl/magazyn/') - tag=soup.find(name='div', attrs={'class':'clr fl'}) + soup = self.index_to_soup('http://www.focus.pl/magazyn/') + tag = soup.find(name='div', attrs={'class': 'clr fl'}) if tag: - self.cover_url='http://www.focus.pl/' + tag.a['href'] + self.cover_url = 'http://www.focus.pl/' + tag.a['href'] return getattr(self, 'cover_url', self.cover_url) def print_version(self, url): - if url.count ('focus.pl.feedsportal.com'): + if url.count('focus.pl.feedsportal.com'): u = url.find('focus0Bpl') u = 'http://www.focus.pl/' + url[u + 11:] u = u.replace('0C', '/') u = u.replace('A', '') - u = u.replace ('0E','-') + u = u.replace('0E', '-') u = u.replace('/nc/1//story01.htm', '/do-druku/1') - else: - u = url.replace('/nc/1','/do-druku/1') - return u \ No newline at end of file + else: + u = url.replace('/nc/1', '/do-druku/1') + return u diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index 5c034b10ab..3d416e444f 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -1,104 +1,107 @@ # -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe + class Gazeta_Wyborcza(BasicNewsRecipe): - title = u'Gazeta Wyborcza' - __author__ = 'fenuks' - language = 'pl' - description ='news from gazeta.pl' - category='newspaper' + title = u'Gazeta Wyborcza' + __author__ = 'fenuks, Artur Stachecki' + language = 'pl' + description = 'news from gazeta.pl' + category = 'newspaper' publication_type = 'newspaper' - masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' - INDEX='http://wyborcza.pl' - remove_empty_feeds= True + masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' + INDEX = 'http://wyborcza.pl' + remove_empty_feeds = True oldest_article = 3 max_articles_per_feed = 100 - remove_javascript=True - no_stylesheets=True - ignore_duplicate_articles = {'title', 'url'} - keep_only_tags = dict(id=['gazeta_article', 'article']) - remove_tags_after = dict(id='gazeta_article_share') - remove_tags = [dict(attrs={'class':['artReadMore', 'gazeta_article_related_new', 'txt_upl']}), dict(id=['gazeta_article_likes', 'gazeta_article_tools', 'rel', 'gazeta_article_tags', 'gazeta_article_share', 'gazeta_article_brand', 'gazeta_article_miniatures'])] - - feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), - (u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'), - (u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'), - (u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'), - (u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), - (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), - (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), - #(u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), - (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), - (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), - (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), - (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), - (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), - (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), - (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), - (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss') - ] + remove_javascript = True + no_stylesheets = True + remove_tags_before = dict(id='k0') + remove_tags_after = dict(id='banP4') + remove_tags = [dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})] + feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), + (u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'), + (u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'), + (u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'), + (u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), (u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'), (u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss') + ] def skip_ad_pages(self, soup): - tag=soup.find(name='a', attrs={'class':'btn'}) - if tag: - new_soup=self.index_to_soup(tag['href'], raw=True) + tag = soup.find(name='a', attrs={'class': 'btn'}) + if tag: + new_soup = self.index_to_soup(tag['href'], raw=True) return new_soup - def append_page(self, soup, appendtag): - loop=False - tag = soup.find('div', attrs={'id':'Str'}) - if appendtag.find('div', attrs={'id':'Str'}): - nexturl=tag.findAll('a') - appendtag.find('div', attrs={'id':'Str'}).extract() - loop=True + loop = False + tag = soup.find('div', attrs={'id': 'Str'}) + if appendtag.find('div', attrs={'id': 'Str'}): + nexturl = tag.findAll('a') + appendtag.find('div', attrs={'id': 'Str'}).extract() + loop = True if appendtag.find(id='source'): appendtag.find(id='source').extract() while loop: - loop=False + loop = False for link in nexturl: if u'następne' in link.string: - url= self.INDEX + link['href'] + url = self.INDEX + link['href'] soup2 = self.index_to_soup(url) pagetext = soup2.find(id='artykul') pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - tag = soup2.find('div', attrs={'id':'Str'}) - nexturl=tag.findAll('a') - loop=True + tag = soup2.find('div', attrs={'id': 'Str'}) + nexturl = tag.findAll('a') + loop = True def gallery_article(self, appendtag): - tag=appendtag.find(id='container_gal') + tag = appendtag.find(id='container_gal') if tag: - nexturl=appendtag.find(id='gal_btn_next').a['href'] + nexturl = appendtag.find(id='gal_btn_next').a['href'] appendtag.find(id='gal_navi').extract() while nexturl: - soup2=self.index_to_soup(nexturl) - pagetext=soup2.find(id='container_gal') - nexturl=pagetext.find(id='gal_btn_next') + soup2 = self.index_to_soup(nexturl) + pagetext = soup2.find(id='container_gal') + nexturl = pagetext.find(id='gal_btn_next') if nexturl: - nexturl=nexturl.a['href'] + nexturl = nexturl.a['href'] pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - rem=appendtag.find(id='gal_navi') + rem = appendtag.find(id='gal_navi') if rem: rem.extract() def preprocess_html(self, soup): - self.append_page(soup, soup.body) - if soup.find(id='container_gal'): - self.gallery_article(soup.body) - return soup + if soup.find(attrs={'class': 'piano_btn_1'}): + return None + else: + self.append_page(soup, soup.body) + if soup.find(id='container_gal'): + self.gallery_article(soup.body) + return soup def print_version(self, url): - if 'http://wyborcza.biz/biznes/' not in url: - return url + if url.count('rss.feedsportal.com'): + u = url.find('wyborcza0Bpl') + u = 'http://www.wyborcza.pl/' + url[u + 11:] + u = u.replace('0C', '/') + u = u.replace('A', '') + u = u.replace('0E', '-') + u = u.replace('0H', ',') + u = u.replace('0I', '_') + u = u.replace('0B', '.') + u = u.replace('/1,', '/2029020,') + u = u.replace('/story01.htm', '') + print(u) + return u + elif 'http://wyborcza.pl/1' in url: + return url.replace('http://wyborcza.pl/1', 'http://wyborcza.pl/2029020') else: - return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') + return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') def get_cover_url(self): soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html') - cover=soup.find(id='GWmini2') - soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href']) - self.cover_url='http://wyborcza.pl' + soup.img['src'] + cover = soup.find(id='GWmini2') + soup = self.index_to_soup('http://wyborcza.pl/' + cover.contents[3].a['href']) + self.cover_url = 'http://wyborcza.pl' + soup.img['src'] return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/icons/mateusz_czytania.png b/recipes/icons/mateusz_czytania.png new file mode 100644 index 0000000000..7568139433 Binary files /dev/null and b/recipes/icons/mateusz_czytania.png differ diff --git a/recipes/icons/rushisaband.png b/recipes/icons/rushisaband.png new file mode 100644 index 0000000000..9a7d4237cf Binary files /dev/null and b/recipes/icons/rushisaband.png differ diff --git a/recipes/icons/rynek_infrastruktury.png b/recipes/icons/rynek_infrastruktury.png new file mode 100644 index 0000000000..8d2e6ac27b Binary files /dev/null and b/recipes/icons/rynek_infrastruktury.png differ diff --git a/recipes/icons/rynek_kolejowy.png b/recipes/icons/rynek_kolejowy.png new file mode 100644 index 0000000000..e9dd5fc464 Binary files /dev/null and b/recipes/icons/rynek_kolejowy.png differ diff --git a/recipes/icons/satkurier.png b/recipes/icons/satkurier.png new file mode 100644 index 0000000000..6e71bb2450 Binary files /dev/null and b/recipes/icons/satkurier.png differ diff --git a/recipes/kerrang.recipe b/recipes/kerrang.recipe new file mode 100644 index 0000000000..bbd944eb62 --- /dev/null +++ b/recipes/kerrang.recipe @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe + +class kerrang(BasicNewsRecipe): + title = u'Kerrang!' + __author__ = 'Artur Stachecki ' + language = 'en_GB' + description = u'UK-based magazine devoted to rock music published by Bauer Media Group' + oldest_article = 7 + masthead_url = 'http://images.kerrang.com/design/kerrang/kerrangsite/logo.gif' + max_articles_per_feed = 100 + simultaneous_downloads = 5 + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + recursions = 0 + + keep_only_tags = [] + keep_only_tags.append(dict(attrs = {'class' : ['headz', 'blktxt']})) + + extra_css = ''' img { display: block; margin-right: auto;} + h1 {text-align: left; font-size: 22px;}''' + + feeds = [(u'News', u'http://www.kerrang.com/blog/rss.xml')] + + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/lequipe.recipe b/recipes/lequipe.recipe new file mode 100644 index 0000000000..c6e9a26880 --- /dev/null +++ b/recipes/lequipe.recipe @@ -0,0 +1,45 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe + + +class leequipe(BasicNewsRecipe): + title = u'l\'equipe' + __author__ = 'Artur Stachecki ' + language = 'fr' + description = u'Retrouvez tout le sport en direct sur le site de L\'EQUIPE et suivez l\'actualité du football, rugby, basket, cyclisme, f1, volley, hand, tous les résultats sportifs' + oldest_article = 1 + masthead_url = 'http://static.lequipe.fr/v6/img/logo-lequipe.png' + max_articles_per_feed = 100 + simultaneous_downloads = 5 + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + recursions = 0 + + keep_only_tags = [] + keep_only_tags.append(dict(attrs={'id': ['article']})) + + remove_tags = [] + remove_tags.append(dict(attrs={'id': ['partage', 'ensavoirplus', 'bloc_bas_breve', 'commentaires', 'tools']})) + remove_tags.append(dict(attrs={'class': ['partage_bis', 'date']})) + + feeds = [(u'Football', u'http://www.lequipe.fr/rss/actu_rss_Football.xml'), + (u'Auto-Moto', u'http://www.lequipe.fr/rss/actu_rss_Auto-Moto.xml'), + (u'Tennis', u'http://www.lequipe.fr/rss/actu_rss_Tennis.xml'), + (u'Golf', u'http://www.lequipe.fr/rss/actu_rss_Golf.xml'), + (u'Rugby', u'http://www.lequipe.fr/rss/actu_rss_Rugby.xml'), + (u'Basket', u'http://www.lequipe.fr/rss/actu_rss_Basket.xml'), + (u'Hand', u'http://www.lequipe.fr/rss/actu_rss_Hand.xml'), + (u'Cyclisme', u'http://www.lequipe.fr/rss/actu_rss_Cyclisme.xml'), + (u'Autres Sports', u'http://pipes.yahoo.com/pipes/pipe.run?_id=2039f7f4f350c70c5e4e8633aa1b37cd&_render=rss') + ] + + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/lrb.recipe b/recipes/lrb.recipe index 4a203c80ae..6453e78724 100644 --- a/recipes/lrb.recipe +++ b/recipes/lrb.recipe @@ -40,6 +40,6 @@ class LondonReviewOfBooks(BasicNewsRecipe): soup = self.index_to_soup('http://www.lrb.co.uk/') cover_item = soup.find('p',attrs={'class':'cover'}) if cover_item: - cover_url = 'http://www.lrb.co.uk' + cover_item.a.img['src'] + cover_url = cover_item.a.img['src'] return cover_url diff --git a/recipes/mateusz_czytania.recipe b/recipes/mateusz_czytania.recipe new file mode 100644 index 0000000000..ba7c598787 --- /dev/null +++ b/recipes/mateusz_czytania.recipe @@ -0,0 +1,36 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'teepel ' + +''' +http://www.mateusz.pl/czytania +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class czytania_mateusz(BasicNewsRecipe): + title = u'Czytania na ka\u017cdy dzie\u0144' + __author__ = 'teepel ' + description = u'Codzienne czytania z jednego z najstarszych polskich serwisów katolickich.' + language = 'pl' + INDEX='http://www.mateusz.pl/czytania' + oldest_article = 1 + remove_empty_feeds= True + no_stylesheets=True + auto_cleanup = True + remove_javascript = True + simultaneous_downloads = 2 + max_articles_per_feed = 100 + auto_cleanup = True + + feeds = [(u'Czytania', u'http://mateusz.pl/rss/czytania/')] + + remove_tags =[] + remove_tags.append(dict(name = 'p', attrs = {'class' : 'top'})) + + #thanks t3d + def get_article_url(self, article): + link = article.get('link') + if 'kmt.pl' not in link: + return link diff --git a/recipes/money_pl.recipe b/recipes/money_pl.recipe index 075264f8f7..475c2059ff 100644 --- a/recipes/money_pl.recipe +++ b/recipes/money_pl.recipe @@ -4,7 +4,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class FocusRecipe(BasicNewsRecipe): __license__ = 'GPL v3' - __author__ = u'intromatyk ' + __author__ = u'Artur Stachecki ' language = 'pl' version = 1 diff --git a/recipes/naszdziennik.recipe b/recipes/naszdziennik.recipe new file mode 100644 index 0000000000..4c7b78c199 --- /dev/null +++ b/recipes/naszdziennik.recipe @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe + +class naszdziennik(BasicNewsRecipe): + title = u'Nasz Dziennik' + __author__ = 'Artur Stachecki ' + language = 'pl' + description =u'Nasz Dziennik - Ogólnopolska gazeta codzienna. Podejmuje tematykę dotyczącą życia społecznego, kulturalnego, politycznego i religijnego. Propaguje wartości chrześcijańskie oraz tradycję i kulturę polską.' + masthead_url='http://www.naszdziennik.pl/images/logo-male.png' + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets = True + + keep_only_tags =[dict(attrs = {'id' : 'article'})] + + #definiujemy nową funkcje; musi zwracać listę feedów wraz z artykułami + def parse_index(self): + #adres do parsowania artykułów + soup = self.index_to_soup('http://www.naszdziennik.pl/news') + #deklaracja pustej listy feedów + feeds = [] + #deklaracja pustego słownika artykułów + articles = {} + #deklaracja pustej listy sekcji + sections = [] + #deklaracja pierwszej sekcji jako pusty string + section = '' + + #pętla for, która analizuje po kolei każdy tag "news-article" + for item in soup.findAll(attrs = {'class' : 'news-article'}) : + #w tagu "news-article szukamy pierwszego taga h4" + section = item.find('h4') + #zmiennej sekcja przypisujemy zawartość tekstową taga + section = self.tag_to_string(section) + #sprawdzamy czy w słowniku artykułów istnieje klucz dotyczący sekcji + #jeśli nie istnieje to : + if not articles.has_key(section) : + #do listy sekcji dodajemy nową sekcje + sections.append(section) + #deklarujemy nową sekcje w słowniku artykułów przypisując jej klucz odpowiadający nowej sekcji, którego wartością jest pusta lista + articles[section] = [] + #przeszukujemy kolejny tag "title-datetime" + article_title_datetime = item.find(attrs = {'class' : 'title-datetime'}) + #w tagu title-datetime znajdujemy pierwszy link + article_a = article_title_datetime.find('a') + #i tworzymy z niego link absolutny do właściwego artykułu + article_url = 'http://naszdziennik.pl' + article_a['href'] + #jako tytuł użyty będzie tekst pomiędzy tagami + article_title = self.tag_to_string(article_a) + #a data będzie tekstem z pierwszego taga h4 znalezionego w tagu title-datetime + article_date = self.tag_to_string(article_title_datetime.find('h4')) + #zebrane elementy dodajemy do listy zadeklarowanej w linijce 44 + articles[section].append( { 'title' : article_title, 'url' : article_url, 'date' : article_date }) + #po dodaniu wszystkich artykułów dodajemy sekcje do listy feedów, korzystając z list sekcji znajdujących się w słowniku + for section in sections: + feeds.append((section, articles[section])) + #zwracamy listę feedów, której parsowaniem zajmie się calibre + return feeds \ No newline at end of file diff --git a/recipes/rushisaband.recipe b/recipes/rushisaband.recipe new file mode 100644 index 0000000000..f1ab83b313 --- /dev/null +++ b/recipes/rushisaband.recipe @@ -0,0 +1,28 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'MrStefan ' + +''' +www.rushisaband.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class rushisaband(BasicNewsRecipe): + title = u'Rushisaband' + __author__ = 'MrStefan ' + language = 'en_GB' + description =u'A blog devoted to the band RUSH and its members, Neil Peart, Geddy Lee and Alex Lifeson' + remove_empty_feeds= True + oldest_article = 7 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'h4')) + keep_only_tags.append(dict(name = 'h5')) + keep_only_tags.append(dict(name = 'p')) + + feeds = [(u'Rush is a Band', u'http://feeds2.feedburner.com/rushisaband/blog')] diff --git a/recipes/rynek_infrastruktury.recipe b/recipes/rynek_infrastruktury.recipe new file mode 100644 index 0000000000..98529379c5 --- /dev/null +++ b/recipes/rynek_infrastruktury.recipe @@ -0,0 +1,41 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'teepel ' + +''' +http://www.rynekinfrastruktury.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class prawica_recipe(BasicNewsRecipe): + title = u'Rynek Infrastruktury' + __author__ = 'teepel ' + language = 'pl' + description =u'Portal "Rynek Infrastruktury" to źródło informacji o kluczowych elementach polskiej gospodarki: drogach, kolei, lotniskach, portach, telekomunikacji, energetyce, prawie i polityce, wzmocnione eksperckimi komentarzami kluczowych analityków.' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + feeds = [ + (u'Drogi', u'http://www.rynekinfrastruktury.pl/rss/41'), + (u'Lotniska', u'http://www.rynekinfrastruktury.pl/rss/42'), + (u'Kolej', u'http://www.rynekinfrastruktury.pl/rss/37'), + (u'Energetyka', u'http://www.rynekinfrastruktury.pl/rss/30'), + (u'Telekomunikacja', u'http://www.rynekinfrastruktury.pl/rss/31'), + (u'Porty', u'http://www.rynekinfrastruktury.pl/rss/32'), + (u'Prawo i polityka', u'http://www.rynekinfrastruktury.pl/rss/47'), + (u'Komentarze', u'http://www.rynekinfrastruktury.pl/rss/38'), + ] + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'articleContent'})) + + remove_tags =[] + remove_tags.append(dict(name = 'span', attrs = {'class' : 'date'})) + + def print_version(self, url): + return url.replace('http://www.rynekinfrastruktury.pl/artykul/', 'http://www.rynekinfrastruktury.pl/artykul/drukuj/') diff --git a/recipes/rynek_kolejowy.recipe b/recipes/rynek_kolejowy.recipe new file mode 100644 index 0000000000..f68b33f84b --- /dev/null +++ b/recipes/rynek_kolejowy.recipe @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'teepel ' + +''' +rynek-kolejowy.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class rynek_kolejowy(BasicNewsRecipe): + title = u'Rynek Kolejowy' + __author__ = 'teepel ' + language = 'pl' + description =u'Rynek Kolejowy - kalendarium wydarzeń branży kolejowej, konferencje, sympozja, targi kolejowe, krajowe i zagraniczne.' + masthead_url='http://p.wnp.pl/images/i/partners/rynek_kolejowy.gif' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'mainContent'})) + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'class' : 'right no-print'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'font-size'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'no-print'})) + + extra_css = '''.wiadomosc_title{ font-size: 1.4em; font-weight: bold; }''' + + feeds = [(u'Wiadomości', u'http://www.rynek-kolejowy.pl/rss/rss.php')] + + def print_version(self, url): + segment = url.split('/') + urlPart = segment[3] + return 'http://www.rynek-kolejowy.pl/drukuj.php?id=' + urlPart + diff --git a/recipes/rzeczpospolita.recipe b/recipes/rzeczpospolita.recipe index 4ab27d8437..40cb4db3ac 100644 --- a/recipes/rzeczpospolita.recipe +++ b/recipes/rzeczpospolita.recipe @@ -34,16 +34,20 @@ class RzeczpospolitaRecipe(BasicNewsRecipe): keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'story'})) remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'class' : 'articleLeftBox'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'socialNewTools'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'socialTools'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'articleToolBoxTop'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'clr'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'recommendations'})) - remove_tags.append(dict(name = 'div', attrs = {'id' : 'editorPicks'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'editorPicks'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'editorPicks editorPicksFirst'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'articleCopyrightText'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'articleCopyrightButton'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'articleToolBoxBottom'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'more'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'addRecommendation'})) + remove_tags.append(dict(name = 'h3', attrs = {'id' : 'tags'})) extra_css = ''' body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} @@ -67,3 +71,4 @@ class RzeczpospolitaRecipe(BasicNewsRecipe): return start + '/' + index + '?print=tak' + diff --git a/recipes/satkurier.recipe b/recipes/satkurier.recipe new file mode 100644 index 0000000000..382f7f8180 --- /dev/null +++ b/recipes/satkurier.recipe @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe + +class SATKurier(BasicNewsRecipe): + title = u'SATKurier.pl' + __author__ = 'Artur Stachecki ' + language = 'pl' + description = u'Największy i najstarszy serwis poświęcony\ + telewizji cyfrowej, przygotowywany przez wydawcę\ + miesięcznika SAT Kurier. Bieżące wydarzenia\ + z rynku mediów i nowych technologii.' + oldest_article = 7 + masthead_url = 'http://satkurier.pl/img/header_sk_logo.gif' + max_articles_per_feed = 100 + simultaneous_downloads = 5 + remove_javascript = True + no_stylesheets = True + + keep_only_tags = [] + keep_only_tags.append(dict(name='div', attrs={'id': ['single_news', 'content']})) + + remove_tags = [] + remove_tags.append(dict(attrs={'id': ['news_info', 'comments']})) + remove_tags.append(dict(attrs={'href': '#czytaj'})) + remove_tags.append(dict(attrs={'align': 'center'})) + remove_tags.append(dict(attrs={'class': ['date', 'category', 'right mini-add-comment', 'socialLinks', 'commentlist']})) + + remove_tags_after = [(dict(id='entry'))] + + feeds = [(u'Najnowsze wiadomości', u'http://feeds.feedburner.com/satkurierpl?format=xml'), + (u'Sport w telewizji', u'http://feeds.feedburner.com/satkurier/sport?format=xml'), + (u'Blog', u'http://feeds.feedburner.com/satkurier/blog?format=xml')] + + def preprocess_html(self, soup): + image = soup.find(attrs={'id': 'news_mini_photo'}) + if image: + image.extract() + header = soup.find('h1') + header.replaceWith(header.prettify() + image.prettify()) + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/tvn24.recipe b/recipes/tvn24.recipe index ae5b44c570..a5f5111770 100644 --- a/recipes/tvn24.recipe +++ b/recipes/tvn24.recipe @@ -1,34 +1,50 @@ from calibre.web.feeds.news import BasicNewsRecipe +from calibre.utils.magick import Image class tvn24(BasicNewsRecipe): title = u'TVN24' oldest_article = 7 max_articles_per_feed = 100 - __author__ = 'fenuks' + __author__ = 'fenuks, Artur Stachecki' description = u'Sport, Biznes, Gospodarka, Informacje, Wiadomości Zawsze aktualne wiadomości z Polski i ze świata' category = 'news' language = 'pl' - #masthead_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif' - cover_url= 'http://www.userlogos.org/files/logos/Struna/TVN24.jpg' - extra_css = 'ul {list-style:none;} \ - li {list-style:none; float: left; margin: 0 0.15em;} \ - h2 {font-size: medium} \ - .date60m {float: left; margin: 0 10px 0 5px;}' + masthead_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif' + cover_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif' + extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}' remove_empty_feeds = True remove_javascript = True no_stylesheets = True - use_embedded_content = False - ignore_duplicate_articles = {'title', 'url'} - keep_only_tags=[dict(name='h1', attrs={'class':['size30 mt10 pb10', 'size38 mt10 pb15']}), dict(name='figure', attrs={'class':'articleMainPhoto articleMainPhotoWide'}), dict(name='article', attrs={'class':['mb20', 'mb20 textArticleDefault']}), dict(name='ul', attrs={'class':'newsItem'})] - remove_tags = [dict(name='aside', attrs={'class':['innerArticleModule onRight cols externalContent', 'innerArticleModule center']}), dict(name='div', attrs={'class':['thumbsGallery', 'articleTools', 'article right rd7', 'heading', 'quizContent']}), dict(name='a', attrs={'class':'watchMaterial text'}), dict(name='section', attrs={'class':['quiz toCenter', 'quiz toRight']})] - - feeds = [(u'Najnowsze', u'http://www.tvn24.pl/najnowsze.xml'), - (u'Polska', u'www.tvn24.pl/polska.xml'), (u'\u015awiat', u'http://www.tvn24.pl/swiat.xml'), (u'Sport', u'http://www.tvn24.pl/sport.xml'), (u'Biznes', u'http://www.tvn24.pl/biznes.xml'), (u'Meteo', u'http://www.tvn24.pl/meteo.xml'), (u'Micha\u0142ki', u'http://www.tvn24.pl/michalki.xml'), (u'Kultura', u'http://www.tvn24.pl/kultura.xml')] + keep_only_tags=[ +# dict(name='h1', attrs={'class':'size38 mt20 pb20'}), + dict(name='div', attrs={'class':'mainContainer'}), +# dict(name='p'), +# dict(attrs={'class':['size18 mt10 mb15', 'bold topicSize1', 'fromUsers content', 'textArticleDefault']}) + ] + remove_tags=[ + dict(attrs={'class':['commentsInfo', 'textSize', 'related newsNews align-right', 'box', 'watchMaterial text', 'related galleryGallery align-center', 'advert block-alignment-right', 'userActions', 'socialBookmarks', 'im yourArticle fl', 'dynamicButton addComment fl', 'innerArticleModule onRight cols externalContent', 'thumbsGallery', 'relatedObject customBlockquote align-right', 'lead', 'mainRightColumn', 'articleDateContainer borderGreyBottom', 'socialMediaContainer onRight loaded', 'quizContent', 'twitter', 'facebook', 'googlePlus', 'share', 'voteResult', 'reportTitleBar bgBlue_v4 mb15', 'innerVideoModule center']}), + dict(name='article', attrs={'class':['singleArtPhotoCenter', 'singleArtPhotoRight', 'singleArtPhotoLeft']}), + dict(name='section', attrs={'id':['forum', 'innerArticle', 'quiz toCenter', 'mb20']}), + dict(name='div', attrs={'class':'socialMediaContainer big p20 mb20 borderGrey loaded'}) + ] + remove_tags_after=[dict(name='li', attrs={'class':'share'})] + feeds = [(u'Najnowsze', u'http://www.tvn24.pl/najnowsze.xml'), ] + #(u'Polska', u'www.tvn24.pl/polska.xml'), (u'\u015awiat', u'http://www.tvn24.pl/swiat.xml'), (u'Sport', u'http://www.tvn24.pl/sport.xml'), (u'Biznes', u'http://www.tvn24.pl/biznes.xml'), (u'Meteo', u'http://www.tvn24.pl/meteo.xml'), (u'Micha\u0142ki', u'http://www.tvn24.pl/michalki.xml'), (u'Kultura', u'http://www.tvn24.pl/kultura.xml')] def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - tag = soup.find(name='ul', attrs={'class':'newsItem'}) - if tag: - tag.name='div' - tag.li.name='div' + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup + + def postprocess_html(self, soup, first): + #process all the images + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + img = Image() + img.open(iurl) + if img < 0: + raise RuntimeError('Out of memory') + img.type = "GrayscaleType" + img.save(iurl) return soup diff --git a/recipes/wprost.recipe b/recipes/wprost.recipe index b271665125..2adac1e113 100644 --- a/recipes/wprost.recipe +++ b/recipes/wprost.recipe @@ -3,6 +3,8 @@ __license__ = 'GPL v3' __copyright__ = '2010, matek09, matek09@gmail.com' __copyright__ = 'Modified 2011, Mariusz Wolek ' +__copyright__ = 'Modified 2012, Artur Stachecki ' + from calibre.web.feeds.news import BasicNewsRecipe import re @@ -11,7 +13,7 @@ class Wprost(BasicNewsRecipe): EDITION = 0 FIND_LAST_FULL_ISSUE = True EXCLUDE_LOCKED = True - ICO_BLOCKED = 'http://www.wprost.pl/G/icons/ico_blocked.gif' + ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png' title = u'Wprost' __author__ = 'matek09' @@ -20,6 +22,7 @@ class Wprost(BasicNewsRecipe): no_stylesheets = True language = 'pl' remove_javascript = True + recursions = 0 remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) @@ -35,13 +38,15 @@ class Wprost(BasicNewsRecipe): (re.compile(r'\\\<\/table\>'), lambda match: ''), (re.compile(r'\'), lambda match: ''), (re.compile(r'\'), lambda match: ''), - (re.compile(r'\
'), lambda match: '')] + (re.compile(r'\'), lambda match: ''), + (re.compile(r'\