diff --git a/.bzrignore b/.bzrignore index aaacc9f58a..f14ff947f6 100644 --- a/.bzrignore +++ b/.bzrignore @@ -35,3 +35,49 @@ nbproject/ .settings/ *.DS_Store calibre_plugins/ +recipes/.git +recipes/.gitignore +recipes/README +recipes/katalog_egazeciarz.recipe +recipes/tv_axnscifi.recipe +recipes/tv_comedycentral.recipe +recipes/tv_discoveryscience.recipe +recipes/tv_foxlife.recipe +recipes/tv_fox.recipe +recipes/tv_hbo.recipe +recipes/tv_kinopolska.recipe +recipes/tv_nationalgeographic.recipe +recipes/tv_polsat2.recipe +recipes/tv_polsat.recipe +recipes/tv_tv4.recipe +recipes/tv_tvn7.recipe +recipes/tv_tvn.recipe +recipes/tv_tvp1.recipe +recipes/tv_tvp2.recipe +recipes/tv_tvphd.recipe +recipes/tv_tvphistoria.recipe +recipes/tv_tvpkultura.recipe +recipes/tv_tvppolonia.recipe +recipes/tv_tvpuls.recipe +recipes/tv_viasathistory.recipe +recipes/icons/tv_axnscifi.png +recipes/icons/tv_comedycentral.png +recipes/icons/tv_discoveryscience.png +recipes/icons/tv_foxlife.png +recipes/icons/tv_fox.png +recipes/icons/tv_hbo.png +recipes/icons/tv_kinopolska.png +recipes/icons/tv_nationalgeographic.png +recipes/icons/tv_polsat2.png +recipes/icons/tv_polsat.png +recipes/icons/tv_tv4.png +recipes/icons/tv_tvn7.png +recipes/icons/tv_tvn.png +recipes/icons/tv_tvp1.png +recipes/icons/tv_tvp2.png +recipes/icons/tv_tvphd.png +recipes/icons/tv_tvphistoria.png +recipes/icons/tv_tvpkultura.png +recipes/icons/tv_tvppolonia.png +recipes/icons/tv_tvpuls.png +recipes/icons/tv_viasathistory.png diff --git a/COPYRIGHT b/COPYRIGHT index 1a2c305ad2..85d70a8aa8 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -47,12 +47,6 @@ License: Apache 2.0 The full text of the Apache 2.0 license is available at: http://www.apache.org/licenses/LICENSE-2.0 -Files: src/sfntly/* -Copyright: Google Inc. -License: Apache 2.0 - The full text of the Apache 2.0 license is available at: - http://www.apache.org/licenses/LICENSE-2.0 - Files: resources/viewer/mathjax/* Copyright: Unknown License: Apache 2.0 diff --git a/Changelog.yaml b/Changelog.yaml index 31908315f3..ebc2e5cad1 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,110 @@ # new recipes: # - title: +- version: 0.9.6 + date: 2012-11-10 + + new features: + - title: "Experimental support for subsetting fonts" + description: "Subsetting a font means reducing the font to contain only the glyphs for the text actually present in the book. This can easily halve the size of the font. calibre can now do this for all embedded fonts during a conversion. Turn it on via the 'Subset all embedded fonts' option under the Look & Feel section of the conversion dialog. calibre can subset both TrueType and OpenType fonts. Note that this code is very new and likely has bugs, so please check the output if you turn on subsetting. The conversion log will have info about the subsetting operations." + type: major + + - title: "EPUB Input: Try to workaround EPUBs that have missing or damaged ZIP central directories. calibre should now be able to read/convert such an EPUB file, provided it does not suffer from further corruption." + + - title: "Allow using identifiers in save to disk templates." + tickets: [1074623] + + - title: "calibredb: Add an option to not notify the GUI" + + - title: "Catalogs: Fix long tags causing catalog generation to fail on windows. Add the ability to cross-reference authors, i.e. to relist the authors for a book with multiple authors separately." + tickets: [1074931] + + - title: "Edit metadata dialog: Add a clear tags button to remove all tags with a single click" + + - title: "Add search to the font family chooser dialog" + + bug fixes: + - title: "Windows: Fix a long standing bug in the device eject code that for some reason only manifested in 0.9.5." + tickets: [1075782] + + - title: "Get Books: Fix Amazon stores, Google Books store and libri.de" + + - title: "Kobo driver: More fixes for on device book matching, and list books as being on device even if the Kobo has not yet indexed them. Also some performance improvements." + tickets: [1069617] + + - title: "EPUB Output: Remove duplicate id and name attributes to eliminate pointless noise from the various epub check utilities" + + - title: "Ask for confirmation before removing plugins" + + - title: "Fix bulk convert queueing dialog becoming very long if any of the books have a very long title." + tickets: [1076191] + + - title: "Fix deleting custom column tags like data from the Tag browser not updating the last modified timestamp for affected books" + tickets: [1075476] + + - title: "When updating a previously broken plugin, do not show an error message because the previous version of the plugin cannot be loaded" + + - title: "Fix regression that broke the Template Editor" + + improved recipes: + - Various updated Polish recipes + - London Review of Books + - Yemen Times + + new recipes: + - title: "Various Polish news sources" + author: Artur Stachecki + + +- version: 0.9.5 + date: 2012-11-02 + + new features: + - title: "Font embedding: Add support for the CSS 3 Fonts module, which means you can embed font families that have more that the usual four faces, with the full set of font-stretch and font-weight variations. Of course, whether the fonts actually show up on a reader will depend on the readers' support for CSS 3." + + - title: "Sharing by email: Allow specifying an 'alias' or friendly name by which to identify each email recipient." + tickets: [1069076] + + - title: "Embedding fonts: Allow adding ttf/otf font files to calibre directly to be used for embedding. That way the fonts do not have to be installed system wide. You can add a font to calibre via the 'Add fonts' button in the font chooser dialog for embedding fonts." + + - title: "E-book viewer: Add the ability to rotate images to the popup image viewer." + tickets: [1073513] + + - title: "Generate cover: Speedup searching the system for a font that can render special characters" + + - title: "A new custom font scanner to locate all fonts on the system. Faster and less crash prone that fontconfig/freetype" + + - title: "Font family chooser: Show the faces available for a family when clicking on the family" + + bug fixes: + - title: "Get Books: Fix eHarlequin and Kobo stores." + tickets: [1072702] + + - title: "Kobo driver: Fix a bug that could cause the on device book matching to fail in certain circumstances." + tickets: [1072437] + + - title: "Kobo driver: When using a SD card do not delete shelves that contain on books on the card (there might be books in the shelf in the main memory)." + tickets: [1073792] + + - title: "Workaround for bug in the windows API CreateHardLink function that breaks using calibre libraries on some networked filesystems." + + - title: "Template editor: Use dummy metadata instead of blank/unknown values" + + - title: "Windows: abort setting of title/author if any of the books' files are in use. Results in less surprising behavior than before, when the title/author would be changed, but the on disk location would not." + + improved recipes: + - Financial Times UK + - Science AAAS + - The Atlantic + + new recipes: + - title: "Pravda in english, italian and portuguese" + author: Darko Miletic + + - title: "Delco Times" + author: Krittika Goyal + + - version: 0.9.4 date: 2012-10-26 diff --git a/manual/develop.rst b/manual/develop.rst index a6f1a1308e..b9fba195d3 100644 --- a/manual/develop.rst +++ b/manual/develop.rst @@ -72,13 +72,21 @@ After installing Bazaar, you can get the |app| source code with the command:: bzr branch lp:calibre -On Windows you will need the complete path name, that will be something like :file:`C:\\Program Files\\Bazaar\\bzr.exe`. To update a branch -to the latest code, use the command:: +On Windows you will need the complete path name, that will be something like :file:`C:\\Program Files\\Bazaar\\bzr.exe`. + +To update a branch to the latest code, use the command:: bzr merge -The calibre repository is huge so the branch operation above takes along time (about an hour). If you want to get the code faster, the sourcecode for the latest release is always available as an -`archive `_. +|app| is a very large project with a very long source control history, so the +above can take a while (10mins to an hour depending on your internet speed). + +If you want to get the code faster, the sourcecode for the latest release is +always available as an `archive `_. +You can also use bzr to just download the source code, without the history, +using:: + + bzr branch --stacked lp:calibre Submitting your changes to be included ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -109,7 +117,7 @@ Whenever you commit changes to your branch with the command:: bzr commit -m "Comment describing your change" Kovid can merge it directly from your branch into the main |app| source tree. You should also keep an eye on the |app| -`development forum `. Before making major changes, you should +`development forum `_. Before making major changes, you should discuss them in the forum or contact Kovid directly (his email address is all over the source code). Windows development environment diff --git a/manual/faq.rst b/manual/faq.rst index d46011d8d8..109aff440d 100644 --- a/manual/faq.rst +++ b/manual/faq.rst @@ -327,9 +327,8 @@ You can browse your |app| collection on your Android device is by using the calibre content server, which makes your collection available over the net. First perform the following steps in |app| - * Set the :guilabel:`Preferred Output Format` in |app| to EPUB (The output format can be set under :guilabel:`Preferences->Interface->Behavior`) - * Set the output profile to Tablet (this will work for phones as well), under :guilabel:`Preferences->Conversion->Common Options->Page Setup` - * Convert the books you want to read on your device to EPUB format by selecting them and clicking the Convert button. + * Set the :guilabel:`Preferred Output Format` in |app| to EPUB for normal Android devices or MOBI for Kindles (The output format can be set under :guilabel:`Preferences->Interface->Behavior`) + * Convert the books you want to read on your device to EPUB/MOBI format by selecting them and clicking the Convert button. * Turn on the Content Server in |app|'s preferences and leave |app| running. Now on your Android device, open the browser and browse to @@ -650,17 +649,24 @@ If it still wont launch, start a command prompt (press the windows key and R; th Post any output you see in a help message on the `Forum `_. -|app| freezes when I click on anything? +|app| freezes/crashes occasionally? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There are three possible things I know of, that can cause this: - * You recently connected an external monitor or TV to your computer. In this case, whenever |app| opens a new window like the edit metadata window or the conversion dialog, it appears on the second monitor where you dont notice it and so you think |app| has frozen. Disconnect your second monitor and restart calibre. + * You recently connected an external monitor or TV to your computer. In + this case, whenever |app| opens a new window like the edit metadata + window or the conversion dialog, it appears on the second monitor where + you dont notice it and so you think |app| has frozen. Disconnect your + second monitor and restart calibre. - * You are using a Wacom branded mouse. There is an incompatibility between Wacom mice and the graphics toolkit |app| uses. Try using a non-Wacom mouse. - - * Sometimes if some software has installed lots of new files in your fonts folder, |app| can crash until it finishes indexing them. Just start |app|, then leave it alone for about 20 minutes, without clicking on anything. After that you should be able to use |app| as normal. + * You are using a Wacom branded mouse. There is an incompatibility between + Wacom mice and the graphics toolkit |app| uses. Try using a non-Wacom + mouse. + * If you use RoboForm, it is known to cause |app| to crash. Add |app| to + the blacklist of programs inside RoboForm to fix this. Or uninstall + RoboForm. |app| is not starting on OS X? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -719,8 +725,8 @@ You can switch |app| to using a backed up library folder by simply clicking the If you want to backup the |app| configuration/plugins, you have to backup the config directory. You can find this config directory via :guilabel:`Preferences->Miscellaneous`. Note that restoring configuration directories is not officially supported, but should work in most cases. Just copy the contents of the backup directory into the current configuration directory to restore. -How do I use purchased EPUB books with |app|? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +How do I use purchased EPUB books with |app| (or what do I do with .acsm files)? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Most purchased EPUB books have `DRM `_. This prevents |app| from opening them. You can still use |app| to store and transfer them to your ebook reader. First, you must authorize your reader on a windows machine with Adobe Digital Editions. Once this is done, EPUB books transferred with |app| will work fine on your reader. When you purchase an epub book from a website, you will get an ".acsm" file. This file should be opened with Adobe Digital Editions, which will then download the actual ".epub" ebook. The ebook file will be stored in the folder "My Digital Editions", from where you can add it to |app|. I am getting a "Permission Denied" error? diff --git a/recipes/autosport.recipe b/recipes/autosport.recipe new file mode 100644 index 0000000000..df98125f25 --- /dev/null +++ b/recipes/autosport.recipe @@ -0,0 +1,30 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'MrStefan ' + +''' +www.autosport.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class autosport(BasicNewsRecipe): + title = u'Autosport' + __author__ = 'MrStefan ' + language = 'en_GB' + description =u'Daily Formula 1 and motorsport news from the leading weekly motor racing magazine. The authority on Formula 1, F1, MotoGP, GP2, Champ Car, Le Mans...' + masthead_url='http://cdn.images.autosport.com/asdotcom.gif' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'h1', attrs = {'class' : 'news_headline'})) + keep_only_tags.append(dict(name = 'td', attrs = {'class' : 'news_article_author'})) + keep_only_tags.append(dict(name = 'td', attrs = {'class' : 'news_article_date'})) + keep_only_tags.append(dict(name = 'p')) + + feeds = [(u'ALL NEWS', u'http://www.autosport.com/rss/allnews.xml')] diff --git a/recipes/blognexto.recipe b/recipes/blognexto.recipe new file mode 100644 index 0000000000..b5ced2cf50 --- /dev/null +++ b/recipes/blognexto.recipe @@ -0,0 +1,28 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class blognexto(BasicNewsRecipe): + title = 'BLOG.NEXTO.pl' + __author__ = 'MrStefan ' + language = 'pl' + description ='o e-publikacjach prawie wszystko' + masthead_url='http://blog.nexto.pl/wp-content/uploads/2012/04/logo-blog-nexto.pl_.jpg' + remove_empty_feeds= True + oldest_article = 7 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'content'})) + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'class' : 'comment-cloud'})) + remove_tags.append(dict(name = 'p', attrs = {'class' : 'post-date1'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'fb-like'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'tags'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'postnavi'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'commments-box'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'respond'})) + + feeds = [('Artykuly', 'http://feeds.feedburner.com/blognexto')] diff --git a/recipes/brewiarz.recipe b/recipes/brewiarz.recipe new file mode 100644 index 0000000000..5d16278b00 --- /dev/null +++ b/recipes/brewiarz.recipe @@ -0,0 +1,140 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe +import datetime, re + + +class brewiarz(BasicNewsRecipe): + title = u'Brewiarz' + __author__ = 'Artur Stachecki ' + language = 'pl' + description = u'Serwis poświęcony Liturgii Godzin (brewiarzowi) - formie codziennej modlitwy Kościoła katolickiego.' + masthead_url = 'http://brewiarz.pl/images/logo2.gif' + max_articles_per_feed = 100 + remove_javascript = True + no_stylesheets = True + publication_type = 'newspaper' + next_days = 1 + + def parse_index(self): + dec2rom_dict = {"01": "i", "02": "ii", "03": "iii", "04": "iv", + "05": "v", "06": "vi", "07": "vii", "08": "viii", + "09": "ix", "10": "x", "11": "xi", "12": "xii"} + + weekday_dict = {"Sunday": "Niedziela", "Monday": "Poniedziałek", "Tuesday": "Wtorek", + "Wednesday": "Środa", "Thursday": "Czwartek", "Friday": "Piątek", "Saturday": "Sobota"} + + now = datetime.datetime.now() + + feeds = [] + for i in range(0, self.next_days): + url_date = now + datetime.timedelta(days=i) + url_date_month = url_date.strftime("%m") + url_date_month_roman = dec2rom_dict[url_date_month] + url_date_day = url_date.strftime("%d") + url_date_year = url_date.strftime("%Y")[2:] + url_date_weekday = url_date.strftime("%A") + url_date_weekday_pl = weekday_dict[url_date_weekday] + + url = "http://brewiarz.pl/" + url_date_month_roman + "_" + url_date_year + "/" + url_date_day + url_date_month + "/index.php3" + articles = self.parse_pages(url) + if articles: + title = url_date_weekday_pl + " " + url_date_day + "." + url_date_month + "." + url_date_year + feeds.append((title, articles)) + else: + sectors = self.get_sectors(url) + for subpage in sectors: + title = url_date_weekday_pl + " " + url_date_day + "." + url_date_month + "." + url_date_year + " - " + subpage.string + url = "http://brewiarz.pl/" + url_date_month_roman + "_" + url_date_year + "/" + url_date_day + url_date_month + "/" + subpage['href'] + print(url) + articles = self.parse_pages(url) + if articles: + feeds.append((title, articles)) + return feeds + + def get_sectors(self, url): + sectors = [] + soup = self.index_to_soup(url) + sectors_table = soup.find(name='table', attrs={'width': '490'}) + sector_links = sectors_table.findAll(name='a') + for sector_links_modified in sector_links: + link_parent_text = sector_links_modified.findParent(name='div').text + if link_parent_text: + sector_links_modified.text = link_parent_text.text + sectors.append(sector_links_modified) + return sectors + + def parse_pages(self, url): + current_articles = [] + soup = self.index_to_soup(url) + www = soup.find(attrs={'class': 'www'}) + if www: + box_title = www.find(text='Teksty LG') + article_box_parent = box_title.findParent('ul') + article_box_sibling = article_box_parent.findNextSibling('ul') + for li in article_box_sibling.findAll('li'): + link = li.find(name='a') + ol = link.findNextSibling(name='ol') + if ol: + sublinks = ol.findAll(name='a') + for sublink in sublinks: + link_title = self.tag_to_string(link) + " - " + self.tag_to_string(sublink) + link_url_print = re.sub('php3', 'php3?kr=_druk&wr=lg&', sublink['href']) + link_url = url[:-10] + link_url_print + current_articles.append({'title': link_title, + 'url': link_url, 'description': '', 'date': ''}) + else: + if link.findParent(name = 'ol'): + continue + else: + link_title = self.tag_to_string(link) + link_url_print = re.sub('php3', 'php3?kr=_druk&wr=lg&', link['href']) + link_url = url[:-10] + link_url_print + current_articles.append({'title': link_title, + 'url': link_url, 'description': '', 'date': ''}) + return current_articles + else: + return None + + def preprocess_html(self, soup): + footer = soup.find(name='a', attrs={'href': 'http://brewiarz.pl'}) + footer_parent = footer.findParent('div') + footer_parent.extract() + + header = soup.find(text='http://brewiarz.pl') + header_parent = header.findParent('div') + header_parent.extract() + + subheader = soup.find(text='Kolor szat:').findParent('div') + subheader.extract() + + color = soup.find('b') + color.extract() + + cleaned = self.strip_tags(soup) + + div = cleaned.findAll(name='div') + div[1].extract() + div[2].extract() + div[3].extract() + + return cleaned + + def strip_tags(self, soup_dirty): + VALID_TAGS = ['p', 'div', 'br', 'b', 'a', 'title', 'head', 'html', 'body'] + + for tag in soup_dirty.findAll(True): + if tag.name not in VALID_TAGS: + for i, x in enumerate(tag.parent.contents): + if x == tag: + break + else: + print "Can't find", tag, "in", tag.parent + continue + for r in reversed(tag.contents): + tag.parent.insert(i, r) + tag.extract() + + return soup_dirty diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe index 5254694d24..a4e24ac61b 100644 --- a/recipes/dobreprogamy.recipe +++ b/recipes/dobreprogamy.recipe @@ -6,7 +6,6 @@ class Dobreprogramy_pl(BasicNewsRecipe): __author__ = 'fenuks' __licence__ ='GPL v3' category = 'IT' - language = 'pl' masthead_url='http://static.dpcdn.pl/css/Black/Images/header_logo_napis_fullVersion.png' cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png' description = u'Aktualności i blogi z dobreprogramy.pl' @@ -29,4 +28,4 @@ class Dobreprogramy_pl(BasicNewsRecipe): for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: a['href']=self.index + a['href'] - return soup \ No newline at end of file + return soup diff --git a/recipes/focus_pl.recipe b/recipes/focus_pl.recipe index 342aa0d2db..1954fd7803 100644 --- a/recipes/focus_pl.recipe +++ b/recipes/focus_pl.recipe @@ -2,7 +2,9 @@ import re from calibre.web.feeds.news import BasicNewsRecipe + class FocusRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' __author__ = u'intromatyk ' language = 'pl' @@ -12,10 +14,10 @@ class FocusRecipe(BasicNewsRecipe): publisher = u'Gruner + Jahr Polska' category = u'News' description = u'Newspaper' - category='magazine' - cover_url='' - remove_empty_feeds= True - no_stylesheets=True + category = 'magazine' + cover_url = '' + remove_empty_feeds = True + no_stylesheets = True oldest_article = 7 max_articles_per_feed = 100000 recursions = 0 @@ -27,15 +29,15 @@ class FocusRecipe(BasicNewsRecipe): simultaneous_downloads = 5 r = re.compile('.*(?Phttp:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*') - keep_only_tags =[] - keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'cll'})) - - remove_tags =[] - remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulm noprint'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'txb'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'h2'})) - remove_tags.append(dict(name = 'ul', attrs = {'class' : 'txu'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulc'})) + keep_only_tags = [] + keep_only_tags.append(dict(name='div', attrs={'id': 'cll'})) + + remove_tags = [] + remove_tags.append(dict(name='div', attrs={'class': 'ulm noprint'})) + remove_tags.append(dict(name='div', attrs={'class': 'txb'})) + remove_tags.append(dict(name='div', attrs={'class': 'h2'})) + remove_tags.append(dict(name='ul', attrs={'class': 'txu'})) + remove_tags.append(dict(name='div', attrs={'class': 'ulc'})) extra_css = ''' body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} @@ -44,18 +46,17 @@ class FocusRecipe(BasicNewsRecipe): p.lead {font-weight: bold; text-align: left;} .authordate {font-size: small; color: #696969;} .fot{font-size: x-small; color: #666666;} - ''' + ''' - - feeds = [ - ('Nauka', 'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'), - ('Historia', 'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'), - ('Cywilizacja', 'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'), - ('Sport', 'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'), - ('Technika', 'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'), - ('Przyroda', 'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'), - ('Technologie', 'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'), - ] + feeds = [ + ('Nauka', 'http://www.focus.pl/nauka/rss/'), + ('Historia', 'http://www.focus.pl/historia/rss/'), + ('Cywilizacja', 'http://www.focus.pl/cywilizacja/rss/'), + ('Sport', 'http://www.focus.pl/sport/rss/'), + ('Technika', 'http://www.focus.pl/technika/rss/'), + ('Przyroda', 'http://www.focus.pl/przyroda/rss/'), + ('Technologie', 'http://www.focus.pl/gadzety/rss/') + ] def skip_ad_pages(self, soup): if ('advertisement' in soup.find('title').string.lower()): @@ -65,20 +66,20 @@ class FocusRecipe(BasicNewsRecipe): return None def get_cover_url(self): - soup=self.index_to_soup('http://www.focus.pl/magazyn/') - tag=soup.find(name='div', attrs={'class':'clr fl'}) + soup = self.index_to_soup('http://www.focus.pl/magazyn/') + tag = soup.find(name='div', attrs={'class': 'clr fl'}) if tag: - self.cover_url='http://www.focus.pl/' + tag.a['href'] + self.cover_url = 'http://www.focus.pl/' + tag.a['href'] return getattr(self, 'cover_url', self.cover_url) def print_version(self, url): - if url.count ('focus.pl.feedsportal.com'): + if url.count('focus.pl.feedsportal.com'): u = url.find('focus0Bpl') u = 'http://www.focus.pl/' + url[u + 11:] u = u.replace('0C', '/') u = u.replace('A', '') - u = u.replace ('0E','-') + u = u.replace('0E', '-') u = u.replace('/nc/1//story01.htm', '/do-druku/1') - else: - u = url.replace('/nc/1','/do-druku/1') - return u \ No newline at end of file + else: + u = url.replace('/nc/1', '/do-druku/1') + return u diff --git a/recipes/gazeta_pl_krakow.recipe b/recipes/gazeta_pl_krakow.recipe new file mode 100644 index 0000000000..0f35e536f6 --- /dev/null +++ b/recipes/gazeta_pl_krakow.recipe @@ -0,0 +1,103 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = 'teepel based on GW from fenuks' + +''' +krakow.gazeta.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class gw_krakow(BasicNewsRecipe): + title = u'Gazeta.pl Kraków' + __author__ = 'teepel based on GW from fenuks' + language = 'pl' + description =u'Wiadomości z Krakowa na portalu Gazeta.pl.' + category='newspaper' + publication_type = 'newspaper' + masthead_url='http://bi.gazeta.pl/im/5/8528/m8528105.gif' + INDEX='http://krakow.gazeta.pl/' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article'})) + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_likes'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tools'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'rel'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})) + remove_tags.append(dict(name = 'u1', attrs = {'id' : 'articleToolbar'})) + remove_tags.append(dict(name = 'li', attrs = {'class' : 'atComments'})) + remove_tags.append(dict(name = 'li', attrs = {'class' : 'atLicense'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'banP4'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'article_toolbar'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tags'})) + remove_tags.append(dict(name = 'p', attrs = {'class' : 'txt_upl'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazeta_article_related_new'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazetaVideoPlayer'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_miniatures'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_buttons'})) + + remove_tags_after = [dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})] + + feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/krakow.xml')] + + def skip_ad_pages(self, soup): + tag=soup.find(name='a', attrs={'class':'btn'}) + if tag: + new_soup=self.index_to_soup(tag['href'], raw=True) + return new_soup + + + def append_page(self, soup, appendtag): + loop=False + tag = soup.find('div', attrs={'id':'Str'}) + if appendtag.find('div', attrs={'id':'Str'}): + nexturl=tag.findAll('a') + appendtag.find('div', attrs={'id':'Str'}).extract() + loop=True + if appendtag.find(id='source'): + appendtag.find(id='source').extract() + while loop: + loop=False + for link in nexturl: + if u'następne' in link.string: + url= self.INDEX + link['href'] + soup2 = self.index_to_soup(url) + pagetext = soup2.find(id='artykul') + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag = soup2.find('div', attrs={'id':'Str'}) + nexturl=tag.findAll('a') + loop=True + + def gallery_article(self, appendtag): + tag=appendtag.find(id='container_gal') + if tag: + nexturl=appendtag.find(id='gal_btn_next').a['href'] + appendtag.find(id='gal_navi').extract() + while nexturl: + soup2=self.index_to_soup(nexturl) + pagetext=soup2.find(id='container_gal') + nexturl=pagetext.find(id='gal_btn_next') + if nexturl: + nexturl=nexturl.a['href'] + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + rem=appendtag.find(id='gal_navi') + if rem: + rem.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + if soup.find(id='container_gal'): + self.gallery_article(soup.body) + return soup + diff --git a/recipes/gazeta_pl_warszawa.recipe b/recipes/gazeta_pl_warszawa.recipe new file mode 100644 index 0000000000..7a43931db4 --- /dev/null +++ b/recipes/gazeta_pl_warszawa.recipe @@ -0,0 +1,100 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'teepel based on GW from fenuks' + +''' +warszawa.gazeta.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class gw_wawa(BasicNewsRecipe): + title = u'Gazeta.pl Warszawa' + __author__ = 'teepel based on GW from fenuks' + language = 'pl' + description ='Wiadomości z Warszawy na portalu Gazeta.pl.' + category='newspaper' + publication_type = 'newspaper' + masthead_url='http://bi.gazeta.pl/im/3/4089/m4089863.gif' + INDEX='http://warszawa.gazeta.pl/' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article'})) + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_likes'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tools'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'rel'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})) + remove_tags.append(dict(name = 'u1', attrs = {'id' : 'articleToolbar'})) + remove_tags.append(dict(name = 'li', attrs = {'class' : 'atComments'})) + remove_tags.append(dict(name = 'li', attrs = {'class' : 'atLicense'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'banP4'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'article_toolbar'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tags'})) + remove_tags.append(dict(name = 'p', attrs = {'class' : 'txt_upl'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazeta_article_related_new'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazetaVideoPlayer'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_miniatures'})) + + feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/warszawa.xml')] + + def skip_ad_pages(self, soup): + tag=soup.find(name='a', attrs={'class':'btn'}) + if tag: + new_soup=self.index_to_soup(tag['href'], raw=True) + return new_soup + + + def append_page(self, soup, appendtag): + loop=False + tag = soup.find('div', attrs={'id':'Str'}) + if appendtag.find('div', attrs={'id':'Str'}): + nexturl=tag.findAll('a') + appendtag.find('div', attrs={'id':'Str'}).extract() + loop=True + if appendtag.find(id='source'): + appendtag.find(id='source').extract() + while loop: + loop=False + for link in nexturl: + if u'następne' in link.string: + url= self.INDEX + link['href'] + soup2 = self.index_to_soup(url) + pagetext = soup2.find(id='artykul') + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag = soup2.find('div', attrs={'id':'Str'}) + nexturl=tag.findAll('a') + loop=True + + def gallery_article(self, appendtag): + tag=appendtag.find(id='container_gal') + if tag: + nexturl=appendtag.find(id='gal_btn_next').a['href'] + appendtag.find(id='gal_navi').extract() + while nexturl: + soup2=self.index_to_soup(nexturl) + pagetext=soup2.find(id='container_gal') + nexturl=pagetext.find(id='gal_btn_next') + if nexturl: + nexturl=nexturl.a['href'] + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + rem=appendtag.find(id='gal_navi') + if rem: + rem.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + if soup.find(id='container_gal'): + self.gallery_article(soup.body) + return soup + diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index 5c034b10ab..633b80444a 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -1,104 +1,107 @@ # -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe + class Gazeta_Wyborcza(BasicNewsRecipe): - title = u'Gazeta Wyborcza' - __author__ = 'fenuks' - language = 'pl' - description ='news from gazeta.pl' - category='newspaper' + title = u'Gazeta.pl' + __author__ = 'fenuks, Artur Stachecki' + language = 'pl' + description = 'news from gazeta.pl' + category = 'newspaper' publication_type = 'newspaper' - masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' - INDEX='http://wyborcza.pl' - remove_empty_feeds= True + masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' + INDEX = 'http://wyborcza.pl' + remove_empty_feeds = True oldest_article = 3 max_articles_per_feed = 100 - remove_javascript=True - no_stylesheets=True - ignore_duplicate_articles = {'title', 'url'} - keep_only_tags = dict(id=['gazeta_article', 'article']) - remove_tags_after = dict(id='gazeta_article_share') - remove_tags = [dict(attrs={'class':['artReadMore', 'gazeta_article_related_new', 'txt_upl']}), dict(id=['gazeta_article_likes', 'gazeta_article_tools', 'rel', 'gazeta_article_tags', 'gazeta_article_share', 'gazeta_article_brand', 'gazeta_article_miniatures'])] - - feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), - (u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'), - (u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'), - (u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'), - (u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), - (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), - (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), - #(u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), - (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), - (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), - (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), - (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), - (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), - (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), - (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), - (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss') - ] + remove_javascript = True + no_stylesheets = True + remove_tags_before = dict(id='k0') + remove_tags_after = dict(id='banP4') + remove_tags = [dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})] + feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), + (u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'), + (u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'), + (u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'), + (u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), (u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'), (u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss') + ] def skip_ad_pages(self, soup): - tag=soup.find(name='a', attrs={'class':'btn'}) - if tag: - new_soup=self.index_to_soup(tag['href'], raw=True) + tag = soup.find(name='a', attrs={'class': 'btn'}) + if tag: + new_soup = self.index_to_soup(tag['href'], raw=True) return new_soup - def append_page(self, soup, appendtag): - loop=False - tag = soup.find('div', attrs={'id':'Str'}) - if appendtag.find('div', attrs={'id':'Str'}): - nexturl=tag.findAll('a') - appendtag.find('div', attrs={'id':'Str'}).extract() - loop=True + loop = False + tag = soup.find('div', attrs={'id': 'Str'}) + if appendtag.find('div', attrs={'id': 'Str'}): + nexturl = tag.findAll('a') + appendtag.find('div', attrs={'id': 'Str'}).extract() + loop = True if appendtag.find(id='source'): appendtag.find(id='source').extract() while loop: - loop=False + loop = False for link in nexturl: if u'następne' in link.string: - url= self.INDEX + link['href'] + url = self.INDEX + link['href'] soup2 = self.index_to_soup(url) pagetext = soup2.find(id='artykul') pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - tag = soup2.find('div', attrs={'id':'Str'}) - nexturl=tag.findAll('a') - loop=True + tag = soup2.find('div', attrs={'id': 'Str'}) + nexturl = tag.findAll('a') + loop = True def gallery_article(self, appendtag): - tag=appendtag.find(id='container_gal') + tag = appendtag.find(id='container_gal') if tag: - nexturl=appendtag.find(id='gal_btn_next').a['href'] + nexturl = appendtag.find(id='gal_btn_next').a['href'] appendtag.find(id='gal_navi').extract() while nexturl: - soup2=self.index_to_soup(nexturl) - pagetext=soup2.find(id='container_gal') - nexturl=pagetext.find(id='gal_btn_next') + soup2 = self.index_to_soup(nexturl) + pagetext = soup2.find(id='container_gal') + nexturl = pagetext.find(id='gal_btn_next') if nexturl: - nexturl=nexturl.a['href'] + nexturl = nexturl.a['href'] pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - rem=appendtag.find(id='gal_navi') + rem = appendtag.find(id='gal_navi') if rem: rem.extract() def preprocess_html(self, soup): - self.append_page(soup, soup.body) - if soup.find(id='container_gal'): - self.gallery_article(soup.body) - return soup + if soup.find(attrs={'class': 'piano_btn_1'}): + return None + else: + self.append_page(soup, soup.body) + if soup.find(id='container_gal'): + self.gallery_article(soup.body) + return soup def print_version(self, url): - if 'http://wyborcza.biz/biznes/' not in url: - return url + if url.count('rss.feedsportal.com'): + u = url.find('wyborcza0Bpl') + u = 'http://www.wyborcza.pl/' + url[u + 11:] + u = u.replace('0C', '/') + u = u.replace('A', '') + u = u.replace('0E', '-') + u = u.replace('0H', ',') + u = u.replace('0I', '_') + u = u.replace('0B', '.') + u = u.replace('/1,', '/2029020,') + u = u.replace('/story01.htm', '') + print(u) + return u + elif 'http://wyborcza.pl/1' in url: + return url.replace('http://wyborcza.pl/1', 'http://wyborcza.pl/2029020') else: - return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') + return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') def get_cover_url(self): soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html') - cover=soup.find(id='GWmini2') - soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href']) - self.cover_url='http://wyborcza.pl' + soup.img['src'] + cover = soup.find(id='GWmini2') + soup = self.index_to_soup('http://wyborcza.pl/' + cover.contents[3].a['href']) + self.cover_url = 'http://wyborcza.pl' + soup.img['src'] return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/icons/autosport.png b/recipes/icons/autosport.png new file mode 100644 index 0000000000..0c84c96a0b Binary files /dev/null and b/recipes/icons/autosport.png differ diff --git a/recipes/icons/blognexto.png b/recipes/icons/blognexto.png new file mode 100644 index 0000000000..2a1ae4a4ae Binary files /dev/null and b/recipes/icons/blognexto.png differ diff --git a/recipes/icons/brewiarz.png b/recipes/icons/brewiarz.png new file mode 100644 index 0000000000..b47dfc95f6 Binary files /dev/null and b/recipes/icons/brewiarz.png differ diff --git a/recipes/icons/gazeta_pl_krakow.png b/recipes/icons/gazeta_pl_krakow.png new file mode 100644 index 0000000000..119afbba3a Binary files /dev/null and b/recipes/icons/gazeta_pl_krakow.png differ diff --git a/recipes/icons/gazeta_pl_szczecin.png b/recipes/icons/gazeta_pl_szczecin.png new file mode 100644 index 0000000000..119afbba3a Binary files /dev/null and b/recipes/icons/gazeta_pl_szczecin.png differ diff --git a/recipes/icons/gazeta_pl_warszawa.png b/recipes/icons/gazeta_pl_warszawa.png new file mode 100644 index 0000000000..119afbba3a Binary files /dev/null and b/recipes/icons/gazeta_pl_warszawa.png differ diff --git a/recipes/icons/gazeta_wyborcza.png b/recipes/icons/gazeta_wyborcza.png index 9e480cc41d..119afbba3a 100644 Binary files a/recipes/icons/gazeta_wyborcza.png and b/recipes/icons/gazeta_wyborcza.png differ diff --git a/recipes/icons/mateusz_czytania.png b/recipes/icons/mateusz_czytania.png new file mode 100644 index 0000000000..7568139433 Binary files /dev/null and b/recipes/icons/mateusz_czytania.png differ diff --git a/recipes/icons/naszdziennik.png b/recipes/icons/naszdziennik.png new file mode 100644 index 0000000000..b557a7835e Binary files /dev/null and b/recipes/icons/naszdziennik.png differ diff --git a/recipes/icons/pravda_en.png b/recipes/icons/pravda_en.png new file mode 100644 index 0000000000..f91f59c554 Binary files /dev/null and b/recipes/icons/pravda_en.png differ diff --git a/recipes/icons/pravda_it.png b/recipes/icons/pravda_it.png new file mode 100644 index 0000000000..f91f59c554 Binary files /dev/null and b/recipes/icons/pravda_it.png differ diff --git a/recipes/icons/pravda_por.png b/recipes/icons/pravda_por.png new file mode 100644 index 0000000000..f91f59c554 Binary files /dev/null and b/recipes/icons/pravda_por.png differ diff --git a/recipes/icons/pravda_ru.png b/recipes/icons/pravda_ru.png new file mode 100644 index 0000000000..f91f59c554 Binary files /dev/null and b/recipes/icons/pravda_ru.png differ diff --git a/recipes/icons/rushisaband.png b/recipes/icons/rushisaband.png new file mode 100644 index 0000000000..9a7d4237cf Binary files /dev/null and b/recipes/icons/rushisaband.png differ diff --git a/recipes/icons/rynek_infrastruktury.png b/recipes/icons/rynek_infrastruktury.png new file mode 100644 index 0000000000..8d2e6ac27b Binary files /dev/null and b/recipes/icons/rynek_infrastruktury.png differ diff --git a/recipes/icons/rynek_kolejowy.png b/recipes/icons/rynek_kolejowy.png new file mode 100644 index 0000000000..e9dd5fc464 Binary files /dev/null and b/recipes/icons/rynek_kolejowy.png differ diff --git a/recipes/icons/satkurier.png b/recipes/icons/satkurier.png new file mode 100644 index 0000000000..6e71bb2450 Binary files /dev/null and b/recipes/icons/satkurier.png differ diff --git a/recipes/icons/wprost.png b/recipes/icons/wprost.png new file mode 100644 index 0000000000..f81878f2d2 Binary files /dev/null and b/recipes/icons/wprost.png differ diff --git a/recipes/kerrang.recipe b/recipes/kerrang.recipe new file mode 100644 index 0000000000..bbd944eb62 --- /dev/null +++ b/recipes/kerrang.recipe @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe + +class kerrang(BasicNewsRecipe): + title = u'Kerrang!' + __author__ = 'Artur Stachecki ' + language = 'en_GB' + description = u'UK-based magazine devoted to rock music published by Bauer Media Group' + oldest_article = 7 + masthead_url = 'http://images.kerrang.com/design/kerrang/kerrangsite/logo.gif' + max_articles_per_feed = 100 + simultaneous_downloads = 5 + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + recursions = 0 + + keep_only_tags = [] + keep_only_tags.append(dict(attrs = {'class' : ['headz', 'blktxt']})) + + extra_css = ''' img { display: block; margin-right: auto;} + h1 {text-align: left; font-size: 22px;}''' + + feeds = [(u'News', u'http://www.kerrang.com/blog/rss.xml')] + + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/lequipe.recipe b/recipes/lequipe.recipe new file mode 100644 index 0000000000..c6e9a26880 --- /dev/null +++ b/recipes/lequipe.recipe @@ -0,0 +1,45 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe + + +class leequipe(BasicNewsRecipe): + title = u'l\'equipe' + __author__ = 'Artur Stachecki ' + language = 'fr' + description = u'Retrouvez tout le sport en direct sur le site de L\'EQUIPE et suivez l\'actualité du football, rugby, basket, cyclisme, f1, volley, hand, tous les résultats sportifs' + oldest_article = 1 + masthead_url = 'http://static.lequipe.fr/v6/img/logo-lequipe.png' + max_articles_per_feed = 100 + simultaneous_downloads = 5 + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + recursions = 0 + + keep_only_tags = [] + keep_only_tags.append(dict(attrs={'id': ['article']})) + + remove_tags = [] + remove_tags.append(dict(attrs={'id': ['partage', 'ensavoirplus', 'bloc_bas_breve', 'commentaires', 'tools']})) + remove_tags.append(dict(attrs={'class': ['partage_bis', 'date']})) + + feeds = [(u'Football', u'http://www.lequipe.fr/rss/actu_rss_Football.xml'), + (u'Auto-Moto', u'http://www.lequipe.fr/rss/actu_rss_Auto-Moto.xml'), + (u'Tennis', u'http://www.lequipe.fr/rss/actu_rss_Tennis.xml'), + (u'Golf', u'http://www.lequipe.fr/rss/actu_rss_Golf.xml'), + (u'Rugby', u'http://www.lequipe.fr/rss/actu_rss_Rugby.xml'), + (u'Basket', u'http://www.lequipe.fr/rss/actu_rss_Basket.xml'), + (u'Hand', u'http://www.lequipe.fr/rss/actu_rss_Hand.xml'), + (u'Cyclisme', u'http://www.lequipe.fr/rss/actu_rss_Cyclisme.xml'), + (u'Autres Sports', u'http://pipes.yahoo.com/pipes/pipe.run?_id=2039f7f4f350c70c5e4e8633aa1b37cd&_render=rss') + ] + + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/lrb.recipe b/recipes/lrb.recipe index 4a203c80ae..6453e78724 100644 --- a/recipes/lrb.recipe +++ b/recipes/lrb.recipe @@ -40,6 +40,6 @@ class LondonReviewOfBooks(BasicNewsRecipe): soup = self.index_to_soup('http://www.lrb.co.uk/') cover_item = soup.find('p',attrs={'class':'cover'}) if cover_item: - cover_url = 'http://www.lrb.co.uk' + cover_item.a.img['src'] + cover_url = cover_item.a.img['src'] return cover_url diff --git a/recipes/mateusz_czytania.recipe b/recipes/mateusz_czytania.recipe new file mode 100644 index 0000000000..ba7c598787 --- /dev/null +++ b/recipes/mateusz_czytania.recipe @@ -0,0 +1,36 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'teepel ' + +''' +http://www.mateusz.pl/czytania +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class czytania_mateusz(BasicNewsRecipe): + title = u'Czytania na ka\u017cdy dzie\u0144' + __author__ = 'teepel ' + description = u'Codzienne czytania z jednego z najstarszych polskich serwisów katolickich.' + language = 'pl' + INDEX='http://www.mateusz.pl/czytania' + oldest_article = 1 + remove_empty_feeds= True + no_stylesheets=True + auto_cleanup = True + remove_javascript = True + simultaneous_downloads = 2 + max_articles_per_feed = 100 + auto_cleanup = True + + feeds = [(u'Czytania', u'http://mateusz.pl/rss/czytania/')] + + remove_tags =[] + remove_tags.append(dict(name = 'p', attrs = {'class' : 'top'})) + + #thanks t3d + def get_article_url(self, article): + link = article.get('link') + if 'kmt.pl' not in link: + return link diff --git a/recipes/money_pl.recipe b/recipes/money_pl.recipe index 075264f8f7..475c2059ff 100644 --- a/recipes/money_pl.recipe +++ b/recipes/money_pl.recipe @@ -4,7 +4,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class FocusRecipe(BasicNewsRecipe): __license__ = 'GPL v3' - __author__ = u'intromatyk ' + __author__ = u'Artur Stachecki ' language = 'pl' version = 1 diff --git a/recipes/naszdziennik.recipe b/recipes/naszdziennik.recipe new file mode 100644 index 0000000000..4c7b78c199 --- /dev/null +++ b/recipes/naszdziennik.recipe @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe + +class naszdziennik(BasicNewsRecipe): + title = u'Nasz Dziennik' + __author__ = 'Artur Stachecki ' + language = 'pl' + description =u'Nasz Dziennik - Ogólnopolska gazeta codzienna. Podejmuje tematykę dotyczącą życia społecznego, kulturalnego, politycznego i religijnego. Propaguje wartości chrześcijańskie oraz tradycję i kulturę polską.' + masthead_url='http://www.naszdziennik.pl/images/logo-male.png' + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets = True + + keep_only_tags =[dict(attrs = {'id' : 'article'})] + + #definiujemy nową funkcje; musi zwracać listę feedów wraz z artykułami + def parse_index(self): + #adres do parsowania artykułów + soup = self.index_to_soup('http://www.naszdziennik.pl/news') + #deklaracja pustej listy feedów + feeds = [] + #deklaracja pustego słownika artykułów + articles = {} + #deklaracja pustej listy sekcji + sections = [] + #deklaracja pierwszej sekcji jako pusty string + section = '' + + #pętla for, która analizuje po kolei każdy tag "news-article" + for item in soup.findAll(attrs = {'class' : 'news-article'}) : + #w tagu "news-article szukamy pierwszego taga h4" + section = item.find('h4') + #zmiennej sekcja przypisujemy zawartość tekstową taga + section = self.tag_to_string(section) + #sprawdzamy czy w słowniku artykułów istnieje klucz dotyczący sekcji + #jeśli nie istnieje to : + if not articles.has_key(section) : + #do listy sekcji dodajemy nową sekcje + sections.append(section) + #deklarujemy nową sekcje w słowniku artykułów przypisując jej klucz odpowiadający nowej sekcji, którego wartością jest pusta lista + articles[section] = [] + #przeszukujemy kolejny tag "title-datetime" + article_title_datetime = item.find(attrs = {'class' : 'title-datetime'}) + #w tagu title-datetime znajdujemy pierwszy link + article_a = article_title_datetime.find('a') + #i tworzymy z niego link absolutny do właściwego artykułu + article_url = 'http://naszdziennik.pl' + article_a['href'] + #jako tytuł użyty będzie tekst pomiędzy tagami + article_title = self.tag_to_string(article_a) + #a data będzie tekstem z pierwszego taga h4 znalezionego w tagu title-datetime + article_date = self.tag_to_string(article_title_datetime.find('h4')) + #zebrane elementy dodajemy do listy zadeklarowanej w linijce 44 + articles[section].append( { 'title' : article_title, 'url' : article_url, 'date' : article_date }) + #po dodaniu wszystkich artykułów dodajemy sekcje do listy feedów, korzystając z list sekcji znajdujących się w słowniku + for section in sections: + feeds.append((section, articles[section])) + #zwracamy listę feedów, której parsowaniem zajmie się calibre + return feeds \ No newline at end of file diff --git a/recipes/new_yorker.recipe b/recipes/new_yorker.recipe index 9eeb8b31ee..c5f1b0aff2 100644 --- a/recipes/new_yorker.recipe +++ b/recipes/new_yorker.recipe @@ -22,9 +22,9 @@ class NewYorker(BasicNewsRecipe): masthead_url = 'http://www.newyorker.com/css/i/hed/logo.gif' extra_css = """ body {font-family: "Times New Roman",Times,serif} - .articleauthor{color: #9F9F9F; + .articleauthor{color: #9F9F9F; font-family: Arial, sans-serif; - font-size: small; + font-size: small; text-transform: uppercase} .rubric,.dd,h6#credit{color: #CD0021; font-family: Arial, sans-serif; @@ -63,11 +63,11 @@ class NewYorker(BasicNewsRecipe): return url.strip() def get_cover_url(self): - cover_url = None - soup = self.index_to_soup('http://www.newyorker.com/magazine/toc/') - cover_item = soup.find('img',attrs={'id':'inThisIssuePhoto'}) + cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg" + soup = self.index_to_soup('http://www.newyorker.com/magazine?intcid=magazine') + cover_item = soup.find('div',attrs={'id':'media-count-1'}) if cover_item: - cover_url = 'http://www.newyorker.com' + cover_item['src'].strip() + cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip() return cover_url def preprocess_html(self, soup): diff --git a/recipes/pravda_en.recipe b/recipes/pravda_en.recipe new file mode 100644 index 0000000000..85e8bc91fe --- /dev/null +++ b/recipes/pravda_en.recipe @@ -0,0 +1,53 @@ +__license__ = 'GPL v3' +__copyright__ = '2012, Darko Miletic ' +''' +english.pravda.ru +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Pravda_eng(BasicNewsRecipe): + title = 'Pravda in English' + __author__ = 'Darko Miletic' + description = 'News from Russia and rest of the world' + publisher = 'PRAVDA.Ru' + category = 'news, politics, Russia' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en_RU' + remove_empty_feeds = True + publication_type = 'newspaper' + masthead_url = 'http://english.pravda.ru/pix/logo.gif' + extra_css = """ + body{font-family: Arial,sans-serif } + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_attributes=['lang', 'style'] + keep_only_tags = [dict(name='div', attrs={'id':'article'})] + + + feeds = [ + (u'World' , u'http://english.pravda.ru/world/export-articles.xml' ) + ,(u'Russia' , u'http://english.pravda.ru/russia/export-articles.xml' ) + ,(u'Society' , u'http://english.pravda.ru/society/export-articles.xml' ) + ,(u'Incidents', u'http://english.pravda.ru/hotspots/export-articles.xml' ) + ,(u'Opinion' , u'http://english.pravda.ru/opinion/export-articles.xml' ) + ,(u'Science' , u'http://english.pravda.ru/science/export-articles.xml' ) + ,(u'Business' , u'http://english.pravda.ru/business/export-articles.xml' ) + ,(u'Economics', u'http://english.pravda.ru/russia/economics/export-articles.xml') + ,(u'Politics' , u'http://english.pravda.ru/russia/politics/export-articles.xml' ) + ] + + def print_version(self, url): + return url + '?mode=print' diff --git a/recipes/pravda_it.recipe b/recipes/pravda_it.recipe new file mode 100644 index 0000000000..67ec52ed19 --- /dev/null +++ b/recipes/pravda_it.recipe @@ -0,0 +1,52 @@ +__license__ = 'GPL v3' +__copyright__ = '2012, Darko Miletic ' +''' +italia.pravda.ru +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Pravda_ita(BasicNewsRecipe): + title = 'Pravda in Italiano' + __author__ = 'Darko Miletic' + description = 'News from Russia and rest of the world' + publisher = 'PRAVDA.Ru' + category = 'news, politics, Russia' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'it' + remove_empty_feeds = True + publication_type = 'newspaper' + masthead_url = 'http://italia.pravda.ru/pix/logo.gif' + extra_css = """ + body{font-family: Arial,sans-serif } + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_attributes=['lang', 'style'] + keep_only_tags = [dict(name='div', attrs={'id':'article'})] + + + feeds = [ + (u'Dal mondo' , u'http://italia.pravda.ru/world/export-articles.xml' ) + ,(u'Russia' , u'http://italia.pravda.ru/russia/export-articles.xml' ) + ,(u'Societa' , u'http://italia.pravda.ru/society/export-articles.xml' ) + ,(u'Avvenimenti', u'http://italia.pravda.ru/hotspots/export-articles.xml' ) + ,(u'Opinioni' , u'http://italia.pravda.ru/opinion/export-articles.xml' ) + ,(u'Scienza' , u'http://italia.pravda.ru/science/export-articles.xml' ) + ,(u'Economia' , u'http://italia.pravda.ru/russia/economics/export-articles.xml') + ,(u'Politica' , u'http://italia.pravda.ru/russia/politics/export-articles.xml' ) + ] + + def print_version(self, url): + return url + '?mode=print' diff --git a/recipes/pravda_por.recipe b/recipes/pravda_por.recipe new file mode 100644 index 0000000000..9022817f59 --- /dev/null +++ b/recipes/pravda_por.recipe @@ -0,0 +1,51 @@ +__license__ = 'GPL v3' +__copyright__ = '2012, Darko Miletic ' +''' +port.pravda.ru +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Pravda_port(BasicNewsRecipe): + title = u'Pravda em português' + __author__ = 'Darko Miletic' + description = 'News from Russia and rest of the world' + publisher = 'PRAVDA.Ru' + category = 'news, politics, Russia' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'pt' + remove_empty_feeds = True + publication_type = 'newspaper' + masthead_url = 'http://port.pravda.ru/pix/logo.gif' + extra_css = """ + body{font-family: Arial,sans-serif } + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_attributes=['lang', 'style'] + keep_only_tags = [dict(name='div', attrs={'id':'article'})] + + + feeds = [ + (u'Mundo' , u'http://port.pravda.ru/mundo/export-articles.xml' ) + ,(u'Russia' , u'http://port.pravda.ru/russa/export-articles.xml' ) + ,(u'Sociedade' , u'http://port.pravda.ru/sociedade/export-articles.xml' ) + ,(u'Cultura' , u'http://port.pravda.ru/sociedade/cultura/export-articles.xml') + ,(u'Ciencia' , u'http://port.pravda.ru/science/export-articles.xml' ) + ,(u'Desporto' , u'http://port.pravda.ru/desporto/export-articles.xml' ) + ,(u'CPLP' , u'http://port.pravda.ru/cplp/export-articles.xml' ) + ] + + def print_version(self, url): + return url + '?mode=print' diff --git a/recipes/pravda_ru.recipe b/recipes/pravda_ru.recipe new file mode 100644 index 0000000000..4d62c84638 --- /dev/null +++ b/recipes/pravda_ru.recipe @@ -0,0 +1,50 @@ +__license__ = 'GPL v3' +__copyright__ = '2012, Darko Miletic ' +''' +www.pravda.ru +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Pravda_ru(BasicNewsRecipe): + title = u'Правда' + __author__ = 'Darko Miletic' + description = u'Правда.Ру: Аналитика и новости' + publisher = 'PRAVDA.Ru' + category = 'news, politics, Russia' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'ru' + remove_empty_feeds = True + publication_type = 'newspaper' + masthead_url = 'http://www.pravda.ru/pix/logo.gif' + extra_css = """ + body{font-family: Arial,sans-serif } + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_attributes=['lang', 'style'] + keep_only_tags = [dict(name='div', attrs={'id':'article'})] + + feeds = [ + (u'Мир' , u'http://www.pravda.ru/world/export.xml' ) + ,(u'Религия' , u'http://www.pravda.ru/faith/export.xml' ) + ,(u'Общество' , u'http://www.pravda.ru/society/export.xml' ) + ,(u'Происшествия', u'http://www.pravda.ru/accidents/export.xml') + ,(u'Наука' , u'http://www.pravda.ru/science/export.xml' ) + ,(u'Экономика' , u'http://www.pravda.ru/economics/export.xml') + ,(u'Политика' , u'http://www.pravda.ru/politics/export.xml' ) + ] + + def print_version(self, url): + return url + '?mode=print' diff --git a/recipes/red_voltaire.recipe b/recipes/red_voltaire.recipe new file mode 100644 index 0000000000..1763125a8e --- /dev/null +++ b/recipes/red_voltaire.recipe @@ -0,0 +1,32 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class RedVoltaireRecipe(BasicNewsRecipe): + title = u'Red Voltaire' + __author__ = 'atordo' + description = u'Red de prensa no alineada, especializada en el an\u00e1lisis de las relaciones internacionales' + oldest_article = 7 + max_articles_per_feed = 30 + auto_cleanup = False + no_stylesheets = True + language = 'es' + use_embedded_content = False + remove_javascript = True + cover_url = u'http://www.voltairenet.org/squelettes/elements/images/logo-voltairenet-org.png' + masthead_url = u'http://www.voltairenet.org/squelettes/elements/images/logo-voltairenet-org.png' + + preprocess_regexps = [ + (re.compile(r'(?P<titulo>.+).+

'+match.group('titulo')+'

. (?P.+).+', re.IGNORECASE|re.DOTALL) + ,lambda match:''+match.group('fecha')+'') + ,(re.compile(r'