diff --git a/.bzrignore b/.bzrignore index aaacc9f58a..f14ff947f6 100644 --- a/.bzrignore +++ b/.bzrignore @@ -35,3 +35,49 @@ nbproject/ .settings/ *.DS_Store calibre_plugins/ +recipes/.git +recipes/.gitignore +recipes/README +recipes/katalog_egazeciarz.recipe +recipes/tv_axnscifi.recipe +recipes/tv_comedycentral.recipe +recipes/tv_discoveryscience.recipe +recipes/tv_foxlife.recipe +recipes/tv_fox.recipe +recipes/tv_hbo.recipe +recipes/tv_kinopolska.recipe +recipes/tv_nationalgeographic.recipe +recipes/tv_polsat2.recipe +recipes/tv_polsat.recipe +recipes/tv_tv4.recipe +recipes/tv_tvn7.recipe +recipes/tv_tvn.recipe +recipes/tv_tvp1.recipe +recipes/tv_tvp2.recipe +recipes/tv_tvphd.recipe +recipes/tv_tvphistoria.recipe +recipes/tv_tvpkultura.recipe +recipes/tv_tvppolonia.recipe +recipes/tv_tvpuls.recipe +recipes/tv_viasathistory.recipe +recipes/icons/tv_axnscifi.png +recipes/icons/tv_comedycentral.png +recipes/icons/tv_discoveryscience.png +recipes/icons/tv_foxlife.png +recipes/icons/tv_fox.png +recipes/icons/tv_hbo.png +recipes/icons/tv_kinopolska.png +recipes/icons/tv_nationalgeographic.png +recipes/icons/tv_polsat2.png +recipes/icons/tv_polsat.png +recipes/icons/tv_tv4.png +recipes/icons/tv_tvn7.png +recipes/icons/tv_tvn.png +recipes/icons/tv_tvp1.png +recipes/icons/tv_tvp2.png +recipes/icons/tv_tvphd.png +recipes/icons/tv_tvphistoria.png +recipes/icons/tv_tvpkultura.png +recipes/icons/tv_tvppolonia.png +recipes/icons/tv_tvpuls.png +recipes/icons/tv_viasathistory.png diff --git a/Changelog.yaml b/Changelog.yaml index f4c5e25cb4..ebc2e5cad1 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,61 @@ # new recipes: # - title: +- version: 0.9.6 + date: 2012-11-10 + + new features: + - title: "Experimental support for subsetting fonts" + description: "Subsetting a font means reducing the font to contain only the glyphs for the text actually present in the book. This can easily halve the size of the font. calibre can now do this for all embedded fonts during a conversion. Turn it on via the 'Subset all embedded fonts' option under the Look & Feel section of the conversion dialog. calibre can subset both TrueType and OpenType fonts. Note that this code is very new and likely has bugs, so please check the output if you turn on subsetting. The conversion log will have info about the subsetting operations." + type: major + + - title: "EPUB Input: Try to workaround EPUBs that have missing or damaged ZIP central directories. calibre should now be able to read/convert such an EPUB file, provided it does not suffer from further corruption." + + - title: "Allow using identifiers in save to disk templates." + tickets: [1074623] + + - title: "calibredb: Add an option to not notify the GUI" + + - title: "Catalogs: Fix long tags causing catalog generation to fail on windows. Add the ability to cross-reference authors, i.e. to relist the authors for a book with multiple authors separately." + tickets: [1074931] + + - title: "Edit metadata dialog: Add a clear tags button to remove all tags with a single click" + + - title: "Add search to the font family chooser dialog" + + bug fixes: + - title: "Windows: Fix a long standing bug in the device eject code that for some reason only manifested in 0.9.5." + tickets: [1075782] + + - title: "Get Books: Fix Amazon stores, Google Books store and libri.de" + + - title: "Kobo driver: More fixes for on device book matching, and list books as being on device even if the Kobo has not yet indexed them. Also some performance improvements." + tickets: [1069617] + + - title: "EPUB Output: Remove duplicate id and name attributes to eliminate pointless noise from the various epub check utilities" + + - title: "Ask for confirmation before removing plugins" + + - title: "Fix bulk convert queueing dialog becoming very long if any of the books have a very long title." + tickets: [1076191] + + - title: "Fix deleting custom column tags like data from the Tag browser not updating the last modified timestamp for affected books" + tickets: [1075476] + + - title: "When updating a previously broken plugin, do not show an error message because the previous version of the plugin cannot be loaded" + + - title: "Fix regression that broke the Template Editor" + + improved recipes: + - Various updated Polish recipes + - London Review of Books + - Yemen Times + + new recipes: + - title: "Various Polish news sources" + author: Artur Stachecki + + - version: 0.9.5 date: 2012-11-02 diff --git a/manual/faq.rst b/manual/faq.rst index 8163861863..109aff440d 100644 --- a/manual/faq.rst +++ b/manual/faq.rst @@ -327,9 +327,8 @@ You can browse your |app| collection on your Android device is by using the calibre content server, which makes your collection available over the net. First perform the following steps in |app| - * Set the :guilabel:`Preferred Output Format` in |app| to EPUB (The output format can be set under :guilabel:`Preferences->Interface->Behavior`) - * Set the output profile to Tablet (this will work for phones as well), under :guilabel:`Preferences->Conversion->Common Options->Page Setup` - * Convert the books you want to read on your device to EPUB format by selecting them and clicking the Convert button. + * Set the :guilabel:`Preferred Output Format` in |app| to EPUB for normal Android devices or MOBI for Kindles (The output format can be set under :guilabel:`Preferences->Interface->Behavior`) + * Convert the books you want to read on your device to EPUB/MOBI format by selecting them and clicking the Convert button. * Turn on the Content Server in |app|'s preferences and leave |app| running. Now on your Android device, open the browser and browse to @@ -650,20 +649,24 @@ If it still wont launch, start a command prompt (press the windows key and R; th Post any output you see in a help message on the `Forum `_. -|app| freezes when I click on anything? +|app| freezes/crashes occasionally? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There are three possible things I know of, that can cause this: - * You recently connected an external monitor or TV to your computer. In this case, whenever |app| opens a new window like the edit metadata window or the conversion dialog, it appears on the second monitor where you dont notice it and so you think |app| has frozen. Disconnect your second monitor and restart calibre. + * You recently connected an external monitor or TV to your computer. In + this case, whenever |app| opens a new window like the edit metadata + window or the conversion dialog, it appears on the second monitor where + you dont notice it and so you think |app| has frozen. Disconnect your + second monitor and restart calibre. - * You are using a Wacom branded mouse. There is an incompatibility between Wacom mice and the graphics toolkit |app| uses. Try using a non-Wacom mouse. + * You are using a Wacom branded mouse. There is an incompatibility between + Wacom mice and the graphics toolkit |app| uses. Try using a non-Wacom + mouse. * If you use RoboForm, it is known to cause |app| to crash. Add |app| to - the blacklist of programs inside RoboForm to fix this. - - * Sometimes if some software has installed lots of new files in your fonts folder, |app| can crash until it finishes indexing them. Just start |app|, then leave it alone for about 20 minutes, without clicking on anything. After that you should be able to use |app| as normal. - + the blacklist of programs inside RoboForm to fix this. Or uninstall + RoboForm. |app| is not starting on OS X? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -722,8 +725,8 @@ You can switch |app| to using a backed up library folder by simply clicking the If you want to backup the |app| configuration/plugins, you have to backup the config directory. You can find this config directory via :guilabel:`Preferences->Miscellaneous`. Note that restoring configuration directories is not officially supported, but should work in most cases. Just copy the contents of the backup directory into the current configuration directory to restore. -How do I use purchased EPUB books with |app|? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +How do I use purchased EPUB books with |app| (or what do I do with .acsm files)? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Most purchased EPUB books have `DRM `_. This prevents |app| from opening them. You can still use |app| to store and transfer them to your ebook reader. First, you must authorize your reader on a windows machine with Adobe Digital Editions. Once this is done, EPUB books transferred with |app| will work fine on your reader. When you purchase an epub book from a website, you will get an ".acsm" file. This file should be opened with Adobe Digital Editions, which will then download the actual ".epub" ebook. The ebook file will be stored in the folder "My Digital Editions", from where you can add it to |app|. I am getting a "Permission Denied" error? diff --git a/recipes/antyweb.recipe b/recipes/antyweb.recipe new file mode 100644 index 0000000000..c2576191dd --- /dev/null +++ b/recipes/antyweb.recipe @@ -0,0 +1,48 @@ + +from calibre.web.feeds.news import BasicNewsRecipe + +class AntywebRecipe(BasicNewsRecipe): + encoding = 'utf-8' + __license__ = 'GPL v3' + __author__ = u'Artur Stachecki ' + language = 'pl' + version = 1 + title = u'Antyweb' + category = u'News' + description = u'Blog o internecie i nowych technologiach' + cover_url='' + remove_empty_feeds= True + auto_cleanup = False + no_stylesheets=True + use_embedded_content = False + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript = True + simultaneous_downloads = 3 + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'h1', attrs = { 'class' : 'mm-article-title'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'mm-article-content'})) + + + remove_tags =[] + remove_tags.append(dict(name = 'h2', attrs = {'class' : 'widgettitle'})) + remove_tags.append(dict(name = 'img', attrs = {'class' : 'alignleft'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'float: right;margin-left:1em;margin-bottom: 0.5em;padding-bottom: 3px; width: 72px;'})) + remove_tags.append(dict(name = 'img', attrs = {'src' : 'http://antyweb.pl/wp-content/uploads/2011/09/HOSTERSI_testy_pasek600x30.gif'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'podwpisowe'})) + + + extra_css = ''' + body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} + ''' + + feeds = [ + (u'Artykuly', u'feed://feeds.feedburner.com/Antyweb?format=xml'), + ] + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/autosport.recipe b/recipes/autosport.recipe new file mode 100644 index 0000000000..df98125f25 --- /dev/null +++ b/recipes/autosport.recipe @@ -0,0 +1,30 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'MrStefan ' + +''' +www.autosport.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class autosport(BasicNewsRecipe): + title = u'Autosport' + __author__ = 'MrStefan ' + language = 'en_GB' + description =u'Daily Formula 1 and motorsport news from the leading weekly motor racing magazine. The authority on Formula 1, F1, MotoGP, GP2, Champ Car, Le Mans...' + masthead_url='http://cdn.images.autosport.com/asdotcom.gif' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'h1', attrs = {'class' : 'news_headline'})) + keep_only_tags.append(dict(name = 'td', attrs = {'class' : 'news_article_author'})) + keep_only_tags.append(dict(name = 'td', attrs = {'class' : 'news_article_date'})) + keep_only_tags.append(dict(name = 'p')) + + feeds = [(u'ALL NEWS', u'http://www.autosport.com/rss/allnews.xml')] diff --git a/recipes/bankier_pl.recipe b/recipes/bankier_pl.recipe new file mode 100644 index 0000000000..8a68d844b3 --- /dev/null +++ b/recipes/bankier_pl.recipe @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'teepel ' + +''' +bankier.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class bankier(BasicNewsRecipe): + title = u'Bankier.pl' + __author__ = 'teepel ' + language = 'pl' + description ='Polski portal finansowy. Informacje o: gospodarka, inwestowanie, finanse osobiste, prowadzenie firmy, kursy walut, notowania akcji, fundusze.' + masthead_url='http://www.bankier.pl/gfx/hd-mid-02.gif' + INDEX='http://bankier.pl/' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + simultaneous_downloads = 5 + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'align' : 'left'})) + + remove_tags =[] + remove_tags.append(dict(name = 'table', attrs = {'cellspacing' : '2'})) + remove_tags.append(dict(name = 'div', attrs = {'align' : 'center'})) + remove_tags.append(dict(name = 'img', attrs = {'src' : '/gfx/hd-mid-02.gif'})) + #remove_tags.append(dict(name = 'a', attrs = {'target' : '_blank'})) + #remove_tags.append(dict(name = 'br', attrs = {'clear' : 'all'})) + + feeds = [ + (u'Wiadomości dnia', u'http://feeds.feedburner.com/bankier-wiadomosci-dnia'), + (u'Finanse osobiste', u'http://feeds.feedburner.com/bankier-finanse-osobiste'), + (u'Firma', u'http://feeds.feedburner.com/bankier-firma'), + (u'Giełda', u'http://feeds.feedburner.com/bankier-gielda'), + (u'Rynek walutowy', u'http://feeds.feedburner.com/bankier-rynek-walutowy'), + (u'Komunikaty ze spółek', u'http://feeds.feedburner.com/bankier-espi'), + ] + def print_version(self, url): + segment = url.split('.') + urlPart = segment[2] + segments = urlPart.split('-') + urlPart2 = segments[-1] + return 'http://www.bankier.pl/wiadomosci/print.html?article_id=' + urlPart2 + diff --git a/recipes/blognexto.recipe b/recipes/blognexto.recipe new file mode 100644 index 0000000000..b5ced2cf50 --- /dev/null +++ b/recipes/blognexto.recipe @@ -0,0 +1,28 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class blognexto(BasicNewsRecipe): + title = 'BLOG.NEXTO.pl' + __author__ = 'MrStefan ' + language = 'pl' + description ='o e-publikacjach prawie wszystko' + masthead_url='http://blog.nexto.pl/wp-content/uploads/2012/04/logo-blog-nexto.pl_.jpg' + remove_empty_feeds= True + oldest_article = 7 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'content'})) + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'class' : 'comment-cloud'})) + remove_tags.append(dict(name = 'p', attrs = {'class' : 'post-date1'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'fb-like'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'tags'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'postnavi'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'commments-box'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'respond'})) + + feeds = [('Artykuly', 'http://feeds.feedburner.com/blognexto')] diff --git a/recipes/brewiarz.recipe b/recipes/brewiarz.recipe new file mode 100644 index 0000000000..5d16278b00 --- /dev/null +++ b/recipes/brewiarz.recipe @@ -0,0 +1,140 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe +import datetime, re + + +class brewiarz(BasicNewsRecipe): + title = u'Brewiarz' + __author__ = 'Artur Stachecki ' + language = 'pl' + description = u'Serwis poświęcony Liturgii Godzin (brewiarzowi) - formie codziennej modlitwy Kościoła katolickiego.' + masthead_url = 'http://brewiarz.pl/images/logo2.gif' + max_articles_per_feed = 100 + remove_javascript = True + no_stylesheets = True + publication_type = 'newspaper' + next_days = 1 + + def parse_index(self): + dec2rom_dict = {"01": "i", "02": "ii", "03": "iii", "04": "iv", + "05": "v", "06": "vi", "07": "vii", "08": "viii", + "09": "ix", "10": "x", "11": "xi", "12": "xii"} + + weekday_dict = {"Sunday": "Niedziela", "Monday": "Poniedziałek", "Tuesday": "Wtorek", + "Wednesday": "Środa", "Thursday": "Czwartek", "Friday": "Piątek", "Saturday": "Sobota"} + + now = datetime.datetime.now() + + feeds = [] + for i in range(0, self.next_days): + url_date = now + datetime.timedelta(days=i) + url_date_month = url_date.strftime("%m") + url_date_month_roman = dec2rom_dict[url_date_month] + url_date_day = url_date.strftime("%d") + url_date_year = url_date.strftime("%Y")[2:] + url_date_weekday = url_date.strftime("%A") + url_date_weekday_pl = weekday_dict[url_date_weekday] + + url = "http://brewiarz.pl/" + url_date_month_roman + "_" + url_date_year + "/" + url_date_day + url_date_month + "/index.php3" + articles = self.parse_pages(url) + if articles: + title = url_date_weekday_pl + " " + url_date_day + "." + url_date_month + "." + url_date_year + feeds.append((title, articles)) + else: + sectors = self.get_sectors(url) + for subpage in sectors: + title = url_date_weekday_pl + " " + url_date_day + "." + url_date_month + "." + url_date_year + " - " + subpage.string + url = "http://brewiarz.pl/" + url_date_month_roman + "_" + url_date_year + "/" + url_date_day + url_date_month + "/" + subpage['href'] + print(url) + articles = self.parse_pages(url) + if articles: + feeds.append((title, articles)) + return feeds + + def get_sectors(self, url): + sectors = [] + soup = self.index_to_soup(url) + sectors_table = soup.find(name='table', attrs={'width': '490'}) + sector_links = sectors_table.findAll(name='a') + for sector_links_modified in sector_links: + link_parent_text = sector_links_modified.findParent(name='div').text + if link_parent_text: + sector_links_modified.text = link_parent_text.text + sectors.append(sector_links_modified) + return sectors + + def parse_pages(self, url): + current_articles = [] + soup = self.index_to_soup(url) + www = soup.find(attrs={'class': 'www'}) + if www: + box_title = www.find(text='Teksty LG') + article_box_parent = box_title.findParent('ul') + article_box_sibling = article_box_parent.findNextSibling('ul') + for li in article_box_sibling.findAll('li'): + link = li.find(name='a') + ol = link.findNextSibling(name='ol') + if ol: + sublinks = ol.findAll(name='a') + for sublink in sublinks: + link_title = self.tag_to_string(link) + " - " + self.tag_to_string(sublink) + link_url_print = re.sub('php3', 'php3?kr=_druk&wr=lg&', sublink['href']) + link_url = url[:-10] + link_url_print + current_articles.append({'title': link_title, + 'url': link_url, 'description': '', 'date': ''}) + else: + if link.findParent(name = 'ol'): + continue + else: + link_title = self.tag_to_string(link) + link_url_print = re.sub('php3', 'php3?kr=_druk&wr=lg&', link['href']) + link_url = url[:-10] + link_url_print + current_articles.append({'title': link_title, + 'url': link_url, 'description': '', 'date': ''}) + return current_articles + else: + return None + + def preprocess_html(self, soup): + footer = soup.find(name='a', attrs={'href': 'http://brewiarz.pl'}) + footer_parent = footer.findParent('div') + footer_parent.extract() + + header = soup.find(text='http://brewiarz.pl') + header_parent = header.findParent('div') + header_parent.extract() + + subheader = soup.find(text='Kolor szat:').findParent('div') + subheader.extract() + + color = soup.find('b') + color.extract() + + cleaned = self.strip_tags(soup) + + div = cleaned.findAll(name='div') + div[1].extract() + div[2].extract() + div[3].extract() + + return cleaned + + def strip_tags(self, soup_dirty): + VALID_TAGS = ['p', 'div', 'br', 'b', 'a', 'title', 'head', 'html', 'body'] + + for tag in soup_dirty.findAll(True): + if tag.name not in VALID_TAGS: + for i, x in enumerate(tag.parent.contents): + if x == tag: + break + else: + print "Can't find", tag, "in", tag.parent + continue + for r in reversed(tag.contents): + tag.parent.insert(i, r) + tag.extract() + + return soup_dirty diff --git a/recipes/buchreport.recipe b/recipes/buchreport.recipe new file mode 100644 index 0000000000..68d98d0622 --- /dev/null +++ b/recipes/buchreport.recipe @@ -0,0 +1,45 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe + +'''Calibre recipe to convert the RSS feeds of the Buchreport to an ebook.''' + +class Buchreport(BasicNewsRecipe) : + __author__ = 'a.peter' + __copyright__ = 'a.peter' + __license__ = 'GPL v3' + description = 'Buchreport' + version = 4 + title = u'Buchreport' + timefmt = ' [%d.%m.%Y]' + encoding = 'cp1252' + language = 'de_DE' + + + extra_css = 'body { margin-left: 0.00em; margin-right: 0.00em; } \ + article, articledate, articledescription { text-align: left; } \ + h1 { text-align: left; font-size: 140%; font-weight: bold; } \ + h2 { text-align: left; font-size: 100%; font-weight: bold; font-style: italic; } \ + h3 { text-align: left; font-size: 100%; font-weight: regular; font-style: italic; } \ + h4, h5, h6 { text-align: left; font-size: 100%; font-weight: bold; }' + + oldest_article = 7.0 + no_stylesheets = True + remove_javascript = True + use_embedded_content = False + publication_type = 'newspaper' + + remove_tags_before = dict(name='h2') + remove_tags_after = [ + dict(name='div', attrs={'style':["padding-top:10px;clear:both"]}) + ] + remove_tags = [ + dict(name='div', attrs={'style':["padding-top:10px;clear:both"]}), + dict(name='iframe'), + dict(name='img') + ] + + feeds = [ + (u'Buchreport', u'http://www.buchreport.de/index.php?id=5&type=100') + ] + + def get_masthead_url(self): + return 'http://www.buchreport.de/fileadmin/template/img/buchreport_logo.jpg' diff --git a/recipes/business_standard.recipe b/recipes/business_standard.recipe index badca48733..a61c32aa42 100644 --- a/recipes/business_standard.recipe +++ b/recipes/business_standard.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2009-2010, Darko Miletic ' +__copyright__ = '2009-2012, Darko Miletic ' ''' www.business-standard.com ''' @@ -14,10 +14,12 @@ class BusinessStandard(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False + auto_cleanup = False encoding = 'cp1252' publisher = 'Business Standard Limited' category = 'news, business, money, india, world' language = 'en_IN' + masthead_url = 'http://feeds.business-standard.com/images/logo_08.jpg' conversion_options = { 'comments' : description @@ -26,7 +28,7 @@ class BusinessStandard(BasicNewsRecipe): ,'publisher' : publisher ,'linearize_tables': True } - keep_only_tags=[dict(attrs={'class':'TableClas'})] + #keep_only_tags=[dict(name='td', attrs={'class':'TableClas'})] remove_tags = [ dict(name=['object','link','script','iframe','base','meta']) ,dict(attrs={'class':'rightDiv2'}) @@ -45,3 +47,8 @@ class BusinessStandard(BasicNewsRecipe): ,(u'Management & Mktg' , u'http://feeds.business-standard.com/rss/7_0.xml' ) ,(u'Opinion' , u'http://feeds.business-standard.com/rss/5_0.xml' ) ] + + def print_version(self, url): + l, s, tp = url.rpartition('/') + t, k, autono = l.rpartition('/') + return 'http://www.business-standard.com/india/printpage.php?autono=' + autono + '&tp=' + tp diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe index 5254694d24..a4e24ac61b 100644 --- a/recipes/dobreprogamy.recipe +++ b/recipes/dobreprogamy.recipe @@ -6,7 +6,6 @@ class Dobreprogramy_pl(BasicNewsRecipe): __author__ = 'fenuks' __licence__ ='GPL v3' category = 'IT' - language = 'pl' masthead_url='http://static.dpcdn.pl/css/Black/Images/header_logo_napis_fullVersion.png' cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png' description = u'Aktualności i blogi z dobreprogramy.pl' @@ -29,4 +28,4 @@ class Dobreprogramy_pl(BasicNewsRecipe): for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: a['href']=self.index + a['href'] - return soup \ No newline at end of file + return soup diff --git a/recipes/f1_ultra.recipe b/recipes/f1_ultra.recipe new file mode 100644 index 0000000000..ada82542fc --- /dev/null +++ b/recipes/f1_ultra.recipe @@ -0,0 +1,35 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class f1ultra(BasicNewsRecipe): + title = u'Formuła 1 - F1 ultra' + __license__ = 'GPL v3' + __author__ = 'MrStefan , Artur Stachecki ' + language = 'pl' + description =u'Formuła 1, Robert Kubica, F3, GP2 oraz inne serie wyścigowe.' + masthead_url='http://www.f1ultra.pl/templates/f1ultra/images/logo.gif' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + keep_only_tags =[(dict(name = 'div', attrs = {'id' : 'main'}))] + remove_tags_after =[dict(attrs = {'style' : 'margin-top:5px;margin-bottom:5px;display: inline;'})] + remove_tags =[(dict(attrs = {'class' : ['buttonheading', 'avPlayerContainer', 'createdate']}))] + remove_tags.append(dict(attrs = {'title' : ['PDF', 'Drukuj', 'Email']})) + remove_tags.append(dict(name = 'form', attrs = {'method' : 'post'})) + remove_tags.append(dict(name = 'hr', attrs = {'size' : '2'})) + + preprocess_regexps = [(re.compile(r'align="left"'), lambda match: ''), + (re.compile(r'align="right"'), lambda match: ''), + (re.compile(r'width=\"*\"'), lambda match: ''), + (re.compile(r'\'), lambda match: '')] + + + extra_css = '''.contentheading { font-size: 1.4em; font-weight: bold; } + img { display: block; clear: both;} + ''' + remove_attributes = ['width','height','position','float','padding-left','padding-right','padding','text-align'] + + feeds = [(u'F1 Ultra', u'http://www.f1ultra.pl/index.php?option=com_rd_rss&id=1&Itemid=245')] diff --git a/recipes/focus_pl.recipe b/recipes/focus_pl.recipe index 342aa0d2db..1954fd7803 100644 --- a/recipes/focus_pl.recipe +++ b/recipes/focus_pl.recipe @@ -2,7 +2,9 @@ import re from calibre.web.feeds.news import BasicNewsRecipe + class FocusRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' __author__ = u'intromatyk ' language = 'pl' @@ -12,10 +14,10 @@ class FocusRecipe(BasicNewsRecipe): publisher = u'Gruner + Jahr Polska' category = u'News' description = u'Newspaper' - category='magazine' - cover_url='' - remove_empty_feeds= True - no_stylesheets=True + category = 'magazine' + cover_url = '' + remove_empty_feeds = True + no_stylesheets = True oldest_article = 7 max_articles_per_feed = 100000 recursions = 0 @@ -27,15 +29,15 @@ class FocusRecipe(BasicNewsRecipe): simultaneous_downloads = 5 r = re.compile('.*(?Phttp:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*') - keep_only_tags =[] - keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'cll'})) - - remove_tags =[] - remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulm noprint'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'txb'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'h2'})) - remove_tags.append(dict(name = 'ul', attrs = {'class' : 'txu'})) - remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulc'})) + keep_only_tags = [] + keep_only_tags.append(dict(name='div', attrs={'id': 'cll'})) + + remove_tags = [] + remove_tags.append(dict(name='div', attrs={'class': 'ulm noprint'})) + remove_tags.append(dict(name='div', attrs={'class': 'txb'})) + remove_tags.append(dict(name='div', attrs={'class': 'h2'})) + remove_tags.append(dict(name='ul', attrs={'class': 'txu'})) + remove_tags.append(dict(name='div', attrs={'class': 'ulc'})) extra_css = ''' body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} @@ -44,18 +46,17 @@ class FocusRecipe(BasicNewsRecipe): p.lead {font-weight: bold; text-align: left;} .authordate {font-size: small; color: #696969;} .fot{font-size: x-small; color: #666666;} - ''' + ''' - - feeds = [ - ('Nauka', 'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'), - ('Historia', 'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'), - ('Cywilizacja', 'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'), - ('Sport', 'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'), - ('Technika', 'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'), - ('Przyroda', 'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'), - ('Technologie', 'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'), - ] + feeds = [ + ('Nauka', 'http://www.focus.pl/nauka/rss/'), + ('Historia', 'http://www.focus.pl/historia/rss/'), + ('Cywilizacja', 'http://www.focus.pl/cywilizacja/rss/'), + ('Sport', 'http://www.focus.pl/sport/rss/'), + ('Technika', 'http://www.focus.pl/technika/rss/'), + ('Przyroda', 'http://www.focus.pl/przyroda/rss/'), + ('Technologie', 'http://www.focus.pl/gadzety/rss/') + ] def skip_ad_pages(self, soup): if ('advertisement' in soup.find('title').string.lower()): @@ -65,20 +66,20 @@ class FocusRecipe(BasicNewsRecipe): return None def get_cover_url(self): - soup=self.index_to_soup('http://www.focus.pl/magazyn/') - tag=soup.find(name='div', attrs={'class':'clr fl'}) + soup = self.index_to_soup('http://www.focus.pl/magazyn/') + tag = soup.find(name='div', attrs={'class': 'clr fl'}) if tag: - self.cover_url='http://www.focus.pl/' + tag.a['href'] + self.cover_url = 'http://www.focus.pl/' + tag.a['href'] return getattr(self, 'cover_url', self.cover_url) def print_version(self, url): - if url.count ('focus.pl.feedsportal.com'): + if url.count('focus.pl.feedsportal.com'): u = url.find('focus0Bpl') u = 'http://www.focus.pl/' + url[u + 11:] u = u.replace('0C', '/') u = u.replace('A', '') - u = u.replace ('0E','-') + u = u.replace('0E', '-') u = u.replace('/nc/1//story01.htm', '/do-druku/1') - else: - u = url.replace('/nc/1','/do-druku/1') - return u \ No newline at end of file + else: + u = url.replace('/nc/1', '/do-druku/1') + return u diff --git a/recipes/gazeta_pl_krakow.recipe b/recipes/gazeta_pl_krakow.recipe new file mode 100644 index 0000000000..59188a5d6a --- /dev/null +++ b/recipes/gazeta_pl_krakow.recipe @@ -0,0 +1,102 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = 'teepel based on GW from fenuks' + +''' +krakow.gazeta.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class gw_krakow(BasicNewsRecipe): + title = u'Gazeta.pl Kraków' + __author__ = 'teepel based on GW from fenuks' + language = 'pl' + description =u'Wiadomości z Krakowa na portalu Gazeta.pl.' + category='newspaper' + publication_type = 'newspaper' + masthead_url='http://bi.gazeta.pl/im/5/8528/m8528105.gif' + INDEX='http://krakow.gazeta.pl/' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article'})) + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_likes'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tools'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'rel'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})) + remove_tags.append(dict(name = 'u1', attrs = {'id' : 'articleToolbar'})) + remove_tags.append(dict(name = 'li', attrs = {'class' : 'atComments'})) + remove_tags.append(dict(name = 'li', attrs = {'class' : 'atLicense'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'banP4'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'article_toolbar'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tags'})) + remove_tags.append(dict(name = 'p', attrs = {'class' : 'txt_upl'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazeta_article_related_new'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazetaVideoPlayer'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_miniatures'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_buttons'})) + + remove_tags_after = [dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})] + + feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/krakow.xml')] + + def skip_ad_pages(self, soup): + tag=soup.find(name='a', attrs={'class':'btn'}) + if tag: + new_soup=self.index_to_soup(tag['href'], raw=True) + return new_soup + + + def append_page(self, soup, appendtag): + loop=False + tag = soup.find('div', attrs={'id':'Str'}) + if appendtag.find('div', attrs={'id':'Str'}): + nexturl=tag.findAll('a') + appendtag.find('div', attrs={'id':'Str'}).extract() + loop=True + if appendtag.find(id='source'): + appendtag.find(id='source').extract() + while loop: + loop=False + for link in nexturl: + if u'następne' in link.string: + url= self.INDEX + link['href'] + soup2 = self.index_to_soup(url) + pagetext = soup2.find(id='artykul') + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag = soup2.find('div', attrs={'id':'Str'}) + nexturl=tag.findAll('a') + loop=True + + def gallery_article(self, appendtag): + tag=appendtag.find(id='container_gal') + if tag: + nexturl=appendtag.find(id='gal_btn_next').a['href'] + appendtag.find(id='gal_navi').extract() + while nexturl: + soup2=self.index_to_soup(nexturl) + pagetext=soup2.find(id='container_gal') + nexturl=pagetext.find(id='gal_btn_next') + if nexturl: + nexturl=nexturl.a['href'] + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + rem=appendtag.find(id='gal_navi') + if rem: + rem.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + if soup.find(id='container_gal'): + self.gallery_article(soup.body) + return soup + diff --git a/recipes/gazeta_pl_warszawa.recipe b/recipes/gazeta_pl_warszawa.recipe new file mode 100644 index 0000000000..2d95bcc06f --- /dev/null +++ b/recipes/gazeta_pl_warszawa.recipe @@ -0,0 +1,99 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'teepel based on GW from fenuks' + +''' +warszawa.gazeta.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class gw_wawa(BasicNewsRecipe): + title = u'Gazeta.pl Warszawa' + __author__ = 'teepel based on GW from fenuks' + language = 'pl' + description ='Wiadomości z Warszawy na portalu Gazeta.pl.' + category='newspaper' + publication_type = 'newspaper' + masthead_url='http://bi.gazeta.pl/im/3/4089/m4089863.gif' + INDEX='http://warszawa.gazeta.pl/' + remove_empty_feeds= True + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article'})) + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_likes'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tools'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'rel'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})) + remove_tags.append(dict(name = 'u1', attrs = {'id' : 'articleToolbar'})) + remove_tags.append(dict(name = 'li', attrs = {'class' : 'atComments'})) + remove_tags.append(dict(name = 'li', attrs = {'class' : 'atLicense'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'banP4'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'article_toolbar'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tags'})) + remove_tags.append(dict(name = 'p', attrs = {'class' : 'txt_upl'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazeta_article_related_new'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazetaVideoPlayer'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_miniatures'})) + + feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/warszawa.xml')] + + def skip_ad_pages(self, soup): + tag=soup.find(name='a', attrs={'class':'btn'}) + if tag: + new_soup=self.index_to_soup(tag['href'], raw=True) + return new_soup + + + def append_page(self, soup, appendtag): + loop=False + tag = soup.find('div', attrs={'id':'Str'}) + if appendtag.find('div', attrs={'id':'Str'}): + nexturl=tag.findAll('a') + appendtag.find('div', attrs={'id':'Str'}).extract() + loop=True + if appendtag.find(id='source'): + appendtag.find(id='source').extract() + while loop: + loop=False + for link in nexturl: + if u'następne' in link.string: + url= self.INDEX + link['href'] + soup2 = self.index_to_soup(url) + pagetext = soup2.find(id='artykul') + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag = soup2.find('div', attrs={'id':'Str'}) + nexturl=tag.findAll('a') + loop=True + + def gallery_article(self, appendtag): + tag=appendtag.find(id='container_gal') + if tag: + nexturl=appendtag.find(id='gal_btn_next').a['href'] + appendtag.find(id='gal_navi').extract() + while nexturl: + soup2=self.index_to_soup(nexturl) + pagetext=soup2.find(id='container_gal') + nexturl=pagetext.find(id='gal_btn_next') + if nexturl: + nexturl=nexturl.a['href'] + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + rem=appendtag.find(id='gal_navi') + if rem: + rem.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + if soup.find(id='container_gal'): + self.gallery_article(soup.body) + return soup + diff --git a/recipes/gazeta_wyborcza.recipe b/recipes/gazeta_wyborcza.recipe index 5c034b10ab..633b80444a 100644 --- a/recipes/gazeta_wyborcza.recipe +++ b/recipes/gazeta_wyborcza.recipe @@ -1,104 +1,107 @@ # -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe + class Gazeta_Wyborcza(BasicNewsRecipe): - title = u'Gazeta Wyborcza' - __author__ = 'fenuks' - language = 'pl' - description ='news from gazeta.pl' - category='newspaper' + title = u'Gazeta.pl' + __author__ = 'fenuks, Artur Stachecki' + language = 'pl' + description = 'news from gazeta.pl' + category = 'newspaper' publication_type = 'newspaper' - masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' - INDEX='http://wyborcza.pl' - remove_empty_feeds= True + masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg' + INDEX = 'http://wyborcza.pl' + remove_empty_feeds = True oldest_article = 3 max_articles_per_feed = 100 - remove_javascript=True - no_stylesheets=True - ignore_duplicate_articles = {'title', 'url'} - keep_only_tags = dict(id=['gazeta_article', 'article']) - remove_tags_after = dict(id='gazeta_article_share') - remove_tags = [dict(attrs={'class':['artReadMore', 'gazeta_article_related_new', 'txt_upl']}), dict(id=['gazeta_article_likes', 'gazeta_article_tools', 'rel', 'gazeta_article_tags', 'gazeta_article_share', 'gazeta_article_brand', 'gazeta_article_miniatures'])] - - feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), - (u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'), - (u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'), - (u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'), - (u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), - (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), - (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), - #(u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), - (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), - (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), - (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), - (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), - (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), - (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), - (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), - (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss') - ] + remove_javascript = True + no_stylesheets = True + remove_tags_before = dict(id='k0') + remove_tags_after = dict(id='banP4') + remove_tags = [dict(name='div', attrs={'class':'rel_box'}), dict(attrs={'class':['date', 'zdjP', 'zdjM', 'pollCont', 'rel_video', 'brand', 'txt_upl']}), dict(name='div', attrs={'id':'footer'})] + feeds = [(u'Kraj', u'http://rss.feedsportal.com/c/32739/f/530266/index.rss'), (u'\u015awiat', u'http://rss.feedsportal.com/c/32739/f/530270/index.rss'), + (u'Wyborcza.biz', u'http://wyborcza.biz/pub/rss/wyborcza_biz_wiadomosci.htm'), + (u'Komentarze', u'http://rss.feedsportal.com/c/32739/f/530312/index.rss'), + (u'Kultura', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_kultura.xml'), + (u'Nauka', u'http://rss.feedsportal.com/c/32739/f/530269/index.rss'), (u'Opinie', u'http://rss.gazeta.pl/pub/rss/opinie.xml'), (u'Gazeta \u015awi\u0105teczna', u'http://rss.feedsportal.com/c/32739/f/530431/index.rss'), (u'Du\u017cy Format', u'http://rss.feedsportal.com/c/32739/f/530265/index.rss'), (u'Witamy w Polsce', u'http://rss.feedsportal.com/c/32739/f/530476/index.rss'), (u'M\u0119ska Muzyka', u'http://rss.feedsportal.com/c/32739/f/530337/index.rss'), (u'Lata Lec\u0105', u'http://rss.feedsportal.com/c/32739/f/530326/index.rss'), (u'Solidarni z Tybetem', u'http://rss.feedsportal.com/c/32739/f/530461/index.rss'), (u'W pon. - \u017bakowski', u'http://rss.feedsportal.com/c/32739/f/530491/index.rss'), (u'We wt. - Kolenda-Zalewska', u'http://rss.feedsportal.com/c/32739/f/530310/index.rss'), (u'\u015aroda w \u015brod\u0119', u'http://rss.feedsportal.com/c/32739/f/530428/index.rss'), (u'W pi\u0105tek - Olejnik', u'http://rss.feedsportal.com/c/32739/f/530364/index.rss'), (u'Nekrologi', u'http://rss.feedsportal.com/c/32739/f/530358/index.rss') + ] def skip_ad_pages(self, soup): - tag=soup.find(name='a', attrs={'class':'btn'}) - if tag: - new_soup=self.index_to_soup(tag['href'], raw=True) + tag = soup.find(name='a', attrs={'class': 'btn'}) + if tag: + new_soup = self.index_to_soup(tag['href'], raw=True) return new_soup - def append_page(self, soup, appendtag): - loop=False - tag = soup.find('div', attrs={'id':'Str'}) - if appendtag.find('div', attrs={'id':'Str'}): - nexturl=tag.findAll('a') - appendtag.find('div', attrs={'id':'Str'}).extract() - loop=True + loop = False + tag = soup.find('div', attrs={'id': 'Str'}) + if appendtag.find('div', attrs={'id': 'Str'}): + nexturl = tag.findAll('a') + appendtag.find('div', attrs={'id': 'Str'}).extract() + loop = True if appendtag.find(id='source'): appendtag.find(id='source').extract() while loop: - loop=False + loop = False for link in nexturl: if u'następne' in link.string: - url= self.INDEX + link['href'] + url = self.INDEX + link['href'] soup2 = self.index_to_soup(url) pagetext = soup2.find(id='artykul') pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - tag = soup2.find('div', attrs={'id':'Str'}) - nexturl=tag.findAll('a') - loop=True + tag = soup2.find('div', attrs={'id': 'Str'}) + nexturl = tag.findAll('a') + loop = True def gallery_article(self, appendtag): - tag=appendtag.find(id='container_gal') + tag = appendtag.find(id='container_gal') if tag: - nexturl=appendtag.find(id='gal_btn_next').a['href'] + nexturl = appendtag.find(id='gal_btn_next').a['href'] appendtag.find(id='gal_navi').extract() while nexturl: - soup2=self.index_to_soup(nexturl) - pagetext=soup2.find(id='container_gal') - nexturl=pagetext.find(id='gal_btn_next') + soup2 = self.index_to_soup(nexturl) + pagetext = soup2.find(id='container_gal') + nexturl = pagetext.find(id='gal_btn_next') if nexturl: - nexturl=nexturl.a['href'] + nexturl = nexturl.a['href'] pos = len(appendtag.contents) appendtag.insert(pos, pagetext) - rem=appendtag.find(id='gal_navi') + rem = appendtag.find(id='gal_navi') if rem: rem.extract() def preprocess_html(self, soup): - self.append_page(soup, soup.body) - if soup.find(id='container_gal'): - self.gallery_article(soup.body) - return soup + if soup.find(attrs={'class': 'piano_btn_1'}): + return None + else: + self.append_page(soup, soup.body) + if soup.find(id='container_gal'): + self.gallery_article(soup.body) + return soup def print_version(self, url): - if 'http://wyborcza.biz/biznes/' not in url: - return url + if url.count('rss.feedsportal.com'): + u = url.find('wyborcza0Bpl') + u = 'http://www.wyborcza.pl/' + url[u + 11:] + u = u.replace('0C', '/') + u = u.replace('A', '') + u = u.replace('0E', '-') + u = u.replace('0H', ',') + u = u.replace('0I', '_') + u = u.replace('0B', '.') + u = u.replace('/1,', '/2029020,') + u = u.replace('/story01.htm', '') + print(u) + return u + elif 'http://wyborcza.pl/1' in url: + return url.replace('http://wyborcza.pl/1', 'http://wyborcza.pl/2029020') else: - return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') + return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020') def get_cover_url(self): soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html') - cover=soup.find(id='GWmini2') - soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href']) - self.cover_url='http://wyborcza.pl' + soup.img['src'] + cover = soup.find(id='GWmini2') + soup = self.index_to_soup('http://wyborcza.pl/' + cover.contents[3].a['href']) + self.cover_url = 'http://wyborcza.pl' + soup.img['src'] return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/icons/antyweb.png b/recipes/icons/antyweb.png new file mode 100644 index 0000000000..8ca9870f60 Binary files /dev/null and b/recipes/icons/antyweb.png differ diff --git a/recipes/icons/autosport.png b/recipes/icons/autosport.png new file mode 100644 index 0000000000..0c84c96a0b Binary files /dev/null and b/recipes/icons/autosport.png differ diff --git a/recipes/icons/bankier_pl.png b/recipes/icons/bankier_pl.png new file mode 100644 index 0000000000..c26f006a57 Binary files /dev/null and b/recipes/icons/bankier_pl.png differ diff --git a/recipes/icons/blognexto.png b/recipes/icons/blognexto.png new file mode 100644 index 0000000000..2a1ae4a4ae Binary files /dev/null and b/recipes/icons/blognexto.png differ diff --git a/recipes/icons/brewiarz.png b/recipes/icons/brewiarz.png new file mode 100644 index 0000000000..b47dfc95f6 Binary files /dev/null and b/recipes/icons/brewiarz.png differ diff --git a/recipes/icons/business_standard.png b/recipes/icons/business_standard.png index 1edff420c0..f4c04e566a 100644 Binary files a/recipes/icons/business_standard.png and b/recipes/icons/business_standard.png differ diff --git a/recipes/icons/f1_ultra.png b/recipes/icons/f1_ultra.png new file mode 100644 index 0000000000..f45a94f53a Binary files /dev/null and b/recipes/icons/f1_ultra.png differ diff --git a/recipes/icons/gazeta_pl_krakow.png b/recipes/icons/gazeta_pl_krakow.png new file mode 100644 index 0000000000..119afbba3a Binary files /dev/null and b/recipes/icons/gazeta_pl_krakow.png differ diff --git a/recipes/icons/gazeta_pl_szczecin.png b/recipes/icons/gazeta_pl_szczecin.png new file mode 100644 index 0000000000..119afbba3a Binary files /dev/null and b/recipes/icons/gazeta_pl_szczecin.png differ diff --git a/recipes/icons/gazeta_pl_warszawa.png b/recipes/icons/gazeta_pl_warszawa.png new file mode 100644 index 0000000000..119afbba3a Binary files /dev/null and b/recipes/icons/gazeta_pl_warszawa.png differ diff --git a/recipes/icons/gazeta_wyborcza.png b/recipes/icons/gazeta_wyborcza.png index 9e480cc41d..119afbba3a 100644 Binary files a/recipes/icons/gazeta_wyborcza.png and b/recipes/icons/gazeta_wyborcza.png differ diff --git a/recipes/icons/mateusz_czytania.png b/recipes/icons/mateusz_czytania.png new file mode 100644 index 0000000000..7568139433 Binary files /dev/null and b/recipes/icons/mateusz_czytania.png differ diff --git a/recipes/icons/myapple_pl.png b/recipes/icons/myapple_pl.png new file mode 100644 index 0000000000..a68cf4e7ef Binary files /dev/null and b/recipes/icons/myapple_pl.png differ diff --git a/recipes/icons/naszdziennik.png b/recipes/icons/naszdziennik.png new file mode 100644 index 0000000000..b557a7835e Binary files /dev/null and b/recipes/icons/naszdziennik.png differ diff --git a/recipes/icons/rushisaband.png b/recipes/icons/rushisaband.png new file mode 100644 index 0000000000..9a7d4237cf Binary files /dev/null and b/recipes/icons/rushisaband.png differ diff --git a/recipes/icons/rynek_infrastruktury.png b/recipes/icons/rynek_infrastruktury.png new file mode 100644 index 0000000000..8d2e6ac27b Binary files /dev/null and b/recipes/icons/rynek_infrastruktury.png differ diff --git a/recipes/icons/rynek_kolejowy.png b/recipes/icons/rynek_kolejowy.png new file mode 100644 index 0000000000..e9dd5fc464 Binary files /dev/null and b/recipes/icons/rynek_kolejowy.png differ diff --git a/recipes/icons/satkurier.png b/recipes/icons/satkurier.png new file mode 100644 index 0000000000..6e71bb2450 Binary files /dev/null and b/recipes/icons/satkurier.png differ diff --git a/recipes/icons/telepolis_pl.png b/recipes/icons/telepolis_pl.png new file mode 100644 index 0000000000..0b94658d94 Binary files /dev/null and b/recipes/icons/telepolis_pl.png differ diff --git a/recipes/icons/wprost.png b/recipes/icons/wprost.png new file mode 100644 index 0000000000..f81878f2d2 Binary files /dev/null and b/recipes/icons/wprost.png differ diff --git a/recipes/kerrang.recipe b/recipes/kerrang.recipe new file mode 100644 index 0000000000..bbd944eb62 --- /dev/null +++ b/recipes/kerrang.recipe @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe + +class kerrang(BasicNewsRecipe): + title = u'Kerrang!' + __author__ = 'Artur Stachecki ' + language = 'en_GB' + description = u'UK-based magazine devoted to rock music published by Bauer Media Group' + oldest_article = 7 + masthead_url = 'http://images.kerrang.com/design/kerrang/kerrangsite/logo.gif' + max_articles_per_feed = 100 + simultaneous_downloads = 5 + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + recursions = 0 + + keep_only_tags = [] + keep_only_tags.append(dict(attrs = {'class' : ['headz', 'blktxt']})) + + extra_css = ''' img { display: block; margin-right: auto;} + h1 {text-align: left; font-size: 22px;}''' + + feeds = [(u'News', u'http://www.kerrang.com/blog/rss.xml')] + + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/lequipe.recipe b/recipes/lequipe.recipe new file mode 100644 index 0000000000..c6e9a26880 --- /dev/null +++ b/recipes/lequipe.recipe @@ -0,0 +1,45 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe + + +class leequipe(BasicNewsRecipe): + title = u'l\'equipe' + __author__ = 'Artur Stachecki ' + language = 'fr' + description = u'Retrouvez tout le sport en direct sur le site de L\'EQUIPE et suivez l\'actualité du football, rugby, basket, cyclisme, f1, volley, hand, tous les résultats sportifs' + oldest_article = 1 + masthead_url = 'http://static.lequipe.fr/v6/img/logo-lequipe.png' + max_articles_per_feed = 100 + simultaneous_downloads = 5 + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + recursions = 0 + + keep_only_tags = [] + keep_only_tags.append(dict(attrs={'id': ['article']})) + + remove_tags = [] + remove_tags.append(dict(attrs={'id': ['partage', 'ensavoirplus', 'bloc_bas_breve', 'commentaires', 'tools']})) + remove_tags.append(dict(attrs={'class': ['partage_bis', 'date']})) + + feeds = [(u'Football', u'http://www.lequipe.fr/rss/actu_rss_Football.xml'), + (u'Auto-Moto', u'http://www.lequipe.fr/rss/actu_rss_Auto-Moto.xml'), + (u'Tennis', u'http://www.lequipe.fr/rss/actu_rss_Tennis.xml'), + (u'Golf', u'http://www.lequipe.fr/rss/actu_rss_Golf.xml'), + (u'Rugby', u'http://www.lequipe.fr/rss/actu_rss_Rugby.xml'), + (u'Basket', u'http://www.lequipe.fr/rss/actu_rss_Basket.xml'), + (u'Hand', u'http://www.lequipe.fr/rss/actu_rss_Hand.xml'), + (u'Cyclisme', u'http://www.lequipe.fr/rss/actu_rss_Cyclisme.xml'), + (u'Autres Sports', u'http://pipes.yahoo.com/pipes/pipe.run?_id=2039f7f4f350c70c5e4e8633aa1b37cd&_render=rss') + ] + + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/mateusz_czytania.recipe b/recipes/mateusz_czytania.recipe new file mode 100644 index 0000000000..ba7c598787 --- /dev/null +++ b/recipes/mateusz_czytania.recipe @@ -0,0 +1,36 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__author__ = 'teepel ' + +''' +http://www.mateusz.pl/czytania +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class czytania_mateusz(BasicNewsRecipe): + title = u'Czytania na ka\u017cdy dzie\u0144' + __author__ = 'teepel ' + description = u'Codzienne czytania z jednego z najstarszych polskich serwisów katolickich.' + language = 'pl' + INDEX='http://www.mateusz.pl/czytania' + oldest_article = 1 + remove_empty_feeds= True + no_stylesheets=True + auto_cleanup = True + remove_javascript = True + simultaneous_downloads = 2 + max_articles_per_feed = 100 + auto_cleanup = True + + feeds = [(u'Czytania', u'http://mateusz.pl/rss/czytania/')] + + remove_tags =[] + remove_tags.append(dict(name = 'p', attrs = {'class' : 'top'})) + + #thanks t3d + def get_article_url(self, article): + link = article.get('link') + if 'kmt.pl' not in link: + return link diff --git a/recipes/money_pl.recipe b/recipes/money_pl.recipe index 075264f8f7..475c2059ff 100644 --- a/recipes/money_pl.recipe +++ b/recipes/money_pl.recipe @@ -4,7 +4,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class FocusRecipe(BasicNewsRecipe): __license__ = 'GPL v3' - __author__ = u'intromatyk ' + __author__ = u'Artur Stachecki ' language = 'pl' version = 1 diff --git a/recipes/myapple_pl.recipe b/recipes/myapple_pl.recipe new file mode 100644 index 0000000000..df5708a325 --- /dev/null +++ b/recipes/myapple_pl.recipe @@ -0,0 +1,49 @@ + +from calibre.web.feeds.news import BasicNewsRecipe + +class MyAppleRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = u'Artur Stachecki ' + language = 'pl' + version = 1 + + title = u'MyApple.pl' + category = u'News' + description = u' Największy w Polsce serwis zajmujący się tematyką związaną z Apple i wszelkimi produktami tej firmy.' + cover_url='' + remove_empty_feeds= True + no_stylesheets=True + oldest_article = 7 + max_articles_per_feed = 100000 + recursions = 0 + + no_stylesheets = True + remove_javascript = True + simultaneous_downloads = 3 + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'article_content'})) + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'class' : 'article_author_date_comment_container'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'fullwidth'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'cmslinks'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'googleads-468'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'comments'})) + + + extra_css = ''' + body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} + td.contentheading{font-size: large; font-weight: bold;} + ''' + + feeds = [ + ('News', 'feed://myapple.pl/external.php?do=rss&type=newcontent§ionid=1&days=120&count=10'), + ] + + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup diff --git a/recipes/naszdziennik.recipe b/recipes/naszdziennik.recipe new file mode 100644 index 0000000000..4c7b78c199 --- /dev/null +++ b/recipes/naszdziennik.recipe @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe + +class naszdziennik(BasicNewsRecipe): + title = u'Nasz Dziennik' + __author__ = 'Artur Stachecki ' + language = 'pl' + description =u'Nasz Dziennik - Ogólnopolska gazeta codzienna. Podejmuje tematykę dotyczącą życia społecznego, kulturalnego, politycznego i religijnego. Propaguje wartości chrześcijańskie oraz tradycję i kulturę polską.' + masthead_url='http://www.naszdziennik.pl/images/logo-male.png' + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets = True + + keep_only_tags =[dict(attrs = {'id' : 'article'})] + + #definiujemy nową funkcje; musi zwracać listę feedów wraz z artykułami + def parse_index(self): + #adres do parsowania artykułów + soup = self.index_to_soup('http://www.naszdziennik.pl/news') + #deklaracja pustej listy feedów + feeds = [] + #deklaracja pustego słownika artykułów + articles = {} + #deklaracja pustej listy sekcji + sections = [] + #deklaracja pierwszej sekcji jako pusty string + section = '' + + #pętla for, która analizuje po kolei każdy tag "news-article" + for item in soup.findAll(attrs = {'class' : 'news-article'}) : + #w tagu "news-article szukamy pierwszego taga h4" + section = item.find('h4') + #zmiennej sekcja przypisujemy zawartość tekstową taga + section = self.tag_to_string(section) + #sprawdzamy czy w słowniku artykułów istnieje klucz dotyczący sekcji + #jeśli nie istnieje to : + if not articles.has_key(section) : + #do listy sekcji dodajemy nową sekcje + sections.append(section) + #deklarujemy nową sekcje w słowniku artykułów przypisując jej klucz odpowiadający nowej sekcji, którego wartością jest pusta lista + articles[section] = [] + #przeszukujemy kolejny tag "title-datetime" + article_title_datetime = item.find(attrs = {'class' : 'title-datetime'}) + #w tagu title-datetime znajdujemy pierwszy link + article_a = article_title_datetime.find('a') + #i tworzymy z niego link absolutny do właściwego artykułu + article_url = 'http://naszdziennik.pl' + article_a['href'] + #jako tytuł użyty będzie tekst pomiędzy tagami + article_title = self.tag_to_string(article_a) + #a data będzie tekstem z pierwszego taga h4 znalezionego w tagu title-datetime + article_date = self.tag_to_string(article_title_datetime.find('h4')) + #zebrane elementy dodajemy do listy zadeklarowanej w linijce 44 + articles[section].append( { 'title' : article_title, 'url' : article_url, 'date' : article_date }) + #po dodaniu wszystkich artykułów dodajemy sekcje do listy feedów, korzystając z list sekcji znajdujących się w słowniku + for section in sections: + feeds.append((section, articles[section])) + #zwracamy listę feedów, której parsowaniem zajmie się calibre + return feeds \ No newline at end of file diff --git a/recipes/new_yorker.recipe b/recipes/new_yorker.recipe index 9eeb8b31ee..c5f1b0aff2 100644 --- a/recipes/new_yorker.recipe +++ b/recipes/new_yorker.recipe @@ -22,9 +22,9 @@ class NewYorker(BasicNewsRecipe): masthead_url = 'http://www.newyorker.com/css/i/hed/logo.gif' extra_css = """ body {font-family: "Times New Roman",Times,serif} - .articleauthor{color: #9F9F9F; + .articleauthor{color: #9F9F9F; font-family: Arial, sans-serif; - font-size: small; + font-size: small; text-transform: uppercase} .rubric,.dd,h6#credit{color: #CD0021; font-family: Arial, sans-serif; @@ -63,11 +63,11 @@ class NewYorker(BasicNewsRecipe): return url.strip() def get_cover_url(self): - cover_url = None - soup = self.index_to_soup('http://www.newyorker.com/magazine/toc/') - cover_item = soup.find('img',attrs={'id':'inThisIssuePhoto'}) + cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg" + soup = self.index_to_soup('http://www.newyorker.com/magazine?intcid=magazine') + cover_item = soup.find('div',attrs={'id':'media-count-1'}) if cover_item: - cover_url = 'http://www.newyorker.com' + cover_item['src'].strip() + cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip() return cover_url def preprocess_html(self, soup): diff --git a/recipes/red_voltaire.recipe b/recipes/red_voltaire.recipe new file mode 100644 index 0000000000..1763125a8e --- /dev/null +++ b/recipes/red_voltaire.recipe @@ -0,0 +1,32 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class RedVoltaireRecipe(BasicNewsRecipe): + title = u'Red Voltaire' + __author__ = 'atordo' + description = u'Red de prensa no alineada, especializada en el an\u00e1lisis de las relaciones internacionales' + oldest_article = 7 + max_articles_per_feed = 30 + auto_cleanup = False + no_stylesheets = True + language = 'es' + use_embedded_content = False + remove_javascript = True + cover_url = u'http://www.voltairenet.org/squelettes/elements/images/logo-voltairenet-org.png' + masthead_url = u'http://www.voltairenet.org/squelettes/elements/images/logo-voltairenet-org.png' + + preprocess_regexps = [ + (re.compile(r'(?P<titulo>.+).+

'+match.group('titulo')+'

. (?P.+).+', re.IGNORECASE|re.DOTALL) + ,lambda match:''+match.group('fecha')+'') + ,(re.compile(r'
'), lambda match: ''), (re.compile(r'\'), lambda match: ''), - (re.compile(r'\
'), lambda match: '')] + (re.compile(r'\'), lambda match: ''), + (re.compile(r'\