diff --git a/Changelog.yaml b/Changelog.yaml index ec01df0107..ae7802da1b 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,49 @@ # new recipes: # - title: +- version: 0.9.19 + date: 2013-02-15 + + new features: + - title: "New tool: \"Polish books\" that allows you to perform various automated cleanup actions on EPUB and AZW3 files without doing a full conversion." + type: major + description: "Polishing books is all about putting the shine of perfection on your ebook files. You can use it to subset embedded fonts, update the metadata in the book files from the metadata in the calibre library, manipulate the book jacket, etc. More features will be added in the future. To use this tool, go to Preferences->Toolbar and add the Polish books tool to the main toolbar. Then simply select the books you want to be polished and click the Polish books button. Polishing, unlike conversion, does not change the internal structure/markup of your book, it performs only the minimal set of actions needed to achieve its goals. Note that polish books is a completely new codebase, so there may well be bugs, polishing a book backs up the original as ORIGINAL_EPUB or ORIGINAL_AZW3, unless you have turned off this feature in Preferences->Tweaks, in which case you should backup your files manually. You can also use this tool from the command line with ebook-polish.exe." + + - title: "Driver for the Trekstor Pyrus Mini." + tickets: [1124120] + + - title: "E-book viewer: Add an option to change the minimum font size." + tickets: [1122333] + + - title: "PDF Output: Add support for converting documents with math typesetting, as described here: http://manual.calibre-ebook.com/typesetting_math.html" + + - title: "Column coloring/icons: Add more conditions when using date based columns with reference to 'today'." + + bug fixes: + - title: "Transforming to titlecase - handle typographic hyphens in all caps phrases" + + - title: "Dont ignore file open events that occur before the GUI is initialized on OS X" + tickets: [1122713] + + - title: "News download: Handle feeds that have entries with empty ids" + + - title: "Fix a regression that broke using the template editor" + + - title: "Do not block startup while scanning the computer for available network interfaces. Speeds up startup time on some windows computers with lots of spurious network interfaces." + + improved recipes: + - New Yorker + - Kommersant + - Le Monde (Subscription version) + - NZ Herald + + new recipes: + - title: Navegalo + author: Douglas Delgado + + - title: El Guardian and More Intelligent Life + author: Darko Miletic + - version: 0.9.18 date: 2013-02-08 diff --git a/manual/faq.rst b/manual/faq.rst index b5f8f382b1..51dc768611 100644 --- a/manual/faq.rst +++ b/manual/faq.rst @@ -250,42 +250,71 @@ If you don't want to uninstall it altogether, there are a couple of tricks you c simplest is to simply re-name the executable file that launches the library program. More detail `in the forums `_. -How do I use |app| with my iPad/iPhone/iTouch? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +How do I use |app| with my iPad/iPhone/iPod touch? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Over the air ^^^^^^^^^^^^^^ -The easiest way to browse your |app| collection on your Apple device (iPad/iPhone/iPod) is by using the calibre content server, which makes your collection available over the net. First perform the following steps in |app| +The easiest way to browse your |app| collection on your Apple device +(iPad/iPhone/iPod) is by using the |app| content server, which makes your +collection available over the net. First perform the following steps in |app| - * Set the Preferred Output Format in |app| to EPUB (The output format can be set under :guilabel:`Preferences->Interface->Behavior`) - * Set the output profile to iPad (this will work for iPhone/iPods as well), under :guilabel:`Preferences->Conversion->Common Options->Page Setup` - * Convert the books you want to read on your iPhone to EPUB format by selecting them and clicking the Convert button. - * Turn on the Content Server in |app|'s preferences and leave |app| running. + * Set the Preferred Output Format in |app| to EPUB (The output format can be + set under :guilabel:`Preferences->Interface->Behavior`) + * Set the output profile to iPad (this will work for iPhone/iPods as well), + under :guilabel:`Preferences->Conversion->Common Options->Page Setup` + * Convert the books you want to read on your iDevice to EPUB format by + selecting them and clicking the Convert button. + * Turn on the Content Server by clicking the :guilabel:`Connect/Share` button + and leave |app| running. You can also tell |app| to automatically start the + content server via :guilabel:`Preferences->Sharing over the net`. -Now on your iPad/iPhone you have two choices, use either iBooks (version 1.2 and later) or Stanza (version 3.0 and later). Both are available free from the app store. +There are many apps for your iDevice that can connect to |app|. Here we +describe using two of them, iBooks and Stanza. Using Stanza *************** -Now you should be able to access your books on your iPhone by opening Stanza. Go to "Get Books" and then click the "Shared" tab. Under Shared you will see an entry "Books in calibre". If you don't, make sure your iPad/iPhone is connected using the WiFi network in your house, not 3G. If the |app| catalog is still not detected in Stanza, you can add it manually in Stanza. To do this, click the "Shared" tab, then click the "Edit" button and then click "Add book source" to add a new book source. In the Add Book Source screen enter whatever name you like and in the URL field, enter the following:: +You should be able to access your books on your iPhone by opening Stanza. Go to +"Get Books" and then click the "Shared" tab. Under Shared you will see an entry +"Books in calibre". If you don't, make sure your iPad/iPhone is connected using +the WiFi network in your house, not 3G. If the |app| catalog is still not +detected in Stanza, you can add it manually in Stanza. To do this, click the +"Shared" tab, then click the "Edit" button and then click "Add book source" to +add a new book source. In the Add Book Source screen enter whatever name you +like and in the URL field, enter the following:: http://192.168.1.2:8080/ -Replace ``192.168.1.2`` with the local IP address of the computer running |app|. If you have changed the port the |app| content server is running on, you will have to change ``8080`` as well to the new port. The local IP address is the IP address you computer is assigned on your home network. A quick Google search will tell you how to find out your local IP address. Now click "Save" and you are done. +Replace ``192.168.1.2`` with the local IP address of the computer running +|app|. If you have changed the port the |app| content server is running on, you +will have to change ``8080`` as well to the new port. The local IP address is +the IP address you computer is assigned on your home network. A quick Google +search will tell you how to find out your local IP address. Now click "Save" +and you are done. -If you get timeout errors while browsing the calibre catalog in Stanza, try increasing the connection timeout value in the stanza settings. Go to Info->Settings and increase the value of Download Timeout. +If you get timeout errors while browsing the calibre catalog in Stanza, try +increasing the connection timeout value in the stanza settings. Go to +Info->Settings and increase the value of Download Timeout. Using iBooks ************** -Start the Safari browser and type in the IP address and port of the computer running the calibre server, like this:: +Start the Safari browser and type in the IP address and port of the computer +running the calibre server, like this:: http://192.168.1.2:8080/ -Replace ``192.168.1.2`` with the local IP address of the computer running |app|. If you have changed the port the |app| content server is running on, you will have to change ``8080`` as well to the new port. The local IP address is the IP address you computer is assigned on your home network. A quick Google search will tell you how to find out your local IP address. +Replace ``192.168.1.2`` with the local IP address of the computer running +|app|. If you have changed the port the |app| content server is running on, you +will have to change ``8080`` as well to the new port. The local IP address is +the IP address you computer is assigned on your home network. A quick Google +search will tell you how to find out your local IP address. -You will see a list of books in Safari, just click on the epub link for whichever book you want to read, Safari will then prompt you to open it with iBooks. +You will see a list of books in Safari, just click on the epub link for +whichever book you want to read, Safari will then prompt you to open it with +iBooks. With the USB cable + iTunes diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe index 2224937f3c..b02460695e 100644 --- a/recipes/adventure_zone_pl.recipe +++ b/recipes/adventure_zone_pl.recipe @@ -11,7 +11,7 @@ class Adventure_zone(BasicNewsRecipe): max_articles_per_feed = 100 cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png' index='http://www.adventure-zone.info/fusion/' - use_embedded_content=False + use_embedded_content = False preprocess_regexps = [(re.compile(r"Komentarze", re.IGNORECASE), lambda m: ''), (re.compile(r''), lambda match: ''), (re.compile(r''), lambda match: '')] @@ -21,7 +21,7 @@ class Adventure_zone(BasicNewsRecipe): extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }' feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')] - def parse_feeds (self): + '''def parse_feeds (self): feeds = BasicNewsRecipe.parse_feeds(self) soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php') tag=soup.find(name='channel') @@ -34,7 +34,7 @@ class Adventure_zone(BasicNewsRecipe): for feed in feeds: for article in feed.articles[:]: article.title=titles[feed.articles.index(article)] - return feeds + return feeds''' '''def get_cover_url(self): @@ -42,16 +42,25 @@ class Adventure_zone(BasicNewsRecipe): cover=soup.find(id='box_OstatninumerAZ') self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src'] return getattr(self, 'cover_url', self.cover_url)''' - + def populate_article_metadata(self, article, soup, first): + result = re.search('(.+) - Adventure Zone', soup.title.string) + if result: + article.title = result.group(1) + else: + result = soup.body.find('strong') + if result: + article.title = result.string def skip_ad_pages(self, soup): skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'}) skip_tag = skip_tag.findAll(name='a') - for r in skip_tag: - if r.strong: - word=r.strong.string.lower() - if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): - return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) + title = soup.title.string.lower() + if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)): + for r in skip_tag: + if r.strong and r.strong.string: + word=r.strong.string.lower() + if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): + return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) def preprocess_html(self, soup): footer=soup.find(attrs={'class':'news-footer middle-border'}) diff --git a/recipes/badania_net.recipe b/recipes/badania_net.recipe new file mode 100644 index 0000000000..01499f6369 --- /dev/null +++ b/recipes/badania_net.recipe @@ -0,0 +1,17 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class BadaniaNet(BasicNewsRecipe): + title = u'badania.net' + __author__ = 'fenuks' + description = u'chcesz wiedzieć więcej?' + category = 'science' + language = 'pl' + cover_url = 'http://badania.net/wp-content/badanianet_green_transparent.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + remove_tags = [dict(attrs={'class':['omc-flex-category', 'omc-comment-count', 'omc-single-tags']})] + remove_tags_after = dict(attrs={'class':'omc-single-tags'}) + keep_only_tags = [dict(id='omc-full-article')] + feeds = [(u'Psychologia', u'http://badania.net/category/psychologia/feed/'), (u'Technologie', u'http://badania.net/category/technologie/feed/'), (u'Biologia', u'http://badania.net/category/biologia/feed/'), (u'Chemia', u'http://badania.net/category/chemia/feed/'), (u'Zdrowie', u'http://badania.net/category/zdrowie/'), (u'Seks', u'http://badania.net/category/psychologia-ewolucyjna-tematyka-seks/feed/')] diff --git a/recipes/bash_org_pl.recipe b/recipes/bash_org_pl.recipe index 4ed59614e7..a04f267ca3 100644 --- a/recipes/bash_org_pl.recipe +++ b/recipes/bash_org_pl.recipe @@ -35,8 +35,8 @@ class Bash_org_pl(BasicNewsRecipe): soup=self.index_to_soup(u'http://bash.org.pl/random/') #date=soup.find('div', attrs={'class':'right'}).string url=soup.find('a', attrs={'class':'qid click'}) - title=url.string - url='http://bash.org.pl' +url['href'] + title='' + url='http://bash.org.pl/random/' articles.append({'title' : title, 'url' : url, 'date' : '', @@ -44,6 +44,8 @@ class Bash_org_pl(BasicNewsRecipe): }) return articles + def populate_article_metadata(self, article, soup, first): + article.title = soup.find(attrs={'class':'qid click'}).string def parse_index(self): feeds = [] diff --git a/recipes/ekologia_pl.recipe b/recipes/ekologia_pl.recipe index 2b0933b58d..21d3b607d2 100644 --- a/recipes/ekologia_pl.recipe +++ b/recipes/ekologia_pl.recipe @@ -15,7 +15,8 @@ class EkologiaPl(BasicNewsRecipe): no_stylesheets = True remove_empty_feeds = True use_embedded_content = False - remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj']})] + remove_attrs = ['style'] + remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj', 'widget-social-buttons']})] feeds = [(u'Wiadomo\u015bci', u'http://www.ekologia.pl/rss/20,53,0'), (u'\u015arodowisko', u'http://www.ekologia.pl/rss/20,56,0'), (u'Styl \u017cycia', u'http://www.ekologia.pl/rss/20,55,0')] diff --git a/recipes/eso_pl.recipe b/recipes/eso_pl.recipe new file mode 100644 index 0000000000..5ebb420396 --- /dev/null +++ b/recipes/eso_pl.recipe @@ -0,0 +1,23 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class ESO(BasicNewsRecipe): + title = u'ESO PL' + __author__ = 'fenuks' + description = u'ESO, Europejskie Obserwatorium Południowe, buduje i obsługuje najbardziej zaawansowane naziemne teleskopy astronomiczne na świecie' + category = 'astronomy' + language = 'pl' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + cover_url = 'https://twimg0-a.akamaihd.net/profile_images/1922519424/eso-twitter-logo.png' + keep_only_tags = [dict(attrs={'class':'subcl'})] + remove_tags = [dict(id='lang_row'), dict(attrs={'class':['pr_typeid', 'pr_news_feature_link', 'outreach_usage', 'hidden']})] + feeds = [(u'Wiadomo\u015bci', u'http://www.eso.org/public/poland/news/feed/'), (u'Og\u0142oszenia', u'http://www.eso.org/public/poland/announcements/feed/'), (u'Zdj\u0119cie tygodnia', u'http://www.eso.org/public/poland/images/potw/feed/')] + + def preprocess_html(self, soup): + for a in soup.findAll('a', href=True): + if a['href'].startswith('/'): + a['href'] = 'http://www.eso.org' + a['href'] + return soup diff --git a/recipes/icons/badania_net.png b/recipes/icons/badania_net.png new file mode 100644 index 0000000000..de915de8d1 Binary files /dev/null and b/recipes/icons/badania_net.png differ diff --git a/recipes/icons/eso_pl.png b/recipes/icons/eso_pl.png new file mode 100644 index 0000000000..4f3319fece Binary files /dev/null and b/recipes/icons/eso_pl.png differ diff --git a/recipes/icons/kurier_galicyjski.png b/recipes/icons/kurier_galicyjski.png new file mode 100644 index 0000000000..4d66a15122 Binary files /dev/null and b/recipes/icons/kurier_galicyjski.png differ diff --git a/recipes/icons/nauka_w_polsce.png b/recipes/icons/nauka_w_polsce.png new file mode 100644 index 0000000000..0d872ce682 Binary files /dev/null and b/recipes/icons/nauka_w_polsce.png differ diff --git a/recipes/icons/osworld_pl.png b/recipes/icons/osworld_pl.png new file mode 100644 index 0000000000..97a7d0dd55 Binary files /dev/null and b/recipes/icons/osworld_pl.png differ diff --git a/recipes/icons/ubuntu_pomoc_org.png b/recipes/icons/ubuntu_pomoc_org.png new file mode 100644 index 0000000000..a143846630 Binary files /dev/null and b/recipes/icons/ubuntu_pomoc_org.png differ diff --git a/recipes/icons/wprost_rss.png b/recipes/icons/wprost_rss.png new file mode 100644 index 0000000000..5ce1b5563d Binary files /dev/null and b/recipes/icons/wprost_rss.png differ diff --git a/recipes/informacje_usa.recipe b/recipes/informacje_usa.recipe index ac31134103..692dcdc07e 100644 --- a/recipes/informacje_usa.recipe +++ b/recipes/informacje_usa.recipe @@ -1,5 +1,4 @@ from calibre.web.feeds.news import BasicNewsRecipe -import re class Informacje_USA(BasicNewsRecipe): title = u'Informacje USA' oldest_article = 7 @@ -8,11 +7,10 @@ class Informacje_USA(BasicNewsRecipe): description = u'portal wiadomości amerykańskich' category = 'news' language = 'pl' - masthead_url= 'http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg' - cover_url='http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg' + cover_url='http://www.informacjeusa.com/wp-content/uploads/2013/01/V3BANNER420-90new.jpg' no_stylesheets = True - preprocess_regexps = [(re.compile(ur'

Zobacz:.*?

', re.DOTALL), lambda match: ''), (re.compile(ur'

')) + img.insert(len(img.contents), bs('

')) + for a in soup.findAll('a', href=True): + if a['href'].startswith('/'): + a['href'] = 'http://kuriergalicyjski.com' + a['href'] + return soup diff --git a/recipes/le_monde_sub.recipe b/recipes/le_monde_sub.recipe index 56156166dc..dc9fa9d36f 100644 --- a/recipes/le_monde_sub.recipe +++ b/recipes/le_monde_sub.recipe @@ -1,166 +1,94 @@ -#!/usr/bin/env python - +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +__author__ = 'Sylvain Durand ' __license__ = 'GPL v3' -__copyright__ = '2012, 2013, Rémi Vanicat ' -''' -Lemonde.fr: Version abonnée -''' +import time -import os, zipfile, re, time -from urllib2 import HTTPError -from calibre.constants import preferred_encoding - +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ptempfile import PersistentTemporaryFile +from urllib2 import HTTPError -class LeMondeAbonne(BasicNewsRecipe): +class LeMonde(BasicNewsRecipe): - title = u'Le Monde: Édition abonnés' - __author__ = u'Rémi Vanicat' - description = u'Actualités' - category = u'Actualités, France, Monde' - publisher = 'Le Monde' - language = 'fr' - needs_subscription = True - no_stylesheets = True - smarten_punctuation = True - remove_attributes = [ 'border', 'cellspacing', 'display', 'align', 'cellpadding', 'colspan', 'valign', 'vscape', 'hspace', 'alt', 'width', 'height'] - extra_css = ''' li{margin:6pt 0} - ul{margin:0} + title = u'Le Monde: Édition abonnés' + __author__ = 'Sylvain Durand' + description = u'Disponible du lundi au samedi à partir de 14 heures environ, avec tous ses cahiers.' + language = 'fr' + encoding = 'utf8' - div.photo img{max-width:100%; border:0px transparent solid;} - div.photo{font-family:inherit; color:#333; text-align:center;} - div.photo p{text-align:justify;font-size:.9em; line-height:.9em;} + needs_subscription = True - @page{margin:10pt} - .ar-txt {color:#000; text-align:justify;} - h1{text-align:left; font-size:1.25em;} + date_url = 'http://www.lemonde.fr/journalelectronique/donnees/libre/%Y%m%d/index.html' + login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html' + journal_url = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/%Y%m%d_ipad.xml' + masthead_url = 'http://upload.wikimedia.org/wikipedia/fr/thumb/c/c5/Le_Monde_logo.svg/300px-Le_Monde_logo.svg.png' + couverture_url = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/data/img/%y%m%d01.jpg' - .auteur{text-align:right; font-weight:bold} - .feed{text-align:right; font-weight:bold} - .po-ti2{font-weight:bold} - .fen-tt{font-weight:bold;font-size:1.1em} - ''' + extra_css = ''' + img{max-width:100%} + h1{font-size:1.2em !important; line-height:1.2em !important; } + h2{font-size:1em !important; line-height:1em !important; } + h3{font-size:1em !important; text-transform:uppercase !important; color:#666;} + #photo{text-align:center !important; margin:10px 0 -8px;} + #lgd{font-size:1em !important; line-height:1em !important; font-style:italic; color:#333;} ''' - zipurl_format = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/%y%m%d.zip' - coverurl_format = '/img/%y%m%d01.jpg' - path_format = "%y%m%d" - login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html' + keep_only_tags = [dict(name=['h1','h2','h3','div','txt'])] - keep_only_tags = [dict(name=['h1']), dict(name='div', attrs={ 'class': 'photo' }), dict(name='div', attrs={ 'class': 'po-ti2' }), dict(name='div', attrs={ 'class': 'ar-txt' }), dict(name='div', attrs={ 'class': 'po_rtcol' }) ] - - - remove_tags = [ dict(name='div', attrs={ 'class': 'po-ti' }),dict(name='div', attrs={ 'class': 'po-copy' })] - - article_id_pattern = re.compile("[0-9]+\\.html") - article_url_format = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/html/' + def __init__(self, options, log, progress_reporter): + BasicNewsRecipe.__init__(self, options, log, progress_reporter) + br = BasicNewsRecipe.get_browser(self) + second = time.time() + 24*60*60 + for i in range(7): + self.date = time.gmtime(second) + try: + br.open(time.strftime(self.date_url,self.date)) + break + except HTTPError: + second -= 24*60*60 + self.timefmt = strftime(u" %A %d %B %Y", self.date).replace(u' 0', u' ') def get_browser(self): br = BasicNewsRecipe.get_browser(self) - if self.username is not None and self.password is not None: - br.open(self.login_url) - br.select_form(nr=0) - br['login'] = self.username - br['password'] = self.password - br.submit() + br.open(self.login_url) + br.select_form(nr=0) + br['login'] = self.username + br['password'] = self.password + br.submit() return br - decalage = 24 * 60 * 60 # today Monde has tomorow date - def get_cover_url(self): - url = time.strftime(self.coverurl_format, self.ltime) - return self.articles_path + url + url = time.strftime(self.couverture_url,self.date) + return url def parse_index(self): - browser = self.get_browser() - - second = time.time() - second += self.decalage - - for i in range(7): - self.ltime = time.gmtime(second) - self.timefmt=time.strftime(" %A %d %B %Y",self.ltime).decode(preferred_encoding) - url = time.strftime(self.zipurl_format,self.ltime) - try: - response = browser.open(url) - continue - except HTTPError: - second -= 24*60*60 - - tmp = PersistentTemporaryFile(suffix='.zip') - self.report_progress(0.1,_('downloading zip file')) - tmp.write(response.read()) - tmp.close() - - zfile = zipfile.ZipFile(tmp.name, 'r') - self.report_progress(0.1,_('extracting zip file')) - - zfile.extractall(self.output_dir) - zfile.close() - - path = os.path.join(self.output_dir, time.strftime(self.path_format, self.ltime), "data") - - self.articles_path = path - - files = os.listdir(path) - - nb_index_files = len([ name for name in files if re.match("frame_gauche_[0-9]+.html", name) ]) - - flux = [] - - article_url = time.strftime(self.article_url_format, self.ltime) - - for i in range(nb_index_files): - filename = os.path.join(path, "selection_%d.html" % (i + 1)) - tmp = open(filename,'r') - soup=BeautifulSoup(tmp,convertEntities=BeautifulSoup.HTML_ENTITIES) - title=soup.find('span').contents[0] - if title=="Une": - title="À la une" - if title=="Evenement": - title="L'événement" - if title=="Planete": - title="Planète" - if title=="Economie - Entreprises": - title="Économie" - if title=="L'Oeil du Monde": - title="L'œil du Monde" - if title=="Enquete": - title="Enquête" - if title=="Editorial - Analyses": - title="Analyses" - if title=="Le Monde Economie": - title="Économie" - if title=="Le Monde Culture et idées": - title="Idées" - if title=="Le Monde Géo et politique": - title="Géopolitique" - tmp.close() - - filename = os.path.join(path, "frame_gauche_%d.html" % (i + 1)) - tmp = open(filename,'r') - soup = BeautifulSoup(tmp) + url = time.strftime(self.journal_url,self.date) + soup = self.index_to_soup(url).sommaire + sections = [] + for sec in soup.findAll("section"): articles = [] - for link in soup.findAll("a"): - article_file = link['href'] - article_id=self.article_id_pattern.search(article_file).group() - article = { - 'title': link.contents[0], - 'url': article_url + article_id, - 'description': '', - 'content': '' - } - articles.append(article) - tmp.close() + if sec['cahier'] != "Le Monde": + for col in sec.findAll("fnts"): + col.extract() + if sec['cahier']=="Le Monde Magazine": + continue + for art in sec.findAll("art"): + if art.txt.string and art.ttr.string: + if art.find(['url']): + art.insert(6,'

') + if art.find(['lgd']) and art.find(['lgd']).string: + art.insert(7,'
'+art.find(['lgd']).string+'
') + article = ""+unicode(art)+"" + article = article.replace('','').replace(' oC ','°C ') + article = article.replace('srttr>','h3>').replace('ssttr>','h2>').replace('ttr>','h1>') + f = PersistentTemporaryFile() + f.write(article) + articles.append({'title':art.ttr.string,'url':"file:///"+f.name}) + sections.append((sec['nom'], articles)) + return sections - flux.append((title, articles)) + def preprocess_html(self, soup): + for lgd in soup.findAll(id="lgd"): + lgd.contents[-1].extract() + return soup - return flux - - - -# Local Variables: -# mode: python -# End: diff --git a/recipes/mlody_technik_pl.recipe b/recipes/mlody_technik_pl.recipe index 741397d08a..1eaa08d23a 100644 --- a/recipes/mlody_technik_pl.recipe +++ b/recipes/mlody_technik_pl.recipe @@ -1,5 +1,5 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai - +import re from calibre.web.feeds.news import BasicNewsRecipe class Mlody_technik(BasicNewsRecipe): title = u'Młody technik' @@ -9,7 +9,19 @@ class Mlody_technik(BasicNewsRecipe): language = 'pl' cover_url='http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg' no_stylesheets = True + preprocess_regexps = [(re.compile(r"

Podobne

", re.IGNORECASE), lambda m: '')] oldest_article = 7 max_articles_per_feed = 100 - #keep_only_tags=[dict(id='container')] - feeds = [(u'Artyku\u0142y', u'http://www.mt.com.pl/feed')] + remove_empty_feeds = True + use_embedded_content = False + keep_only_tags = [dict(id='content')] + remove_tags = [dict(attrs={'class':'st-related-posts'})] + remove_tags_after = dict(attrs={'class':'entry-content clearfix'}) + feeds = [(u'Wszystko', u'http://www.mt.com.pl/feed'), + (u'MT NEWS 24/7', u'http://www.mt.com.pl/kategoria/mt-newsy-24-7/feed'), + (u'Info zoom', u'http://www.mt.com.pl/kategoria/info-zoom/feed'), + (u'm.technik', u'http://www.mt.com.pl/kategoria/m-technik/feed'), + (u'Szkoła', u'http://www.mt.com.pl/kategoria/szkola-2/feed'), + (u'Na Warsztacie', u'http://www.mt.com.pl/kategoria/na-warsztacie/feed'), + (u'Z pasji do...', u'http://www.mt.com.pl/kategoria/z-pasji-do/feed'), + (u'MT testuje', u'http://www.mt.com.pl/kategoria/mt-testuje/feed')] diff --git a/recipes/nauka_w_polsce.recipe b/recipes/nauka_w_polsce.recipe new file mode 100644 index 0000000000..c524c18b26 --- /dev/null +++ b/recipes/nauka_w_polsce.recipe @@ -0,0 +1,47 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class NaukawPolsce(BasicNewsRecipe): + title = u'Nauka w Polsce' + __author__ = 'fenuks' + description = u'Serwis Nauka w Polsce ma za zadanie popularyzację polskiej nauki. Można na nim znaleźć wiadomości takie jak: osiągnięcia polskich naukowców, wydarzenia na polskich uczelniach, osiągnięcia studentów, konkursy dla badaczy, staże i stypendia naukowe, wydarzenia w polskiej nauce, kalendarium wydarzeń w nauce, materiały wideo o nauce.' + category = 'science' + language = 'pl' + cover_url = 'http://www.naukawpolsce.pap.pl/Themes/Pap/images/logo-pl.gif' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + index = 'http://www.naukawpolsce.pl' + keep_only_tags = [dict(name='div', attrs={'class':'margines wiadomosc'})] + remove_tags = [dict(name='div', attrs={'class':'tagi'})] + + def find_articles(self, url): + articles = [] + soup=self.index_to_soup(url) + for i in soup.findAll(name='div', attrs={'class':'aktualnosci-margines lista-depesz information-content'}): + title = i.h1.a.string + url = self.index + i.h1.a['href'] + date = '' #i.span.string + articles.append({'title' : title, + 'url' : url, + 'date' : date, + 'description' : '' + }) + return articles + + def parse_index(self): + feeds = [] + feeds.append((u"Historia i kultura", self.find_articles('http://www.naukawpolsce.pl/historia-i-kultura/'))) + feeds.append((u"Kosmos", self.find_articles('http://www.naukawpolsce.pl/kosmos/'))) + feeds.append((u"Przyroda", self.find_articles('http://www.naukawpolsce.pl/przyroda/'))) + feeds.append((u"Społeczeństwo", self.find_articles('http://www.naukawpolsce.pl/spoleczenstwo/'))) + feeds.append((u"Technologie", self.find_articles('http://www.naukawpolsce.pl/technologie/'))) + feeds.append((u"Uczelnie", self.find_articles('http://www.naukawpolsce.pl/uczelnie/'))) + feeds.append((u"Nauki medyczne", self.find_articles('http://www.naukawpolsce.pl/zdrowie/'))) + + return feeds + + def preprocess_html(self, soup): + for p in soup.findAll(name='p', text=re.compile(' ')): + p.extract() + return soup diff --git a/recipes/new_yorker.recipe b/recipes/new_yorker.recipe index c5f1b0aff2..2730b45d6d 100644 --- a/recipes/new_yorker.recipe +++ b/recipes/new_yorker.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2008-2011, Darko Miletic ' +__copyright__ = '2008-2013, Darko Miletic ' ''' newyorker.com ''' @@ -44,20 +44,18 @@ class NewYorker(BasicNewsRecipe): , 'language' : language } - keep_only_tags = [ - dict(name='div', attrs={'class':'headers'}) - ,dict(name='div', attrs={'id':['articleheads','items-container','articleRail','articletext','photocredits']}) - ] + keep_only_tags = [dict(name='div', attrs={'id':'pagebody'})] remove_tags = [ dict(name=['meta','iframe','base','link','embed','object']) - ,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons'] }) + ,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons','social-utils-top','entry-keywords','entry-categories','utilsPrintEmail'] }) ,dict(attrs={'id':['show-header','show-footer'] }) ] + remove_tags_after = dict(attrs={'class':'entry-content'}) remove_attributes = ['lang'] feeds = [(u'The New Yorker', u'http://www.newyorker.com/services/mrss/feeds/everything.xml')] def print_version(self, url): - return url + '?printable=true' + return url + '?printable=true¤tPage=all' def image_url_processor(self, baseurl, url): return url.strip() diff --git a/recipes/osworld_pl.recipe b/recipes/osworld_pl.recipe new file mode 100644 index 0000000000..7784a271e0 --- /dev/null +++ b/recipes/osworld_pl.recipe @@ -0,0 +1,33 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class OSWorld(BasicNewsRecipe): + title = u'OSWorld.pl' + __author__ = 'fenuks' + description = u'OSWorld.pl to serwis internetowy, dzięki któremu poznasz czym naprawdę jest Open Source. Serwis poświęcony jest wolnemu oprogramowaniu jak linux mint, centos czy ubunty. Znajdziecie u nasz artykuły, unity oraz informacje o certyfikatach CACert. OSWorld to mały świat wielkich systemów!' + category = 'OS, IT, open source, Linux' + language = 'pl' + cover_url = 'http://osworld.pl/wp-content/uploads/osworld-kwadrat-128x111.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + keep_only_tags = [dict(id=['dzial', 'posts'])] + remove_tags = [dict(attrs={'class':'post-comments'})] + remove_tags_after = dict(attrs={'class':'entry clr'}) + feeds = [(u'Artyku\u0142y', u'http://osworld.pl/category/artykuly/feed/'), (u'Nowe wersje', u'http://osworld.pl/category/nowe-wersje/feed/')] + + def append_page(self, soup, appendtag): + tag = appendtag.find(attrs={'id':'paginacja'}) + if tag: + for nexturl in tag.findAll('a'): + soup2 = self.index_to_soup(nexturl['href']) + pagetext = soup2.find(attrs={'class':'entry clr'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'id':'paginacja'}): + r.extract() + + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/pc_lab.recipe b/recipes/pc_lab.recipe index c4b33b8416..7a6038bd65 100644 --- a/recipes/pc_lab.recipe +++ b/recipes/pc_lab.recipe @@ -1,5 +1,4 @@ #!/usr/bin/env python - from calibre.web.feeds.recipes import BasicNewsRecipe class PCLab(BasicNewsRecipe): @@ -8,12 +7,13 @@ class PCLab(BasicNewsRecipe): __author__ = 'ravcio - rlelusz[at]gmail.com' description = u"Articles from PC Lab website" language = 'pl' - oldest_article = 30.0 + oldest_article = 30 max_articles_per_feed = 100 recursions = 0 encoding = 'iso-8859-2' no_stylesheets = True remove_javascript = True + remove_empty_feeds = True use_embedded_content = False keep_only_tags = [ @@ -21,50 +21,54 @@ class PCLab(BasicNewsRecipe): ] remove_tags = [ - dict(name='div', attrs={'class':['chapters']}) - ,dict(name='div', attrs={'id':['script_bxad_slot_display_list_bxad_slot']}) + dict(name='div', attrs={'class':['toc first', 'toc', 'tags', 'recommendedarticles', 'name', 'zumi', 'chapters']}) ] - remove_tags_after = [ - dict(name='div', attrs={'class':['navigation']}) - ] - #links to RSS feeds - feeds = [ ('PCLab', u'http://pclab.pl/xml/artykuly.xml') ] + feeds = [ + (u'Aktualności', 'http://pclab.pl/xml/aktualnosci.xml'), + (u'Artykuły', u'http://pclab.pl/xml/artykuly.xml'), + (u'Poradniki', 'http://pclab.pl/xml/poradniki.xml') + ] #load second and subsequent page content # in: soup - full page with 'next' button # out: appendtag - tag to which new page is to be added def append_page(self, soup, appendtag): # find the 'Next' button - pager = soup.find('div', attrs={'class':'next'}) - + pager = soup.find('div', attrs={'class':'navigation'}) if pager: + a = pager.find('a') + if 'news' in a['href']: + pager = None + else: + pager = pager.find('div', attrs={'class':'next'}) + + while pager: #search for 'a' element with link to next page (exit if not found) a = pager.find('a') - if a: - nexturl = a['href'] + nexturl = a['href'] + soup2 = self.index_to_soup('http://pclab.pl' + nexturl) + pager = soup2.find('div', attrs={'class':'next'}) + pagetext = soup2.find('div', attrs={'class':'substance'}) + pagetext = pagetext.find('div', attrs={'class':'data'}) - soup2 = self.index_to_soup('http://pclab.pl/' + nexturl) - - pagetext_substance = soup2.find('div', attrs={'class':'substance'}) - pagetext = pagetext_substance.find('div', attrs={'class':'data'}) - pagetext.extract() - - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - pos = len(appendtag.contents) - - self.append_page(soup2, appendtag) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pos = len(appendtag.contents) + pager = soup.find('div', attrs={'class':'navigation'}) + if pager: + pager.extract() def preprocess_html(self, soup): - # soup.body contains no title and no navigator, they are in soup self.append_page(soup, soup.body) - + for link in soup.findAll('a'): + href = link.get('href', None) + if href and href.startswith('/'): + link['href'] = 'http://pclab.pl' + href # finally remove some tags - tags = soup.findAll('div',attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']}) - [tag.extract() for tag in tags] + #for r in soup.findAll('div', attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']}) return soup diff --git a/recipes/spiders_web_pl.recipe b/recipes/spiders_web_pl.recipe index 678ee5c640..b593d6b837 100644 --- a/recipes/spiders_web_pl.recipe +++ b/recipes/spiders_web_pl.recipe @@ -5,11 +5,14 @@ class SpidersWeb(BasicNewsRecipe): oldest_article = 7 __author__ = 'fenuks' description = u'' - cover_url = 'http://www.spidersweb.pl/wp-content/themes/spiderweb/img/Logo.jpg' + cover_url = 'http://www.spidersweb.pl/wp-content/themes/new_sw/images/spidersweb.png' category = 'IT, WEB' language = 'pl' no_stylesheers=True + remove_javascript = True + use_embedded_content = False max_articles_per_feed = 100 - keep_only_tags=[dict(id='Post')] - remove_tags=[dict(name='div', attrs={'class':['Comments', 'Shows', 'Post-Tags']}), dict(id='Author-Column')] + keep_only_tags=[dict(id='start')] + remove_tags_after = dict(attrs={'class':'padding20'}) + remove_tags=[dict(name='div', attrs={'class':['padding border-bottom', 'padding20', 'padding border-top']})] feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')] diff --git a/recipes/ubuntu_pomoc_org.recipe b/recipes/ubuntu_pomoc_org.recipe new file mode 100644 index 0000000000..1a78649dfc --- /dev/null +++ b/recipes/ubuntu_pomoc_org.recipe @@ -0,0 +1,22 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe +class UbuntuPomoc(BasicNewsRecipe): + title = u'Ubuntu-pomoc.org' + __author__ = 'fenuks' + description = u'Strona poświęcona systemowi Ubuntu Linux. Znajdziesz tutaj przydatne i sprawdzone poradniki oraz sposoby rozwiązywania wielu popularnych problemów. Ten blog rozwiąże każdy Twój problem - jeśli nie teraz, to wkrótce! :)' + category = 'Linux, Ubuntu, open source' + language = 'pl' + cover_url = 'http://www.ubuntu-pomoc.org/grafika/ubuntupomoc.png' + preprocess_regexps = [(re.compile(r'
.+', re.IGNORECASE|re.DOTALL), lambda m: '')] + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_javascript = True + remove_empty_feeds = True + use_embedded_content = False + remove_attrs = ['style'] + keep_only_tags = [dict(attrs={'class':'post'})] + remove_tags_after = dict(attrs={'class':'underEntry'}) + remove_tags = [dict(attrs={'class':['underPostTitle', 'yarpp-related', 'underEntry', 'social', 'tags', 'commentlist', 'youtube_sc']}), dict(id=['wp_rp_first', 'commentReply'])] + feeds = [(u'Ca\u0142o\u015b\u0107', u'http://feeds.feedburner.com/Ubuntu-Pomoc'), + (u'Gry', u'http://feeds.feedburner.com/GryUbuntu-pomoc')] diff --git a/recipes/wprost.recipe b/recipes/wprost.recipe index 2adac1e113..90dde251ca 100644 --- a/recipes/wprost.recipe +++ b/recipes/wprost.recipe @@ -10,89 +10,89 @@ from calibre.web.feeds.news import BasicNewsRecipe import re class Wprost(BasicNewsRecipe): - EDITION = 0 - FIND_LAST_FULL_ISSUE = True - EXCLUDE_LOCKED = True - ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png' + EDITION = 0 + FIND_LAST_FULL_ISSUE = True + EXCLUDE_LOCKED = True + ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png' + title = u'Wprost' + __author__ = 'matek09' + description = 'Weekly magazine' + encoding = 'ISO-8859-2' + no_stylesheets = True + language = 'pl' + remove_javascript = True + recursions = 0 + remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) + remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) + ''' + keep_only_tags =[] + keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'})) + ''' - title = u'Wprost' - __author__ = 'matek09' - description = 'Weekly magazine' - encoding = 'ISO-8859-2' - no_stylesheets = True - language = 'pl' - remove_javascript = True - recursions = 0 - - remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) - remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) - - '''keep_only_tags =[] - keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))''' - - preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), + preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), (re.compile(r'display: block;'), lambda match: ''), (re.compile(r'\\\<\/table\>'), lambda match: ''), (re.compile(r'\'), lambda match: ''), (re.compile(r'\'), lambda match: ''), (re.compile(r'\
'), lambda match: ''), - (re.compile(r'\