diff --git a/Changelog.yaml b/Changelog.yaml index ec01df0107..38d59e0770 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,117 @@ # new recipes: # - title: +- version: 0.9.20 + date: 2013-02-22 + + new features: + - title: "Book polishing: Add an option to smarten punctuation in the book when polishing" + + - title: "Book polishing: Add an option to delete all saved settings to the load saved settings button" + + - title: "Book polishing: Remember the last used settings" + + - title: "Book polishing: Add a checkbox to enable/disable the detailed polishing report" + + - title: "Add a separate tweak in Preferences-Tweaks for saving backups of files when polishing. That way you can have calibre save backups while converting EPUB->EPUB and not while polishing, if you so desire." + + - title: "Content server: Allow clicking on the book cover to download it. Useful on small screen devices where clicking the Get button may be difficult" + + - title: "Driver for Energy Systems C4 Touch." + tickets: [1127477] + + bug fixes: + - title: "E-book viewer: Fix a bug that could cause the back button in the viewer to skip a location" + + - title: "When tweaking/polishing an azw3 file that does not have an identified content ToC, do not auto-generate one." + tickets: [1130729] + + - title: "Book polishing: Use the actual cover image dimensions when creating the svg wrapper for the cover image." + tickets: [1127273] + + - title: "Book polishing: Do not error out on epub files containing an iTunesMetadata.plist file." + tickets: [1127308] + + - title: "Book polishing: Fix trying to polish more than 5 books at a time not working" + + - title: "Content server: Add workaround for bug in latest release of Google Chrome that causes it to not work with book lists containing some utf-8 characters" + tickets: [1130478] + + - title: "E-book viewer: When viewing EPUB files, do not parse html as xhtml even if it has svg tags embedded. This allows malformed XHTML files to still be viewed." + + - title: "Bulk metadata edit Search & recplace: Update the sample values when changing the type of identifier to search on" + + - title: "Fix recipes with the / character in their names not useable from the command line" + tickets: [1127666] + + - title: "News download: Fix regression that broke downloading of images in gif format" + + - title: "EPUB/AZW3 Output: When splitting the output html on page breaks, handle page-break-after rules correctly, the pre split point html should contain the full element" + + - title: "Fix stdout/stderr redirection temp files not being deleted when restarting calibre from within calibre on windows" + + - title: "E-book viewer: When viewing epub files that have their cover marked as non-linear, show the cover at the start of the book instead of the end." + tickets: [1126030] + + - title: "EPUB Input: Fix handling of cover references with fragments in the urls" + + improved recipes: + - Fronda + - Various Polish news sources + + new recipes: + - title: Pravda + author: Darko Miletic + + - title: PNN + author: n.kucklaender + + - title: Various Polish news sources + author: fenuks + +- version: 0.9.19 + date: 2013-02-15 + + new features: + - title: "New tool: \"Polish books\" that allows you to perform various automated cleanup actions on EPUB and AZW3 files without doing a full conversion." + type: major + description: "Polishing books is all about putting the shine of perfection on your ebook files. You can use it to subset embedded fonts, update the metadata in the book files from the metadata in the calibre library, manipulate the book jacket, etc. More features will be added in the future. To use this tool, go to Preferences->Toolbar and add the Polish books tool to the main toolbar. Then simply select the books you want to be polished and click the Polish books button. Polishing, unlike conversion, does not change the internal structure/markup of your book, it performs only the minimal set of actions needed to achieve its goals. Note that polish books is a completely new codebase, so there may well be bugs, polishing a book backs up the original as ORIGINAL_EPUB or ORIGINAL_AZW3, unless you have turned off this feature in Preferences->Tweaks, in which case you should backup your files manually. You can also use this tool from the command line with ebook-polish.exe." + + - title: "Driver for the Trekstor Pyrus Mini." + tickets: [1124120] + + - title: "E-book viewer: Add an option to change the minimum font size." + tickets: [1122333] + + - title: "PDF Output: Add support for converting documents with math typesetting, as described here: http://manual.calibre-ebook.com/typesetting_math.html" + + - title: "Column coloring/icons: Add more conditions when using date based columns with reference to 'today'." + + bug fixes: + - title: "Transforming to titlecase - handle typographic hyphens in all caps phrases" + + - title: "Dont ignore file open events that occur before the GUI is initialized on OS X" + tickets: [1122713] + + - title: "News download: Handle feeds that have entries with empty ids" + + - title: "Fix a regression that broke using the template editor" + + - title: "Do not block startup while scanning the computer for available network interfaces. Speeds up startup time on some windows computers with lots of spurious network interfaces." + + improved recipes: + - New Yorker + - Kommersant + - Le Monde (Subscription version) + - NZ Herald + + new recipes: + - title: Navegalo + author: Douglas Delgado + + - title: El Guardian and More Intelligent Life + author: Darko Miletic + - version: 0.9.18 date: 2013-02-08 diff --git a/manual/develop.rst b/manual/develop.rst index 719c876b33..823a31b5c2 100644 --- a/manual/develop.rst +++ b/manual/develop.rst @@ -39,27 +39,27 @@ All the |app| python code is in the ``calibre`` package. This package contains t * devices - All the device drivers. Just look through some of the built-in drivers to get an idea for how they work. - * For details, see: devices.interface which defines the interface supported by device drivers and devices.usbms which + * For details, see: devices.interface which defines the interface supported by device drivers and ``devices.usbms`` which defines a generic driver that connects to a USBMS device. All USBMS based drivers in |app| inherit from it. * ebooks - All the ebook conversion/metadata code. A good starting point is ``calibre.ebooks.conversion.cli`` which is the - module powering the :command:`ebook-convert` command. The conversion process is controlled via conversion.plumber. - The format independent code is all in ebooks.oeb and the format dependent code is in ebooks.format_name. + module powering the :command:`ebook-convert` command. The conversion process is controlled via ``conversion.plumber``. + The format independent code is all in ``ebooks.oeb`` and the format dependent code is in ``ebooks.format_name``. - * Metadata reading, writing, and downloading is all in ebooks.metadata + * Metadata reading, writing, and downloading is all in ``ebooks.metadata`` * Conversion happens in a pipeline, for the structure of the pipeline, see :ref:`conversion-introduction`. The pipeline consists of an input plugin, various transforms and an output plugin. The that code constructs - and drives the pipeline is in plumber.py. The pipeline works on a + and drives the pipeline is in :file:`plumber.py`. The pipeline works on a representation of an ebook that is like an unzipped epub, with manifest, spine, toc, guide, html content, etc. The - class that manages this representation is OEBBook in oeb/base.py. The + class that manages this representation is OEBBook in ``ebooks.oeb.base``. The various transformations that are applied to the book during - conversions live in `oeb/transforms/*.py`. And the input and output - plugins live in `conversion/plugins/*.py`. + conversions live in :file:`oeb/transforms/*.py`. And the input and output + plugins live in :file:`conversion/plugins/*.py`. - * library - The database back-end and the content server. See library.database2 for the interface to the |app| library. library.server is the |app| Content Server. - * gui2 - The Graphical User Interface. GUI initialization happens in gui2.main and gui2.ui. The ebook-viewer is in gui2.viewer. + * library - The database back-end and the content server. See ``library.database2`` for the interface to the |app| library. ``library.server`` is the |app| Content Server. + * gui2 - The Graphical User Interface. GUI initialization happens in ``gui2.main`` and ``gui2.ui``. The ebook-viewer is in ``gui2.viewer``. If you need help understanding the code, post in the `development forum `_ and you will most likely get help from one of |app|'s many developers. diff --git a/manual/faq.rst b/manual/faq.rst index b5f8f382b1..24774c8c7d 100644 --- a/manual/faq.rst +++ b/manual/faq.rst @@ -250,42 +250,71 @@ If you don't want to uninstall it altogether, there are a couple of tricks you c simplest is to simply re-name the executable file that launches the library program. More detail `in the forums `_. -How do I use |app| with my iPad/iPhone/iTouch? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +How do I use |app| with my iPad/iPhone/iPod touch? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Over the air ^^^^^^^^^^^^^^ -The easiest way to browse your |app| collection on your Apple device (iPad/iPhone/iPod) is by using the calibre content server, which makes your collection available over the net. First perform the following steps in |app| +The easiest way to browse your |app| collection on your Apple device +(iPad/iPhone/iPod) is by using the |app| content server, which makes your +collection available over the net. First perform the following steps in |app| - * Set the Preferred Output Format in |app| to EPUB (The output format can be set under :guilabel:`Preferences->Interface->Behavior`) - * Set the output profile to iPad (this will work for iPhone/iPods as well), under :guilabel:`Preferences->Conversion->Common Options->Page Setup` - * Convert the books you want to read on your iPhone to EPUB format by selecting them and clicking the Convert button. - * Turn on the Content Server in |app|'s preferences and leave |app| running. + * Set the Preferred Output Format in |app| to EPUB (The output format can be + set under :guilabel:`Preferences->Interface->Behavior`) + * Set the output profile to iPad (this will work for iPhone/iPods as well), + under :guilabel:`Preferences->Conversion->Common Options->Page Setup` + * Convert the books you want to read on your iDevice to EPUB format by + selecting them and clicking the Convert button. + * Turn on the Content Server by clicking the :guilabel:`Connect/Share` button + and leave |app| running. You can also tell |app| to automatically start the + content server via :guilabel:`Preferences->Sharing over the net`. -Now on your iPad/iPhone you have two choices, use either iBooks (version 1.2 and later) or Stanza (version 3.0 and later). Both are available free from the app store. +There are many apps for your iDevice that can connect to |app|. Here we +describe using two of them, iBooks and Stanza. Using Stanza *************** -Now you should be able to access your books on your iPhone by opening Stanza. Go to "Get Books" and then click the "Shared" tab. Under Shared you will see an entry "Books in calibre". If you don't, make sure your iPad/iPhone is connected using the WiFi network in your house, not 3G. If the |app| catalog is still not detected in Stanza, you can add it manually in Stanza. To do this, click the "Shared" tab, then click the "Edit" button and then click "Add book source" to add a new book source. In the Add Book Source screen enter whatever name you like and in the URL field, enter the following:: +You should be able to access your books on your iPhone by opening Stanza. Go to +"Get Books" and then click the "Shared" tab. Under Shared you will see an entry +"Books in calibre". If you don't, make sure your iPad/iPhone is connected using +the WiFi network in your house, not 3G. If the |app| catalog is still not +detected in Stanza, you can add it manually in Stanza. To do this, click the +"Shared" tab, then click the "Edit" button and then click "Add book source" to +add a new book source. In the Add Book Source screen enter whatever name you +like and in the URL field, enter the following:: http://192.168.1.2:8080/ -Replace ``192.168.1.2`` with the local IP address of the computer running |app|. If you have changed the port the |app| content server is running on, you will have to change ``8080`` as well to the new port. The local IP address is the IP address you computer is assigned on your home network. A quick Google search will tell you how to find out your local IP address. Now click "Save" and you are done. +Replace ``192.168.1.2`` with the local IP address of the computer running +|app|. If you have changed the port the |app| content server is running on, you +will have to change ``8080`` as well to the new port. The local IP address is +the IP address you computer is assigned on your home network. A quick Google +search will tell you how to find out your local IP address. Now click "Save" +and you are done. -If you get timeout errors while browsing the calibre catalog in Stanza, try increasing the connection timeout value in the stanza settings. Go to Info->Settings and increase the value of Download Timeout. +If you get timeout errors while browsing the calibre catalog in Stanza, try +increasing the connection timeout value in the stanza settings. Go to +Info->Settings and increase the value of Download Timeout. Using iBooks ************** -Start the Safari browser and type in the IP address and port of the computer running the calibre server, like this:: +Start the Safari browser and type in the IP address and port of the computer +running the calibre server, like this:: http://192.168.1.2:8080/ -Replace ``192.168.1.2`` with the local IP address of the computer running |app|. If you have changed the port the |app| content server is running on, you will have to change ``8080`` as well to the new port. The local IP address is the IP address you computer is assigned on your home network. A quick Google search will tell you how to find out your local IP address. +Replace ``192.168.1.2`` with the local IP address of the computer running +|app|. If you have changed the port the |app| content server is running on, you +will have to change ``8080`` as well to the new port. The local IP address is +the IP address you computer is assigned on your home network. A quick Google +search will tell you how to find out your local IP address. -You will see a list of books in Safari, just click on the epub link for whichever book you want to read, Safari will then prompt you to open it with iBooks. +You will see a list of books in Safari, just click on the epub link for +whichever book you want to read, Safari will then prompt you to open it with +iBooks. With the USB cable + iTunes @@ -663,7 +692,7 @@ Post any output you see in a help message on the `Forum Komentarze", re.IGNORECASE), lambda m: ''), (re.compile(r''), lambda match: ''), (re.compile(r''), lambda match: '')] @@ -21,7 +21,7 @@ class Adventure_zone(BasicNewsRecipe): extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }' feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')] - def parse_feeds (self): + '''def parse_feeds (self): feeds = BasicNewsRecipe.parse_feeds(self) soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php') tag=soup.find(name='channel') @@ -34,7 +34,7 @@ class Adventure_zone(BasicNewsRecipe): for feed in feeds: for article in feed.articles[:]: article.title=titles[feed.articles.index(article)] - return feeds + return feeds''' '''def get_cover_url(self): @@ -42,16 +42,25 @@ class Adventure_zone(BasicNewsRecipe): cover=soup.find(id='box_OstatninumerAZ') self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src'] return getattr(self, 'cover_url', self.cover_url)''' - + def populate_article_metadata(self, article, soup, first): + result = re.search('(.+) - Adventure Zone', soup.title.string) + if result: + article.title = result.group(1) + else: + result = soup.body.find('strong') + if result: + article.title = result.string def skip_ad_pages(self, soup): skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'}) skip_tag = skip_tag.findAll(name='a') - for r in skip_tag: - if r.strong: - word=r.strong.string.lower() - if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): - return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) + title = soup.title.string.lower() + if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)): + for r in skip_tag: + if r.strong and r.strong.string: + word=r.strong.string.lower() + if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): + return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) def preprocess_html(self, soup): footer=soup.find(attrs={'class':'news-footer middle-border'}) diff --git a/recipes/badania_net.recipe b/recipes/badania_net.recipe new file mode 100644 index 0000000000..01499f6369 --- /dev/null +++ b/recipes/badania_net.recipe @@ -0,0 +1,17 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class BadaniaNet(BasicNewsRecipe): + title = u'badania.net' + __author__ = 'fenuks' + description = u'chcesz wiedzieć więcej?' + category = 'science' + language = 'pl' + cover_url = 'http://badania.net/wp-content/badanianet_green_transparent.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + remove_tags = [dict(attrs={'class':['omc-flex-category', 'omc-comment-count', 'omc-single-tags']})] + remove_tags_after = dict(attrs={'class':'omc-single-tags'}) + keep_only_tags = [dict(id='omc-full-article')] + feeds = [(u'Psychologia', u'http://badania.net/category/psychologia/feed/'), (u'Technologie', u'http://badania.net/category/technologie/feed/'), (u'Biologia', u'http://badania.net/category/biologia/feed/'), (u'Chemia', u'http://badania.net/category/chemia/feed/'), (u'Zdrowie', u'http://badania.net/category/zdrowie/'), (u'Seks', u'http://badania.net/category/psychologia-ewolucyjna-tematyka-seks/feed/')] diff --git a/recipes/bash_org_pl.recipe b/recipes/bash_org_pl.recipe index 4ed59614e7..a04f267ca3 100644 --- a/recipes/bash_org_pl.recipe +++ b/recipes/bash_org_pl.recipe @@ -35,8 +35,8 @@ class Bash_org_pl(BasicNewsRecipe): soup=self.index_to_soup(u'http://bash.org.pl/random/') #date=soup.find('div', attrs={'class':'right'}).string url=soup.find('a', attrs={'class':'qid click'}) - title=url.string - url='http://bash.org.pl' +url['href'] + title='' + url='http://bash.org.pl/random/' articles.append({'title' : title, 'url' : url, 'date' : '', @@ -44,6 +44,8 @@ class Bash_org_pl(BasicNewsRecipe): }) return articles + def populate_article_metadata(self, article, soup, first): + article.title = soup.find(attrs={'class':'qid click'}).string def parse_index(self): feeds = [] diff --git a/recipes/discover_magazine.recipe b/recipes/discover_magazine.recipe index 02cdb952b5..a7f080bb5f 100644 --- a/recipes/discover_magazine.recipe +++ b/recipes/discover_magazine.recipe @@ -33,6 +33,21 @@ class DiscoverMagazine(BasicNewsRecipe): remove_tags_after = [dict(name='div', attrs={'class':'listingBar'})] + # Login stuff + needs_subscription = True + use_javascript_to_login = True + requires_version = (0, 9, 20) + + def javascript_login(self, br, username, password): + br.visit('http://discovermagazine.com', timeout=120) + f = br.select_form('div.login.section div.form') + f['username'] = username + f['password'] = password + br.submit('input[id="signInButton"]', timeout=120) + br.run_for_a_time(20) + # End login stuff + + def append_page(self, soup, appendtag, position): pager = soup.find('span',attrs={'class':'next'}) if pager: diff --git a/recipes/ekologia_pl.recipe b/recipes/ekologia_pl.recipe index 2b0933b58d..21d3b607d2 100644 --- a/recipes/ekologia_pl.recipe +++ b/recipes/ekologia_pl.recipe @@ -15,7 +15,8 @@ class EkologiaPl(BasicNewsRecipe): no_stylesheets = True remove_empty_feeds = True use_embedded_content = False - remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj']})] + remove_attrs = ['style'] + remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj', 'widget-social-buttons']})] feeds = [(u'Wiadomo\u015bci', u'http://www.ekologia.pl/rss/20,53,0'), (u'\u015arodowisko', u'http://www.ekologia.pl/rss/20,56,0'), (u'Styl \u017cycia', u'http://www.ekologia.pl/rss/20,55,0')] diff --git a/recipes/el_malpensante.recipe b/recipes/el_malpensante.recipe new file mode 100644 index 0000000000..7a014735b6 --- /dev/null +++ b/recipes/el_malpensante.recipe @@ -0,0 +1,27 @@ +# coding=utf-8 +# https://github.com/iemejia/calibrecolombia + +''' +http://www.elmalpensante.com/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class ElMalpensante(BasicNewsRecipe): + title = u'El Malpensante' + language = 'es_CO' + __author__ = 'Ismael Mejia ' + cover_url = 'http://elmalpensante.com/img/layout/logo.gif' + description = 'El Malpensante' + oldest_article = 30 + simultaneous_downloads = 20 + #tags = 'news, sport, blog' + use_embedded_content = True + remove_empty_feeds = True + max_articles_per_feed = 100 + feeds = [(u'Artículos', u'http://www.elmalpensante.com/articulosRSS.php'), + (u'Malpensantías', u'http://www.elmalpensante.com/malpensantiasRSS.php'), + (u'Margaritas', u'http://www.elmalpensante.com/margaritasRSS.php'), +# This one is almost the same as articulos so we leave articles +# (u'Noticias', u'http://www.elmalpensante.com/noticiasRSS.php'), + ] diff --git a/recipes/elguardian.recipe b/recipes/elguardian.recipe new file mode 100644 index 0000000000..f5d035dd21 --- /dev/null +++ b/recipes/elguardian.recipe @@ -0,0 +1,93 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +elguardian.com.ar +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class ElGuardian(BasicNewsRecipe): + title = 'El Guardian' + __author__ = 'Darko Miletic' + description = "Semanario con todas las tendencias de un pais" + publisher = 'Editorial Apache SA' + category = 'news,politics,Argentina' + oldest_article = 8 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'es_AR' + remove_empty_feeds = True + publication_type = 'magazine' + issn = '1666-7476' + masthead_url = 'http://elguardian.com.ar/application/templates/frontend/images/home/logo.png' + extra_css = """ + body{font-family: Arial,sans-serif} + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'series' : title + , 'isbn' : issn + } + + keep_only_tags = [dict(attrs={'class':['fotos', 'header_nota', 'nota']})] + remove_tags = [dict(name=['meta','link','iframe','embed','object'])] + remove_attributes = ['lang'] + + feeds = [ + (u'El Pais' , u'http://elguardian.com.ar/RSS/el-pais.xml' ) + ,(u'Columnistas' , u'http://elguardian.com.ar/RSS/columnistas.xml' ) + ,(u'Personajes' , u'http://elguardian.com.ar/RSS/personajes.xml' ) + ,(u'Tinta roja' , u'http://elguardian.com.ar/RSS/tinta-roja.xml' ) + ,(u'Yo fui' , u'http://elguardian.com.ar/RSS/yo-fui.xml' ) + ,(u'Ciencia' , u'http://elguardian.com.ar/RSS/ciencia.xml' ) + ,(u'Cronicas' , u'http://elguardian.com.ar/RSS/cronicas.xml' ) + ,(u'Culturas' , u'http://elguardian.com.ar/RSS/culturas.xml' ) + ,(u'DxT' , u'http://elguardian.com.ar/RSS/dxt.xml' ) + ,(u'Fierros' , u'http://elguardian.com.ar/RSS/fierros.xml' ) + ,(u'Frente fashion', u'http://elguardian.com.ar/RSS/frente-fashion.xml') + ,(u'Pan y vino' , u'http://elguardian.com.ar/RSS/pan-y-vino.xml' ) + ,(u'Turismo' , u'http://elguardian.com.ar/RSS/turismo.xml' ) + ] + + def get_cover_url(self): + soup = self.index_to_soup('http://elguardian.com.ar/') + udata = soup.find('div', attrs={'class':'datosNumero'}) + if udata: + sdata = udata.find('div') + if sdata: + stra = re.findall(r'\d+', self.tag_to_string(sdata)) + self.conversion_options.update({'series_index':int(stra[1])}) + unumero = soup.find('div', attrs={'class':'ultimoNumero'}) + if unumero: + img = unumero.find('img', src=True) + if img: + return img['src'] + return None + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name = 'div' + item.attrs = [] + else: + str = self.tag_to_string(item) + item.replaceWith(str) + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup diff --git a/recipes/eso_pl.recipe b/recipes/eso_pl.recipe new file mode 100644 index 0000000000..5ebb420396 --- /dev/null +++ b/recipes/eso_pl.recipe @@ -0,0 +1,23 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class ESO(BasicNewsRecipe): + title = u'ESO PL' + __author__ = 'fenuks' + description = u'ESO, Europejskie Obserwatorium Południowe, buduje i obsługuje najbardziej zaawansowane naziemne teleskopy astronomiczne na świecie' + category = 'astronomy' + language = 'pl' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + cover_url = 'https://twimg0-a.akamaihd.net/profile_images/1922519424/eso-twitter-logo.png' + keep_only_tags = [dict(attrs={'class':'subcl'})] + remove_tags = [dict(id='lang_row'), dict(attrs={'class':['pr_typeid', 'pr_news_feature_link', 'outreach_usage', 'hidden']})] + feeds = [(u'Wiadomo\u015bci', u'http://www.eso.org/public/poland/news/feed/'), (u'Og\u0142oszenia', u'http://www.eso.org/public/poland/announcements/feed/'), (u'Zdj\u0119cie tygodnia', u'http://www.eso.org/public/poland/images/potw/feed/')] + + def preprocess_html(self, soup): + for a in soup.findAll('a', href=True): + if a['href'].startswith('/'): + a['href'] = 'http://www.eso.org' + a['href'] + return soup diff --git a/recipes/fronda.recipe b/recipes/fronda.recipe index 6755770329..d0177b998e 100644 --- a/recipes/fronda.recipe +++ b/recipes/fronda.recipe @@ -23,7 +23,6 @@ class Fronda(BasicNewsRecipe): extra_css = ''' h1 {font-size:150%} .body {text-align:left;} - div.headline {font-weight:bold} ''' earliest_date = date.today() - timedelta(days=oldest_article) @@ -72,7 +71,7 @@ class Fronda(BasicNewsRecipe): feeds.append((genName, articles[genName])) return feeds - keep_only_tags = [ + keep_only_tags = [ dict(name='div', attrs={'class':'yui-g'}) ] @@ -84,5 +83,7 @@ class Fronda(BasicNewsRecipe): dict(name='ul', attrs={'class':'comment-list'}), dict(name='ul', attrs={'class':'category'}), dict(name='p', attrs={'id':'comments-disclaimer'}), + dict(name='div', attrs={'style':'text-align: left; margin-bottom: 15px;'}), + dict(name='div', attrs={'style':'text-align: left; margin-top: 15px;'}), dict(name='div', attrs={'id':'comment-form'}) ] diff --git a/recipes/hnonline.recipe b/recipes/hnonline.recipe new file mode 100644 index 0000000000..d9faafd0f1 --- /dev/null +++ b/recipes/hnonline.recipe @@ -0,0 +1,68 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class HNonlineRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'lacike' + language = 'sk' + version = 1 + + title = u'HNonline' + publisher = u'HNonline' + category = u'News, Newspaper' + description = u'News from Slovakia' + cover_url = u'http://hnonline.sk/img/sk/_relaunch/logo2.png' + + oldest_article = 1 + max_articles_per_feed = 100 + use_embedded_content = False + remove_empty_feeds = True + + no_stylesheets = True + remove_javascript = True + + # Feeds from: http://rss.hnonline.sk, for listing see http://rss.hnonline.sk/prehlad + feeds = [] + feeds.append((u'HNonline|Ekonomika a firmy', u'http://rss.hnonline.sk/?p=kC1000')) + feeds.append((u'HNonline|Slovensko', u'http://rss.hnonline.sk/?p=kC2000')) + feeds.append((u'HNonline|Svet', u'http://rss.hnonline.sk/?p=kC3000')) + feeds.append((u'HNonline|\u0160port', u'http://rss.hnonline.sk/?p=kC4000')) + feeds.append((u'HNonline|Online rozhovor', u'http://rss.hnonline.sk/?p=kCR000')) + + feeds.append((u'FinWeb|Spr\u00E1vy zo sveta financi\u00ED', u'http://rss.finweb.hnonline.sk/spravodajstvo')) + feeds.append((u'FinWeb|Koment\u00E1re a anal\u00FDzy', u'http://rss.finweb.hnonline.sk/?p=kPC200')) + feeds.append((u'FinWeb|Invest\u00EDcie', u'http://rss.finweb.hnonline.sk/?p=kPC300')) + feeds.append((u'FinWeb|Svet akci\u00ED', u'http://rss.finweb.hnonline.sk/?p=kPC400')) + feeds.append((u'FinWeb|Rozhovory', u'http://rss.finweb.hnonline.sk/?p=kPC500')) + feeds.append((u'FinWeb|T\u00E9ma t\u00FD\u017Ed\u0148a', u'http://rss.finweb.hnonline.sk/?p=kPC600')) + feeds.append((u'FinWeb|Rebr\u00ED\u010Dky', u'http://rss.finweb.hnonline.sk/?p=kPC700')) + + feeds.append((u'HNstyle|Kult\u00FAra', u'http://style.hnonline.sk/?p=kTC100')) + feeds.append((u'HNstyle|Auto-moto', u'http://style.hnonline.sk/?p=kTC200')) + feeds.append((u'HNstyle|Digit\u00E1l', u'http://style.hnonline.sk/?p=kTC300')) + feeds.append((u'HNstyle|Veda', u'http://style.hnonline.sk/?p=kTCV00')) + feeds.append((u'HNstyle|Dizajn', u'http://style.hnonline.sk/?p=kTC400')) + feeds.append((u'HNstyle|Cestovanie', u'http://style.hnonline.sk/?p=kTCc00')) + feeds.append((u'HNstyle|V\u00EDkend', u'http://style.hnonline.sk/?p=kTC800')) + feeds.append((u'HNstyle|Gastro', u'http://style.hnonline.sk/?p=kTC600')) + feeds.append((u'HNstyle|M\u00F3da', u'http://style.hnonline.sk/?p=kTC700')) + feeds.append((u'HNstyle|Modern\u00E1 \u017Eena', u'http://style.hnonline.sk/?p=kTCA00')) + feeds.append((u'HNstyle|Pre\u010Do nie?!', u'http://style.hnonline.sk/?p=k7C000')) + + keep_only_tags = [] + keep_only_tags.append(dict(name = 'h1', attrs = {'class': 'detail-titulek'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class': 'detail-podtitulek'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class': 'detail-perex'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class': 'detail-text'})) + + remove_tags = [] + #remove_tags.append(dict(name = 'div', attrs = {'id': re.compile('smeplayer.*')})) + + remove_tags_after = [] + #remove_tags_after = [dict(name = 'p', attrs = {'class': 'autor_line'})] + + extra_css = ''' + @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/LiberationSans.ttf)} + body {font-family: sans1, serif1;} + ''' \ No newline at end of file diff --git a/recipes/icons/badania_net.png b/recipes/icons/badania_net.png new file mode 100644 index 0000000000..de915de8d1 Binary files /dev/null and b/recipes/icons/badania_net.png differ diff --git a/recipes/icons/elguardian.png b/recipes/icons/elguardian.png new file mode 100644 index 0000000000..a54b067ee4 Binary files /dev/null and b/recipes/icons/elguardian.png differ diff --git a/recipes/icons/eso_pl.png b/recipes/icons/eso_pl.png new file mode 100644 index 0000000000..4f3319fece Binary files /dev/null and b/recipes/icons/eso_pl.png differ diff --git a/recipes/icons/hnonline.png b/recipes/icons/hnonline.png new file mode 100644 index 0000000000..1e073839ad Binary files /dev/null and b/recipes/icons/hnonline.png differ diff --git a/recipes/icons/kurier_galicyjski.png b/recipes/icons/kurier_galicyjski.png new file mode 100644 index 0000000000..4d66a15122 Binary files /dev/null and b/recipes/icons/kurier_galicyjski.png differ diff --git a/recipes/icons/more_intelligent_life.png b/recipes/icons/more_intelligent_life.png new file mode 100644 index 0000000000..4fcf66e9a1 Binary files /dev/null and b/recipes/icons/more_intelligent_life.png differ diff --git a/recipes/icons/nauka_w_polsce.png b/recipes/icons/nauka_w_polsce.png new file mode 100644 index 0000000000..0d872ce682 Binary files /dev/null and b/recipes/icons/nauka_w_polsce.png differ diff --git a/recipes/icons/osworld_pl.png b/recipes/icons/osworld_pl.png new file mode 100644 index 0000000000..97a7d0dd55 Binary files /dev/null and b/recipes/icons/osworld_pl.png differ diff --git a/recipes/icons/pravda_rs.png b/recipes/icons/pravda_rs.png new file mode 100644 index 0000000000..8c4533a79d Binary files /dev/null and b/recipes/icons/pravda_rs.png differ diff --git a/recipes/icons/ubuntu_pomoc_org.png b/recipes/icons/ubuntu_pomoc_org.png new file mode 100644 index 0000000000..a143846630 Binary files /dev/null and b/recipes/icons/ubuntu_pomoc_org.png differ diff --git a/recipes/icons/wprost_rss.png b/recipes/icons/wprost_rss.png new file mode 100644 index 0000000000..5ce1b5563d Binary files /dev/null and b/recipes/icons/wprost_rss.png differ diff --git a/recipes/informacje_usa.recipe b/recipes/informacje_usa.recipe index ac31134103..692dcdc07e 100644 --- a/recipes/informacje_usa.recipe +++ b/recipes/informacje_usa.recipe @@ -1,5 +1,4 @@ from calibre.web.feeds.news import BasicNewsRecipe -import re class Informacje_USA(BasicNewsRecipe): title = u'Informacje USA' oldest_article = 7 @@ -8,11 +7,10 @@ class Informacje_USA(BasicNewsRecipe): description = u'portal wiadomości amerykańskich' category = 'news' language = 'pl' - masthead_url= 'http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg' - cover_url='http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg' + cover_url='http://www.informacjeusa.com/wp-content/uploads/2013/01/V3BANNER420-90new.jpg' no_stylesheets = True - preprocess_regexps = [(re.compile(ur'

Zobacz:.*?

', re.DOTALL), lambda match: ''), (re.compile(ur'

')) + img.insert(len(img.contents), bs('

')) + for a in soup.findAll('a', href=True): + if a['href'].startswith('/'): + a['href'] = 'http://kuriergalicyjski.com' + a['href'] + return soup diff --git a/recipes/le_monde_sub.recipe b/recipes/le_monde_sub.recipe index 56156166dc..dc9fa9d36f 100644 --- a/recipes/le_monde_sub.recipe +++ b/recipes/le_monde_sub.recipe @@ -1,166 +1,94 @@ -#!/usr/bin/env python - +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +__author__ = 'Sylvain Durand ' __license__ = 'GPL v3' -__copyright__ = '2012, 2013, Rémi Vanicat ' -''' -Lemonde.fr: Version abonnée -''' +import time -import os, zipfile, re, time -from urllib2 import HTTPError -from calibre.constants import preferred_encoding - +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ptempfile import PersistentTemporaryFile +from urllib2 import HTTPError -class LeMondeAbonne(BasicNewsRecipe): +class LeMonde(BasicNewsRecipe): - title = u'Le Monde: Édition abonnés' - __author__ = u'Rémi Vanicat' - description = u'Actualités' - category = u'Actualités, France, Monde' - publisher = 'Le Monde' - language = 'fr' - needs_subscription = True - no_stylesheets = True - smarten_punctuation = True - remove_attributes = [ 'border', 'cellspacing', 'display', 'align', 'cellpadding', 'colspan', 'valign', 'vscape', 'hspace', 'alt', 'width', 'height'] - extra_css = ''' li{margin:6pt 0} - ul{margin:0} + title = u'Le Monde: Édition abonnés' + __author__ = 'Sylvain Durand' + description = u'Disponible du lundi au samedi à partir de 14 heures environ, avec tous ses cahiers.' + language = 'fr' + encoding = 'utf8' - div.photo img{max-width:100%; border:0px transparent solid;} - div.photo{font-family:inherit; color:#333; text-align:center;} - div.photo p{text-align:justify;font-size:.9em; line-height:.9em;} + needs_subscription = True - @page{margin:10pt} - .ar-txt {color:#000; text-align:justify;} - h1{text-align:left; font-size:1.25em;} + date_url = 'http://www.lemonde.fr/journalelectronique/donnees/libre/%Y%m%d/index.html' + login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html' + journal_url = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/%Y%m%d_ipad.xml' + masthead_url = 'http://upload.wikimedia.org/wikipedia/fr/thumb/c/c5/Le_Monde_logo.svg/300px-Le_Monde_logo.svg.png' + couverture_url = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/data/img/%y%m%d01.jpg' - .auteur{text-align:right; font-weight:bold} - .feed{text-align:right; font-weight:bold} - .po-ti2{font-weight:bold} - .fen-tt{font-weight:bold;font-size:1.1em} - ''' + extra_css = ''' + img{max-width:100%} + h1{font-size:1.2em !important; line-height:1.2em !important; } + h2{font-size:1em !important; line-height:1em !important; } + h3{font-size:1em !important; text-transform:uppercase !important; color:#666;} + #photo{text-align:center !important; margin:10px 0 -8px;} + #lgd{font-size:1em !important; line-height:1em !important; font-style:italic; color:#333;} ''' - zipurl_format = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/%y%m%d.zip' - coverurl_format = '/img/%y%m%d01.jpg' - path_format = "%y%m%d" - login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html' + keep_only_tags = [dict(name=['h1','h2','h3','div','txt'])] - keep_only_tags = [dict(name=['h1']), dict(name='div', attrs={ 'class': 'photo' }), dict(name='div', attrs={ 'class': 'po-ti2' }), dict(name='div', attrs={ 'class': 'ar-txt' }), dict(name='div', attrs={ 'class': 'po_rtcol' }) ] - - - remove_tags = [ dict(name='div', attrs={ 'class': 'po-ti' }),dict(name='div', attrs={ 'class': 'po-copy' })] - - article_id_pattern = re.compile("[0-9]+\\.html") - article_url_format = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/html/' + def __init__(self, options, log, progress_reporter): + BasicNewsRecipe.__init__(self, options, log, progress_reporter) + br = BasicNewsRecipe.get_browser(self) + second = time.time() + 24*60*60 + for i in range(7): + self.date = time.gmtime(second) + try: + br.open(time.strftime(self.date_url,self.date)) + break + except HTTPError: + second -= 24*60*60 + self.timefmt = strftime(u" %A %d %B %Y", self.date).replace(u' 0', u' ') def get_browser(self): br = BasicNewsRecipe.get_browser(self) - if self.username is not None and self.password is not None: - br.open(self.login_url) - br.select_form(nr=0) - br['login'] = self.username - br['password'] = self.password - br.submit() + br.open(self.login_url) + br.select_form(nr=0) + br['login'] = self.username + br['password'] = self.password + br.submit() return br - decalage = 24 * 60 * 60 # today Monde has tomorow date - def get_cover_url(self): - url = time.strftime(self.coverurl_format, self.ltime) - return self.articles_path + url + url = time.strftime(self.couverture_url,self.date) + return url def parse_index(self): - browser = self.get_browser() - - second = time.time() - second += self.decalage - - for i in range(7): - self.ltime = time.gmtime(second) - self.timefmt=time.strftime(" %A %d %B %Y",self.ltime).decode(preferred_encoding) - url = time.strftime(self.zipurl_format,self.ltime) - try: - response = browser.open(url) - continue - except HTTPError: - second -= 24*60*60 - - tmp = PersistentTemporaryFile(suffix='.zip') - self.report_progress(0.1,_('downloading zip file')) - tmp.write(response.read()) - tmp.close() - - zfile = zipfile.ZipFile(tmp.name, 'r') - self.report_progress(0.1,_('extracting zip file')) - - zfile.extractall(self.output_dir) - zfile.close() - - path = os.path.join(self.output_dir, time.strftime(self.path_format, self.ltime), "data") - - self.articles_path = path - - files = os.listdir(path) - - nb_index_files = len([ name for name in files if re.match("frame_gauche_[0-9]+.html", name) ]) - - flux = [] - - article_url = time.strftime(self.article_url_format, self.ltime) - - for i in range(nb_index_files): - filename = os.path.join(path, "selection_%d.html" % (i + 1)) - tmp = open(filename,'r') - soup=BeautifulSoup(tmp,convertEntities=BeautifulSoup.HTML_ENTITIES) - title=soup.find('span').contents[0] - if title=="Une": - title="À la une" - if title=="Evenement": - title="L'événement" - if title=="Planete": - title="Planète" - if title=="Economie - Entreprises": - title="Économie" - if title=="L'Oeil du Monde": - title="L'œil du Monde" - if title=="Enquete": - title="Enquête" - if title=="Editorial - Analyses": - title="Analyses" - if title=="Le Monde Economie": - title="Économie" - if title=="Le Monde Culture et idées": - title="Idées" - if title=="Le Monde Géo et politique": - title="Géopolitique" - tmp.close() - - filename = os.path.join(path, "frame_gauche_%d.html" % (i + 1)) - tmp = open(filename,'r') - soup = BeautifulSoup(tmp) + url = time.strftime(self.journal_url,self.date) + soup = self.index_to_soup(url).sommaire + sections = [] + for sec in soup.findAll("section"): articles = [] - for link in soup.findAll("a"): - article_file = link['href'] - article_id=self.article_id_pattern.search(article_file).group() - article = { - 'title': link.contents[0], - 'url': article_url + article_id, - 'description': '', - 'content': '' - } - articles.append(article) - tmp.close() + if sec['cahier'] != "Le Monde": + for col in sec.findAll("fnts"): + col.extract() + if sec['cahier']=="Le Monde Magazine": + continue + for art in sec.findAll("art"): + if art.txt.string and art.ttr.string: + if art.find(['url']): + art.insert(6,'

') + if art.find(['lgd']) and art.find(['lgd']).string: + art.insert(7,'
'+art.find(['lgd']).string+'
') + article = ""+unicode(art)+"" + article = article.replace('','').replace(' oC ','°C ') + article = article.replace('srttr>','h3>').replace('ssttr>','h2>').replace('ttr>','h1>') + f = PersistentTemporaryFile() + f.write(article) + articles.append({'title':art.ttr.string,'url':"file:///"+f.name}) + sections.append((sec['nom'], articles)) + return sections - flux.append((title, articles)) + def preprocess_html(self, soup): + for lgd in soup.findAll(id="lgd"): + lgd.contents[-1].extract() + return soup - return flux - - - -# Local Variables: -# mode: python -# End: diff --git a/recipes/mlody_technik_pl.recipe b/recipes/mlody_technik_pl.recipe index 741397d08a..1eaa08d23a 100644 --- a/recipes/mlody_technik_pl.recipe +++ b/recipes/mlody_technik_pl.recipe @@ -1,5 +1,5 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai - +import re from calibre.web.feeds.news import BasicNewsRecipe class Mlody_technik(BasicNewsRecipe): title = u'Młody technik' @@ -9,7 +9,19 @@ class Mlody_technik(BasicNewsRecipe): language = 'pl' cover_url='http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg' no_stylesheets = True + preprocess_regexps = [(re.compile(r"

Podobne

", re.IGNORECASE), lambda m: '')] oldest_article = 7 max_articles_per_feed = 100 - #keep_only_tags=[dict(id='container')] - feeds = [(u'Artyku\u0142y', u'http://www.mt.com.pl/feed')] + remove_empty_feeds = True + use_embedded_content = False + keep_only_tags = [dict(id='content')] + remove_tags = [dict(attrs={'class':'st-related-posts'})] + remove_tags_after = dict(attrs={'class':'entry-content clearfix'}) + feeds = [(u'Wszystko', u'http://www.mt.com.pl/feed'), + (u'MT NEWS 24/7', u'http://www.mt.com.pl/kategoria/mt-newsy-24-7/feed'), + (u'Info zoom', u'http://www.mt.com.pl/kategoria/info-zoom/feed'), + (u'm.technik', u'http://www.mt.com.pl/kategoria/m-technik/feed'), + (u'Szkoła', u'http://www.mt.com.pl/kategoria/szkola-2/feed'), + (u'Na Warsztacie', u'http://www.mt.com.pl/kategoria/na-warsztacie/feed'), + (u'Z pasji do...', u'http://www.mt.com.pl/kategoria/z-pasji-do/feed'), + (u'MT testuje', u'http://www.mt.com.pl/kategoria/mt-testuje/feed')] diff --git a/recipes/more_intelligent_life.recipe b/recipes/more_intelligent_life.recipe new file mode 100644 index 0000000000..e90f883080 --- /dev/null +++ b/recipes/more_intelligent_life.recipe @@ -0,0 +1,67 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +moreintelligentlife.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class MoreIntelligentLife(BasicNewsRecipe): + title = 'More Intelligent Life' + __author__ = 'Darko Miletic' + description = "More Intelligent Life (moreintelligentlife.com) is the online version of Intelligent Life, a lifestyle and culture magazine from The Economist. The website offers not only content from the print edition, trickled out over the course of its shelf-life, but also the Editors' Blog, which carries daily posts from the editorial team-quickfire observations and opinions that allow readers to eavesdrop on the conversation in the office." + publisher = 'The Economist Newspaper ltd' + category = 'arts,lifestyle,intelligent life,the economist,ideas,style,culture' + oldest_article = 60 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en' + remove_empty_feeds = True + publication_type = 'website' + extra_css = """ + body{font-family: Arial,"Helvetica neue","Bitstream Vera Sans",sans-serif} + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + keep_only_tags = [dict(attrs={'class':'node'})] + remove_tags_after = dict(attrs={'class':'tags'}) + remove_tags = [dict(name=['meta','link','iframe','embed','object'])] + remove_attributes = ['lang'] + + feeds = [(u'Articles', u'http://feeds.feedburner.com/MoreintelligentlifeTotal')] + + def get_cover_url(self): + soup = self.index_to_soup('http://moreintelligentlife.com/') + for image in soup.findAll('img', src=True): + if image['src'].startswith('http://moreintelligentlife.com/files/covers/current_issue_'): + return image['src'] + return None + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name = 'div' + item.attrs = [] + else: + str = self.tag_to_string(item) + item.replaceWith(str) + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup diff --git a/recipes/nauka_w_polsce.recipe b/recipes/nauka_w_polsce.recipe new file mode 100644 index 0000000000..c524c18b26 --- /dev/null +++ b/recipes/nauka_w_polsce.recipe @@ -0,0 +1,47 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class NaukawPolsce(BasicNewsRecipe): + title = u'Nauka w Polsce' + __author__ = 'fenuks' + description = u'Serwis Nauka w Polsce ma za zadanie popularyzację polskiej nauki. Można na nim znaleźć wiadomości takie jak: osiągnięcia polskich naukowców, wydarzenia na polskich uczelniach, osiągnięcia studentów, konkursy dla badaczy, staże i stypendia naukowe, wydarzenia w polskiej nauce, kalendarium wydarzeń w nauce, materiały wideo o nauce.' + category = 'science' + language = 'pl' + cover_url = 'http://www.naukawpolsce.pap.pl/Themes/Pap/images/logo-pl.gif' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + index = 'http://www.naukawpolsce.pl' + keep_only_tags = [dict(name='div', attrs={'class':'margines wiadomosc'})] + remove_tags = [dict(name='div', attrs={'class':'tagi'})] + + def find_articles(self, url): + articles = [] + soup=self.index_to_soup(url) + for i in soup.findAll(name='div', attrs={'class':'aktualnosci-margines lista-depesz information-content'}): + title = i.h1.a.string + url = self.index + i.h1.a['href'] + date = '' #i.span.string + articles.append({'title' : title, + 'url' : url, + 'date' : date, + 'description' : '' + }) + return articles + + def parse_index(self): + feeds = [] + feeds.append((u"Historia i kultura", self.find_articles('http://www.naukawpolsce.pl/historia-i-kultura/'))) + feeds.append((u"Kosmos", self.find_articles('http://www.naukawpolsce.pl/kosmos/'))) + feeds.append((u"Przyroda", self.find_articles('http://www.naukawpolsce.pl/przyroda/'))) + feeds.append((u"Społeczeństwo", self.find_articles('http://www.naukawpolsce.pl/spoleczenstwo/'))) + feeds.append((u"Technologie", self.find_articles('http://www.naukawpolsce.pl/technologie/'))) + feeds.append((u"Uczelnie", self.find_articles('http://www.naukawpolsce.pl/uczelnie/'))) + feeds.append((u"Nauki medyczne", self.find_articles('http://www.naukawpolsce.pl/zdrowie/'))) + + return feeds + + def preprocess_html(self, soup): + for p in soup.findAll(name='p', text=re.compile(' ')): + p.extract() + return soup diff --git a/recipes/navegalo.recipe b/recipes/navegalo.recipe new file mode 100644 index 0000000000..89f6cde45d --- /dev/null +++ b/recipes/navegalo.recipe @@ -0,0 +1,40 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1360354988(BasicNewsRecipe): + title = u'Navegalo.com' + oldest_article = 7 + max_articles_per_feed = 100 + auto_cleanup = True + +from calibre.web.feeds.news import BasicNewsRecipe + +class navegalonews(BasicNewsRecipe): + __author__ = 'Douglas Delgado' + title = u'Navegalo.com' + publisher = 'Navegalo.com' + description = 'Noticias actualizadas por Navegalo.com. Recipe creado por Douglas Delgado (doudelgado@gmail.com) para su uso con Calibre' + category = 'Spanish, Entertainment' + masthead_url = 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQZhML5lwsdss6FFF7CFR0Sf-Ln052Zmhs1TlIOcAL8JWN8a-dPlA' + + oldest_article = 2 + delay = 1 + max_articles_per_feed = 100 + auto_cleanup = True + encoding = 'utf-8' + language = 'es_CR' + use_embedded_content = False + remove_empty_feeds = True + remove_javascript = True + no_stylesheets = True + + + feeds = [(u'Nacionales', u'http://www.navegalo.com/es/noticias/noticias/noticias-nacionales?format=feed&type=rss'), (u'Internacionales', u'http://direct.navegalo.com/es/noticias/noticias/noticias-internacionales?format=feed&type=rss'), (u'Deportes', u'http://direct.navegalo.com/es/noticias/noticias/deportes-nacionales?format=feed&type=rss'), (u'Solo futbol', u'http://www.navegalo.com/es/noticias/noticias/solo-futbol?format=feed&type=rss'), (u'Entretenimiento', u'http://www.navegalo.com/es/noticias/noticias/entretenimiento?format=feed&type=rss'), (u'Solo para ellas', u'http://www.navegalo.com/es/noticias/noticias/solo-para-ellas?format=feed&type=rss'), (u'Infiltrados', u'http://direct.navegalo.com/es/noticias/noticias/infiltrados?format=feed&type=rss'), (u'Mano a mano', u'http://direct.navegalo.com/es/noticias/noticias/mano-a-mano?format=feed&type=rss')] + + + + + extra_css = ''' + h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;} + h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;} + ''' + diff --git a/recipes/new_yorker.recipe b/recipes/new_yorker.recipe index c5f1b0aff2..2730b45d6d 100644 --- a/recipes/new_yorker.recipe +++ b/recipes/new_yorker.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2008-2011, Darko Miletic ' +__copyright__ = '2008-2013, Darko Miletic ' ''' newyorker.com ''' @@ -44,20 +44,18 @@ class NewYorker(BasicNewsRecipe): , 'language' : language } - keep_only_tags = [ - dict(name='div', attrs={'class':'headers'}) - ,dict(name='div', attrs={'id':['articleheads','items-container','articleRail','articletext','photocredits']}) - ] + keep_only_tags = [dict(name='div', attrs={'id':'pagebody'})] remove_tags = [ dict(name=['meta','iframe','base','link','embed','object']) - ,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons'] }) + ,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons','social-utils-top','entry-keywords','entry-categories','utilsPrintEmail'] }) ,dict(attrs={'id':['show-header','show-footer'] }) ] + remove_tags_after = dict(attrs={'class':'entry-content'}) remove_attributes = ['lang'] feeds = [(u'The New Yorker', u'http://www.newyorker.com/services/mrss/feeds/everything.xml')] def print_version(self, url): - return url + '?printable=true' + return url + '?printable=true¤tPage=all' def image_url_processor(self, baseurl, url): return url.strip() diff --git a/recipes/osworld_pl.recipe b/recipes/osworld_pl.recipe new file mode 100644 index 0000000000..7784a271e0 --- /dev/null +++ b/recipes/osworld_pl.recipe @@ -0,0 +1,33 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class OSWorld(BasicNewsRecipe): + title = u'OSWorld.pl' + __author__ = 'fenuks' + description = u'OSWorld.pl to serwis internetowy, dzięki któremu poznasz czym naprawdę jest Open Source. Serwis poświęcony jest wolnemu oprogramowaniu jak linux mint, centos czy ubunty. Znajdziecie u nasz artykuły, unity oraz informacje o certyfikatach CACert. OSWorld to mały świat wielkich systemów!' + category = 'OS, IT, open source, Linux' + language = 'pl' + cover_url = 'http://osworld.pl/wp-content/uploads/osworld-kwadrat-128x111.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + keep_only_tags = [dict(id=['dzial', 'posts'])] + remove_tags = [dict(attrs={'class':'post-comments'})] + remove_tags_after = dict(attrs={'class':'entry clr'}) + feeds = [(u'Artyku\u0142y', u'http://osworld.pl/category/artykuly/feed/'), (u'Nowe wersje', u'http://osworld.pl/category/nowe-wersje/feed/')] + + def append_page(self, soup, appendtag): + tag = appendtag.find(attrs={'id':'paginacja'}) + if tag: + for nexturl in tag.findAll('a'): + soup2 = self.index_to_soup(nexturl['href']) + pagetext = soup2.find(attrs={'class':'entry clr'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'id':'paginacja'}): + r.extract() + + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/pc_lab.recipe b/recipes/pc_lab.recipe index c4b33b8416..7a6038bd65 100644 --- a/recipes/pc_lab.recipe +++ b/recipes/pc_lab.recipe @@ -1,5 +1,4 @@ #!/usr/bin/env python - from calibre.web.feeds.recipes import BasicNewsRecipe class PCLab(BasicNewsRecipe): @@ -8,12 +7,13 @@ class PCLab(BasicNewsRecipe): __author__ = 'ravcio - rlelusz[at]gmail.com' description = u"Articles from PC Lab website" language = 'pl' - oldest_article = 30.0 + oldest_article = 30 max_articles_per_feed = 100 recursions = 0 encoding = 'iso-8859-2' no_stylesheets = True remove_javascript = True + remove_empty_feeds = True use_embedded_content = False keep_only_tags = [ @@ -21,50 +21,54 @@ class PCLab(BasicNewsRecipe): ] remove_tags = [ - dict(name='div', attrs={'class':['chapters']}) - ,dict(name='div', attrs={'id':['script_bxad_slot_display_list_bxad_slot']}) + dict(name='div', attrs={'class':['toc first', 'toc', 'tags', 'recommendedarticles', 'name', 'zumi', 'chapters']}) ] - remove_tags_after = [ - dict(name='div', attrs={'class':['navigation']}) - ] - #links to RSS feeds - feeds = [ ('PCLab', u'http://pclab.pl/xml/artykuly.xml') ] + feeds = [ + (u'Aktualności', 'http://pclab.pl/xml/aktualnosci.xml'), + (u'Artykuły', u'http://pclab.pl/xml/artykuly.xml'), + (u'Poradniki', 'http://pclab.pl/xml/poradniki.xml') + ] #load second and subsequent page content # in: soup - full page with 'next' button # out: appendtag - tag to which new page is to be added def append_page(self, soup, appendtag): # find the 'Next' button - pager = soup.find('div', attrs={'class':'next'}) - + pager = soup.find('div', attrs={'class':'navigation'}) if pager: + a = pager.find('a') + if 'news' in a['href']: + pager = None + else: + pager = pager.find('div', attrs={'class':'next'}) + + while pager: #search for 'a' element with link to next page (exit if not found) a = pager.find('a') - if a: - nexturl = a['href'] + nexturl = a['href'] + soup2 = self.index_to_soup('http://pclab.pl' + nexturl) + pager = soup2.find('div', attrs={'class':'next'}) + pagetext = soup2.find('div', attrs={'class':'substance'}) + pagetext = pagetext.find('div', attrs={'class':'data'}) - soup2 = self.index_to_soup('http://pclab.pl/' + nexturl) - - pagetext_substance = soup2.find('div', attrs={'class':'substance'}) - pagetext = pagetext_substance.find('div', attrs={'class':'data'}) - pagetext.extract() - - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - pos = len(appendtag.contents) - - self.append_page(soup2, appendtag) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pos = len(appendtag.contents) + pager = soup.find('div', attrs={'class':'navigation'}) + if pager: + pager.extract() def preprocess_html(self, soup): - # soup.body contains no title and no navigator, they are in soup self.append_page(soup, soup.body) - + for link in soup.findAll('a'): + href = link.get('href', None) + if href and href.startswith('/'): + link['href'] = 'http://pclab.pl' + href # finally remove some tags - tags = soup.findAll('div',attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']}) - [tag.extract() for tag in tags] + #for r in soup.findAll('div', attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']}) return soup diff --git a/recipes/pnn.recipe b/recipes/pnn.recipe new file mode 100644 index 0000000000..cb36afe88b --- /dev/null +++ b/recipes/pnn.recipe @@ -0,0 +1,55 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe + +'''Calibre recipe to convert the RSS feeds of the PNN to an ebook.''' + +class SportsIllustratedRecipe(BasicNewsRecipe) : + __author__ = 'n.kucklaender' + __copyright__ = 'a.peter' + __license__ = 'GPL v3' + language = 'de' + description = 'PNN RSS' + version = 1 + title = u'PNN' + timefmt = ' [%d.%m.%Y]' + + oldest_article = 7.0 + no_stylesheets = True + remove_javascript = True + use_embedded_content = False + publication_type = 'newspaper' + + remove_empty_feeds = True + remove_tags = [dict(attrs={'class':['um-weather um-header-weather','um-has-sub um-mainnav','um-box','ts-products','um-meta-nav','um-box um-last','um-footer','um-footer-links','share hidden','um-buttons']}),dict(id=['dinsContainer'])] + # remove_tags_before = [dict(name='div', attrs={'class':'um-first'})] + # remove_tags_after = [dict(name='div', attrs={'class':'um-metabar'})] + + feeds = [(u'Titelseite', u'http://www.pnn.de/rss.xml'), + (u'Dritte Seite', u'http://www.pnn.de/dritte-seite/rss.xml'), + (u'Politik', u'http://www.pnn.de/politik/rss.xml'), + (u'Meinung', u'http://www.pnn.de/meinung/rss.xml'), + (u'Potsdam', u'http://www.pnn.de/potsdam/rss.xml'), + (u'Havel-Spree', u'http://www.pnn.de/havel-spree/rss.xml'), + (u'Potsdam-Mittelmark', u'http://www.pnn.de/pm/rss.xml'), + (u'Berlin-Brandenburg', u'http://www.pnn.de/brandenburg-berlin/rss.xml'), + (u'Wirtschaft', u'http://www.pnn.de/wirtschaft/rss.xml'), + (u'Sport', u'http://www.pnn.de/sport/rss.xml'), + (u'Regionalsport', u'http://www.pnn.de/regionalsport/rss.xml'), + (u'Kultur', u'http://www.pnn.de/kultur/rss.xml'), + (u'Potsdam-Kultur', u'http://www.pnn.de/potsdam-kultur/rss.xml'), + (u'Wissen', u'http://www.pnn.de/wissen/rss.xml'), + (u'Medien', u'http://www.pnn.de/medien/rss.xml'), + (u'Weltspiegel', u'http://www.pnn.de/weltspiegel/rss.xml'), + (u'Wissenschaft', u'http://www.pnn.de/campus/rss.xml'), + (u'Mobil', u'http://www.pnn.de/mobil/rss.xml'), + (u'Reise', u'http://www.pnn.de/reise/rss.xml'), + (u'Ratgeber', u'http://www.pnn.de/ratgeber/rss.xml'), + (u'Fragen des Tages', u'http://www.pnn.de/fragen-des-tages/rss.xml'), + # (u'Potsdam bin ich', u'http://www.pnn.de/potsdam-bin-ich/rss.xml'), + (u'Leserbriefe', u'http://www.pnn.de/leserbriefe/rss.xml')] + + def get_masthead_url(self): + return 'http://www.pnn.de/app/base/img/pnn_logo.png' + + def print_version(self, url): + return url.replace('.html', ',view,printVersion.html') + diff --git a/recipes/pravda_rs.recipe b/recipes/pravda_rs.recipe new file mode 100644 index 0000000000..742527ac2b --- /dev/null +++ b/recipes/pravda_rs.recipe @@ -0,0 +1,85 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' + +''' +www.pravda.rs +''' + +import re +from calibre.web.feeds.recipes import BasicNewsRecipe + +class Pravda_rs(BasicNewsRecipe): + title = 'Dnevne novine Pravda' + __author__ = 'Darko Miletic' + description = '24 sata portal vesti iz Srbije' + publisher = 'Dnevne novine Pravda' + category = 'news, politics, entertainment, Serbia' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = False + language = 'sr' + publication_type = 'newspaper' + remove_empty_feeds = True + PREFIX = 'http://www.pravda.rs' + FEEDPR = PREFIX + '/category/' + LANGLAT = '?lng=lat' + FEEDSU = '/feed/' + LANGLAT + INDEX = PREFIX + LANGLAT + masthead_url = 'http://www.pravda.rs/wp-content/uploads/2012/09/logoof.png' + extra_css = """ + @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} + body{font-family: Georgia,"Times New Roman",Times,serif1,serif;} + img{display: block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher': publisher + , 'language' : language + } + + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + + keep_only_tags = [dict(name='div', attrs={'class':'post'})] + remove_tags = [dict(name='h3')] + remove_tags_after = dict(name='h3') + + feeds = [ + (u'Politika' , FEEDPR + 'politika/' + FEEDSU), + (u'Tema Dana', FEEDPR + 'tema-dana/' + FEEDSU), + (u'Hronika' , FEEDPR + 'hronika/' + FEEDSU), + (u'Društvo' , FEEDPR + 'drustvo/' + FEEDSU), + (u'Ekonomija', FEEDPR + 'ekonomija/' + FEEDSU), + (u'Srbija' , FEEDPR + 'srbija/' + FEEDSU), + (u'Beograd' , FEEDPR + 'beograd/' + FEEDSU), + (u'Kultura' , FEEDPR + 'kultura/' + FEEDSU), + (u'Zabava' , FEEDPR + 'zabava/' + FEEDSU), + (u'Sport' , FEEDPR + 'sport/' + FEEDSU), + (u'Svet' , FEEDPR + 'svet/' + FEEDSU), + (u'Porodica' , FEEDPR + 'porodica/' + FEEDSU), + (u'Vremeplov', FEEDPR + 'vremeplov/' + FEEDSU), + (u'IT' , FEEDPR + 'it/' + FEEDSU), + (u'Republika Srpska', FEEDPR + 'republika-srpska/' + FEEDSU), + (u'Crna Gora', FEEDPR + 'crna-gora/' + FEEDSU), + (u'EX YU' , FEEDPR + 'eks-ju/' + FEEDSU), + (u'Dijaspora', FEEDPR + 'dijaspora/' + FEEDSU), + (u'Kolumna' , FEEDPR + 'kolumna/' + FEEDSU), + (u'Afere' , FEEDPR + 'afere/' + FEEDSU), + (u'Feljton' , FEEDPR + 'feljton/' + FEEDSU), + (u'Intervju' , FEEDPR + 'intervju/' + FEEDSU), + (u'Reportaža', FEEDPR + 'reportaza/' + FEEDSU), + (u'Zanimljivosti', FEEDPR + 'zanimljivosti/' + FEEDSU), + (u'Sa trga' , FEEDPR + 'sa-trga/' + FEEDSU) + ] + + def print_version(self, url): + return url + self.LANGLAT + + def preprocess_raw_html(self, raw, url): + return 'title'+raw[raw.find(''):] + \ No newline at end of file diff --git a/recipes/revista_cromos.recipe b/recipes/revista_cromos.recipe new file mode 100644 index 0000000000..29515971dd --- /dev/null +++ b/recipes/revista_cromos.recipe @@ -0,0 +1,33 @@ +# coding=utf-8 +# https://github.com/iemejia/calibrecolombia + +''' +http://www.cromos.com.co/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class ElMalpensante(BasicNewsRecipe): + title = u'Revista Cromos' + language = 'es_CO' + __author__ = 'Ismael Mejia ' + cover_url = 'http://www.cromos.com.co/sites/cromos.com.co/themes/cromos_theme/images/logo_morado.gif' + description = 'Revista Cromos' + oldest_article = 7 + simultaneous_downloads = 20 + #tags = 'news, sport, blog' + use_embedded_content = True + remove_empty_feeds = True + max_articles_per_feed = 100 + feeds = [(u'Cromos', u'http://www.cromos.com.co/rss.xml'), + (u'Moda', u'http://www.cromos.com.co/moda/feed'), + (u'Estilo de Vida', u'http://www.cromos.com.co/estilo-de-vida/feed'), + (u'Cuidado Personal', u'http://www.cromos.com.co/estilo-de-vida/cuidado-personal/feed'), + (u'Salud y Alimentación', u'http://www.cromos.com.co/estilo-de-vida/salud-y-alimentacion/feed'), + (u'Personajes', u'http://www.cromos.com.co/personajes/feed'), + (u'Actualidad', u'http://www.cromos.com.co/personajes/actualidad/feed'), + (u'Espectáculo', u'http://www.cromos.com.co/personajes/espectaculo/feed'), + (u'Reportajes', u'http://www.cromos.com.co/reportajes/feed'), + (u'Eventos', u'http://www.cromos.com.co/eventos/feed'), + (u'Modelos', u'http://www.cromos.com.co/modelos/feed'), + ] diff --git a/recipes/spiders_web_pl.recipe b/recipes/spiders_web_pl.recipe index 678ee5c640..b593d6b837 100644 --- a/recipes/spiders_web_pl.recipe +++ b/recipes/spiders_web_pl.recipe @@ -5,11 +5,14 @@ class SpidersWeb(BasicNewsRecipe): oldest_article = 7 __author__ = 'fenuks' description = u'' - cover_url = 'http://www.spidersweb.pl/wp-content/themes/spiderweb/img/Logo.jpg' + cover_url = 'http://www.spidersweb.pl/wp-content/themes/new_sw/images/spidersweb.png' category = 'IT, WEB' language = 'pl' no_stylesheers=True + remove_javascript = True + use_embedded_content = False max_articles_per_feed = 100 - keep_only_tags=[dict(id='Post')] - remove_tags=[dict(name='div', attrs={'class':['Comments', 'Shows', 'Post-Tags']}), dict(id='Author-Column')] + keep_only_tags=[dict(id='start')] + remove_tags_after = dict(attrs={'class':'padding20'}) + remove_tags=[dict(name='div', attrs={'class':['padding border-bottom', 'padding20', 'padding border-top']})] feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')] diff --git a/recipes/ubuntu_pomoc_org.recipe b/recipes/ubuntu_pomoc_org.recipe new file mode 100644 index 0000000000..1a78649dfc --- /dev/null +++ b/recipes/ubuntu_pomoc_org.recipe @@ -0,0 +1,22 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe +class UbuntuPomoc(BasicNewsRecipe): + title = u'Ubuntu-pomoc.org' + __author__ = 'fenuks' + description = u'Strona poświęcona systemowi Ubuntu Linux. Znajdziesz tutaj przydatne i sprawdzone poradniki oraz sposoby rozwiązywania wielu popularnych problemów. Ten blog rozwiąże każdy Twój problem - jeśli nie teraz, to wkrótce! :)' + category = 'Linux, Ubuntu, open source' + language = 'pl' + cover_url = 'http://www.ubuntu-pomoc.org/grafika/ubuntupomoc.png' + preprocess_regexps = [(re.compile(r'
.+', re.IGNORECASE|re.DOTALL), lambda m: '')] + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_javascript = True + remove_empty_feeds = True + use_embedded_content = False + remove_attrs = ['style'] + keep_only_tags = [dict(attrs={'class':'post'})] + remove_tags_after = dict(attrs={'class':'underEntry'}) + remove_tags = [dict(attrs={'class':['underPostTitle', 'yarpp-related', 'underEntry', 'social', 'tags', 'commentlist', 'youtube_sc']}), dict(id=['wp_rp_first', 'commentReply'])] + feeds = [(u'Ca\u0142o\u015b\u0107', u'http://feeds.feedburner.com/Ubuntu-Pomoc'), + (u'Gry', u'http://feeds.feedburner.com/GryUbuntu-pomoc')] diff --git a/recipes/unperiodico.recipe b/recipes/unperiodico.recipe new file mode 100644 index 0000000000..d4edb4e5dc --- /dev/null +++ b/recipes/unperiodico.recipe @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# https://github.com/iemejia/calibrecolombia + +''' +http://www.unperiodico.unal.edu.co/ +''' + +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe + +class UNPeriodico(BasicNewsRecipe): + title = u'UN Periodico' + language = 'es_CO' + __author__ = 'Ismael Mejia ' + cover_url = 'http://www.unperiodico.unal.edu.co/fileadmin/templates/periodico/img/logoperiodico.png' + description = 'UN Periodico' + oldest_article = 30 + max_articles_per_feed = 100 + publication_type = 'newspaper' + feeds = [ + (u'UNPeriodico', u'http://www.unperiodico.unal.edu.co/rss/type/rss2/') + ] diff --git a/recipes/volksrant.recipe b/recipes/volksrant.recipe index e5499fed73..fa143c97ad 100644 --- a/recipes/volksrant.recipe +++ b/recipes/volksrant.recipe @@ -41,17 +41,9 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe): ####################################################################################################### temp_files = [] articles_are_obfuscated = True - use_javascript_to_login = True - - def javascript_login(self, br, username, password): - 'Volksrant wants the user to explicitly allow cookies' - if not br.visit('http://www.volkskrant.nl'): - raise Exception('Failed to connect to volksrant website') - br.click('#pop_cookie_text a[onclick]', wait_for_load=True, timeout=120) def get_obfuscated_article(self, url): br = self.browser.clone_browser() - print 'THE CURRENT URL IS: ', url br.open(url) year = date.today().year diff --git a/recipes/wprost.recipe b/recipes/wprost.recipe index 2adac1e113..90dde251ca 100644 --- a/recipes/wprost.recipe +++ b/recipes/wprost.recipe @@ -10,89 +10,89 @@ from calibre.web.feeds.news import BasicNewsRecipe import re class Wprost(BasicNewsRecipe): - EDITION = 0 - FIND_LAST_FULL_ISSUE = True - EXCLUDE_LOCKED = True - ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png' + EDITION = 0 + FIND_LAST_FULL_ISSUE = True + EXCLUDE_LOCKED = True + ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png' + title = u'Wprost' + __author__ = 'matek09' + description = 'Weekly magazine' + encoding = 'ISO-8859-2' + no_stylesheets = True + language = 'pl' + remove_javascript = True + recursions = 0 + remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) + remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) + ''' + keep_only_tags =[] + keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'})) + ''' - title = u'Wprost' - __author__ = 'matek09' - description = 'Weekly magazine' - encoding = 'ISO-8859-2' - no_stylesheets = True - language = 'pl' - remove_javascript = True - recursions = 0 - - remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) - remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) - - '''keep_only_tags =[] - keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))''' - - preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), + preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), (re.compile(r'display: block;'), lambda match: ''), (re.compile(r'\\\<\/table\>'), lambda match: ''), (re.compile(r'\'), lambda match: ''), (re.compile(r'\'), lambda match: ''), (re.compile(r'\
'), lambda match: ''), - (re.compile(r'\