diff --git a/Changelog.yaml b/Changelog.yaml index 0bbc1f1e07..38d59e0770 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,212 @@ # new recipes: # - title: +- version: 0.9.20 + date: 2013-02-22 + + new features: + - title: "Book polishing: Add an option to smarten punctuation in the book when polishing" + + - title: "Book polishing: Add an option to delete all saved settings to the load saved settings button" + + - title: "Book polishing: Remember the last used settings" + + - title: "Book polishing: Add a checkbox to enable/disable the detailed polishing report" + + - title: "Add a separate tweak in Preferences-Tweaks for saving backups of files when polishing. That way you can have calibre save backups while converting EPUB->EPUB and not while polishing, if you so desire." + + - title: "Content server: Allow clicking on the book cover to download it. Useful on small screen devices where clicking the Get button may be difficult" + + - title: "Driver for Energy Systems C4 Touch." + tickets: [1127477] + + bug fixes: + - title: "E-book viewer: Fix a bug that could cause the back button in the viewer to skip a location" + + - title: "When tweaking/polishing an azw3 file that does not have an identified content ToC, do not auto-generate one." + tickets: [1130729] + + - title: "Book polishing: Use the actual cover image dimensions when creating the svg wrapper for the cover image." + tickets: [1127273] + + - title: "Book polishing: Do not error out on epub files containing an iTunesMetadata.plist file." + tickets: [1127308] + + - title: "Book polishing: Fix trying to polish more than 5 books at a time not working" + + - title: "Content server: Add workaround for bug in latest release of Google Chrome that causes it to not work with book lists containing some utf-8 characters" + tickets: [1130478] + + - title: "E-book viewer: When viewing EPUB files, do not parse html as xhtml even if it has svg tags embedded. This allows malformed XHTML files to still be viewed." + + - title: "Bulk metadata edit Search & recplace: Update the sample values when changing the type of identifier to search on" + + - title: "Fix recipes with the / character in their names not useable from the command line" + tickets: [1127666] + + - title: "News download: Fix regression that broke downloading of images in gif format" + + - title: "EPUB/AZW3 Output: When splitting the output html on page breaks, handle page-break-after rules correctly, the pre split point html should contain the full element" + + - title: "Fix stdout/stderr redirection temp files not being deleted when restarting calibre from within calibre on windows" + + - title: "E-book viewer: When viewing epub files that have their cover marked as non-linear, show the cover at the start of the book instead of the end." + tickets: [1126030] + + - title: "EPUB Input: Fix handling of cover references with fragments in the urls" + + improved recipes: + - Fronda + - Various Polish news sources + + new recipes: + - title: Pravda + author: Darko Miletic + + - title: PNN + author: n.kucklaender + + - title: Various Polish news sources + author: fenuks + +- version: 0.9.19 + date: 2013-02-15 + + new features: + - title: "New tool: \"Polish books\" that allows you to perform various automated cleanup actions on EPUB and AZW3 files without doing a full conversion." + type: major + description: "Polishing books is all about putting the shine of perfection on your ebook files. You can use it to subset embedded fonts, update the metadata in the book files from the metadata in the calibre library, manipulate the book jacket, etc. More features will be added in the future. To use this tool, go to Preferences->Toolbar and add the Polish books tool to the main toolbar. Then simply select the books you want to be polished and click the Polish books button. Polishing, unlike conversion, does not change the internal structure/markup of your book, it performs only the minimal set of actions needed to achieve its goals. Note that polish books is a completely new codebase, so there may well be bugs, polishing a book backs up the original as ORIGINAL_EPUB or ORIGINAL_AZW3, unless you have turned off this feature in Preferences->Tweaks, in which case you should backup your files manually. You can also use this tool from the command line with ebook-polish.exe." + + - title: "Driver for the Trekstor Pyrus Mini." + tickets: [1124120] + + - title: "E-book viewer: Add an option to change the minimum font size." + tickets: [1122333] + + - title: "PDF Output: Add support for converting documents with math typesetting, as described here: http://manual.calibre-ebook.com/typesetting_math.html" + + - title: "Column coloring/icons: Add more conditions when using date based columns with reference to 'today'." + + bug fixes: + - title: "Transforming to titlecase - handle typographic hyphens in all caps phrases" + + - title: "Dont ignore file open events that occur before the GUI is initialized on OS X" + tickets: [1122713] + + - title: "News download: Handle feeds that have entries with empty ids" + + - title: "Fix a regression that broke using the template editor" + + - title: "Do not block startup while scanning the computer for available network interfaces. Speeds up startup time on some windows computers with lots of spurious network interfaces." + + improved recipes: + - New Yorker + - Kommersant + - Le Monde (Subscription version) + - NZ Herald + + new recipes: + - title: Navegalo + author: Douglas Delgado + + - title: El Guardian and More Intelligent Life + author: Darko Miletic + +- version: 0.9.18 + date: 2013-02-08 + + new features: + - title: "New metadata source: Edelweiss, a catalog of books that is updated directly by publishers. To enable it, go to Preferences->Metadata download and enable the Edelweiss plugin." + tickets: [1091073] + + - title: "Add an option to add extra spacing between rows in the book list. (Preferences->Look & Feel)" + tickets: [1117907] + + - title: "Column coloring/icons: Add a 'days ago' condition, useable with columns that store dates to set colors/icons based on the number of days before today" + + - title: "E-book viewer: Add shortcuts Ctrl+= and Ctrl+- to increase/decrease text size." + tickets: [ 1117524 ] + + - title: "When showing possible duplicates after adding books, also show the file formats." + + - title: "Driver for Trekstor Ventos Tablet" + + bug fixes: + - title: "Conversion: When transliterating unicode characters, handle « and » correctly." + tickets: [1117270] + + - title: "Fix adding books from multiple directories with multiple books per directory treating opf files as an ebook" + + - title: "Fix download metadata window not resizable on smaller screens" + tickets: [1116849] + + - title: "Tweak Book: When rebuilding azw3 files handle tags that have name but not id attribute, these are apparently produced by kindlegen." + tickets: [ 1112934 ] + + - title: "Fix regression in advanced column color rules." + tickets: [1118678] + + improved recipes: + - El Mundo today + - fluter.de + - Birmingham Post + - Japan Times + - The Toronto Star + - Le Monde (subscription version) + - Globe and Mail + + new recipes: + - title: VICE Magazine Deutschland + author: Alex + + - title: Libertad Digital + author: Darko Miletic + +- version: 0.9.17 + date: 2013-02-01 + + new features: + - title: "Allow adding user specified icons to the main book list for books whose metadata matches specific criteria. Go to Preferences->Look & Feel->Column icons to setup these icons. They work in the same way as the column coloring rules." + type: major + + - title: "Allow choosing which page of a PDF to use as the cover." + description: "To access this functionality add the PDF to calibre then click the edit metadata button. In the top right area of the edit metadata dialog there is a button to get the cover from the ebook file, this will now allow you to choose which page (from the first ten pages) of the pdf to use as the cover." + tickets: [1110019] + + - title: "Add option to turn off reflections in the cover browser (Preferences->Look & Feel->Cover Browser)" + + - title: "PDF Output: Add an option to add page numbers to the bottom of every page in the generated PDF file (look in the PDF Output section of the conversion dialog)" + + - title: "Add the full item name to the tool tip of a leaf item displayed in the tag browser." + tickets: [1106231] + + bug fixes: + - title: "Fix out-of-bounds data causing errors in the Tag Browser" + tickets: [1108017] + + - title: "Conversion: Handle input documents that use multiple prefixes referring to the XHTML namespace correctly." + tickets: [1107220] + + - title: "PDF Output: Fix regression that caused some svg images to be rendered as black rectangles." + tickets: [1105294] + + - title: "Metadata download: Only normalize title case if the result has no language set or its language is English" + + improved recipes: + - Baltimore Sun + - Harvard Business Review + - Victoria Times + - South China Morning Post + - Volksrant + - Seattle Times + + new recipes: + - title: Dob NeviNosti + author: Darko Miletic + + - title: La Nacion (CR) + author: Douglas Delgado + - version: 0.9.16 date: 2013-01-25 diff --git a/imgsrc/polish.svg b/imgsrc/polish.svg new file mode 100644 index 0000000000..7affaaf4bd --- /dev/null +++ b/imgsrc/polish.svg @@ -0,0 +1,366 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + Ulisse Perusin + + + + uli.peru@gmail.com + edit-clear + + + + + + + + + + + + + + + + + + + + + diff --git a/manual/develop.rst b/manual/develop.rst index 719c876b33..823a31b5c2 100644 --- a/manual/develop.rst +++ b/manual/develop.rst @@ -39,27 +39,27 @@ All the |app| python code is in the ``calibre`` package. This package contains t * devices - All the device drivers. Just look through some of the built-in drivers to get an idea for how they work. - * For details, see: devices.interface which defines the interface supported by device drivers and devices.usbms which + * For details, see: devices.interface which defines the interface supported by device drivers and ``devices.usbms`` which defines a generic driver that connects to a USBMS device. All USBMS based drivers in |app| inherit from it. * ebooks - All the ebook conversion/metadata code. A good starting point is ``calibre.ebooks.conversion.cli`` which is the - module powering the :command:`ebook-convert` command. The conversion process is controlled via conversion.plumber. - The format independent code is all in ebooks.oeb and the format dependent code is in ebooks.format_name. + module powering the :command:`ebook-convert` command. The conversion process is controlled via ``conversion.plumber``. + The format independent code is all in ``ebooks.oeb`` and the format dependent code is in ``ebooks.format_name``. - * Metadata reading, writing, and downloading is all in ebooks.metadata + * Metadata reading, writing, and downloading is all in ``ebooks.metadata`` * Conversion happens in a pipeline, for the structure of the pipeline, see :ref:`conversion-introduction`. The pipeline consists of an input plugin, various transforms and an output plugin. The that code constructs - and drives the pipeline is in plumber.py. The pipeline works on a + and drives the pipeline is in :file:`plumber.py`. The pipeline works on a representation of an ebook that is like an unzipped epub, with manifest, spine, toc, guide, html content, etc. The - class that manages this representation is OEBBook in oeb/base.py. The + class that manages this representation is OEBBook in ``ebooks.oeb.base``. The various transformations that are applied to the book during - conversions live in `oeb/transforms/*.py`. And the input and output - plugins live in `conversion/plugins/*.py`. + conversions live in :file:`oeb/transforms/*.py`. And the input and output + plugins live in :file:`conversion/plugins/*.py`. - * library - The database back-end and the content server. See library.database2 for the interface to the |app| library. library.server is the |app| Content Server. - * gui2 - The Graphical User Interface. GUI initialization happens in gui2.main and gui2.ui. The ebook-viewer is in gui2.viewer. + * library - The database back-end and the content server. See ``library.database2`` for the interface to the |app| library. ``library.server`` is the |app| Content Server. + * gui2 - The Graphical User Interface. GUI initialization happens in ``gui2.main`` and ``gui2.ui``. The ebook-viewer is in ``gui2.viewer``. If you need help understanding the code, post in the `development forum `_ and you will most likely get help from one of |app|'s many developers. diff --git a/manual/faq.rst b/manual/faq.rst index 215b71e860..2d2862e4e6 100644 --- a/manual/faq.rst +++ b/manual/faq.rst @@ -158,13 +158,23 @@ My device is not being detected by |app|? Follow these steps to find the problem: - * Make sure that you are connecting only a single device to your computer at a time. Do not have another |app| supported device like an iPhone/iPad etc. at the same time. - * If you are connecting an Apple iDevice (iPad, iPod Touch, iPhone), use the 'Connect to iTunes' method in the 'Getting started' instructions in `Calibre + Apple iDevices: Start here `_. - * Make sure you are running the latest version of |app|. The latest version can always be downloaded from `the calibre website `_. - * Ensure your operating system is seeing the device. That is, the device should show up in Windows Explorer (in Windows) or Finder (in OS X). + * Make sure that you are connecting only a single device to your computer + at a time. Do not have another |app| supported device like an iPhone/iPad + etc. at the same time. + * If you are connecting an Apple iDevice (iPad, iPod Touch, iPhone), use + the 'Connect to iTunes' method in the 'Getting started' instructions in + `Calibre + Apple iDevices: Start here `_. + * Make sure you are running the latest version of |app|. The latest version + can always be downloaded from `the calibre website `_. + You can tell what version of |app| you are currently running by looking + at the bottom line of the main |app| window. + * Ensure your operating system is seeing the device. That is, the device + should show up in Windows Explorer (in Windows) or Finder (in OS X). * In |app|, go to Preferences->Ignored Devices and check that your device is not being ignored - * If all the above steps fail, go to Preferences->Miscellaneous and click debug device detection with your device attached and post the output as a ticket on `the calibre bug tracker `_. + * If all the above steps fail, go to Preferences->Miscellaneous and click + debug device detection with your device attached and post the output as a + ticket on `the calibre bug tracker `_. My device is non-standard or unusual. What can I do to connect to it? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -240,42 +250,71 @@ If you don't want to uninstall it altogether, there are a couple of tricks you c simplest is to simply re-name the executable file that launches the library program. More detail `in the forums `_. -How do I use |app| with my iPad/iPhone/iTouch? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +How do I use |app| with my iPad/iPhone/iPod touch? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Over the air ^^^^^^^^^^^^^^ -The easiest way to browse your |app| collection on your Apple device (iPad/iPhone/iPod) is by using the calibre content server, which makes your collection available over the net. First perform the following steps in |app| +The easiest way to browse your |app| collection on your Apple device +(iPad/iPhone/iPod) is by using the |app| content server, which makes your +collection available over the net. First perform the following steps in |app| - * Set the Preferred Output Format in |app| to EPUB (The output format can be set under :guilabel:`Preferences->Interface->Behavior`) - * Set the output profile to iPad (this will work for iPhone/iPods as well), under :guilabel:`Preferences->Conversion->Common Options->Page Setup` - * Convert the books you want to read on your iPhone to EPUB format by selecting them and clicking the Convert button. - * Turn on the Content Server in |app|'s preferences and leave |app| running. + * Set the Preferred Output Format in |app| to EPUB (The output format can be + set under :guilabel:`Preferences->Interface->Behavior`) + * Set the output profile to iPad (this will work for iPhone/iPods as well), + under :guilabel:`Preferences->Conversion->Common Options->Page Setup` + * Convert the books you want to read on your iDevice to EPUB format by + selecting them and clicking the Convert button. + * Turn on the Content Server by clicking the :guilabel:`Connect/Share` button + and leave |app| running. You can also tell |app| to automatically start the + content server via :guilabel:`Preferences->Sharing over the net`. -Now on your iPad/iPhone you have two choices, use either iBooks (version 1.2 and later) or Stanza (version 3.0 and later). Both are available free from the app store. +There are many apps for your iDevice that can connect to |app|. Here we +describe using two of them, iBooks and Stanza. Using Stanza *************** -Now you should be able to access your books on your iPhone by opening Stanza. Go to "Get Books" and then click the "Shared" tab. Under Shared you will see an entry "Books in calibre". If you don't, make sure your iPad/iPhone is connected using the WiFi network in your house, not 3G. If the |app| catalog is still not detected in Stanza, you can add it manually in Stanza. To do this, click the "Shared" tab, then click the "Edit" button and then click "Add book source" to add a new book source. In the Add Book Source screen enter whatever name you like and in the URL field, enter the following:: +You should be able to access your books on your iPhone by opening Stanza. Go to +"Get Books" and then click the "Shared" tab. Under Shared you will see an entry +"Books in calibre". If you don't, make sure your iPad/iPhone is connected using +the WiFi network in your house, not 3G. If the |app| catalog is still not +detected in Stanza, you can add it manually in Stanza. To do this, click the +"Shared" tab, then click the "Edit" button and then click "Add book source" to +add a new book source. In the Add Book Source screen enter whatever name you +like and in the URL field, enter the following:: http://192.168.1.2:8080/ -Replace ``192.168.1.2`` with the local IP address of the computer running |app|. If you have changed the port the |app| content server is running on, you will have to change ``8080`` as well to the new port. The local IP address is the IP address you computer is assigned on your home network. A quick Google search will tell you how to find out your local IP address. Now click "Save" and you are done. +Replace ``192.168.1.2`` with the local IP address of the computer running +|app|. If you have changed the port the |app| content server is running on, you +will have to change ``8080`` as well to the new port. The local IP address is +the IP address you computer is assigned on your home network. A quick Google +search will tell you how to find out your local IP address. Now click "Save" +and you are done. -If you get timeout errors while browsing the calibre catalog in Stanza, try increasing the connection timeout value in the stanza settings. Go to Info->Settings and increase the value of Download Timeout. +If you get timeout errors while browsing the calibre catalog in Stanza, try +increasing the connection timeout value in the stanza settings. Go to +Info->Settings and increase the value of Download Timeout. Using iBooks ************** -Start the Safari browser and type in the IP address and port of the computer running the calibre server, like this:: +Start the Safari browser and type in the IP address and port of the computer +running the calibre server, like this:: http://192.168.1.2:8080/ -Replace ``192.168.1.2`` with the local IP address of the computer running |app|. If you have changed the port the |app| content server is running on, you will have to change ``8080`` as well to the new port. The local IP address is the IP address you computer is assigned on your home network. A quick Google search will tell you how to find out your local IP address. +Replace ``192.168.1.2`` with the local IP address of the computer running +|app|. If you have changed the port the |app| content server is running on, you +will have to change ``8080`` as well to the new port. The local IP address is +the IP address you computer is assigned on your home network. A quick Google +search will tell you how to find out your local IP address. -You will see a list of books in Safari, just click on the epub link for whichever book you want to read, Safari will then prompt you to open it with iBooks. +You will see a list of books in Safari, just click on the epub link for +whichever book you want to read, Safari will then prompt you to open it with +iBooks. With the USB cable + iTunes @@ -540,9 +579,9 @@ Yes, you can. Follow the instructions in the answer above for adding custom colu How do I move my |app| library from one computer to another? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Simply copy the |app| library folder from the old to the new computer. You can find out what the library folder is by clicking the calibre icon in the toolbar. The very first item is the path to the library folder. Now on the new computer, start |app| for the first time. It will run the Welcome Wizard asking you for the location of the |app| library. Point it to the previously copied folder. If the computer you are transferring to already has a calibre installation, then the Welcome wizard wont run. In that case, click the calibre icon in the tooolbar and point it to the newly copied directory. You will now have two calibre libraries on your computer and you can switch between them by clicking the calibre icon on the toolbar. +Simply copy the |app| library folder from the old to the new computer. You can find out what the library folder is by clicking the calibre icon in the toolbar. The very first item is the path to the library folder. Now on the new computer, start |app| for the first time. It will run the Welcome Wizard asking you for the location of the |app| library. Point it to the previously copied folder. If the computer you are transferring to already has a calibre installation, then the Welcome wizard wont run. In that case, right-click the |app| icon in the tooolbar and point it to the newly copied directory. You will now have two calibre libraries on your computer and you can switch between them by clicking the |app| icon on the toolbar. Transferring your library in this manner preserver all your metadata, tags, custom columns, etc. -Note that if you are transferring between different types of computers (for example Windows to OS X) then after doing the above you should also right-click the calibre icon on the tool bar, select Library Maintenance and run the Check Library action. It will warn you about any problems in your library, which you should fix by hand. +Note that if you are transferring between different types of computers (for example Windows to OS X) then after doing the above you should also right-click the |app| icon on the tool bar, select Library Maintenance and run the Check Library action. It will warn you about any problems in your library, which you should fix by hand. .. note:: A |app| library is just a folder which contains all the book files and their metadata. All the metadata is stored in a single file called metadata.db, in the top level folder. If this file gets corrupted, you may see an empty list of books in |app|. In this case you can ask |app| to restore your books by doing a right-click on the |app| icon in the toolbar and selecting Library Maintenance->Restore Library. @@ -577,7 +616,10 @@ or a Remote Desktop solution. If you must share the actual library, use a file syncing tool like DropBox or rsync or Microsoft SkyDrive instead of a networked drive. Even with these tools there is danger of data corruption/loss, so only do this if you are -willing to live with that risk. +willing to live with that risk. In particular, be aware that **Google Drive** +is incompatible with |app|, if you put your |app| library in Google Drive, you +*will* suffer data loss. See +`this thread `_ for details. Content From The Web --------------------- @@ -653,7 +695,7 @@ Post any output you see in a help message on the `Forum `. diff --git a/manual/gui.rst b/manual/gui.rst index a51ced54d3..98954ebabd 100755 --- a/manual/gui.rst +++ b/manual/gui.rst @@ -537,6 +537,8 @@ Calibre has several keyboard shortcuts to save you time and mouse movement. Thes - Merge selected records, keeping originals * - :kbd:`O` - Open containing folder + * - :kbd:`P` + - Polish books * - :kbd:`S` - Save to Disk * - :kbd:`V` diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe index 2224937f3c..b02460695e 100644 --- a/recipes/adventure_zone_pl.recipe +++ b/recipes/adventure_zone_pl.recipe @@ -11,7 +11,7 @@ class Adventure_zone(BasicNewsRecipe): max_articles_per_feed = 100 cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png' index='http://www.adventure-zone.info/fusion/' - use_embedded_content=False + use_embedded_content = False preprocess_regexps = [(re.compile(r"Komentarze", re.IGNORECASE), lambda m: ''), (re.compile(r''), lambda match: ''), (re.compile(r''), lambda match: '')] @@ -21,7 +21,7 @@ class Adventure_zone(BasicNewsRecipe): extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }' feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')] - def parse_feeds (self): + '''def parse_feeds (self): feeds = BasicNewsRecipe.parse_feeds(self) soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php') tag=soup.find(name='channel') @@ -34,7 +34,7 @@ class Adventure_zone(BasicNewsRecipe): for feed in feeds: for article in feed.articles[:]: article.title=titles[feed.articles.index(article)] - return feeds + return feeds''' '''def get_cover_url(self): @@ -42,16 +42,25 @@ class Adventure_zone(BasicNewsRecipe): cover=soup.find(id='box_OstatninumerAZ') self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src'] return getattr(self, 'cover_url', self.cover_url)''' - + def populate_article_metadata(self, article, soup, first): + result = re.search('(.+) - Adventure Zone', soup.title.string) + if result: + article.title = result.group(1) + else: + result = soup.body.find('strong') + if result: + article.title = result.string def skip_ad_pages(self, soup): skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'}) skip_tag = skip_tag.findAll(name='a') - for r in skip_tag: - if r.strong: - word=r.strong.string.lower() - if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): - return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) + title = soup.title.string.lower() + if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)): + for r in skip_tag: + if r.strong and r.strong.string: + word=r.strong.string.lower() + if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): + return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) def preprocess_html(self, soup): footer=soup.find(attrs={'class':'news-footer middle-border'}) diff --git a/recipes/badania_net.recipe b/recipes/badania_net.recipe new file mode 100644 index 0000000000..01499f6369 --- /dev/null +++ b/recipes/badania_net.recipe @@ -0,0 +1,17 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class BadaniaNet(BasicNewsRecipe): + title = u'badania.net' + __author__ = 'fenuks' + description = u'chcesz wiedzieć więcej?' + category = 'science' + language = 'pl' + cover_url = 'http://badania.net/wp-content/badanianet_green_transparent.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + remove_tags = [dict(attrs={'class':['omc-flex-category', 'omc-comment-count', 'omc-single-tags']})] + remove_tags_after = dict(attrs={'class':'omc-single-tags'}) + keep_only_tags = [dict(id='omc-full-article')] + feeds = [(u'Psychologia', u'http://badania.net/category/psychologia/feed/'), (u'Technologie', u'http://badania.net/category/technologie/feed/'), (u'Biologia', u'http://badania.net/category/biologia/feed/'), (u'Chemia', u'http://badania.net/category/chemia/feed/'), (u'Zdrowie', u'http://badania.net/category/zdrowie/'), (u'Seks', u'http://badania.net/category/psychologia-ewolucyjna-tematyka-seks/feed/')] diff --git a/recipes/baltimore_sun.recipe b/recipes/baltimore_sun.recipe index 006a4c4ae6..3cd5c8edbc 100644 --- a/recipes/baltimore_sun.recipe +++ b/recipes/baltimore_sun.recipe @@ -19,6 +19,7 @@ class BaltimoreSun(BasicNewsRecipe): use_embedded_content = False no_stylesheets = True remove_javascript = True + #auto_cleanup = True recursions = 1 ignore_duplicate_articles = {'title'} @@ -78,6 +79,7 @@ class BaltimoreSun(BasicNewsRecipe): #(u'High School', u'http://www.baltimoresun.com/sports/high-school/rss2.0.xml'), #(u'Outdoors', u'http://www.baltimoresun.com/sports/outdoors/rss2.0.xml'), + ## Entertainment ## (u'Celebrity News', u'http://www.baltimoresun.com/entertainment/celebrities/rss2.0.xml'), (u'Arts & Theater', u'http://www.baltimoresun.com/entertainment/arts/rss2.0.xml'), @@ -142,12 +144,12 @@ class BaltimoreSun(BasicNewsRecipe): (u'Read Street', u'http://www.baltimoresun.com/features/books/read-street/rss2.0.xml'), (u'Z on TV', u'http://www.baltimoresun.com/entertainment/tv/z-on-tv-blog/rss2.0.xml'), -## Life Blogs ## - (u'BMore Green', u'http://weblogs.baltimoresun.com/features/green/index.xml'), - (u'Baltimore Insider',u'http://www.baltimoresun.com/features/baltimore-insider-blog/rss2.0.xml'), - (u'Homefront', u'http://www.baltimoresun.com/features/parenting/homefront/rss2.0.xml'), - (u'Picture of Health', u'http://www.baltimoresun.com/health/blog/rss2.0.xml'), - (u'Unleashed', u'http://weblogs.baltimoresun.com/features/mutts/blog/index.xml'), +### Life Blogs ## + #(u'BMore Green', u'http://weblogs.baltimoresun.com/features/green/index.xml'), + #(u'Baltimore Insider',u'http://www.baltimoresun.com/features/baltimore-insider-blog/rss2.0.xml'), + #(u'Homefront', u'http://www.baltimoresun.com/features/parenting/homefront/rss2.0.xml'), + #(u'Picture of Health', u'http://www.baltimoresun.com/health/blog/rss2.0.xml'), + #(u'Unleashed', u'http://weblogs.baltimoresun.com/features/mutts/blog/index.xml'), ## b the site blogs ## (u'Game Cache', u'http://www.baltimoresun.com/entertainment/bthesite/game-cache/rss2.0.xml'), @@ -167,6 +169,7 @@ class BaltimoreSun(BasicNewsRecipe): ] + def get_article_url(self, article): ans = None try: diff --git a/recipes/bash_org_pl.recipe b/recipes/bash_org_pl.recipe index 4ed59614e7..a04f267ca3 100644 --- a/recipes/bash_org_pl.recipe +++ b/recipes/bash_org_pl.recipe @@ -35,8 +35,8 @@ class Bash_org_pl(BasicNewsRecipe): soup=self.index_to_soup(u'http://bash.org.pl/random/') #date=soup.find('div', attrs={'class':'right'}).string url=soup.find('a', attrs={'class':'qid click'}) - title=url.string - url='http://bash.org.pl' +url['href'] + title='' + url='http://bash.org.pl/random/' articles.append({'title' : title, 'url' : url, 'date' : '', @@ -44,6 +44,8 @@ class Bash_org_pl(BasicNewsRecipe): }) return articles + def populate_article_metadata(self, article, soup, first): + article.title = soup.find(attrs={'class':'qid click'}).string def parse_index(self): feeds = [] diff --git a/recipes/birmingham_post.recipe b/recipes/birmingham_post.recipe index 49c86fe3b8..db2e29c821 100644 --- a/recipes/birmingham_post.recipe +++ b/recipes/birmingham_post.recipe @@ -14,7 +14,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe): remove_empty_feeds = True remove_javascript = True no_stylesheets = True - #auto_cleanup = True + auto_cleanup = True language = 'en_GB' cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg' @@ -23,7 +23,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe): def get_cover_url(self): soup = self.index_to_soup('http://www.birminghampost.net') # look for the block containing the sun button and url - cov = soup.find(attrs={'height' : re.compile('3'), 'alt' : re.compile('Birmingham Post')}) + cov = soup.find(attrs={'height' : re.compile('3'), 'alt' : re.compile('Post')}) print print '%%%%%%%%%%%%%%%',cov print @@ -43,20 +43,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe): return cover_url - keep_only_tags = [ - dict(attrs={'id' : 'article-header'}), - #dict(name='h1',attrs={'id' : 'article-header'}), - dict(attrs={'class':['article-meta-author','article-meta-date','article main','art-o art-align-center otm-1 ']}), - dict(name='div',attrs={'class' : 'article-image full'}), - dict(attrs={'clas' : 'art-o art-align-center otm-1 '}), - dict(name='div',attrs={'class' : 'article main'}), -#dict(name='p') - #dict(attrs={'id' : 'three-col'}) - ] - remove_tags = [ - # dict(name='div',attrs={'class' : 'span-33 last header-links'}) - ] feeds = [ #(u'News',u'http://www.birminghampost.net/news/rss.xml'), (u'West Mids. News', u'http://www.birminghampost.net/news/west-midlands-news/rss.xml'), @@ -65,9 +52,3 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe): (u'Bloggs & Comments',u'http://www.birminghampost.net/comment/rss.xml') ] - extra_css = ''' - h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;text-align:center;} - h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} - p{font-family:Arial,Helvetica,sans-serif;font-size:small;} - body{font-family:Helvetica,Arial,sans-serif;font-size:small;} - ''' diff --git a/recipes/democracy_journal.recipe b/recipes/democracy_journal.recipe new file mode 100644 index 0000000000..f02a3b70a8 --- /dev/null +++ b/recipes/democracy_journal.recipe @@ -0,0 +1,27 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class AdvancedUserRecipe1361743898(BasicNewsRecipe): + title = u'Democracy Journal' + description = '''A journal of ideas. Published quarterly.''' + __author__ = u'David Nye' + language = 'en' + oldest_article = 90 + max_articles_per_feed = 30 + no_stylesheets = True + auto_cleanup = True + + def parse_index(self): + articles = [] + feeds = [] + soup = self.index_to_soup("http://www.democracyjournal.org") + for x in soup.findAll(href=re.compile("http://www\.democracyjournal\.org/\d*/.*php$")): + url = x.get('href') + title = self.tag_to_string(x) + articles.append({'title':title, 'url':url, 'description':'', 'date':''}) + feeds.append(('Articles', articles)) + return feeds + + def print_version(self, url): + return url + '?page=all' + diff --git a/recipes/discover_magazine.recipe b/recipes/discover_magazine.recipe index 02cdb952b5..a7f080bb5f 100644 --- a/recipes/discover_magazine.recipe +++ b/recipes/discover_magazine.recipe @@ -33,6 +33,21 @@ class DiscoverMagazine(BasicNewsRecipe): remove_tags_after = [dict(name='div', attrs={'class':'listingBar'})] + # Login stuff + needs_subscription = True + use_javascript_to_login = True + requires_version = (0, 9, 20) + + def javascript_login(self, br, username, password): + br.visit('http://discovermagazine.com', timeout=120) + f = br.select_form('div.login.section div.form') + f['username'] = username + f['password'] = password + br.submit('input[id="signInButton"]', timeout=120) + br.run_for_a_time(20) + # End login stuff + + def append_page(self, soup, appendtag, position): pager = soup.find('span',attrs={'class':'next'}) if pager: diff --git a/recipes/dobanevinosti.recipe b/recipes/dobanevinosti.recipe new file mode 100644 index 0000000000..9d148be8b4 --- /dev/null +++ b/recipes/dobanevinosti.recipe @@ -0,0 +1,46 @@ + +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +dobanevinosti.blogspot.com +''' +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class DobaNevinosti(BasicNewsRecipe): + title = 'Doba Nevinosti' + __author__ = 'Darko Miletic' + description = 'Filmski blog' + oldest_article = 15 + max_articles_per_feed = 100 + language = 'sr' + encoding = 'utf-8' + no_stylesheets = True + use_embedded_content = True + publication_type = 'blog' + auto_cleanup = True + extra_css = """ + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} + body{font-family: "Trebuchet MS",Trebuchet,Verdana,sans1,sans-serif} + img{margin-bottom: 0.8em; display:block;} + """ + + conversion_options = { + 'comment' : description + , 'tags' : 'film, blog, srbija, tv' + , 'publisher': 'Dimitrije Vojinov' + , 'language' : language + } + remove_attributes = ['lang', 'border'] + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + + feeds = [(u'Tekstovi', u'http://dobanevinosti.blogspot.com/feeds/posts/default')] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup + diff --git a/recipes/ekologia_pl.recipe b/recipes/ekologia_pl.recipe index 2b0933b58d..21d3b607d2 100644 --- a/recipes/ekologia_pl.recipe +++ b/recipes/ekologia_pl.recipe @@ -15,7 +15,8 @@ class EkologiaPl(BasicNewsRecipe): no_stylesheets = True remove_empty_feeds = True use_embedded_content = False - remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj']})] + remove_attrs = ['style'] + remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj', 'widget-social-buttons']})] feeds = [(u'Wiadomo\u015bci', u'http://www.ekologia.pl/rss/20,53,0'), (u'\u015arodowisko', u'http://www.ekologia.pl/rss/20,56,0'), (u'Styl \u017cycia', u'http://www.ekologia.pl/rss/20,55,0')] diff --git a/recipes/el_malpensante.recipe b/recipes/el_malpensante.recipe new file mode 100644 index 0000000000..7a014735b6 --- /dev/null +++ b/recipes/el_malpensante.recipe @@ -0,0 +1,27 @@ +# coding=utf-8 +# https://github.com/iemejia/calibrecolombia + +''' +http://www.elmalpensante.com/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class ElMalpensante(BasicNewsRecipe): + title = u'El Malpensante' + language = 'es_CO' + __author__ = 'Ismael Mejia ' + cover_url = 'http://elmalpensante.com/img/layout/logo.gif' + description = 'El Malpensante' + oldest_article = 30 + simultaneous_downloads = 20 + #tags = 'news, sport, blog' + use_embedded_content = True + remove_empty_feeds = True + max_articles_per_feed = 100 + feeds = [(u'Artículos', u'http://www.elmalpensante.com/articulosRSS.php'), + (u'Malpensantías', u'http://www.elmalpensante.com/malpensantiasRSS.php'), + (u'Margaritas', u'http://www.elmalpensante.com/margaritasRSS.php'), +# This one is almost the same as articulos so we leave articles +# (u'Noticias', u'http://www.elmalpensante.com/noticiasRSS.php'), + ] diff --git a/recipes/el_mundo_today.recipe b/recipes/el_mundo_today.recipe index 7f558d10e7..010596b6e5 100644 --- a/recipes/el_mundo_today.recipe +++ b/recipes/el_mundo_today.recipe @@ -3,29 +3,34 @@ from calibre.web.feeds.news import BasicNewsRecipe class ElMundoTodayRecipe(BasicNewsRecipe): title = 'El Mundo Today' - __author__ = 'atordo' - description = u'La actualidad del mañana' + description = u'La actualidad del ma\u00f1ana' category = 'Noticias, humor' cover_url = 'http://www.elmundotoday.com/wp-content/themes/EarthlyTouch/images/logo.png' - oldest_article = 30 + oldest_article = 15 max_articles_per_feed = 60 auto_cleanup = False no_stylesheets = True remove_javascript = True language = 'es' use_embedded_content = False + publication_type = 'blog' preprocess_regexps = [ (re.compile(r'.*', re.DOTALL), - lambda match: ''), - #(re.compile(r'^\t{5}$'), lambda match: ''), - #(re.compile(r'\t{5}$'), lambda match: ''), - (re.compile(r'
', re.DOTALL), - lambda match: ''), + lambda match: ''), + (re.compile(r''), + lambda match: ''), + (re.compile(r'
.*', re.DOTALL), + lambda match: '') ] keep_only_tags = [ - dict(name='div', attrs={'class':'post-wrapper'}) + dict(name='div', attrs={'class':'post-wrapper '}) + ] + + remove_tags = [ + dict(name='div', attrs={'class':'social4i'}), + dict(name='span', attrs={'class':'num-comentarios'}) ] remove_attributes = [ 'href', 'title', 'alt' ] @@ -36,8 +41,3 @@ class ElMundoTodayRecipe(BasicNewsRecipe): ''' feeds = [('El Mundo Today', 'http://www.elmundotoday.com/feed/')] - - def get_broser(self): - br = BasicNewsRecipe.get_browser(self) - br.set_handle_gzip(True) - return br diff --git a/recipes/elguardian.recipe b/recipes/elguardian.recipe new file mode 100644 index 0000000000..f5d035dd21 --- /dev/null +++ b/recipes/elguardian.recipe @@ -0,0 +1,93 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +elguardian.com.ar +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class ElGuardian(BasicNewsRecipe): + title = 'El Guardian' + __author__ = 'Darko Miletic' + description = "Semanario con todas las tendencias de un pais" + publisher = 'Editorial Apache SA' + category = 'news,politics,Argentina' + oldest_article = 8 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'es_AR' + remove_empty_feeds = True + publication_type = 'magazine' + issn = '1666-7476' + masthead_url = 'http://elguardian.com.ar/application/templates/frontend/images/home/logo.png' + extra_css = """ + body{font-family: Arial,sans-serif} + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'series' : title + , 'isbn' : issn + } + + keep_only_tags = [dict(attrs={'class':['fotos', 'header_nota', 'nota']})] + remove_tags = [dict(name=['meta','link','iframe','embed','object'])] + remove_attributes = ['lang'] + + feeds = [ + (u'El Pais' , u'http://elguardian.com.ar/RSS/el-pais.xml' ) + ,(u'Columnistas' , u'http://elguardian.com.ar/RSS/columnistas.xml' ) + ,(u'Personajes' , u'http://elguardian.com.ar/RSS/personajes.xml' ) + ,(u'Tinta roja' , u'http://elguardian.com.ar/RSS/tinta-roja.xml' ) + ,(u'Yo fui' , u'http://elguardian.com.ar/RSS/yo-fui.xml' ) + ,(u'Ciencia' , u'http://elguardian.com.ar/RSS/ciencia.xml' ) + ,(u'Cronicas' , u'http://elguardian.com.ar/RSS/cronicas.xml' ) + ,(u'Culturas' , u'http://elguardian.com.ar/RSS/culturas.xml' ) + ,(u'DxT' , u'http://elguardian.com.ar/RSS/dxt.xml' ) + ,(u'Fierros' , u'http://elguardian.com.ar/RSS/fierros.xml' ) + ,(u'Frente fashion', u'http://elguardian.com.ar/RSS/frente-fashion.xml') + ,(u'Pan y vino' , u'http://elguardian.com.ar/RSS/pan-y-vino.xml' ) + ,(u'Turismo' , u'http://elguardian.com.ar/RSS/turismo.xml' ) + ] + + def get_cover_url(self): + soup = self.index_to_soup('http://elguardian.com.ar/') + udata = soup.find('div', attrs={'class':'datosNumero'}) + if udata: + sdata = udata.find('div') + if sdata: + stra = re.findall(r'\d+', self.tag_to_string(sdata)) + self.conversion_options.update({'series_index':int(stra[1])}) + unumero = soup.find('div', attrs={'class':'ultimoNumero'}) + if unumero: + img = unumero.find('img', src=True) + if img: + return img['src'] + return None + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name = 'div' + item.attrs = [] + else: + str = self.tag_to_string(item) + item.replaceWith(str) + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup diff --git a/recipes/eso_pl.recipe b/recipes/eso_pl.recipe new file mode 100644 index 0000000000..5ebb420396 --- /dev/null +++ b/recipes/eso_pl.recipe @@ -0,0 +1,23 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class ESO(BasicNewsRecipe): + title = u'ESO PL' + __author__ = 'fenuks' + description = u'ESO, Europejskie Obserwatorium Południowe, buduje i obsługuje najbardziej zaawansowane naziemne teleskopy astronomiczne na świecie' + category = 'astronomy' + language = 'pl' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + cover_url = 'https://twimg0-a.akamaihd.net/profile_images/1922519424/eso-twitter-logo.png' + keep_only_tags = [dict(attrs={'class':'subcl'})] + remove_tags = [dict(id='lang_row'), dict(attrs={'class':['pr_typeid', 'pr_news_feature_link', 'outreach_usage', 'hidden']})] + feeds = [(u'Wiadomo\u015bci', u'http://www.eso.org/public/poland/news/feed/'), (u'Og\u0142oszenia', u'http://www.eso.org/public/poland/announcements/feed/'), (u'Zdj\u0119cie tygodnia', u'http://www.eso.org/public/poland/images/potw/feed/')] + + def preprocess_html(self, soup): + for a in soup.findAll('a', href=True): + if a['href'].startswith('/'): + a['href'] = 'http://www.eso.org' + a['href'] + return soup diff --git a/recipes/financial_times_us.recipe b/recipes/financial_times_us.recipe new file mode 100644 index 0000000000..3821e5ea0e --- /dev/null +++ b/recipes/financial_times_us.recipe @@ -0,0 +1,182 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +http://www.ft.com/intl/us-edition +''' + +import datetime +from calibre.ptempfile import PersistentTemporaryFile +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe + +class FinancialTimes(BasicNewsRecipe): + title = 'Financial Times (US) printed edition' + __author__ = 'Darko Miletic' + description = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy." + publisher = 'The Financial Times Ltd.' + category = 'news, finances, politics, UK, World' + oldest_article = 2 + language = 'en' + max_articles_per_feed = 250 + no_stylesheets = True + use_embedded_content = False + needs_subscription = True + encoding = 'utf8' + publication_type = 'newspaper' + articles_are_obfuscated = True + temp_files = [] + masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg' + LOGIN = 'https://registration.ft.com/registration/barrier/login' + LOGIN2 = 'http://media.ft.com/h/subs3.html' + INDEX = 'http://www.ft.com/intl/us-edition' + PREFIX = 'http://www.ft.com' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } + + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + br.open(self.INDEX) + if self.username is not None and self.password is not None: + br.open(self.LOGIN2) + br.select_form(name='loginForm') + br['username'] = self.username + br['password'] = self.password + br.submit() + return br + + keep_only_tags = [ + dict(name='div' , attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']}) + ,dict(name='div' , attrs={'class':'standfirst'}) + ,dict(name='div' , attrs={'id' :'storyContent'}) + ,dict(name='div' , attrs={'class':['ft-story-body','index-detail']}) + ,dict(name='h2' , attrs={'class':'entry-title'} ) + ,dict(name='span', attrs={'class':lambda x: x and 'posted-on' in x.split()} ) + ,dict(name='span', attrs={'class':'author_byline'} ) + ,dict(name='div' , attrs={'class':'entry-content'} ) + ] + remove_tags = [ + dict(name='div', attrs={'id':'floating-con'}) + ,dict(name=['meta','iframe','base','object','embed','link']) + ,dict(attrs={'class':['storyTools','story-package','screen-copy','story-package separator','expandable-image']}) + ] + remove_attributes = ['width','height','lang'] + + extra_css = """ + body{font-family: Georgia,Times,"Times New Roman",serif} + h2{font-size:large} + .ft-story-header{font-size: x-small} + .container{font-size:x-small;} + h3{font-size:x-small;color:#003399;} + .copyright{font-size: x-small} + img{margin-top: 0.8em; display: block} + .lastUpdated{font-family: Arial,Helvetica,sans-serif; font-size: x-small} + .byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif} + """ + + def get_artlinks(self, elem): + articles = [] + count = 0 + for item in elem.findAll('a',href=True): + count = count + 1 + if self.test and count > 2: + return articles + rawlink = item['href'] + url = rawlink + if not rawlink.startswith('http://'): + url = self.PREFIX + rawlink + try: + urlverified = self.browser.open_novisit(url).geturl() # resolve redirect. + except: + continue + title = self.tag_to_string(item) + date = strftime(self.timefmt) + articles.append({ + 'title' :title + ,'date' :date + ,'url' :urlverified + ,'description':'' + }) + return articles + + def parse_index(self): + feeds = [] + soup = self.index_to_soup(self.INDEX) + dates= self.tag_to_string(soup.find('div', attrs={'class':'btm-links'}).find('div')) + self.timefmt = ' [%s]'%dates + wide = soup.find('div',attrs={'class':'wide'}) + if not wide: + return feeds + allsections = wide.findAll(attrs={'class':lambda x: x and 'footwell' in x.split()}) + if not allsections: + return feeds + count = 0 + for item in allsections: + count = count + 1 + if self.test and count > 2: + return feeds + fitem = item.h3 + if not fitem: + fitem = item.h4 + ftitle = self.tag_to_string(fitem) + self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle)) + feedarts = self.get_artlinks(item.ul) + feeds.append((ftitle,feedarts)) + return feeds + + def preprocess_html(self, soup): + items = ['promo-box','promo-title', + 'promo-headline','promo-image', + 'promo-intro','promo-link','subhead'] + for item in items: + for it in soup.findAll(item): + it.name = 'div' + it.attrs = [] + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name = 'div' + item.attrs = [] + else: + str = self.tag_to_string(item) + item.replaceWith(str) + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup + + def get_cover_url(self): + cdate = datetime.date.today() + if cdate.isoweekday() == 7: + cdate -= datetime.timedelta(days=1) + return cdate.strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_USA.pdf') + + def get_obfuscated_article(self, url): + count = 0 + while (count < 10): + try: + response = self.browser.open(url) + html = response.read() + count = 10 + except: + print "Retrying download..." + count += 1 + tfile = PersistentTemporaryFile('_fa.html') + tfile.write(html) + tfile.close() + self.temp_files.append(tfile) + return tfile.name + + def cleanup(self): + self.browser.open('https://registration.ft.com/registration/login/logout?location=') \ No newline at end of file diff --git a/recipes/fluter_de.recipe b/recipes/fluter_de.recipe index 1f8576cf81..18ea8e703e 100644 --- a/recipes/fluter_de.recipe +++ b/recipes/fluter_de.recipe @@ -14,26 +14,17 @@ class AdvancedUserRecipe1313693926(BasicNewsRecipe): language = 'de' encoding = 'UTF-8' - __author__ = 'Armin Geller' # 2011-08-19 + __author__ = 'Armin Geller' # 2013-02-05 V3 oldest_article = 7 max_articles_per_feed = 50 - - remove_tags = [ - dict(name='div', attrs={'id':["comments"]}), - dict(attrs={'class':['commentlink']}), - ] - - - keep_only_tags = [ - dict(name='div', attrs={'class':["grid_8 articleText"]}), - dict(name='div', attrs={'class':["articleTextInnerText"]}), - ] - feeds = [ (u'Inhalt:', u'http://www.fluter.de/de/?tpl=907'), ] extra_css = '.cs_img {margin-right: 10pt;}' + def print_version(self, url): + return url + '?tpl=1260' + diff --git a/recipes/fronda.recipe b/recipes/fronda.recipe index 6755770329..d0177b998e 100644 --- a/recipes/fronda.recipe +++ b/recipes/fronda.recipe @@ -23,7 +23,6 @@ class Fronda(BasicNewsRecipe): extra_css = ''' h1 {font-size:150%} .body {text-align:left;} - div.headline {font-weight:bold} ''' earliest_date = date.today() - timedelta(days=oldest_article) @@ -72,7 +71,7 @@ class Fronda(BasicNewsRecipe): feeds.append((genName, articles[genName])) return feeds - keep_only_tags = [ + keep_only_tags = [ dict(name='div', attrs={'class':'yui-g'}) ] @@ -84,5 +83,7 @@ class Fronda(BasicNewsRecipe): dict(name='ul', attrs={'class':'comment-list'}), dict(name='ul', attrs={'class':'category'}), dict(name='p', attrs={'id':'comments-disclaimer'}), + dict(name='div', attrs={'style':'text-align: left; margin-bottom: 15px;'}), + dict(name='div', attrs={'style':'text-align: left; margin-top: 15px;'}), dict(name='div', attrs={'id':'comment-form'}) ] diff --git a/recipes/geopolityka.recipe b/recipes/geopolityka.recipe new file mode 100644 index 0000000000..9749007479 --- /dev/null +++ b/recipes/geopolityka.recipe @@ -0,0 +1,12 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class BasicUserRecipe1361379046(BasicNewsRecipe): + title = u'Geopolityka.org' + language = 'pl' + __author__ = 'chemik111' + oldest_article = 15 + max_articles_per_feed = 100 + auto_cleanup = True + + feeds = [(u'Rss', u'http://geopolityka.org/index.php?format=feed&type=rss')] + diff --git a/recipes/globe_and_mail.recipe b/recipes/globe_and_mail.recipe index a7c78887c5..9ee3c6bd81 100644 --- a/recipes/globe_and_mail.recipe +++ b/recipes/globe_and_mail.recipe @@ -21,6 +21,10 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe): encoding = 'utf8' publisher = 'Globe & Mail' language = 'en_CA' + use_embedded_content = False + + no_stylesheets = True + auto_cleanup = True extra_css = 'p.meta {font-size:75%}\n .redtext {color: red;}\n .byline {font-size: 70%}' feeds = [ @@ -44,12 +48,12 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe): (re.compile(r'', re.DOTALL), lambda m: ''), ] - remove_tags_before = dict(name='h1') - remove_tags = [ - dict(name='div', attrs={'id':['ShareArticles', 'topStories']}), - dict(href=lambda x: x and 'tracking=' in x), - {'class':['articleTools', 'pagination', 'Ads', 'topad', - 'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}] + #remove_tags_before = dict(name='h1') + #remove_tags = [ + #dict(name='div', attrs={'id':['ShareArticles', 'topStories']}), + #dict(href=lambda x: x and 'tracking=' in x), + #{'class':['articleTools', 'pagination', 'Ads', 'topad', + #'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}] def populate_article_metadata(self, article, soup, first): if first and hasattr(self, 'add_toc_thumbnail'): diff --git a/recipes/hbr.recipe b/recipes/hbr.recipe index b80b0bace7..d996cf2200 100644 --- a/recipes/hbr.recipe +++ b/recipes/hbr.recipe @@ -11,11 +11,11 @@ class HBR(BasicNewsRecipe): timefmt = ' [%B %Y]' language = 'en' no_stylesheets = True - recipe_disabled = ('hbr.org has started requiring the use of javascript' - ' to log into their website. This is unsupported in calibre, so' - ' this recipe has been disabled. If you would like to see ' - ' HBR supported in calibre, contact hbr.org and ask them' - ' to provide a javascript free login method.') + # recipe_disabled = ('hbr.org has started requiring the use of javascript' + # ' to log into their website. This is unsupported in calibre, so' + # ' this recipe has been disabled. If you would like to see ' + # ' HBR supported in calibre, contact hbr.org and ask them' + # ' to provide a javascript free login method.') LOGIN_URL = 'https://hbr.org/login?request_url=/' LOGOUT_URL = 'https://hbr.org/logout?request_url=/' @@ -38,46 +38,38 @@ class HBR(BasicNewsRecipe): #articleAuthors{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;} #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;} ''' + use_javascript_to_login = True - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - self.logout_url = None - - #''' - br.open(self.LOGIN_URL) - br.select_form(name='signin-form') - br['signin-form:username'] = self.username - br['signin-form:password'] = self.password - raw = br.submit().read() - if '>Sign out<' not in raw: - raise Exception('Failed to login, are you sure your username and password are correct?') + def javascript_login(self, br, username, password): + from calibre.web.jsbrowser.browser import Timeout try: - link = br.find_link(text='Sign out') - if link: - self.logout_url = link.absolute_url - except: - self.logout_url = self.LOGOUT_URL - #''' - return br - - def cleanup(self): - if self.logout_url is not None: - self.browser.open(self.logout_url) + br.visit('https://hbr.org/login?request_url=/', timeout=20) + except Timeout: + pass + br.click('#accordion div[tabindex="0"]', wait_for_load=False) + f = br.select_form('#signin-form') + f['signin-form:username'] = username + f['signin-form:password'] = password + br.submit(wait_for_load=False) + br.run_for_a_time(30) def map_url(self, url): if url.endswith('/ar/1'): return url[:-1]+'pr' - def hbr_get_toc(self): - #return self.index_to_soup(open('/t/hbr.html').read()) + # return self.index_to_soup(open('/t/toc.html').read()) today = date.today() future = today + timedelta(days=30) - for x in [x.strftime('%y%m') for x in (future, today)]: + past = today - timedelta(days=30) + for x in [x.strftime('%y%m') for x in (future, today, past)]: url = self.INDEX + x soup = self.index_to_soup(url) - if not soup.find(text='Issue Not Found'): + if (not soup.find(text='Issue Not Found') and not soup.find( + text="We're Sorry. There was an error processing your request") + and 'Exception: java.io.FileNotFoundException' not in + unicode(soup)): return soup raise Exception('Could not find current issue') @@ -85,8 +77,9 @@ class HBR(BasicNewsRecipe): feeds = [] current_section = None articles = [] - for x in soup.find(id='archiveToc').findAll(['h3', 'h4']): - if x.name == 'h3': + for x in soup.find(id='issueFeaturesContent').findAll(['li', 'h4']): + if x.name == 'h4': + if x.get('class', None) == 'basic':continue if current_section is not None and articles: feeds.append((current_section, articles)) current_section = self.tag_to_string(x).capitalize() @@ -102,7 +95,7 @@ class HBR(BasicNewsRecipe): if url.startswith('/'): url = 'http://hbr.org' + url url = self.map_url(url) - p = x.parent.find('p') + p = x.find('p', attrs={'class':'author'}) desc = '' if p is not None: desc = self.tag_to_string(p) @@ -114,10 +107,9 @@ class HBR(BasicNewsRecipe): 'date':''}) return feeds - def parse_index(self): soup = self.hbr_get_toc() - #open('/t/hbr.html', 'wb').write(unicode(soup).encode('utf-8')) + # open('/t/hbr.html', 'wb').write(unicode(soup).encode('utf-8')) feeds = self.hbr_parse_toc(soup) return feeds diff --git a/recipes/hnonline.recipe b/recipes/hnonline.recipe new file mode 100644 index 0000000000..d9faafd0f1 --- /dev/null +++ b/recipes/hnonline.recipe @@ -0,0 +1,68 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class HNonlineRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'lacike' + language = 'sk' + version = 1 + + title = u'HNonline' + publisher = u'HNonline' + category = u'News, Newspaper' + description = u'News from Slovakia' + cover_url = u'http://hnonline.sk/img/sk/_relaunch/logo2.png' + + oldest_article = 1 + max_articles_per_feed = 100 + use_embedded_content = False + remove_empty_feeds = True + + no_stylesheets = True + remove_javascript = True + + # Feeds from: http://rss.hnonline.sk, for listing see http://rss.hnonline.sk/prehlad + feeds = [] + feeds.append((u'HNonline|Ekonomika a firmy', u'http://rss.hnonline.sk/?p=kC1000')) + feeds.append((u'HNonline|Slovensko', u'http://rss.hnonline.sk/?p=kC2000')) + feeds.append((u'HNonline|Svet', u'http://rss.hnonline.sk/?p=kC3000')) + feeds.append((u'HNonline|\u0160port', u'http://rss.hnonline.sk/?p=kC4000')) + feeds.append((u'HNonline|Online rozhovor', u'http://rss.hnonline.sk/?p=kCR000')) + + feeds.append((u'FinWeb|Spr\u00E1vy zo sveta financi\u00ED', u'http://rss.finweb.hnonline.sk/spravodajstvo')) + feeds.append((u'FinWeb|Koment\u00E1re a anal\u00FDzy', u'http://rss.finweb.hnonline.sk/?p=kPC200')) + feeds.append((u'FinWeb|Invest\u00EDcie', u'http://rss.finweb.hnonline.sk/?p=kPC300')) + feeds.append((u'FinWeb|Svet akci\u00ED', u'http://rss.finweb.hnonline.sk/?p=kPC400')) + feeds.append((u'FinWeb|Rozhovory', u'http://rss.finweb.hnonline.sk/?p=kPC500')) + feeds.append((u'FinWeb|T\u00E9ma t\u00FD\u017Ed\u0148a', u'http://rss.finweb.hnonline.sk/?p=kPC600')) + feeds.append((u'FinWeb|Rebr\u00ED\u010Dky', u'http://rss.finweb.hnonline.sk/?p=kPC700')) + + feeds.append((u'HNstyle|Kult\u00FAra', u'http://style.hnonline.sk/?p=kTC100')) + feeds.append((u'HNstyle|Auto-moto', u'http://style.hnonline.sk/?p=kTC200')) + feeds.append((u'HNstyle|Digit\u00E1l', u'http://style.hnonline.sk/?p=kTC300')) + feeds.append((u'HNstyle|Veda', u'http://style.hnonline.sk/?p=kTCV00')) + feeds.append((u'HNstyle|Dizajn', u'http://style.hnonline.sk/?p=kTC400')) + feeds.append((u'HNstyle|Cestovanie', u'http://style.hnonline.sk/?p=kTCc00')) + feeds.append((u'HNstyle|V\u00EDkend', u'http://style.hnonline.sk/?p=kTC800')) + feeds.append((u'HNstyle|Gastro', u'http://style.hnonline.sk/?p=kTC600')) + feeds.append((u'HNstyle|M\u00F3da', u'http://style.hnonline.sk/?p=kTC700')) + feeds.append((u'HNstyle|Modern\u00E1 \u017Eena', u'http://style.hnonline.sk/?p=kTCA00')) + feeds.append((u'HNstyle|Pre\u010Do nie?!', u'http://style.hnonline.sk/?p=k7C000')) + + keep_only_tags = [] + keep_only_tags.append(dict(name = 'h1', attrs = {'class': 'detail-titulek'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class': 'detail-podtitulek'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class': 'detail-perex'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class': 'detail-text'})) + + remove_tags = [] + #remove_tags.append(dict(name = 'div', attrs = {'id': re.compile('smeplayer.*')})) + + remove_tags_after = [] + #remove_tags_after = [dict(name = 'p', attrs = {'class': 'autor_line'})] + + extra_css = ''' + @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/LiberationSans.ttf)} + body {font-family: sans1, serif1;} + ''' \ No newline at end of file diff --git a/recipes/icons/badania_net.png b/recipes/icons/badania_net.png new file mode 100644 index 0000000000..de915de8d1 Binary files /dev/null and b/recipes/icons/badania_net.png differ diff --git a/recipes/icons/elguardian.png b/recipes/icons/elguardian.png new file mode 100644 index 0000000000..a54b067ee4 Binary files /dev/null and b/recipes/icons/elguardian.png differ diff --git a/recipes/icons/eso_pl.png b/recipes/icons/eso_pl.png new file mode 100644 index 0000000000..4f3319fece Binary files /dev/null and b/recipes/icons/eso_pl.png differ diff --git a/recipes/icons/financial_times_us.png b/recipes/icons/financial_times_us.png new file mode 100644 index 0000000000..2a769d9dbb Binary files /dev/null and b/recipes/icons/financial_times_us.png differ diff --git a/recipes/icons/hnonline.png b/recipes/icons/hnonline.png new file mode 100644 index 0000000000..1e073839ad Binary files /dev/null and b/recipes/icons/hnonline.png differ diff --git a/recipes/icons/kurier_galicyjski.png b/recipes/icons/kurier_galicyjski.png new file mode 100644 index 0000000000..4d66a15122 Binary files /dev/null and b/recipes/icons/kurier_galicyjski.png differ diff --git a/recipes/icons/libertad_digital.png b/recipes/icons/libertad_digital.png new file mode 100644 index 0000000000..83ed5a6dda Binary files /dev/null and b/recipes/icons/libertad_digital.png differ diff --git a/recipes/icons/more_intelligent_life.png b/recipes/icons/more_intelligent_life.png new file mode 100644 index 0000000000..4fcf66e9a1 Binary files /dev/null and b/recipes/icons/more_intelligent_life.png differ diff --git a/recipes/icons/nauka_w_polsce.png b/recipes/icons/nauka_w_polsce.png new file mode 100644 index 0000000000..0d872ce682 Binary files /dev/null and b/recipes/icons/nauka_w_polsce.png differ diff --git a/recipes/icons/nezavisne_novine.png b/recipes/icons/nezavisne_novine.png new file mode 100644 index 0000000000..29da3de24f Binary files /dev/null and b/recipes/icons/nezavisne_novine.png differ diff --git a/recipes/icons/osworld_pl.png b/recipes/icons/osworld_pl.png new file mode 100644 index 0000000000..97a7d0dd55 Binary files /dev/null and b/recipes/icons/osworld_pl.png differ diff --git a/recipes/icons/pravda_rs.png b/recipes/icons/pravda_rs.png new file mode 100644 index 0000000000..8c4533a79d Binary files /dev/null and b/recipes/icons/pravda_rs.png differ diff --git a/recipes/icons/ubuntu_pomoc_org.png b/recipes/icons/ubuntu_pomoc_org.png new file mode 100644 index 0000000000..a143846630 Binary files /dev/null and b/recipes/icons/ubuntu_pomoc_org.png differ diff --git a/recipes/icons/wprost_rss.png b/recipes/icons/wprost_rss.png new file mode 100644 index 0000000000..5ce1b5563d Binary files /dev/null and b/recipes/icons/wprost_rss.png differ diff --git a/recipes/informacje_usa.recipe b/recipes/informacje_usa.recipe index ac31134103..692dcdc07e 100644 --- a/recipes/informacje_usa.recipe +++ b/recipes/informacje_usa.recipe @@ -1,5 +1,4 @@ from calibre.web.feeds.news import BasicNewsRecipe -import re class Informacje_USA(BasicNewsRecipe): title = u'Informacje USA' oldest_article = 7 @@ -8,11 +7,10 @@ class Informacje_USA(BasicNewsRecipe): description = u'portal wiadomości amerykańskich' category = 'news' language = 'pl' - masthead_url= 'http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg' - cover_url='http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg' + cover_url='http://www.informacjeusa.com/wp-content/uploads/2013/01/V3BANNER420-90new.jpg' no_stylesheets = True - preprocess_regexps = [(re.compile(ur'

Zobacz:.*?

', re.DOTALL), lambda match: ''), (re.compile(ur'

')) + img.insert(len(img.contents), bs('

')) + for a in soup.findAll('a', href=True): + if a['href'].startswith('/'): + a['href'] = 'http://kuriergalicyjski.com' + a['href'] + return soup diff --git a/recipes/la_nacion_cr.recipe b/recipes/la_nacion_cr.recipe new file mode 100644 index 0000000000..ae320064d6 --- /dev/null +++ b/recipes/la_nacion_cr.recipe @@ -0,0 +1,40 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class crnews(BasicNewsRecipe): + __author__ = 'Douglas Delgado' + title = u'La Nacion' + publisher = 'GRUPO NACION GN, S. A.' + description = 'Diario de circulacion nacional de Costa Rica. Recipe creado por Douglas Delgado (doudelgado@gmail.com) para su uso con Calibre por Kovid Goyal' + category = 'Spanish, Entertainment' + masthead_url = 'http://www.nacion.com/App_Themes/nacioncom/Images/logo_nacioncom.png' + + oldest_article = 7 + delay = 1 + max_articles_per_feed = 100 + auto_cleanup = True + encoding = 'utf-8' + language = 'es_CR' + use_embedded_content = False + remove_empty_feeds = True + remove_javascript = True + no_stylesheets = True + + + feeds = [(u'Portada', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=portada'), (u'Ultima Hora', u'http://www.nacion.com/Generales/RSS/UltimaHoraRss.aspx'), (u'Nacionales', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=elpais'), (u'Entretenimiento', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=entretenimiento'), (u'Sucesos', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=sucesos'), (u'Deportes', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=deportes'), (u'Internacionales', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=mundo'), (u'Economia', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=economia'), (u'Aldea Global', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=aldeaglobal'), (u'Tecnologia', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=tecnologia'), (u'Opinion', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=opinion')] + + def get_cover_url(self): + index = 'http://kiosko.net/cr/np/cr_nacion.html' + soup = self.index_to_soup(index) + for image in soup.findAll('img',src=True): + if image['src'].endswith('cr_nacion.750.jpg'): + return image['src'] + return None + + def get_article_url(self, article): + url = article.get('guid', None) + return url + + extra_css = ''' + h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;} + h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;} + ''' diff --git a/recipes/le_monde_sub.recipe b/recipes/le_monde_sub.recipe index 8da4a97627..dc9fa9d36f 100644 --- a/recipes/le_monde_sub.recipe +++ b/recipes/le_monde_sub.recipe @@ -1,132 +1,94 @@ -#!/usr/bin/env python - +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +__author__ = 'Sylvain Durand ' __license__ = 'GPL v3' -__copyright__ = '2012, Rémi Vanicat ' -''' -Lemonde.fr: Version abonnée -''' - -import os, zipfile, re, time +import time from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ptempfile import PersistentTemporaryFile +from urllib2 import HTTPError -class LeMondeAbonne(BasicNewsRecipe): +class LeMonde(BasicNewsRecipe): - title = u'Le Monde: Édition abonnés' - __author__ = u'Rémi Vanicat' - description = u'Actualités' - category = u'Actualités, France, Monde' - language = 'fr' - needs_subscription = True + title = u'Le Monde: Édition abonnés' + __author__ = 'Sylvain Durand' + description = u'Disponible du lundi au samedi à partir de 14 heures environ, avec tous ses cahiers.' + language = 'fr' + encoding = 'utf8' - no_stylesheets = True + needs_subscription = True - extra_css = u''' - h1{font-size:130%;} - .ariane{font-size:xx-small;} - .source{font-size:xx-small;} - .href{font-size:xx-small;} - .LM_caption{color:#666666; font-size:x-small;} - .main-article-info{font-family:Arial,Helvetica,sans-serif;} - #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} - #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} - ''' + date_url = 'http://www.lemonde.fr/journalelectronique/donnees/libre/%Y%m%d/index.html' + login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html' + journal_url = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/%Y%m%d_ipad.xml' + masthead_url = 'http://upload.wikimedia.org/wikipedia/fr/thumb/c/c5/Le_Monde_logo.svg/300px-Le_Monde_logo.svg.png' + couverture_url = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/data/img/%y%m%d01.jpg' - zipurl_format = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/%y%m%d.zip' - coverurl_format = '/img/%y%m%d01.jpg' - path_format = "%y%m%d" - login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html' + extra_css = ''' + img{max-width:100%} + h1{font-size:1.2em !important; line-height:1.2em !important; } + h2{font-size:1em !important; line-height:1em !important; } + h3{font-size:1em !important; text-transform:uppercase !important; color:#666;} + #photo{text-align:center !important; margin:10px 0 -8px;} + #lgd{font-size:1em !important; line-height:1em !important; font-style:italic; color:#333;} ''' - keep_only_tags = [ dict(name="div", attrs={ 'class': 'po-prti' }), dict(name=['h1']), dict(name='div', attrs={ 'class': 'photo' }), dict(name='div', attrs={ 'class': 'po-ti2' }), dict(name='div', attrs={ 'class': 'ar-txt' }), dict(name='div', attrs={ 'class': 'po_rtcol' }) ] + keep_only_tags = [dict(name=['h1','h2','h3','div','txt'])] - article_id_pattern = re.compile("[0-9]+\\.html") - article_url_format = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/html/' + def __init__(self, options, log, progress_reporter): + BasicNewsRecipe.__init__(self, options, log, progress_reporter) + br = BasicNewsRecipe.get_browser(self) + second = time.time() + 24*60*60 + for i in range(7): + self.date = time.gmtime(second) + try: + br.open(time.strftime(self.date_url,self.date)) + break + except HTTPError: + second -= 24*60*60 + self.timefmt = strftime(u" %A %d %B %Y", self.date).replace(u' 0', u' ') def get_browser(self): br = BasicNewsRecipe.get_browser(self) - if self.username is not None and self.password is not None: - br.open(self.login_url) - br.select_form(nr=0) - br['login'] = self.username - br['password'] = self.password - br.submit() + br.open(self.login_url) + br.select_form(nr=0) + br['login'] = self.username + br['password'] = self.password + br.submit() return br - decalage = 24 * 60 * 60 # today Monde has tomorow date - def get_cover_url(self): - url = time.strftime(self.coverurl_format, self.ltime) - return self.articles_path + url + url = time.strftime(self.couverture_url,self.date) + return url def parse_index(self): - browser = self.get_browser() - - second = time.time() - second += self.decalage - ltime = self.ltime = time.gmtime(second) - url = time.strftime(self.zipurl_format, ltime) - - self.timefmt=strftime(" %A %d %B %Y", ltime) - - response = browser.open(url) - - tmp = PersistentTemporaryFile(suffix='.zip') - self.report_progress(0.1,_('downloading zip file')) - tmp.write(response.read()) - tmp.close() - - zfile = zipfile.ZipFile(tmp.name, 'r') - self.report_progress(0.1,_('extracting zip file')) - - zfile.extractall(self.output_dir) - zfile.close() - - path = os.path.join(self.output_dir, time.strftime(self.path_format, ltime), "data") - - self.articles_path = path - - files = os.listdir(path) - - nb_index_files = len([ name for name in files if re.match("frame_gauche_[0-9]+.html", name) ]) - - flux = [] - - article_url = time.strftime(self.article_url_format, ltime) - - for i in range(nb_index_files): - filename = os.path.join(path, "selection_%d.html" % (i + 1)) - tmp = open(filename,'r') - soup=BeautifulSoup(tmp) - title=soup.find('span').contents[0] - tmp.close() - - filename = os.path.join(path, "frame_gauche_%d.html" % (i + 1)) - tmp = open(filename,'r') - soup = BeautifulSoup(tmp) + url = time.strftime(self.journal_url,self.date) + soup = self.index_to_soup(url).sommaire + sections = [] + for sec in soup.findAll("section"): articles = [] - for link in soup.findAll("a"): - article_file = link['href'] - article_id=self.article_id_pattern.search(article_file).group() - article = { - 'title': link.contents[0], - 'url': article_url + article_id, - 'descripion': '', - 'content': '' - } - articles.append(article) - tmp.close() + if sec['cahier'] != "Le Monde": + for col in sec.findAll("fnts"): + col.extract() + if sec['cahier']=="Le Monde Magazine": + continue + for art in sec.findAll("art"): + if art.txt.string and art.ttr.string: + if art.find(['url']): + art.insert(6,'

') + if art.find(['lgd']) and art.find(['lgd']).string: + art.insert(7,'
'+art.find(['lgd']).string+'
') + article = ""+unicode(art)+"" + article = article.replace('','').replace(' oC ','°C ') + article = article.replace('srttr>','h3>').replace('ssttr>','h2>').replace('ttr>','h1>') + f = PersistentTemporaryFile() + f.write(article) + articles.append({'title':art.ttr.string,'url':"file:///"+f.name}) + sections.append((sec['nom'], articles)) + return sections - flux.append((title, articles)) - - return flux - - - -# Local Variables: -# mode: python -# End: + def preprocess_html(self, soup): + for lgd in soup.findAll(id="lgd"): + lgd.contents[-1].extract() + return soup diff --git a/recipes/libertad_digital.recipe b/recipes/libertad_digital.recipe new file mode 100644 index 0000000000..1a35e6995a --- /dev/null +++ b/recipes/libertad_digital.recipe @@ -0,0 +1,65 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +www.libertaddigital.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class LibertadDigital(BasicNewsRecipe): + title = 'Libertad Digital' + __author__ = 'Darko Miletic' + description = 'En Libertad Digital encontraras noticias y opinion sobre: España, el Mundo, Internet, sociedad, economia y deportes' + publisher = 'Libertad Digital S.A.' + category = 'noticias, ultima hora, españa, internet, mundo, economia, sociedad, Libertad Digital' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'cp1252' + use_embedded_content = False + language = 'es' + remove_empty_feeds = True + publication_type = 'website' + masthead_url = 'http://s.libertaddigital.com/images/logo.gif' + extra_css = """ + body{font-family: Verdana,sans-serif } + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_tags = [ + dict(name=['meta','link','iframe','embed','object']) + ,dict(name='p', attrs={'class':'copyright'}) + ] + remove_attributes=['lang'] + + + feeds = [ + (u'Portada' , u'http://feeds2.feedburner.com/libertaddigital/deportes' ) + ,(u'Opinion' , u'http://feeds2.feedburner.com/libertaddigital/opinion' ) + ,(u'España' , u'http://feeds2.feedburner.com/libertaddigital/nacional' ) + ,(u'Internacional', u'http://feeds2.feedburner.com/libertaddigital/internacional') + ,(u'Libre Mercado', u'http://feeds2.feedburner.com/libertaddigital/economia' ) + ,(u'Chic' , u'http://feeds2.feedburner.com/libertaddigital/el-candelabro') + ,(u'Internet' , u'http://feeds2.feedburner.com/libertaddigital/internet' ) + ,(u'Deportes' , u'http://feeds2.feedburner.com/libertaddigital/deportes' ) + ] + + def get_article_url(self, article): + return article.get('guid', None) + + def print_version(self, url): + art, sep, rest = url.rpartition('/') + aart, asep, artid = art.rpartition('-') + return 'http://www.libertaddigital.com/c.php?op=imprimir&id=' + artid + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup diff --git a/recipes/mlody_technik_pl.recipe b/recipes/mlody_technik_pl.recipe index 741397d08a..1eaa08d23a 100644 --- a/recipes/mlody_technik_pl.recipe +++ b/recipes/mlody_technik_pl.recipe @@ -1,5 +1,5 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai - +import re from calibre.web.feeds.news import BasicNewsRecipe class Mlody_technik(BasicNewsRecipe): title = u'Młody technik' @@ -9,7 +9,19 @@ class Mlody_technik(BasicNewsRecipe): language = 'pl' cover_url='http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg' no_stylesheets = True + preprocess_regexps = [(re.compile(r"

Podobne

", re.IGNORECASE), lambda m: '')] oldest_article = 7 max_articles_per_feed = 100 - #keep_only_tags=[dict(id='container')] - feeds = [(u'Artyku\u0142y', u'http://www.mt.com.pl/feed')] + remove_empty_feeds = True + use_embedded_content = False + keep_only_tags = [dict(id='content')] + remove_tags = [dict(attrs={'class':'st-related-posts'})] + remove_tags_after = dict(attrs={'class':'entry-content clearfix'}) + feeds = [(u'Wszystko', u'http://www.mt.com.pl/feed'), + (u'MT NEWS 24/7', u'http://www.mt.com.pl/kategoria/mt-newsy-24-7/feed'), + (u'Info zoom', u'http://www.mt.com.pl/kategoria/info-zoom/feed'), + (u'm.technik', u'http://www.mt.com.pl/kategoria/m-technik/feed'), + (u'Szkoła', u'http://www.mt.com.pl/kategoria/szkola-2/feed'), + (u'Na Warsztacie', u'http://www.mt.com.pl/kategoria/na-warsztacie/feed'), + (u'Z pasji do...', u'http://www.mt.com.pl/kategoria/z-pasji-do/feed'), + (u'MT testuje', u'http://www.mt.com.pl/kategoria/mt-testuje/feed')] diff --git a/recipes/more_intelligent_life.recipe b/recipes/more_intelligent_life.recipe new file mode 100644 index 0000000000..e90f883080 --- /dev/null +++ b/recipes/more_intelligent_life.recipe @@ -0,0 +1,67 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +moreintelligentlife.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class MoreIntelligentLife(BasicNewsRecipe): + title = 'More Intelligent Life' + __author__ = 'Darko Miletic' + description = "More Intelligent Life (moreintelligentlife.com) is the online version of Intelligent Life, a lifestyle and culture magazine from The Economist. The website offers not only content from the print edition, trickled out over the course of its shelf-life, but also the Editors' Blog, which carries daily posts from the editorial team-quickfire observations and opinions that allow readers to eavesdrop on the conversation in the office." + publisher = 'The Economist Newspaper ltd' + category = 'arts,lifestyle,intelligent life,the economist,ideas,style,culture' + oldest_article = 60 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en' + remove_empty_feeds = True + publication_type = 'website' + extra_css = """ + body{font-family: Arial,"Helvetica neue","Bitstream Vera Sans",sans-serif} + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + keep_only_tags = [dict(attrs={'class':'node'})] + remove_tags_after = dict(attrs={'class':'tags'}) + remove_tags = [dict(name=['meta','link','iframe','embed','object'])] + remove_attributes = ['lang'] + + feeds = [(u'Articles', u'http://feeds.feedburner.com/MoreintelligentlifeTotal')] + + def get_cover_url(self): + soup = self.index_to_soup('http://moreintelligentlife.com/') + for image in soup.findAll('img', src=True): + if image['src'].startswith('http://moreintelligentlife.com/files/covers/current_issue_'): + return image['src'] + return None + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name = 'div' + item.attrs = [] + else: + str = self.tag_to_string(item) + item.replaceWith(str) + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup diff --git a/recipes/nauka_w_polsce.recipe b/recipes/nauka_w_polsce.recipe new file mode 100644 index 0000000000..c524c18b26 --- /dev/null +++ b/recipes/nauka_w_polsce.recipe @@ -0,0 +1,47 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class NaukawPolsce(BasicNewsRecipe): + title = u'Nauka w Polsce' + __author__ = 'fenuks' + description = u'Serwis Nauka w Polsce ma za zadanie popularyzację polskiej nauki. Można na nim znaleźć wiadomości takie jak: osiągnięcia polskich naukowców, wydarzenia na polskich uczelniach, osiągnięcia studentów, konkursy dla badaczy, staże i stypendia naukowe, wydarzenia w polskiej nauce, kalendarium wydarzeń w nauce, materiały wideo o nauce.' + category = 'science' + language = 'pl' + cover_url = 'http://www.naukawpolsce.pap.pl/Themes/Pap/images/logo-pl.gif' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + index = 'http://www.naukawpolsce.pl' + keep_only_tags = [dict(name='div', attrs={'class':'margines wiadomosc'})] + remove_tags = [dict(name='div', attrs={'class':'tagi'})] + + def find_articles(self, url): + articles = [] + soup=self.index_to_soup(url) + for i in soup.findAll(name='div', attrs={'class':'aktualnosci-margines lista-depesz information-content'}): + title = i.h1.a.string + url = self.index + i.h1.a['href'] + date = '' #i.span.string + articles.append({'title' : title, + 'url' : url, + 'date' : date, + 'description' : '' + }) + return articles + + def parse_index(self): + feeds = [] + feeds.append((u"Historia i kultura", self.find_articles('http://www.naukawpolsce.pl/historia-i-kultura/'))) + feeds.append((u"Kosmos", self.find_articles('http://www.naukawpolsce.pl/kosmos/'))) + feeds.append((u"Przyroda", self.find_articles('http://www.naukawpolsce.pl/przyroda/'))) + feeds.append((u"Społeczeństwo", self.find_articles('http://www.naukawpolsce.pl/spoleczenstwo/'))) + feeds.append((u"Technologie", self.find_articles('http://www.naukawpolsce.pl/technologie/'))) + feeds.append((u"Uczelnie", self.find_articles('http://www.naukawpolsce.pl/uczelnie/'))) + feeds.append((u"Nauki medyczne", self.find_articles('http://www.naukawpolsce.pl/zdrowie/'))) + + return feeds + + def preprocess_html(self, soup): + for p in soup.findAll(name='p', text=re.compile(' ')): + p.extract() + return soup diff --git a/recipes/navegalo.recipe b/recipes/navegalo.recipe new file mode 100644 index 0000000000..89f6cde45d --- /dev/null +++ b/recipes/navegalo.recipe @@ -0,0 +1,40 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1360354988(BasicNewsRecipe): + title = u'Navegalo.com' + oldest_article = 7 + max_articles_per_feed = 100 + auto_cleanup = True + +from calibre.web.feeds.news import BasicNewsRecipe + +class navegalonews(BasicNewsRecipe): + __author__ = 'Douglas Delgado' + title = u'Navegalo.com' + publisher = 'Navegalo.com' + description = 'Noticias actualizadas por Navegalo.com. Recipe creado por Douglas Delgado (doudelgado@gmail.com) para su uso con Calibre' + category = 'Spanish, Entertainment' + masthead_url = 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQZhML5lwsdss6FFF7CFR0Sf-Ln052Zmhs1TlIOcAL8JWN8a-dPlA' + + oldest_article = 2 + delay = 1 + max_articles_per_feed = 100 + auto_cleanup = True + encoding = 'utf-8' + language = 'es_CR' + use_embedded_content = False + remove_empty_feeds = True + remove_javascript = True + no_stylesheets = True + + + feeds = [(u'Nacionales', u'http://www.navegalo.com/es/noticias/noticias/noticias-nacionales?format=feed&type=rss'), (u'Internacionales', u'http://direct.navegalo.com/es/noticias/noticias/noticias-internacionales?format=feed&type=rss'), (u'Deportes', u'http://direct.navegalo.com/es/noticias/noticias/deportes-nacionales?format=feed&type=rss'), (u'Solo futbol', u'http://www.navegalo.com/es/noticias/noticias/solo-futbol?format=feed&type=rss'), (u'Entretenimiento', u'http://www.navegalo.com/es/noticias/noticias/entretenimiento?format=feed&type=rss'), (u'Solo para ellas', u'http://www.navegalo.com/es/noticias/noticias/solo-para-ellas?format=feed&type=rss'), (u'Infiltrados', u'http://direct.navegalo.com/es/noticias/noticias/infiltrados?format=feed&type=rss'), (u'Mano a mano', u'http://direct.navegalo.com/es/noticias/noticias/mano-a-mano?format=feed&type=rss')] + + + + + extra_css = ''' + h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;} + h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;} + ''' + diff --git a/recipes/new_yorker.recipe b/recipes/new_yorker.recipe index c5f1b0aff2..2730b45d6d 100644 --- a/recipes/new_yorker.recipe +++ b/recipes/new_yorker.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2008-2011, Darko Miletic ' +__copyright__ = '2008-2013, Darko Miletic ' ''' newyorker.com ''' @@ -44,20 +44,18 @@ class NewYorker(BasicNewsRecipe): , 'language' : language } - keep_only_tags = [ - dict(name='div', attrs={'class':'headers'}) - ,dict(name='div', attrs={'id':['articleheads','items-container','articleRail','articletext','photocredits']}) - ] + keep_only_tags = [dict(name='div', attrs={'id':'pagebody'})] remove_tags = [ dict(name=['meta','iframe','base','link','embed','object']) - ,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons'] }) + ,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons','social-utils-top','entry-keywords','entry-categories','utilsPrintEmail'] }) ,dict(attrs={'id':['show-header','show-footer'] }) ] + remove_tags_after = dict(attrs={'class':'entry-content'}) remove_attributes = ['lang'] feeds = [(u'The New Yorker', u'http://www.newyorker.com/services/mrss/feeds/everything.xml')] def print_version(self, url): - return url + '?printable=true' + return url + '?printable=true¤tPage=all' def image_url_processor(self, baseurl, url): return url.strip() diff --git a/recipes/nezavisne_novine.recipe b/recipes/nezavisne_novine.recipe new file mode 100644 index 0000000000..357c478ce1 --- /dev/null +++ b/recipes/nezavisne_novine.recipe @@ -0,0 +1,59 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +www.nezavisne.com +''' +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe + +class NezavisneNovine(BasicNewsRecipe): + title = 'Nezavisne novine' + __author__ = 'Darko Miletic' + description = 'Nezavisne novine - Najnovije vijesti iz BiH, Srbije, Hrvatske, Crne Gore i svijeta' + publisher = 'NIGP "DNN"' + category = 'news, politics, Bosnia, Balcans' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'sr' + remove_empty_feeds = True + publication_type = 'newspaper' + cover_url = strftime('http://pdf.nezavisne.com/slika/novina/nezavisne_novine.jpg?v=%Y%m%d') + masthead_url = 'http://www.nezavisne.com/slika/osnova/nezavisne-novine-logo.gif' + extra_css = """ + body{font-family: Arial,Helvetica,sans-serif } + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + keep_only_tags = [dict(name='div', attrs={'class':'vijest'})] + remove_tags_after = dict(name='div', attrs={'id':'wrap'}) + remove_tags = [ + dict(name=['meta','link','iframe','object']) + ,dict(name='div', attrs={'id':'wrap'}) + ] + remove_attributes=['lang','xmlns:fb','xmlns:og'] + + + feeds = [ + (u'Novosti' , u'http://feeds.feedburner.com/Novosti-NezavisneNovine' ) + ,(u'Posao' , u'http://feeds.feedburner.com/Posao-NezavisneNovine' ) + ,(u'Sport' , u'http://feeds.feedburner.com/Sport-NezavisneNovine' ) + ,(u'Komentar' , u'http://feeds.feedburner.com/Komentari-NezavisneNovine' ) + ,(u'Umjetnost i zabava' , u'http://feeds.feedburner.com/UmjetnostIZabava-NezavisneNovine' ) + ,(u'Život i stil' , u'http://feeds.feedburner.com/ZivotIStil-NezavisneNovine' ) + ,(u'Auto' , u'http://feeds.feedburner.com/Auto-NezavisneNovine' ) + ,(u'Nauka i tehnologija', u'http://feeds.feedburner.com/NaukaITehnologija-NezavisneNovine') + ] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup diff --git a/recipes/nzherald.recipe b/recipes/nzherald.recipe index b73fd8366e..46242b630a 100644 --- a/recipes/nzherald.recipe +++ b/recipes/nzherald.recipe @@ -25,7 +25,7 @@ class NewZealandHerald(BasicNewsRecipe): 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000466.xml'), ('Life & Style', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000006.xml'), - ('Technology' + ('Technology', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000005.xml'), ('Sport', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000004.xml'), diff --git a/recipes/osworld_pl.recipe b/recipes/osworld_pl.recipe new file mode 100644 index 0000000000..7784a271e0 --- /dev/null +++ b/recipes/osworld_pl.recipe @@ -0,0 +1,33 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class OSWorld(BasicNewsRecipe): + title = u'OSWorld.pl' + __author__ = 'fenuks' + description = u'OSWorld.pl to serwis internetowy, dzięki któremu poznasz czym naprawdę jest Open Source. Serwis poświęcony jest wolnemu oprogramowaniu jak linux mint, centos czy ubunty. Znajdziecie u nasz artykuły, unity oraz informacje o certyfikatach CACert. OSWorld to mały świat wielkich systemów!' + category = 'OS, IT, open source, Linux' + language = 'pl' + cover_url = 'http://osworld.pl/wp-content/uploads/osworld-kwadrat-128x111.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + keep_only_tags = [dict(id=['dzial', 'posts'])] + remove_tags = [dict(attrs={'class':'post-comments'})] + remove_tags_after = dict(attrs={'class':'entry clr'}) + feeds = [(u'Artyku\u0142y', u'http://osworld.pl/category/artykuly/feed/'), (u'Nowe wersje', u'http://osworld.pl/category/nowe-wersje/feed/')] + + def append_page(self, soup, appendtag): + tag = appendtag.find(attrs={'id':'paginacja'}) + if tag: + for nexturl in tag.findAll('a'): + soup2 = self.index_to_soup(nexturl['href']) + pagetext = soup2.find(attrs={'class':'entry clr'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'id':'paginacja'}): + r.extract() + + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/pc_lab.recipe b/recipes/pc_lab.recipe index c4b33b8416..7a6038bd65 100644 --- a/recipes/pc_lab.recipe +++ b/recipes/pc_lab.recipe @@ -1,5 +1,4 @@ #!/usr/bin/env python - from calibre.web.feeds.recipes import BasicNewsRecipe class PCLab(BasicNewsRecipe): @@ -8,12 +7,13 @@ class PCLab(BasicNewsRecipe): __author__ = 'ravcio - rlelusz[at]gmail.com' description = u"Articles from PC Lab website" language = 'pl' - oldest_article = 30.0 + oldest_article = 30 max_articles_per_feed = 100 recursions = 0 encoding = 'iso-8859-2' no_stylesheets = True remove_javascript = True + remove_empty_feeds = True use_embedded_content = False keep_only_tags = [ @@ -21,50 +21,54 @@ class PCLab(BasicNewsRecipe): ] remove_tags = [ - dict(name='div', attrs={'class':['chapters']}) - ,dict(name='div', attrs={'id':['script_bxad_slot_display_list_bxad_slot']}) + dict(name='div', attrs={'class':['toc first', 'toc', 'tags', 'recommendedarticles', 'name', 'zumi', 'chapters']}) ] - remove_tags_after = [ - dict(name='div', attrs={'class':['navigation']}) - ] - #links to RSS feeds - feeds = [ ('PCLab', u'http://pclab.pl/xml/artykuly.xml') ] + feeds = [ + (u'Aktualności', 'http://pclab.pl/xml/aktualnosci.xml'), + (u'Artykuły', u'http://pclab.pl/xml/artykuly.xml'), + (u'Poradniki', 'http://pclab.pl/xml/poradniki.xml') + ] #load second and subsequent page content # in: soup - full page with 'next' button # out: appendtag - tag to which new page is to be added def append_page(self, soup, appendtag): # find the 'Next' button - pager = soup.find('div', attrs={'class':'next'}) - + pager = soup.find('div', attrs={'class':'navigation'}) if pager: + a = pager.find('a') + if 'news' in a['href']: + pager = None + else: + pager = pager.find('div', attrs={'class':'next'}) + + while pager: #search for 'a' element with link to next page (exit if not found) a = pager.find('a') - if a: - nexturl = a['href'] + nexturl = a['href'] + soup2 = self.index_to_soup('http://pclab.pl' + nexturl) + pager = soup2.find('div', attrs={'class':'next'}) + pagetext = soup2.find('div', attrs={'class':'substance'}) + pagetext = pagetext.find('div', attrs={'class':'data'}) - soup2 = self.index_to_soup('http://pclab.pl/' + nexturl) - - pagetext_substance = soup2.find('div', attrs={'class':'substance'}) - pagetext = pagetext_substance.find('div', attrs={'class':'data'}) - pagetext.extract() - - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - pos = len(appendtag.contents) - - self.append_page(soup2, appendtag) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pos = len(appendtag.contents) + pager = soup.find('div', attrs={'class':'navigation'}) + if pager: + pager.extract() def preprocess_html(self, soup): - # soup.body contains no title and no navigator, they are in soup self.append_page(soup, soup.body) - + for link in soup.findAll('a'): + href = link.get('href', None) + if href and href.startswith('/'): + link['href'] = 'http://pclab.pl' + href # finally remove some tags - tags = soup.findAll('div',attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']}) - [tag.extract() for tag in tags] + #for r in soup.findAll('div', attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']}) return soup diff --git a/recipes/pnn.recipe b/recipes/pnn.recipe new file mode 100644 index 0000000000..cb36afe88b --- /dev/null +++ b/recipes/pnn.recipe @@ -0,0 +1,55 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe + +'''Calibre recipe to convert the RSS feeds of the PNN to an ebook.''' + +class SportsIllustratedRecipe(BasicNewsRecipe) : + __author__ = 'n.kucklaender' + __copyright__ = 'a.peter' + __license__ = 'GPL v3' + language = 'de' + description = 'PNN RSS' + version = 1 + title = u'PNN' + timefmt = ' [%d.%m.%Y]' + + oldest_article = 7.0 + no_stylesheets = True + remove_javascript = True + use_embedded_content = False + publication_type = 'newspaper' + + remove_empty_feeds = True + remove_tags = [dict(attrs={'class':['um-weather um-header-weather','um-has-sub um-mainnav','um-box','ts-products','um-meta-nav','um-box um-last','um-footer','um-footer-links','share hidden','um-buttons']}),dict(id=['dinsContainer'])] + # remove_tags_before = [dict(name='div', attrs={'class':'um-first'})] + # remove_tags_after = [dict(name='div', attrs={'class':'um-metabar'})] + + feeds = [(u'Titelseite', u'http://www.pnn.de/rss.xml'), + (u'Dritte Seite', u'http://www.pnn.de/dritte-seite/rss.xml'), + (u'Politik', u'http://www.pnn.de/politik/rss.xml'), + (u'Meinung', u'http://www.pnn.de/meinung/rss.xml'), + (u'Potsdam', u'http://www.pnn.de/potsdam/rss.xml'), + (u'Havel-Spree', u'http://www.pnn.de/havel-spree/rss.xml'), + (u'Potsdam-Mittelmark', u'http://www.pnn.de/pm/rss.xml'), + (u'Berlin-Brandenburg', u'http://www.pnn.de/brandenburg-berlin/rss.xml'), + (u'Wirtschaft', u'http://www.pnn.de/wirtschaft/rss.xml'), + (u'Sport', u'http://www.pnn.de/sport/rss.xml'), + (u'Regionalsport', u'http://www.pnn.de/regionalsport/rss.xml'), + (u'Kultur', u'http://www.pnn.de/kultur/rss.xml'), + (u'Potsdam-Kultur', u'http://www.pnn.de/potsdam-kultur/rss.xml'), + (u'Wissen', u'http://www.pnn.de/wissen/rss.xml'), + (u'Medien', u'http://www.pnn.de/medien/rss.xml'), + (u'Weltspiegel', u'http://www.pnn.de/weltspiegel/rss.xml'), + (u'Wissenschaft', u'http://www.pnn.de/campus/rss.xml'), + (u'Mobil', u'http://www.pnn.de/mobil/rss.xml'), + (u'Reise', u'http://www.pnn.de/reise/rss.xml'), + (u'Ratgeber', u'http://www.pnn.de/ratgeber/rss.xml'), + (u'Fragen des Tages', u'http://www.pnn.de/fragen-des-tages/rss.xml'), + # (u'Potsdam bin ich', u'http://www.pnn.de/potsdam-bin-ich/rss.xml'), + (u'Leserbriefe', u'http://www.pnn.de/leserbriefe/rss.xml')] + + def get_masthead_url(self): + return 'http://www.pnn.de/app/base/img/pnn_logo.png' + + def print_version(self, url): + return url.replace('.html', ',view,printVersion.html') + diff --git a/recipes/pravda_rs.recipe b/recipes/pravda_rs.recipe new file mode 100644 index 0000000000..742527ac2b --- /dev/null +++ b/recipes/pravda_rs.recipe @@ -0,0 +1,85 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' + +''' +www.pravda.rs +''' + +import re +from calibre.web.feeds.recipes import BasicNewsRecipe + +class Pravda_rs(BasicNewsRecipe): + title = 'Dnevne novine Pravda' + __author__ = 'Darko Miletic' + description = '24 sata portal vesti iz Srbije' + publisher = 'Dnevne novine Pravda' + category = 'news, politics, entertainment, Serbia' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = False + language = 'sr' + publication_type = 'newspaper' + remove_empty_feeds = True + PREFIX = 'http://www.pravda.rs' + FEEDPR = PREFIX + '/category/' + LANGLAT = '?lng=lat' + FEEDSU = '/feed/' + LANGLAT + INDEX = PREFIX + LANGLAT + masthead_url = 'http://www.pravda.rs/wp-content/uploads/2012/09/logoof.png' + extra_css = """ + @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} + body{font-family: Georgia,"Times New Roman",Times,serif1,serif;} + img{display: block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher': publisher + , 'language' : language + } + + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + + keep_only_tags = [dict(name='div', attrs={'class':'post'})] + remove_tags = [dict(name='h3')] + remove_tags_after = dict(name='h3') + + feeds = [ + (u'Politika' , FEEDPR + 'politika/' + FEEDSU), + (u'Tema Dana', FEEDPR + 'tema-dana/' + FEEDSU), + (u'Hronika' , FEEDPR + 'hronika/' + FEEDSU), + (u'Društvo' , FEEDPR + 'drustvo/' + FEEDSU), + (u'Ekonomija', FEEDPR + 'ekonomija/' + FEEDSU), + (u'Srbija' , FEEDPR + 'srbija/' + FEEDSU), + (u'Beograd' , FEEDPR + 'beograd/' + FEEDSU), + (u'Kultura' , FEEDPR + 'kultura/' + FEEDSU), + (u'Zabava' , FEEDPR + 'zabava/' + FEEDSU), + (u'Sport' , FEEDPR + 'sport/' + FEEDSU), + (u'Svet' , FEEDPR + 'svet/' + FEEDSU), + (u'Porodica' , FEEDPR + 'porodica/' + FEEDSU), + (u'Vremeplov', FEEDPR + 'vremeplov/' + FEEDSU), + (u'IT' , FEEDPR + 'it/' + FEEDSU), + (u'Republika Srpska', FEEDPR + 'republika-srpska/' + FEEDSU), + (u'Crna Gora', FEEDPR + 'crna-gora/' + FEEDSU), + (u'EX YU' , FEEDPR + 'eks-ju/' + FEEDSU), + (u'Dijaspora', FEEDPR + 'dijaspora/' + FEEDSU), + (u'Kolumna' , FEEDPR + 'kolumna/' + FEEDSU), + (u'Afere' , FEEDPR + 'afere/' + FEEDSU), + (u'Feljton' , FEEDPR + 'feljton/' + FEEDSU), + (u'Intervju' , FEEDPR + 'intervju/' + FEEDSU), + (u'Reportaža', FEEDPR + 'reportaza/' + FEEDSU), + (u'Zanimljivosti', FEEDPR + 'zanimljivosti/' + FEEDSU), + (u'Sa trga' , FEEDPR + 'sa-trga/' + FEEDSU) + ] + + def print_version(self, url): + return url + self.LANGLAT + + def preprocess_raw_html(self, raw, url): + return 'title'+raw[raw.find(''):] + \ No newline at end of file diff --git a/recipes/revista_cromos.recipe b/recipes/revista_cromos.recipe new file mode 100644 index 0000000000..29515971dd --- /dev/null +++ b/recipes/revista_cromos.recipe @@ -0,0 +1,33 @@ +# coding=utf-8 +# https://github.com/iemejia/calibrecolombia + +''' +http://www.cromos.com.co/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class ElMalpensante(BasicNewsRecipe): + title = u'Revista Cromos' + language = 'es_CO' + __author__ = 'Ismael Mejia ' + cover_url = 'http://www.cromos.com.co/sites/cromos.com.co/themes/cromos_theme/images/logo_morado.gif' + description = 'Revista Cromos' + oldest_article = 7 + simultaneous_downloads = 20 + #tags = 'news, sport, blog' + use_embedded_content = True + remove_empty_feeds = True + max_articles_per_feed = 100 + feeds = [(u'Cromos', u'http://www.cromos.com.co/rss.xml'), + (u'Moda', u'http://www.cromos.com.co/moda/feed'), + (u'Estilo de Vida', u'http://www.cromos.com.co/estilo-de-vida/feed'), + (u'Cuidado Personal', u'http://www.cromos.com.co/estilo-de-vida/cuidado-personal/feed'), + (u'Salud y Alimentación', u'http://www.cromos.com.co/estilo-de-vida/salud-y-alimentacion/feed'), + (u'Personajes', u'http://www.cromos.com.co/personajes/feed'), + (u'Actualidad', u'http://www.cromos.com.co/personajes/actualidad/feed'), + (u'Espectáculo', u'http://www.cromos.com.co/personajes/espectaculo/feed'), + (u'Reportajes', u'http://www.cromos.com.co/reportajes/feed'), + (u'Eventos', u'http://www.cromos.com.co/eventos/feed'), + (u'Modelos', u'http://www.cromos.com.co/modelos/feed'), + ] diff --git a/recipes/science_news.recipe b/recipes/science_news.recipe index fa24bbadcf..53b451030a 100644 --- a/recipes/science_news.recipe +++ b/recipes/science_news.recipe @@ -1,24 +1,38 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' ''' sciencenews.org ''' from calibre.web.feeds.news import BasicNewsRecipe -class Sciencenews(BasicNewsRecipe): - title = u'ScienceNews' - __author__ = u'Darko Miletic and Sujata Raman' - description = u"Science News is an award-winning weekly newsmagazine covering the most important research in all fields of science. Its 16 pages each week are packed with short, accurate articles that appeal to both general readers and scientists. Published since 1922, the magazine now reaches about 150,000 subscribers and more than 1 million readers. These are the latest News Items from Science News." +class ScienceNewsIssue(BasicNewsRecipe): + title = u'Science News Recent Issues' + __author__ = u'Darko Miletic, Sujata Raman and Starson17' + description = u'''Science News is an award-winning weekly + newsmagazine covering the most important research in all fields of science. + Its 16 pages each week are packed with short, accurate articles that appeal + to both general readers and scientists. Published since 1922, the magazine + now reaches about 150,000 subscribers and more than 1 million readers. + These are the latest News Items from Science News. This recipe downloads + the last 30 days worth of articles.''' + category = u'Science, Technology, News' + publisher = u'Society for Science & the Public' oldest_article = 30 language = 'en' - max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - auto_cleanup = True timefmt = ' [%A, %d %B, %Y]' + recursions = 1 + remove_attributes = ['style'] + + conversion_options = {'linearize_tables' : True + , 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } extra_css = ''' .content_description{font-family:georgia ;font-size:x-large; color:#646464 ; font-weight:bold;} @@ -27,36 +41,33 @@ class Sciencenews(BasicNewsRecipe): .content_edition{font-family:helvetica,arial ;font-size: xx-small ;} .exclusive{color:#FF0000 ;} .anonymous{color:#14487E ;} - .content_content{font-family:helvetica,arial ;font-size: x-small ; color:#000000;} - .description{color:#585858;font-family:helvetica,arial ;font-size: xx-small ;} + .content_content{font-family:helvetica,arial ;font-size: medium ; color:#000000;} + .description{color:#585858;font-family:helvetica,arial ;font-size: large ;} .credit{color:#A6A6A6;font-family:helvetica,arial ;font-size: xx-small ;} ''' - #keep_only_tags = [ dict(name='div', attrs={'id':'column_action'}) ] - #remove_tags_after = dict(name='ul', attrs={'id':'content_functions_bottom'}) - #remove_tags = [ - #dict(name='ul', attrs={'id':'content_functions_bottom'}) - #,dict(name='div', attrs={'id':['content_functions_top','breadcrumb_content']}) - #,dict(name='img', attrs={'class':'icon'}) - #,dict(name='div', attrs={'class': 'embiggen'}) - #] + keep_only_tags = [ dict(name='div', attrs={'class':'content_content'}), + dict(name='ul', attrs={'id':'toc'}) + ] - feeds = [(u"Science News / News Items", u'http://sciencenews.org/index.php/feed/type/news/name/news.rss/view/feed/name/all.rss')] + feeds = [(u"Science News Current Issues", u'http://www.sciencenews.org/view/feed/type/edition/name/issues.rss')] + + match_regexps = [ + r'www.sciencenews.org/view/feature/id/', + r'www.sciencenews.org/view/generic/id' + ] def get_cover_url(self): cover_url = None index = 'http://www.sciencenews.org/view/home' soup = self.index_to_soup(index) link_item = soup.find(name = 'img',alt = "issue") - print link_item if link_item: cover_url = 'http://www.sciencenews.org' + link_item['src'] + '.jpg' return cover_url - #def preprocess_html(self, soup): - - #for tag in soup.findAll(name=['span']): - #tag.name = 'div' - - #return soup + def preprocess_html(self, soup): + for tag in soup.findAll(name=['span']): + tag.name = 'div' + return soup diff --git a/recipes/scmp.recipe b/recipes/scmp.recipe index f2981ca667..6c66231da7 100644 --- a/recipes/scmp.recipe +++ b/recipes/scmp.recipe @@ -4,7 +4,6 @@ __copyright__ = '2010, Darko Miletic ' scmp.com ''' -import re from calibre.web.feeds.news import BasicNewsRecipe class SCMP(BasicNewsRecipe): @@ -18,10 +17,11 @@ class SCMP(BasicNewsRecipe): max_articles_per_feed = 200 no_stylesheets = True encoding = 'utf-8' + auto_cleanup = True use_embedded_content = False language = 'en_CN' remove_empty_feeds = True - needs_subscription = True + needs_subscription = 'optional' publication_type = 'newspaper' masthead_url = 'http://www.scmp.com/images/logo_scmp_home.gif' extra_css = ' body{font-family: Arial,Helvetica,sans-serif } ' @@ -46,17 +46,17 @@ class SCMP(BasicNewsRecipe): br.submit() return br - remove_attributes=['width','height','border'] + #remove_attributes=['width','height','border'] - keep_only_tags = [ - dict(attrs={'id':['ART','photoBox']}) - ,dict(attrs={'class':['article_label','article_byline','article_body']}) - ] + #keep_only_tags = [ + #dict(attrs={'id':['ART','photoBox']}) + #,dict(attrs={'class':['article_label','article_byline','article_body']}) + #] - preprocess_regexps = [ - (re.compile(r'

).)*', re.DOTALL|re.IGNORECASE), - lambda match: ''), - ] + #preprocess_regexps = [ + #(re.compile(r'

).)*', re.DOTALL|re.IGNORECASE), + #lambda match: ''), + #] feeds = [ (u'Business' , u'http://www.scmp.com/rss/business.xml' ) @@ -68,13 +68,13 @@ class SCMP(BasicNewsRecipe): ,(u'Sport' , u'http://www.scmp.com/rss/sport.xml' ) ] - def print_version(self, url): - rpart, sep, rest = url.rpartition('&') - return rpart #+ sep + urllib.quote_plus(rest) + #def print_version(self, url): + #rpart, sep, rest = url.rpartition('&') + #return rpart #+ sep + urllib.quote_plus(rest) - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - items = soup.findAll(src="/images/label_icon.gif") - [item.extract() for item in items] - return self.adeify_images(soup) + #def preprocess_html(self, soup): + #for item in soup.findAll(style=True): + #del item['style'] + #items = soup.findAll(src="/images/label_icon.gif") + #[item.extract() for item in items] + #return self.adeify_images(soup) diff --git a/recipes/seattle_times.recipe b/recipes/seattle_times.recipe index 631dfa58f1..9ae40d1f20 100644 --- a/recipes/seattle_times.recipe +++ b/recipes/seattle_times.recipe @@ -23,6 +23,7 @@ class SeattleTimes(BasicNewsRecipe): language = 'en' auto_cleanup = True auto_cleanup_keep = '//div[@id="PhotoContainer"]' + cover_url = 'http://seattletimes.com/PDF/frontpage.pdf' feeds = [ (u'Top Stories', diff --git a/recipes/spiders_web_pl.recipe b/recipes/spiders_web_pl.recipe index 678ee5c640..b593d6b837 100644 --- a/recipes/spiders_web_pl.recipe +++ b/recipes/spiders_web_pl.recipe @@ -5,11 +5,14 @@ class SpidersWeb(BasicNewsRecipe): oldest_article = 7 __author__ = 'fenuks' description = u'' - cover_url = 'http://www.spidersweb.pl/wp-content/themes/spiderweb/img/Logo.jpg' + cover_url = 'http://www.spidersweb.pl/wp-content/themes/new_sw/images/spidersweb.png' category = 'IT, WEB' language = 'pl' no_stylesheers=True + remove_javascript = True + use_embedded_content = False max_articles_per_feed = 100 - keep_only_tags=[dict(id='Post')] - remove_tags=[dict(name='div', attrs={'class':['Comments', 'Shows', 'Post-Tags']}), dict(id='Author-Column')] + keep_only_tags=[dict(id='start')] + remove_tags_after = dict(attrs={'class':'padding20'}) + remove_tags=[dict(name='div', attrs={'class':['padding border-bottom', 'padding20', 'padding border-top']})] feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')] diff --git a/recipes/thestar.recipe b/recipes/thestar.recipe index f667b86472..59c3b43c6b 100644 --- a/recipes/thestar.recipe +++ b/recipes/thestar.recipe @@ -1,7 +1,5 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' +__copyright__ = '2009-2013, Darko Miletic ' ''' www.thestar.com ''' @@ -11,18 +9,17 @@ from calibre.web.feeds.news import BasicNewsRecipe class TheTorontoStar(BasicNewsRecipe): title = 'The Toronto Star' __author__ = 'Darko Miletic' - description = "Canada's largest daily newspaper" + description = "Thestar.com is Canada's largest online news site. Stay current with our sports, business entertainment news and more from the Toronto Star and thestar.com" oldest_article = 2 language = 'en_CA' max_articles_per_feed = 100 no_stylesheets = True - #auto_cleanup = True - #auto_cleanup_keep = '//div[@class="topsContent topsContentActive"]' use_embedded_content = False delay = 2 publisher = 'The Toronto Star' category = "Toronto Star,Canada's largest daily newspaper,breaking news,classifieds,careers,GTA,Toronto Maple Leafs,sports,Toronto,news,editorial,The Star,Ontario,information,columnists,business,entertainment,births,deaths,automotive,rentals,weather,archives,Torstar,technology,Joseph Atkinson" encoding = 'utf-8' + masthead_url = 'http://www.thestar.com/etc/designs/thestar/images/general/logoLrg.png' conversion_options = { 'comments' : description @@ -30,23 +27,18 @@ class TheTorontoStar(BasicNewsRecipe): ,'publisher' : publisher } - #keep_only_tags = [dict(name='div', attrs={'class':'ts-article'})] - #remove_tags_before = dict(name='div',attrs={'id':'ts-article_header'}) + remove_tags_before = dict(name='div',attrs={'class':'article-headline'}) feeds = [ - (u'News' , u'http://www.thestar.com/rss/?categories=293' ) - ,(u'Opinion' , u'http://www.thestar.com/rss/?categories=303' ) - ,(u'Business' , u'http://www.thestar.com/rss/?categories=294' ) - ,(u'Sports' , u'http://www.thestar.com/rss/?categories=295' ) - ,(u'Entertainment', u'http://www.toronto.com/rss?categories=6298' ) - ,(u'Living' , u'http://www.thestar.com/rss/?categories=297' ) - ,(u'Travel' , u'http://www.thestar.com/rss/list/1042246?' ) - ,(u'Science' , u'http://www.thestar.com/rss?categories=6481') + (u'News' , u'http://www.thestar.com/feeds.articles.news.rss' ) + ,(u'Opinion' , u'http://www.thestar.com/feeds.articles.opinion.rss' ) + ,(u'Business' , u'http://www.thestar.com/feeds.articles.business.rss' ) + ,(u'Sports' , u'http://www.thestar.com/feeds.articles.sports.rss' ) + ,(u'Entertainment', u'http://www.thestar.com/feeds.articles.entertainment.rss' ) + ,(u'Living' , u'http://www.thestar.com/feeds.articles.life.rss' ) + ,(u'Travel' , u'http://www.thestar.com/feeds.articles.life.travel.rss' ) + ,(u'Technology' , u'http://www.thestar.com/feeds.articles.life.technology.rss') ] def print_version(self, url): - artl = url.rpartition('--')[0] - artid = artl.rpartition('/')[2] - return 'http://www.thestar.com/printarticle/' + artid - - + return url.replace('.html', '.print.html') diff --git a/recipes/ubuntu_pomoc_org.recipe b/recipes/ubuntu_pomoc_org.recipe new file mode 100644 index 0000000000..1a78649dfc --- /dev/null +++ b/recipes/ubuntu_pomoc_org.recipe @@ -0,0 +1,22 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe +class UbuntuPomoc(BasicNewsRecipe): + title = u'Ubuntu-pomoc.org' + __author__ = 'fenuks' + description = u'Strona poświęcona systemowi Ubuntu Linux. Znajdziesz tutaj przydatne i sprawdzone poradniki oraz sposoby rozwiązywania wielu popularnych problemów. Ten blog rozwiąże każdy Twój problem - jeśli nie teraz, to wkrótce! :)' + category = 'Linux, Ubuntu, open source' + language = 'pl' + cover_url = 'http://www.ubuntu-pomoc.org/grafika/ubuntupomoc.png' + preprocess_regexps = [(re.compile(r'

.+', re.IGNORECASE|re.DOTALL), lambda m: '')] + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_javascript = True + remove_empty_feeds = True + use_embedded_content = False + remove_attrs = ['style'] + keep_only_tags = [dict(attrs={'class':'post'})] + remove_tags_after = dict(attrs={'class':'underEntry'}) + remove_tags = [dict(attrs={'class':['underPostTitle', 'yarpp-related', 'underEntry', 'social', 'tags', 'commentlist', 'youtube_sc']}), dict(id=['wp_rp_first', 'commentReply'])] + feeds = [(u'Ca\u0142o\u015b\u0107', u'http://feeds.feedburner.com/Ubuntu-Pomoc'), + (u'Gry', u'http://feeds.feedburner.com/GryUbuntu-pomoc')] diff --git a/recipes/unperiodico.recipe b/recipes/unperiodico.recipe new file mode 100644 index 0000000000..d4edb4e5dc --- /dev/null +++ b/recipes/unperiodico.recipe @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# https://github.com/iemejia/calibrecolombia + +''' +http://www.unperiodico.unal.edu.co/ +''' + +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe + +class UNPeriodico(BasicNewsRecipe): + title = u'UN Periodico' + language = 'es_CO' + __author__ = 'Ismael Mejia ' + cover_url = 'http://www.unperiodico.unal.edu.co/fileadmin/templates/periodico/img/logoperiodico.png' + description = 'UN Periodico' + oldest_article = 30 + max_articles_per_feed = 100 + publication_type = 'newspaper' + feeds = [ + (u'UNPeriodico', u'http://www.unperiodico.unal.edu.co/rss/type/rss2/') + ] diff --git a/recipes/vic_times.recipe b/recipes/vic_times.recipe index 391cf5eff4..48fb9038aa 100644 --- a/recipes/vic_times.recipe +++ b/recipes/vic_times.recipe @@ -1,105 +1,46 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - __license__ = 'GPL v3' ''' www.canada.com ''' - import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup -class CanWestPaper(BasicNewsRecipe): +class TimesColonist(BasicNewsRecipe): - # un-comment the following four lines for the Victoria Times Colonist title = u'Victoria Times Colonist' url_prefix = 'http://www.timescolonist.com' description = u'News from Victoria, BC' fp_tag = 'CAN_TC' - # un-comment the following four lines for the Vancouver Province -## title = u'Vancouver Province' -## url_prefix = 'http://www.theprovince.com' -## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VP' - - # un-comment the following four lines for the Vancouver Sun -## title = u'Vancouver Sun' -## url_prefix = 'http://www.vancouversun.com' -## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VS' - - # un-comment the following four lines for the Edmonton Journal -## title = u'Edmonton Journal' -## url_prefix = 'http://www.edmontonjournal.com' -## description = u'News from Edmonton, AB' -## fp_tag = 'CAN_EJ' - - # un-comment the following four lines for the Calgary Herald -## title = u'Calgary Herald' -## url_prefix = 'http://www.calgaryherald.com' -## description = u'News from Calgary, AB' -## fp_tag = 'CAN_CH' - - # un-comment the following four lines for the Regina Leader-Post -## title = u'Regina Leader-Post' -## url_prefix = 'http://www.leaderpost.com' -## description = u'News from Regina, SK' -## fp_tag = '' - - # un-comment the following four lines for the Saskatoon Star-Phoenix -## title = u'Saskatoon Star-Phoenix' -## url_prefix = 'http://www.thestarphoenix.com' -## description = u'News from Saskatoon, SK' -## fp_tag = '' - - # un-comment the following four lines for the Windsor Star -## title = u'Windsor Star' -## url_prefix = 'http://www.windsorstar.com' -## description = u'News from Windsor, ON' -## fp_tag = 'CAN_' - - # un-comment the following four lines for the Ottawa Citizen -## title = u'Ottawa Citizen' -## url_prefix = 'http://www.ottawacitizen.com' -## description = u'News from Ottawa, ON' -## fp_tag = 'CAN_OC' - - # un-comment the following four lines for the Montreal Gazette -## title = u'Montreal Gazette' -## url_prefix = 'http://www.montrealgazette.com' -## description = u'News from Montreal, QC' -## fp_tag = 'CAN_MG' - - + url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True - timefmt = ' [%b %d]' + timefmt = ' [%b %d]' + encoding = 'utf-8' extra_css = ''' - .timestamp { font-size:xx-small; display: block; } - #storyheader { font-size: medium; } - #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } - .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' - keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + .byline { font-size:xx-small; font-weight: bold;} + h3 { margin-bottom: 6px; } + .caption { font-size: xx-small; font-style: italic; font-weight: normal; } + ''' + keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})] remove_tags = [{'class':'comments'}, - dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), - dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), - dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), - dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), - dict(name='div', attrs={'class':'rule_grey_solid'}), - dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + {'id':'photocredit'}, + dict(name='div', attrs={'class':re.compile('top.controls')}), + dict(name='div', attrs={'class':re.compile('social')}), + dict(name='div', attrs={'class':re.compile('tools')}), + dict(name='div', attrs={'class':re.compile('bottom.tools')}), + dict(name='div', attrs={'class':re.compile('window')}), + dict(name='div', attrs={'class':re.compile('related.news.element')})] + def get_cover_url(self): from datetime import timedelta, date - if self.fp_tag=='': - return None cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' br = BasicNewsRecipe.get_browser(self) daysback=1 @@ -120,6 +61,18 @@ class CanWestPaper(BasicNewsRecipe): cover = None return cover + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(path_to_image, out_path) + def fixChars(self,string): # Replace lsquo (\x91) fixed = re.sub("\x91","‘",string) @@ -166,55 +119,107 @@ class CanWestPaper(BasicNewsRecipe): a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup - def preprocess_html(self, soup): + def preprocess_html(self,soup): + byline = soup.find('p',attrs={'class':re.compile('ancillary')}) + if byline is not None: + byline.find('a') + authstr = self.tag_to_string(byline,False) + authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE) + authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE) + newdiv = Tag(soup,'div') + newdiv.insert(0,authstr) + newdiv['class']='byline' + byline.replaceWith(newdiv) + for caption in soup.findAll('p',attrs={'class':re.compile('caption')}): + capstr = self.tag_to_string(caption,False) + capstr = re.sub('Photograph by.*$','',capstr, flags=re.IGNORECASE) + newdiv = Tag(soup,'div') + newdiv.insert(0,capstr) + newdiv['class']='caption' + caption.replaceWith(newdiv) + for ptag in soup.findAll('p'): + ptext = self.tag_to_string(ptag,use_alt=False, normalize_whitespace=True) + ptext = re.sub(r'\s+','', ptext) + if (ptext=='') or (ptext==' '): + ptag.extract() return self.strip_anchors(soup) + raeside = False + def handle_articles(self,htag,article_list,sectitle): + atag = htag.a + if atag is not None: + url = atag['href'] + #print("Checking "+url) + if atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + if 'RAESIDE' in title.upper(): + if self.raeside: + return + self.raeside = True + dtag = htag.findNext('p') + description='' + if dtag is not None: + description = self.tag_to_string(dtag,False) + article_list.append(dict(title=title,url=url,date='',description=description,author='',content='')) + #print(sectitle+title+": description = "+description+" URL="+url) + def add_section_index(self,ans,securl,sectitle): + print("Add section url="+self.url_prefix+'/'+securl) + try: + soup = self.index_to_soup(self.url_prefix+'/'+securl) + except: + return ans + mainsoup = soup.find('div',attrs={'class':re.compile('main.content')}) + article_list = [] + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('featured.story')}): + for htag in wdiv.findAll('h3'): + self.handle_articles(htag,article_list,sectitle) + for ladiv in mainsoup.findAll(attrs={'class':re.compile('leading.articles')}): + for wdiv in mainsoup.findAll('div',attrs={'class':re.compile('article.row')}): + for htag in wdiv.findAll('h2'): + self.handle_articles(htag,article_list,sectitle) + ans.append((sectitle,article_list)) + return ans def parse_index(self): - soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') - - articles = {} - key = 'News' - ans = ['News'] - - # Find each instance of class="sectiontitle", class="featurecontent" - for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): - #self.log(" div class = %s" % divtag['class']) - if divtag['class'].startswith('section_title'): - # div contains section title - if not divtag.h3: - continue - key = self.tag_to_string(divtag.h3,False) - ans.append(key) - self.log("Section name %s" % key) - continue - # div contains article data - h1tag = divtag.find('h1') - if not h1tag: - continue - atag = h1tag.find('a',href=True) - if not atag: - continue - url = self.url_prefix+'/news/todays-paper/'+atag['href'] - #self.log("Section %s" % key) - #self.log("url %s" % url) - title = self.tag_to_string(atag,False) - #self.log("title %s" % title) - pubdate = '' - description = '' - ptag = divtag.find('p'); - if ptag: - description = self.tag_to_string(ptag,False) - #self.log("description %s" % description) - author = '' - autag = divtag.find('h4') - if autag: - author = self.tag_to_string(autag,False) - #self.log("author %s" % author) - if not articles.has_key(key): - articles[key] = [] - articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) - - ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + ans = [] + ans = self.add_section_index(ans,'','Web Front Page') + ans = self.add_section_index(ans,'news/','News Headlines') + ans = self.add_section_index(ans,'news/b-c/','BC News') + ans = self.add_section_index(ans,'news/national/','Natioanl News') + ans = self.add_section_index(ans,'news/world/','World News') + ans = self.add_section_index(ans,'opinion/','Opinion') + ans = self.add_section_index(ans,'opinion/letters/','Letters') + ans = self.add_section_index(ans,'business/','Business') + ans = self.add_section_index(ans,'business/money/','Money') + ans = self.add_section_index(ans,'business/technology/','Technology') + ans = self.add_section_index(ans,'business/working/','Working') + ans = self.add_section_index(ans,'sports/','Sports') + ans = self.add_section_index(ans,'sports/hockey/','Hockey') + ans = self.add_section_index(ans,'sports/football/','Football') + ans = self.add_section_index(ans,'sports/basketball/','Basketball') + ans = self.add_section_index(ans,'sports/golf/','Golf') + ans = self.add_section_index(ans,'entertainment/','entertainment') + ans = self.add_section_index(ans,'entertainment/go/','Go!') + ans = self.add_section_index(ans,'entertainment/music/','Music') + ans = self.add_section_index(ans,'entertainment/books/','Books') + ans = self.add_section_index(ans,'entertainment/Movies/','movies') + ans = self.add_section_index(ans,'entertainment/television/','Television') + ans = self.add_section_index(ans,'life/','Life') + ans = self.add_section_index(ans,'life/health/','Health') + ans = self.add_section_index(ans,'life/travel/','Travel') + ans = self.add_section_index(ans,'life/driving/','Driving') + ans = self.add_section_index(ans,'life/homes/','Homes') + ans = self.add_section_index(ans,'life/food-drink/','Food & Drink') return ans + diff --git a/recipes/vice_magazine_de.recipe b/recipes/vice_magazine_de.recipe new file mode 100644 index 0000000000..c3e1aa8f7d --- /dev/null +++ b/recipes/vice_magazine_de.recipe @@ -0,0 +1,40 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class ViceDERecipe(BasicNewsRecipe): + title = u'Vice Magazin Deutschland' + __author__ = 'atordo;alex' + description = u'Die offizielle Website des Vice Magazins Deutschland' + category = u'Nachrichten, Fotografie, Blogs, Mode, Kunst, Film, Musik, Literatur, Technik' + cover_url = 'http://www.seeklogo.com/images/V/Vice-logo-668578AC94-seeklogo.com.gif' + oldest_article = 14 + max_articles_per_feed = 100 + auto_cleanup = False + no_stylesheets = True + language = 'de' + use_embedded_content = False + remove_javascript = True + publication_type = 'magazine' + + recursions=10 + match_regexps = [r'/read/.*\?Contentpage=[2-9]$'] + + keep_only_tags = [ + dict(attrs={'class':['article_title','article_content','next']}) + ] + remove_tags = [ + dict(attrs={'class':['social_buttons','search','tweet','like','inline_socials' + ,'stumblebadge','plusone']}) + ] + + extra_css = ''' + .author{font-size:small} + img{margin-bottom: 0.4em; display:block; margin-left:auto; margin-right: auto} + ''' + + preprocess_regexps = [ + (re.compile(r'\\<\/table\>'), lambda match: ''), (re.compile(r'\'), lambda match: ''), (re.compile(r'\'), lambda match: ''), (re.compile(r'\
'), lambda match: ''), - (re.compile(r'\