diff --git a/Changelog.yaml b/Changelog.yaml index 129af0afd5..ae7802da1b 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -5,7 +5,7 @@ # Also, each release can have new and improved recipes. # - version: ?.?.? -# date: 2012-??-?? +# date: 2013-??-?? # # new features: # - title: @@ -19,6 +19,596 @@ # new recipes: # - title: +- version: 0.9.19 + date: 2013-02-15 + + new features: + - title: "New tool: \"Polish books\" that allows you to perform various automated cleanup actions on EPUB and AZW3 files without doing a full conversion." + type: major + description: "Polishing books is all about putting the shine of perfection on your ebook files. You can use it to subset embedded fonts, update the metadata in the book files from the metadata in the calibre library, manipulate the book jacket, etc. More features will be added in the future. To use this tool, go to Preferences->Toolbar and add the Polish books tool to the main toolbar. Then simply select the books you want to be polished and click the Polish books button. Polishing, unlike conversion, does not change the internal structure/markup of your book, it performs only the minimal set of actions needed to achieve its goals. Note that polish books is a completely new codebase, so there may well be bugs, polishing a book backs up the original as ORIGINAL_EPUB or ORIGINAL_AZW3, unless you have turned off this feature in Preferences->Tweaks, in which case you should backup your files manually. You can also use this tool from the command line with ebook-polish.exe." + + - title: "Driver for the Trekstor Pyrus Mini." + tickets: [1124120] + + - title: "E-book viewer: Add an option to change the minimum font size." + tickets: [1122333] + + - title: "PDF Output: Add support for converting documents with math typesetting, as described here: http://manual.calibre-ebook.com/typesetting_math.html" + + - title: "Column coloring/icons: Add more conditions when using date based columns with reference to 'today'." + + bug fixes: + - title: "Transforming to titlecase - handle typographic hyphens in all caps phrases" + + - title: "Dont ignore file open events that occur before the GUI is initialized on OS X" + tickets: [1122713] + + - title: "News download: Handle feeds that have entries with empty ids" + + - title: "Fix a regression that broke using the template editor" + + - title: "Do not block startup while scanning the computer for available network interfaces. Speeds up startup time on some windows computers with lots of spurious network interfaces." + + improved recipes: + - New Yorker + - Kommersant + - Le Monde (Subscription version) + - NZ Herald + + new recipes: + - title: Navegalo + author: Douglas Delgado + + - title: El Guardian and More Intelligent Life + author: Darko Miletic + +- version: 0.9.18 + date: 2013-02-08 + + new features: + - title: "New metadata source: Edelweiss, a catalog of books that is updated directly by publishers. To enable it, go to Preferences->Metadata download and enable the Edelweiss plugin." + tickets: [1091073] + + - title: "Add an option to add extra spacing between rows in the book list. (Preferences->Look & Feel)" + tickets: [1117907] + + - title: "Column coloring/icons: Add a 'days ago' condition, useable with columns that store dates to set colors/icons based on the number of days before today" + + - title: "E-book viewer: Add shortcuts Ctrl+= and Ctrl+- to increase/decrease text size." + tickets: [ 1117524 ] + + - title: "When showing possible duplicates after adding books, also show the file formats." + + - title: "Driver for Trekstor Ventos Tablet" + + bug fixes: + - title: "Conversion: When transliterating unicode characters, handle « and » correctly." + tickets: [1117270] + + - title: "Fix adding books from multiple directories with multiple books per directory treating opf files as an ebook" + + - title: "Fix download metadata window not resizable on smaller screens" + tickets: [1116849] + + - title: "Tweak Book: When rebuilding azw3 files handle tags that have name but not id attribute, these are apparently produced by kindlegen." + tickets: [ 1112934 ] + + - title: "Fix regression in advanced column color rules." + tickets: [1118678] + + improved recipes: + - El Mundo today + - fluter.de + - Birmingham Post + - Japan Times + - The Toronto Star + - Le Monde (subscription version) + - Globe and Mail + + new recipes: + - title: VICE Magazine Deutschland + author: Alex + + - title: Libertad Digital + author: Darko Miletic + +- version: 0.9.17 + date: 2013-02-01 + + new features: + - title: "Allow adding user specified icons to the main book list for books whose metadata matches specific criteria. Go to Preferences->Look & Feel->Column icons to setup these icons. They work in the same way as the column coloring rules." + type: major + + - title: "Allow choosing which page of a PDF to use as the cover." + description: "To access this functionality add the PDF to calibre then click the edit metadata button. In the top right area of the edit metadata dialog there is a button to get the cover from the ebook file, this will now allow you to choose which page (from the first ten pages) of the pdf to use as the cover." + tickets: [1110019] + + - title: "Add option to turn off reflections in the cover browser (Preferences->Look & Feel->Cover Browser)" + + - title: "PDF Output: Add an option to add page numbers to the bottom of every page in the generated PDF file (look in the PDF Output section of the conversion dialog)" + + - title: "Add the full item name to the tool tip of a leaf item displayed in the tag browser." + tickets: [1106231] + + bug fixes: + - title: "Fix out-of-bounds data causing errors in the Tag Browser" + tickets: [1108017] + + - title: "Conversion: Handle input documents that use multiple prefixes referring to the XHTML namespace correctly." + tickets: [1107220] + + - title: "PDF Output: Fix regression that caused some svg images to be rendered as black rectangles." + tickets: [1105294] + + - title: "Metadata download: Only normalize title case if the result has no language set or its language is English" + + improved recipes: + - Baltimore Sun + - Harvard Business Review + - Victoria Times + - South China Morning Post + - Volksrant + - Seattle Times + + new recipes: + - title: Dob NeviNosti + author: Darko Miletic + + - title: La Nacion (CR) + author: Douglas Delgado + +- version: 0.9.16 + date: 2013-01-25 + + new features: + - title: "News download: Add support for logging in to sites that require javascript for their logins." + tickets: [1101809] + + - title: "News download: Do not convert all downloaded images to JPG format. This fixes the problem of PNG images with transparent backgrounds being rendered with black backgrounds" + + - title: "CHM Input: Support hierarchical table of contents. Do not generate an inline table of contents when a metadata table of contents is present. Also correctly decode the text in the table of contents" + + - title: "Get Books: Add the beam-ebooks.de store" + + - title: "Make custom yes/no columns using icons put text values under the icons." + + - title: "Driver for LG E400 and SayCoolA710" + tickets: [1103741,1104528] + + - title: "Speed up device connection when there are lots of books on the device by not generating cover thumbnails unless they are actually needed." + + - title: "Have the metadata download dialog remember its last used size." + tickets: [1101150] + + bug fixes: + - title: "Fix deleting a custom column that was used in a column coloring rule makes the column coloring preferences panel unusable" + tickets: [1103504] + + - title: "Store caches outside the config directory for non-portable calibre installs" + + - title: "PDF Output: Dont crash if the user has a font on his system that is missing the OS/2 table" + tickets: [1102403] + + - title: "Conversion: Do not error out because of an error in user supplied search replace rules." + tickets: [1102647] + + - title: "Conversion: Replace all non-ascii characters in CSS class names, as they cause problems with some broken EPUB renderers." + tickets: [1102587] + + - title: "Do not choke when reading metadata from MOBI files with incorrectly encoded metadata fields" + + - title: "Conversion: Preserve ToC entries that point nowhere instead of causing them to point to a non-existent file" + + - title: "E-book viewer: Allow entries in the Table of Contents that do not point anywhere, instead of just ignoring them." + + - title: "Content server: Fix the 'Previous' link in the mobile version of the content server webpage skipping an entry" + tickets: [1101124] + + improved recipes: + - TSN + - St. Louis Post Dispatch + - Metro UK + - Michelle Malkin + - Barrons + + new recipes: + - title: Contemporary Argentine Writers + author: Darko Miletic + +- version: 0.9.15 + date: 2013-01-18 + + new features: + - title: "Linux MTP driver: Detect devices that have MTP interfaces even if their USB ids are not known" + + - title: "Content server: Allow picking a random book by clicking the 'Random book' link on the start page. You can also refresh the random book page to get a new random book" + + - title: "E-book viewer: Add an option to hide the toolbars in the viewer window (Preferences->Miscellaneous->Show controls in the viewr preferences). You can unhide them by right clicking in the viewer window." + + - title: "Kobo driver: Speedup initial connect by avoiding unnecessary update of series metadata in some situations." + tickets: [1099190] + + - title: "Get Books: Allow the store plugins to be dynamically loaded so that future website changes of a store dont require a calibre update to fix Get Books." + + - title: "Wireless driver: Always replace file when resending a previously sent book to the device, even if the title/author have changed." + + - title: "Add PocketBook Pro 912 driver." + tickets: [1099571] + + - title: "When creating/exporting epub and mobi files, add the calibre book identifier as a special field in the book's metadata. This allows third party tools to identify the book record in calibre to which the file belongs." + + - title: "Wireless driver: Add support for using the book uuid as the filename" + + - title: "Remove the experimental tag from the subset fonts feature, since there has been only one reported problem (now fixed) with it in the two months since it was released" + + bug fixes: + - title: "Get Books: Update the amazon, waterstones and libri.de plugins to account for website changes" + + - title: "MOBI Input: Do not choke on MOBI files with incorrectly encoded titles." + tickets: [1100601] + + - title: "Font subsetting: Fix a bug in the parsing of the GSUB table that could cause some ligatures to not be included in the subset font" + + - title: "E-book-viewer: Fix TOC links without anchors not scrolling to the top of the current flow" + + - title: "LIT Input: Handle lit files that set an incorrect XML mimetype for their text." + tickets: [1099621] + + - title: "Catalogs: Fix 'X' being droppen from isbns on export" + tickets: [1098325] + + - title: "Fix an error when editing date in the main book list and all visible dates are blank." + tickets: [1098675] + + - title: "Fix calibre-smtp using incorrect escaping for non-ascii attachment filenames" + tickets: [1098478] + + - title: "Conversion: When subsetting fonts, handle multiple @font-face rules referring to the same physical font" + + - title: "Content server: Update metadata when serving azw3 files" + + - title: "CHM Input: Handle chm files that contain files with url unsafe filenames." + tickets: [1100610] + + - title: "Content server: Fix custom icons for top level categories incorrect." + tickets: [1095016] + + - title: "Kobo driver: When resending a file to the device, update the filesize in the Kobo db to prevent the device from deleting the file." + tickets: [1100607] + + improved recipes: + - The Chronicle of Higher Education + - Smithsonian Magazine + - Philosophy Now + - The Economist + - Business Week Magazine + + new recipes: + - title: Asco de Vida + author: Krittika Goyal + + - title: Schattenblick + author: ThB + +- version: 0.9.14 + date: 2013-01-11 + + new features: + - title: "When adding multiple books and duplicates are found, allow the user to select which of the duplicate books will be added anyway." + tickets: [1095256] + + - title: "Device drivers for Kobo Arc on linux, Polaroid Android tablet" + tickets: [1098049] + + - title: "When sorting by series, use the language of the book to decide what leading articles to remove, just as is done for sorting by title" + + bug fixes: + - title: "PDF Output: Do not error out when the input document contains links with anchors not present in the document." + tickets: [1096428] + + - title: "Add support for upgraded db on newest Kobo firmware" + tickets: [1095617] + + - title: "PDF Output: Fix typo that broke use of custom paper sizes." + tickets: [1097563] + + - title: "PDF Output: Handle empty anchors present at the end of a page" + + - title: "PDF Output: Fix side margins of last page in a flow being incorrect when large side margins are used." + tickets: [1096290] + + - title: "Edit metadata dialog: Allow setting the series number for custom series type columns to zero" + + - title: "When bulk editing custom series-type columns and not provding a series number use 1 as the default, instead of None" + + - title: "Catalogs: Fix issue with catalog generation using Hungarian UI and author_sort beginning with multiple letter groups." + tickets: [1091581] + + - title: "PDF Output: Dont error out on files that have invalid font-family declarations." + tickets: [1096279] + + - title: "Do not load QRawFont at global level, to allow calibre installation on systems with missing dependencies" + tickets: [1096170] + + - title: "PDF Output: Fix cover not present in generated PDF files" + tickets: [1096098] + + improved recipes: + - Sueddeutsche Zeitung mobil + - Boerse Online + - TidBits + - New York Review of Books + - Fleshbot + - Il Messaggero + - Libero + + new recipes: + - title: Spectator Magazine, Oxford Mail and Outside Magazine + author: Krittika Goyal + + - title: Libartes + author: Darko Miletic + + - title: El Diplo + author: Tomas De Domenico + +- version: 0.9.13 + date: 2013-01-04 + + new features: + - title: "Complete rewrite of the PDF Output engine, to support links and fix various bugs" + type: major + description: "calibre now has a new PDF output engine that supports links in the text. It also fixes various bugs, detailed below. In order to implement support for links and fix these bugs, the engine had to be completely rewritten, so there may be some regressions." + + - title: "Show disabled device plugins in Preferences->Ignored Devices" + + - title: "Get Books: Fix Smashwords, Google books and B&N stores. Add Nook UK store" + + - title: "Allow series numbers lower than -100 for custom series columns." + tickets: [1094475] + + - title: "Add mass storage driver for rockhip based android smart phones" + tickets: [1087809] + + - title: "Add a clear ratings button to the edit metadata dialog" + + bug fixes: + - title: "PDF Output: Fix custom page sizes not working on OS X" + + - title: "PDF Output: Fix embedding of many fonts not supported (note that embedding of OpenType fonts with Postscript outlines is still not supported on windows, though it is supported on other operating systems)" + + - title: "PDF Output: Fix crashes converting some books to PDF on OS X" + tickets: [1087688] + + - title: "HTML Input: Handle entities inside href attributes when following the links in an HTML file." + tickets: [1094203] + + - title: "Content server: Fix custom icons not used for sub categories" + tickets: [1095016] + + - title: "Force use of non-unicode constants in compiled templates. Fixes a problem with regular expression character classes and probably other things." + + - title: "Kobo driver: Do not error out if there are invalid dates in the device database" + tickets: [1094597] + + - title: "Content server: Fix for non-unicode hostnames when using mDNS" + tickets: [1094063] + + improved recipes: + - Today's Zaman + - The Economist + - Foreign Affairs + - New York Times + - Alternet + - Harper's Magazine + - La Stampa + +- version: 0.9.12 + date: 2012-12-28 + + new features: + - title: "Drivers for Kibano e-reader and Slick ER-700-2" + tickets: [1093570, 1093732] + + - title: "Add support for downloading metadata from Amazon Brazil." + tickets: [1092594] + + - title: "Copy to library: Allow specifying the destination library by path." + tickets: [1093231] + + - title: "When adding empty books, allow setting of the series for the new books. Also select the newly added book records after adding." + + - title: "PDF Output: Add a checkbox to override the page size defined by the output profile. This allows you to specify a custom page size even if the output profile is not set to default." + + - title: "Add usb ids for newer kindle fire to the linux mtp driver" + + bug fixes: + - title: "Linux: Temporarily redirect stdout to get rid of the annoying and pointless message about mtpz during libmtp initialization" + + - title: "Fix multiple 'All column' coloring rules not being applied" + tickets: [1093574] + + - title: "Use custom icons in the content server as well." + tickets: [1092098] + + improved recipes: + - La Voce + - Harpers Magazine (printed edition) + - Pajamas Media + - NSFW corp + - The Hindu + - Nikkei News + + new recipes: + - title: Various Ukranian news sources + author: rpalyvoda + +- version: 0.9.11 + date: 2012-12-21 + + new features: + - title: "Merry Christmas and Happy Holidays to all ☺" + + - title: "When connecting to MTP devices such as the Kindle Fire HD or the Nook HD, speed up the process by ignoring some folders." + description: "calibre will now ignore folders for music, video, pictures, etc. when scanning the device. This can substantially speed up the connection process if you have thousands of non-ebook files on the device. The list of folders to be ignored can be customized by right clicking on the device icon in calibre and selecting 'Configure this device'." + + - title: "Allow changing the icons for categories in the Tag Browser. Right click on a category and choose 'Change category icon'." + tickets: [1092098] + + - title: "Allow setting the color of all columns with a single rule in Preferences->Look & Feel->Column Coloring" + + - title: "MOBI: When reading metadata from mobi files, put the contents of the ASIN field into an identifier named mobi-asin. Note that this value is not used when downloading metadata as it is not possible to know which (country specific) amazon website the ASIN comes from." + tickets: [1090394] + + bug fixes: + - title: "Windows build: Fix a regression in 0.9.9 that caused calibre to not start on some windows system that were missing the VC.90 dlls (some older XP systems)" + + - title: "Kobo driver: Workaround for invalid shelves created by bugs in the Kobo server" + tickets: [1091932] + + - title: "Metadata download: Fix cover downloading from non-US amazon sites broken by a website change." + tickets: [1090765] + + improved recipes: + - Le Devoir + - Nin online + - countryfile + - Birmingham Post + - The Independent + - Various Polish news sources + + new recipes: + - title: MobileBulgaria + author: Martin Tsanchev + + - title: Various Polish news sources + author: fenuks + +- version: 0.9.10 + date: 2012-12-14 + + new features: + - title: "Drivers for Nextbook Premium 8 se, HTC Desire X and Emerson EM 543" + tickets: [1088149, 1088112, 1087978] + + bug fixes: + - title: "Fix rich text delegate not working with Qt compiled in debug mode." + tickets: [1089011] + + - title: "When deleting all books in the library, blank the book details panel" + + - title: "Conversion: Fix malformed values in the bgcolor attribute causing conversion to abort" + + - title: "Conversion: Fix heuristics applying incorrect style in some circumstances" + tickets: [1066507] + + - title: "Possible fix for 64bit calibre not starting up on some Windows systems" + tickets: [1087816] + + improved recipes: + - Sivil Dusunce + - Anchorage Daily News + - Le Monde + - Harpers + + new recipes: + - title: Titanic + author: Krittika Goyal + +- version: 0.9.9 + date: 2012-12-07 + + new features: + - title: "64 bit build for windows" + type: major + description: "calibre now has a 64 bit version for windows, available at: http://calibre-ebook.com/download_windows64 The 64bit build is not limited to using only 3GB of RAM when converting large/complex documents. It may also be slightly faster for some tasks. You can have both the 32 bit and the 64 bit build installed at the same time, they will use the same libraries, plugins and settings." + + - title: "Content server: Make the identifiers in each books metadata clickable." + tickets: [1085726] + + bug fixes: + - title: "EPUB Input: Fix an infinite loop while trying to recover a damaged EPUB file." + tickets: [1086917] + + - title: "KF8 Input: Fix handling of links in files that link to the obsolete tags instead of tags with an id attribute." + tickets: [1086705] + + - title: "Conversion: Fix a bug in removal of invalid entries from the spine, where not all invalid entries were removed, causing conversion to fail." + tickets: [1086054] + + - title: "KF8 Input: Ignore invalid flow references in the KF8 document instead of erroring out on them." + tickets: [1085306] + + - title: "Fix command line output on linux systems with incorrect LANG/LC_TYPE env vars." + tickets: [1085103] + + - title: "KF8 Input: Fix page breaks specified using the data-AmznPageBreak attribute being ignored by calibre." + + - title: "PDF Output: Fix custom size field not accepting fractional numbers as sizes" + + - title: "Get Books: Update libre.de and publio for website changes" + + - title: "Wireless driver: Increase timeout interval, and when allocating a random port try 9090 first" + + improved recipes: + - New York Times + - Weblogs SL + - Zaman Gazetesi + - Aksiyon Dergisi + - Endgadget + - Metro UK + - Heise Online + +- version: 0.9.8 + date: 2012-11-30 + + new features: + - title: "Add an option to show the cover size in the book details panel on the right. Option is in Preferences->Look & Feel->Book Details" + + - title: "Kobo driver: Add support for firmware 2.2. Also add an option to send series information to the device." + description: "The newest Kobo firmware can display series information. Unfortunately, the Kobo does not read this information from the ebook file itself. It has to be sent separately after the Kobo has finished processing the new files. So you might have to connect - send books - disconnect and then re-connect for the series infor to show up. Fixes #1084388 (Add support for series on Kobo devices)" + + - title: "Catalogs: Allow using custom columns as the source for Genres when generating catalogs" + + - title: "When the user asks calibre to convert a book, show a small animation to highlight that the convert job has been queued to run in the background" + + - title: "Add support for the notification center in OS X 10.8" + + - title: "calibredb: Add an option to specify the cover to use when adding books with calibredb add." + tickets: [1083932] + + - title: "EPUB Input: Add support for EPUB files with broken central directory records *and* data descriptors" + + - title: "Comic metadata: Support reading metadata from cbr files. Also read the comments and published date info from the metadata." + tickets: [1082340] + + - title: "Speed up processing of RAR and CBR files by avoiding an extra file copy" + + - title: "Add driver for Nexus 10 on linux." + tickets: [1082563] + + bug fixes: + - title: "KF8 Input: Handle invalid KF8 files with links pointing to non-existent locations and incorrect values in the div table." + tickets: [1082669] + + - title: "Viewer: Fix handling of empty self closing tags." + tickets: [1083278] + + - title: "Fix use of {formats} in save to disk templates. Fix some formatter functions causing plugboards to not validate." + + - title: "Fix calibre quitting when minimized to system tray and an update available message is shown and then closed." + tickets: [1082630] + + - title: "Viewer: Fix vertical margin at the top of the first page of a chapter incorrect in a certain rare circumstance (first child of body being an empty paragraph)." + tickets: [1082640] + + - title: "E-book viewer: Fix bug that caused the default language for hyphenation to be ignored for books that do not specify a language" + + improved recipes: + - Pro Physik + - Aachener Nachrichten + - Science News + - version: 0.9.7 date: 2012-11-23 diff --git a/README b/README index 2ffab4e2f6..a1e3081988 100644 --- a/README +++ b/README @@ -1,7 +1,7 @@ -calibre is an e-book library manager. It can view, convert and catalog e-books \ -in most of the major e-book formats. It can also talk to e-book reader \ -devices. It can go out to the internet and fetch metadata for your books. \ -It can download newspapers and convert them into e-books for convenient \ +calibre is an e-book library manager. It can view, convert and catalog e-books +in most of the major e-book formats. It can also talk to e-book reader +devices. It can go out to the internet and fetch metadata for your books. +It can download newspapers and convert them into e-books for convenient reading. It is cross platform, running on Linux, Windows and OS X. For screenshots: https://calibre-ebook.com/demo @@ -15,5 +15,5 @@ bzr branch lp:calibre To update your copy of the source code: bzr merge -Tarballs of the source code for each release are now available \ +Tarballs of the source code for each release are now available at http://code.google.com/p/calibre-ebook diff --git a/imgsrc/polish.svg b/imgsrc/polish.svg new file mode 100644 index 0000000000..7affaaf4bd --- /dev/null +++ b/imgsrc/polish.svg @@ -0,0 +1,366 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + Ulisse Perusin + + + + uli.peru@gmail.com + edit-clear + + + + + + + + + + + + + + + + + + + + + diff --git a/manual/develop.rst b/manual/develop.rst index b9fba195d3..823a31b5c2 100644 --- a/manual/develop.rst +++ b/manual/develop.rst @@ -39,27 +39,27 @@ All the |app| python code is in the ``calibre`` package. This package contains t * devices - All the device drivers. Just look through some of the built-in drivers to get an idea for how they work. - * For details, see: devices.interface which defines the interface supported by device drivers and devices.usbms which + * For details, see: devices.interface which defines the interface supported by device drivers and ``devices.usbms`` which defines a generic driver that connects to a USBMS device. All USBMS based drivers in |app| inherit from it. * ebooks - All the ebook conversion/metadata code. A good starting point is ``calibre.ebooks.conversion.cli`` which is the - module powering the :command:`ebook-convert` command. The conversion process is controlled via conversion.plumber. - The format independent code is all in ebooks.oeb and the format dependent code is in ebooks.format_name. + module powering the :command:`ebook-convert` command. The conversion process is controlled via ``conversion.plumber``. + The format independent code is all in ``ebooks.oeb`` and the format dependent code is in ``ebooks.format_name``. - * Metadata reading, writing, and downloading is all in ebooks.metadata + * Metadata reading, writing, and downloading is all in ``ebooks.metadata`` * Conversion happens in a pipeline, for the structure of the pipeline, see :ref:`conversion-introduction`. The pipeline consists of an input - plugin, various transforms and an output plugin. The code constructs - and drives the pipeline is in plumber.py. The pipeline works on a + plugin, various transforms and an output plugin. The that code constructs + and drives the pipeline is in :file:`plumber.py`. The pipeline works on a representation of an ebook that is like an unzipped epub, with manifest, spine, toc, guide, html content, etc. The - class that manages this representation is OEBBook in oeb/base.py. The + class that manages this representation is OEBBook in ``ebooks.oeb.base``. The various transformations that are applied to the book during - conversions live in `oeb/transforms/*.py`. And the input and output - plugins live in `conversion/plugins/*.py`. + conversions live in :file:`oeb/transforms/*.py`. And the input and output + plugins live in :file:`conversion/plugins/*.py`. - * library - The database back-end and the content server. See library.database2 for the interface to the |app| library. library.server is the |app| Content Server. - * gui2 - The Graphical User Interface. GUI initialization happens in gui2.main and gui2.ui. The ebook-viewer is in gui2.viewer. + * library - The database back-end and the content server. See ``library.database2`` for the interface to the |app| library. ``library.server`` is the |app| Content Server. + * gui2 - The Graphical User Interface. GUI initialization happens in ``gui2.main`` and ``gui2.ui``. The ebook-viewer is in ``gui2.viewer``. If you need help understanding the code, post in the `development forum `_ and you will most likely get help from one of |app|'s many developers. @@ -74,10 +74,6 @@ After installing Bazaar, you can get the |app| source code with the command:: On Windows you will need the complete path name, that will be something like :file:`C:\\Program Files\\Bazaar\\bzr.exe`. -To update a branch to the latest code, use the command:: - - bzr merge - |app| is a very large project with a very long source control history, so the above can take a while (10mins to an hour depending on your internet speed). @@ -88,6 +84,11 @@ using:: bzr branch --stacked lp:calibre + +To update a branch to the latest code, use the command:: + + bzr merge + Submitting your changes to be included ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/manual/faq.rst b/manual/faq.rst index 109aff440d..9b1c862436 100644 --- a/manual/faq.rst +++ b/manual/faq.rst @@ -158,12 +158,23 @@ My device is not being detected by |app|? Follow these steps to find the problem: - * Make sure that you are connecting only a single device to your computer at a time. Do not have another |app| supported device like an iPhone/iPad etc. at the same time. - * If you are connecting an Apple iDevice (iPad, iPod Touch, iPhone), use the 'Connect to iTunes' method in the 'Getting started' instructions in `Calibre + Apple iDevices: Start here `_. - * Make sure you are running the latest version of |app|. The latest version can always be downloaded from `the calibre website `_. - * Ensure your operating system is seeing the device. That is, the device should show up in Windows Explorer (in Windows) or Finder (in OS X). - * In |app|, go to Preferences->Plugins->Device Interface plugin and make sure the plugin for your device is enabled, the plugin icon next to it should be green when it is enabled. - * If all the above steps fail, go to Preferences->Miscellaneous and click debug device detection with your device attached and post the output as a ticket on `the calibre bug tracker `_. + * Make sure that you are connecting only a single device to your computer + at a time. Do not have another |app| supported device like an iPhone/iPad + etc. at the same time. + * If you are connecting an Apple iDevice (iPad, iPod Touch, iPhone), use + the 'Connect to iTunes' method in the 'Getting started' instructions in + `Calibre + Apple iDevices: Start here `_. + * Make sure you are running the latest version of |app|. The latest version + can always be downloaded from `the calibre website `_. + You can tell what version of |app| you are currently running by looking + at the bottom line of the main |app| window. + * Ensure your operating system is seeing the device. That is, the device + should show up in Windows Explorer (in Windows) or Finder (in OS X). + * In |app|, go to Preferences->Ignored Devices and check that your device + is not being ignored + * If all the above steps fail, go to Preferences->Miscellaneous and click + debug device detection with your device attached and post the output as a + ticket on `the calibre bug tracker `_. My device is non-standard or unusual. What can I do to connect to it? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -239,42 +250,71 @@ If you don't want to uninstall it altogether, there are a couple of tricks you c simplest is to simply re-name the executable file that launches the library program. More detail `in the forums `_. -How do I use |app| with my iPad/iPhone/iTouch? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +How do I use |app| with my iPad/iPhone/iPod touch? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Over the air ^^^^^^^^^^^^^^ -The easiest way to browse your |app| collection on your Apple device (iPad/iPhone/iPod) is by using the calibre content server, which makes your collection available over the net. First perform the following steps in |app| +The easiest way to browse your |app| collection on your Apple device +(iPad/iPhone/iPod) is by using the |app| content server, which makes your +collection available over the net. First perform the following steps in |app| - * Set the Preferred Output Format in |app| to EPUB (The output format can be set under :guilabel:`Preferences->Interface->Behavior`) - * Set the output profile to iPad (this will work for iPhone/iPods as well), under :guilabel:`Preferences->Conversion->Common Options->Page Setup` - * Convert the books you want to read on your iPhone to EPUB format by selecting them and clicking the Convert button. - * Turn on the Content Server in |app|'s preferences and leave |app| running. + * Set the Preferred Output Format in |app| to EPUB (The output format can be + set under :guilabel:`Preferences->Interface->Behavior`) + * Set the output profile to iPad (this will work for iPhone/iPods as well), + under :guilabel:`Preferences->Conversion->Common Options->Page Setup` + * Convert the books you want to read on your iDevice to EPUB format by + selecting them and clicking the Convert button. + * Turn on the Content Server by clicking the :guilabel:`Connect/Share` button + and leave |app| running. You can also tell |app| to automatically start the + content server via :guilabel:`Preferences->Sharing over the net`. -Now on your iPad/iPhone you have two choices, use either iBooks (version 1.2 and later) or Stanza (version 3.0 and later). Both are available free from the app store. +There are many apps for your iDevice that can connect to |app|. Here we +describe using two of them, iBooks and Stanza. Using Stanza *************** -Now you should be able to access your books on your iPhone by opening Stanza. Go to "Get Books" and then click the "Shared" tab. Under Shared you will see an entry "Books in calibre". If you don't, make sure your iPad/iPhone is connected using the WiFi network in your house, not 3G. If the |app| catalog is still not detected in Stanza, you can add it manually in Stanza. To do this, click the "Shared" tab, then click the "Edit" button and then click "Add book source" to add a new book source. In the Add Book Source screen enter whatever name you like and in the URL field, enter the following:: +You should be able to access your books on your iPhone by opening Stanza. Go to +"Get Books" and then click the "Shared" tab. Under Shared you will see an entry +"Books in calibre". If you don't, make sure your iPad/iPhone is connected using +the WiFi network in your house, not 3G. If the |app| catalog is still not +detected in Stanza, you can add it manually in Stanza. To do this, click the +"Shared" tab, then click the "Edit" button and then click "Add book source" to +add a new book source. In the Add Book Source screen enter whatever name you +like and in the URL field, enter the following:: http://192.168.1.2:8080/ -Replace ``192.168.1.2`` with the local IP address of the computer running |app|. If you have changed the port the |app| content server is running on, you will have to change ``8080`` as well to the new port. The local IP address is the IP address you computer is assigned on your home network. A quick Google search will tell you how to find out your local IP address. Now click "Save" and you are done. +Replace ``192.168.1.2`` with the local IP address of the computer running +|app|. If you have changed the port the |app| content server is running on, you +will have to change ``8080`` as well to the new port. The local IP address is +the IP address you computer is assigned on your home network. A quick Google +search will tell you how to find out your local IP address. Now click "Save" +and you are done. -If you get timeout errors while browsing the calibre catalog in Stanza, try increasing the connection timeout value in the stanza settings. Go to Info->Settings and increase the value of Download Timeout. +If you get timeout errors while browsing the calibre catalog in Stanza, try +increasing the connection timeout value in the stanza settings. Go to +Info->Settings and increase the value of Download Timeout. Using iBooks ************** -Start the Safari browser and type in the IP address and port of the computer running the calibre server, like this:: +Start the Safari browser and type in the IP address and port of the computer +running the calibre server, like this:: http://192.168.1.2:8080/ -Replace ``192.168.1.2`` with the local IP address of the computer running |app|. If you have changed the port the |app| content server is running on, you will have to change ``8080`` as well to the new port. The local IP address is the IP address you computer is assigned on your home network. A quick Google search will tell you how to find out your local IP address. +Replace ``192.168.1.2`` with the local IP address of the computer running +|app|. If you have changed the port the |app| content server is running on, you +will have to change ``8080`` as well to the new port. The local IP address is +the IP address you computer is assigned on your home network. A quick Google +search will tell you how to find out your local IP address. -You will see a list of books in Safari, just click on the epub link for whichever book you want to read, Safari will then prompt you to open it with iBooks. +You will see a list of books in Safari, just click on the epub link for +whichever book you want to read, Safari will then prompt you to open it with +iBooks. With the USB cable + iTunes @@ -436,10 +476,10 @@ that allows you to create collections on your Kindle from the |app| metadata. It .. note:: Amazon have removed the ability to manipulate collections completely in their newer models, like the Kindle Touch and Kindle Fire, making even the above plugin useless. If you really want the ability to manage collections on your Kindle via a USB connection, we encourage you to complain to Amazon about it, or get a reader where this is supported, like the SONY or Kobo Readers. -I am getting an error when I try to use |app| with my Kobo Touch? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +I am getting an error when I try to use |app| with my Kobo Touch/Glo/etc.? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The Kobo Touch has very buggy firmware. Connecting to it has been known to fail at random. Certain combinations of motherboard, USB ports/cables/hubs can exacerbate this tendency to fail. If you are getting an error when connecting to your touch with |app| try the following, each of which has solved the problem for *some* |app| users. +The Kobo has very buggy firmware. Connecting to it has been known to fail at random. Certain combinations of motherboard, USB ports/cables/hubs can exacerbate this tendency to fail. If you are getting an error when connecting to your touch with |app| try the following, each of which has solved the problem for *some* |app| users. * Connect the Kobo directly to your computer, not via USB Hub * Try a different USB cable and a different USB port on your computer @@ -539,9 +579,9 @@ Yes, you can. Follow the instructions in the answer above for adding custom colu How do I move my |app| library from one computer to another? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Simply copy the |app| library folder from the old to the new computer. You can find out what the library folder is by clicking the calibre icon in the toolbar. The very first item is the path to the library folder. Now on the new computer, start |app| for the first time. It will run the Welcome Wizard asking you for the location of the |app| library. Point it to the previously copied folder. If the computer you are transferring to already has a calibre installation, then the Welcome wizard wont run. In that case, click the calibre icon in the tooolbar and point it to the newly copied directory. You will now have two calibre libraries on your computer and you can switch between them by clicking the calibre icon on the toolbar. +Simply copy the |app| library folder from the old to the new computer. You can find out what the library folder is by clicking the calibre icon in the toolbar. The very first item is the path to the library folder. Now on the new computer, start |app| for the first time. It will run the Welcome Wizard asking you for the location of the |app| library. Point it to the previously copied folder. If the computer you are transferring to already has a calibre installation, then the Welcome wizard wont run. In that case, right-click the |app| icon in the tooolbar and point it to the newly copied directory. You will now have two calibre libraries on your computer and you can switch between them by clicking the |app| icon on the toolbar. Transferring your library in this manner preserver all your metadata, tags, custom columns, etc. -Note that if you are transferring between different types of computers (for example Windows to OS X) then after doing the above you should also right-click the calibre icon on the tool bar, select Library Maintenance and run the Check Library action. It will warn you about any problems in your library, which you should fix by hand. +Note that if you are transferring between different types of computers (for example Windows to OS X) then after doing the above you should also right-click the |app| icon on the tool bar, select Library Maintenance and run the Check Library action. It will warn you about any problems in your library, which you should fix by hand. .. note:: A |app| library is just a folder which contains all the book files and their metadata. All the metadata is stored in a single file called metadata.db, in the top level folder. If this file gets corrupted, you may see an empty list of books in |app|. In this case you can ask |app| to restore your books by doing a right-click on the |app| icon in the toolbar and selecting Library Maintenance->Restore Library. @@ -652,7 +692,7 @@ Post any output you see in a help message on the `Forum `. diff --git a/manual/gui.rst b/manual/gui.rst index a51ced54d3..98954ebabd 100755 --- a/manual/gui.rst +++ b/manual/gui.rst @@ -537,6 +537,8 @@ Calibre has several keyboard shortcuts to save you time and mouse movement. Thes - Merge selected records, keeping originals * - :kbd:`O` - Open containing folder + * - :kbd:`P` + - Polish books * - :kbd:`S` - Save to Disk * - :kbd:`V` diff --git a/recipes/aachener_nachrichten.recipe b/recipes/aachener_nachrichten.recipe index a2294fc472..cdcb6895bc 100644 --- a/recipes/aachener_nachrichten.recipe +++ b/recipes/aachener_nachrichten.recipe @@ -2,41 +2,70 @@ from calibre.web.feeds.recipes import BasicNewsRecipe class AdvancedUserRecipe(BasicNewsRecipe): title = u'Aachener Nachrichten' - __author__ = 'schuster' - oldest_article = 1 + __author__ = 'schuster' #AGE update 2012-11-28 + oldest_article = 1 max_articles_per_feed = 100 - use_embedded_content = False - language = 'de' - remove_javascript = True - cover_url = 'http://www.an-online.de/einwaage/images/an_logo.png' - masthead_url = 'http://www.an-online.de/einwaage/images/an_logo.png' - extra_css = ''' - .fliesstext_detail:{margin-bottom:10%;} - .headline_1:{margin-bottom:25%;} - b{font-family:Arial,Helvetica,sans-serif; font-weight:200;font-size:large;} - a{font-family:Arial,Helvetica,sans-serif; font-weight:400;font-size:large;} - ll{font-family:Arial,Helvetica,sans-serif; font-weight:100;font-size:large;} - h4{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} - img {min-width:300px; max-width:600px; min-height:300px; max-height:800px} - dd{font-family:Arial,Helvetica,sans-serif;font-size:large;} - body{font-family:Helvetica,Arial,sans-serif;font-size:small;} - ''' - + no_stylesheets = True + remove_javascript = True + remove_empty_feeds = True + language = 'de' +# cover_url = 'http://www.aachener-nachrichten.de/img/logos/an_website_retina.png' + masthead_url = 'http://www.aachener-nachrichten.de/img/logos/an_website_retina.png' keep_only_tags = [ - dict(name='span', attrs={'class':['fliesstext_detail', 'headline_1', 'autor_detail']}), - dict(id=['header-logo']) - ] + dict(name='article', attrs={'class':['single']}) + ] - feeds = [(u'Euregio', u'http://www.an-online.de/an/rss/Euregio.xml'), - (u'Aachen', u'http://www.an-online.de/an/rss/Aachen.xml'), - (u'Nordkreis', u'http://www.an-online.de/an/rss/Nordkreis.xml'), - (u'Düren', u'http://www.an-online.de/an/rss/Dueren.xml'), - (u'Eiffel', u'http://www.an-online.de/an/rss/Eifel.xml'), - (u'Eschweiler', u'http://www.an-online.de/an/rss/Eschweiler.xml'), - (u'Geilenkirchen', u'http://www.an-online.de/an/rss/Geilenkirchen.xml'), - (u'Heinsberg', u'http://www.an-online.de/an/rss/Heinsberg.xml'), - (u'Jülich', u'http://www.an-online.de/an/rss/Juelich.xml'), - (u'Stolberg', u'http://www.an-online.de/an/rss/Stolberg.xml'), - (u'Ratgebenr', u'http://www.an-online.de/an/rss/Ratgeber.xml')] + remove_tags = [ + dict(name='div', attrs={'class':["clearfix navi-wrapper"]}), + dict(name='div', attrs={'id':["article_actions"]}), + dict(name='style', attrs={'type':["text/css"]}), + dict(name='aside'), + dict(name='a', attrs={'class':["btn btn-action"]}) + ] + + feeds = [ + (u'Lokales - Euregio', u'http://www.aachener-nachrichten.de/cmlink/euregio-rss-1.357285'), + (u'Lokales - Aachen', u'http://www.aachener-nachrichten.de/cmlink/aachen-rss-1.357286'), + (u'Lokales - Nordkreis', u'http://www.aachener-nachrichten.de/cmlink/nordkreis-rss-1.358150'), + (u'Lokales - Düren', u'http://www.aachener-nachrichten.de/cmlink/dueren-rss-1.358626'), + (u'Lokales - Eiffel', u'http://www.aachener-nachrichten.de/cmlink/eifel-rss-1.358978'), + (u'Lokales - Eschweiler', u'http://www.aachener-nachrichten.de/cmlink/eschweiler-rss-1.359332'), + (u'Lokales - Geilenkirchen', u'http://www.aachener-nachrichten.de/cmlink/geilenkirchen-rss-1.359643'), + (u'Lokales - Heinsberg', u'http://www.aachener-nachrichten.de/cmlink/heinsberg-rss-1.359724'), + (u'Lokales - Jülich', u'http://www.aachener-nachrichten.de/cmlink/juelich-rss-1.359725'), + (u'Lokales - Stolberg', u'http://www.aachener-nachrichten.de/cmlink/stolberg-rss-1.359726'), + (u'News - Politik', u'http://www.aachener-nachrichten.de/cmlink/politik-rss-1.359727'), + (u'News - Aus aller Welt', u'http://www.aachener-nachrichten.de/cmlink/ausallerwelt-rss-1.453282'), + (u'News - Wirtschaft', u'http://www.aachener-nachrichten.de/cmlink/wirtschaft-rss-1.359872'), + (u'News - Kultur', u'http://www.aachener-nachrichten.de/cmlink/kultur-rss-1.365018'), + (u'News - Kino', u'http://www.aachener-nachrichten.de/cmlink/kino-rss-1.365019'), + (u'News - Digital', u'http://www.aachener-nachrichten.de/cmlink/digital-rss-1.365020'), + (u'News - Wissenschaft', u'http://www.aachener-nachrichten.de/cmlink/wissenschaft-rss-1.365021'), + (u'News - Hochschule', u'http://www.aachener-nachrichten.de/cmlink/hochschule-rss-1.365022'), + (u'News - Auto', u'http://www.aachener-nachrichten.de/cmlink/auto-rss-1.365023'), + (u'News - Kurioses', u'http://www.aachener-nachrichten.de/cmlink/kurioses-rss-1.365067'), + (u'News - Musik', u'http://www.aachener-nachrichten.de/cmlink/musik-rss-1.365305'), + (u'News - Tagesthema', u'http://www.aachener-nachrichten.de/cmlink/tagesthema-rss-1.365519'), + (u'News - Newsticker', u'http://www.aachener-nachrichten.de/cmlink/newsticker-rss-1.451948'), + (u'Sport - Aktuell', u'http://www.aachener-nachrichten.de/cmlink/aktuell-rss-1.366716'), + (u'Sport - Fußball', u'http://www.aachener-nachrichten.de/cmlink/fussball-rss-1.367060'), + (u'Sport - Bundesliga', u'http://www.aachener-nachrichten.de/cmlink/bundesliga-rss-1.453367'), + (u'Sport - Alemannia Aachen', u'http://www.aachener-nachrichten.de/cmlink/alemanniaaachen-rss-1.366057'), + (u'Sport - Volleyball', u'http://www.aachener-nachrichten.de/cmlink/volleyball-rss-1.453370'), + (u'Sport - Chio', u'http://www.aachener-nachrichten.de/cmlink/chio-rss-1.453371'), + (u'Dossier - Kinderuni', u'http://www.aachener-nachrichten.de/cmlink/kinderuni-rss-1.453375'), + (u'Dossier - Karlspreis', u'http://www.aachener-nachrichten.de/cmlink/karlspreis-rss-1.453376'), + (u'Dossier - Ritterorden', u'http://www.aachener-nachrichten.de/cmlink/ritterorden-rss-1.453377'), + (u'Dossier - ZAB-Aachen', u'http://www.aachener-nachrichten.de/cmlink/zabaachen-rss-1.453380'), + (u'Dossier - Karneval', u'http://www.aachener-nachrichten.de/cmlink/karneval-rss-1.453384'), + (u'Ratgeber - Geld', u'http://www.aachener-nachrichten.de/cmlink/geld-rss-1.453385'), + (u'Ratgeber - Recht', u'http://www.aachener-nachrichten.de/cmlink/recht-rss-1.453386'), + (u'Ratgeber - Gesundheit', u'http://www.aachener-nachrichten.de/cmlink/gesundheit-rss-1.453387'), + (u'Ratgeber - Familie', u'http://www.aachener-nachrichten.de/cmlink/familie-rss-1.453388'), + (u'Ratgeber - Livestyle', u'http://www.aachener-nachrichten.de/cmlink/lifestyle-rss-1.453389'), + (u'Ratgeber - Reisen', u'http://www.aachener-nachrichten.de/cmlink/reisen-rss-1.453390'), + (u'Ratgeber - Bauen und Wohnen', u'http://www.aachener-nachrichten.de/cmlink/bauen-rss-1.453398'), + (u'Ratgeber - Bildung und Beruf', u'http://www.aachener-nachrichten.de/cmlink/bildung-rss-1.453400'), + ] diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe index 485a2e0c5b..b02460695e 100644 --- a/recipes/adventure_zone_pl.recipe +++ b/recipes/adventure_zone_pl.recipe @@ -9,18 +9,19 @@ class Adventure_zone(BasicNewsRecipe): no_stylesheets = True oldest_article = 20 max_articles_per_feed = 100 + cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png' index='http://www.adventure-zone.info/fusion/' - use_embedded_content=False + use_embedded_content = False preprocess_regexps = [(re.compile(r"Komentarze", re.IGNORECASE), lambda m: ''), - (re.compile(r'\'), lambda match: ''), - (re.compile(r'\'), lambda match: '')] + (re.compile(r''), lambda match: ''), + (re.compile(r''), lambda match: '')] remove_tags_before= dict(name='td', attrs={'class':'main-bg'}) remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})] remove_tags_after= dict(id='comments') extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }' feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')] - def parse_feeds (self): + '''def parse_feeds (self): feeds = BasicNewsRecipe.parse_feeds(self) soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php') tag=soup.find(name='channel') @@ -33,24 +34,33 @@ class Adventure_zone(BasicNewsRecipe): for feed in feeds: for article in feed.articles[:]: article.title=titles[feed.articles.index(article)] - return feeds + return feeds''' - def get_cover_url(self): + '''def get_cover_url(self): soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php') cover=soup.find(id='box_OstatninumerAZ') self.cover_url='http://www.adventure-zone.info/fusion/'+ cover.center.a.img['src'] - return getattr(self, 'cover_url', self.cover_url) - + return getattr(self, 'cover_url', self.cover_url)''' + def populate_article_metadata(self, article, soup, first): + result = re.search('(.+) - Adventure Zone', soup.title.string) + if result: + article.title = result.group(1) + else: + result = soup.body.find('strong') + if result: + article.title = result.string def skip_ad_pages(self, soup): skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'}) skip_tag = skip_tag.findAll(name='a') - for r in skip_tag: - if r.strong: - word=r.strong.string.lower() - if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): - return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) + title = soup.title.string.lower() + if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)): + for r in skip_tag: + if r.strong and r.strong.string: + word=r.strong.string.lower() + if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): + return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True) def preprocess_html(self, soup): footer=soup.find(attrs={'class':'news-footer middle-border'}) diff --git a/recipes/aksiyon_derigisi.recipe b/recipes/aksiyon_derigisi.recipe index bc15b39095..d7be418413 100644 --- a/recipes/aksiyon_derigisi.recipe +++ b/recipes/aksiyon_derigisi.recipe @@ -20,6 +20,7 @@ class Aksiyon (BasicNewsRecipe): auto_cleanup = True cover_img_url = 'http://www.aksiyon.com.tr/aksiyon/images/aksiyon/top-page/aksiyon_top_r2_c1.jpg' masthead_url = 'http://aksiyon.com.tr/aksiyon/images/aksiyon/top-page/aksiyon_top_r2_c1.jpg' + ignore_duplicate_articles = { 'title', 'url' } remove_empty_feeds= True feeds = [ ( u'KAPAK', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=26'), diff --git a/recipes/alternet.recipe b/recipes/alternet.recipe index e58376cc42..0bd608e0e7 100644 --- a/recipes/alternet.recipe +++ b/recipes/alternet.recipe @@ -10,14 +10,12 @@ class Alternet(BasicNewsRecipe): category = 'News, Magazine' description = 'News magazine and online community' feeds = [ - (u'Front Page', u'http://feeds.feedblitz.com/alternet'), - (u'Breaking News', u'http://feeds.feedblitz.com/alternet_breaking_news'), - (u'Top Ten Campaigns', u'http://feeds.feedblitz.com/alternet_top_10_campaigns'), - (u'Special Coverage Areas', u'http://feeds.feedblitz.com/alternet_coverage') + (u'Front Page', u'http://feeds.feedblitz.com/alternet') ] + remove_attributes = ['width', 'align','cellspacing'] remove_javascript = True - use_embedded_content = False + use_embedded_content = True no_stylesheets = True language = 'en' encoding = 'UTF-8' diff --git a/recipes/ambito_financiero.recipe b/recipes/ambito_financiero.recipe index 08c056e8ee..c45fa8fbce 100644 --- a/recipes/ambito_financiero.recipe +++ b/recipes/ambito_financiero.recipe @@ -42,7 +42,7 @@ class Ambito_Financiero(BasicNewsRecipe): remove_attributes = ['align'] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open(self.INDEX) if self.username is not None and self.password is not None: br.open(self.LOGIN) diff --git a/recipes/anchorage_daily.recipe b/recipes/anchorage_daily.recipe index 4ce2f13a14..7bda0f5bcd 100644 --- a/recipes/anchorage_daily.recipe +++ b/recipes/anchorage_daily.recipe @@ -5,14 +5,16 @@ class AdvancedUserRecipe1278347258(BasicNewsRecipe): __author__ = 'rty' oldest_article = 7 max_articles_per_feed = 100 + auto_cleanup = True + feeds = [(u'Alaska News', u'http://www.adn.com/news/alaska/index.xml'), - (u'Business', u'http://www.adn.com/money/index.xml'), - (u'Sports', u'http://www.adn.com/sports/index.xml'), - (u'Politics', u'http://www.adn.com/politics/index.xml'), - (u'Lifestyles', u'http://www.adn.com/life/index.xml'), - (u'Iditarod', u'http://www.adn.com/iditarod/index.xml') - ] + (u'Business', u'http://www.adn.com/money/index.xml'), + (u'Sports', u'http://www.adn.com/sports/index.xml'), + (u'Politics', u'http://www.adn.com/politics/index.xml'), + (u'Lifestyles', u'http://www.adn.com/life/index.xml'), + (u'Iditarod', u'http://www.adn.com/iditarod/index.xml') + ] description = ''''Alaska's Newspaper''' publisher = 'http://www.adn.com' category = 'news, Alaska, Anchorage' @@ -28,13 +30,13 @@ class AdvancedUserRecipe1278347258(BasicNewsRecipe): conversion_options = {'linearize_tables':True} masthead_url = 'http://media.adn.com/includes/assets/images/adn_logo.2.gif' - keep_only_tags = [ - dict(name='div', attrs={'class':'left_col story_mainbar'}), - ] - remove_tags = [ - dict(name='div', attrs={'class':'story_tools'}), - dict(name='p', attrs={'class':'ad_label'}), - ] - remove_tags_after = [ - dict(name='div', attrs={'class':'advertisement'}), - ] + #keep_only_tags = [ + #dict(name='div', attrs={'class':'left_col story_mainbar'}), + #] + #remove_tags = [ + #dict(name='div', attrs={'class':'story_tools'}), + #dict(name='p', attrs={'class':'ad_label'}), + #] + #remove_tags_after = [ + #dict(name='div', attrs={'class':'advertisement'}), + #] diff --git a/recipes/android_com_pl.recipe b/recipes/android_com_pl.recipe index c7a4a97d3c..a4a387d414 100644 --- a/recipes/android_com_pl.recipe +++ b/recipes/android_com_pl.recipe @@ -3,11 +3,11 @@ from calibre.web.feeds.news import BasicNewsRecipe class Android_com_pl(BasicNewsRecipe): title = u'Android.com.pl' __author__ = 'fenuks' - description = 'Android.com.pl - biggest polish Android site' + description = u'Android.com.pl - to największe w Polsce centrum Android OS. Znajdziesz tu: nowości, forum, pomoc, recenzje, gry, aplikacje.' category = 'Android, mobile' language = 'pl' use_embedded_content=True - cover_url =u'http://upload.wikimedia.org/wikipedia/commons/thumb/d/d7/Android_robot.svg/220px-Android_robot.svg.png' + cover_url =u'http://android.com.pl/wp-content/themes/android/images/logo.png' oldest_article = 8 max_articles_per_feed = 100 - feeds = [(u'Android', u'http://android.com.pl/component/content/frontpage/frontpage.feed?type=rss')] + feeds = [(u'Android', u'http://android.com.pl/feed/')] diff --git a/recipes/apple_daily.recipe b/recipes/apple_daily.recipe index 1e9953af43..763136c9b0 100644 --- a/recipes/apple_daily.recipe +++ b/recipes/apple_daily.recipe @@ -37,7 +37,7 @@ class AppleDaily(BasicNewsRecipe): #def get_browser(self): - #br = BasicNewsRecipe.get_browser() + #br = BasicNewsRecipe.get_browser(self) #if self.username is not None and self.password is not None: # br.open('http://www.nytimes.com/auth/login') # br.select_form(name='login') diff --git a/recipes/archeowiesci.recipe b/recipes/archeowiesci.recipe index e121ba4d42..6bcc9bef6c 100644 --- a/recipes/archeowiesci.recipe +++ b/recipes/archeowiesci.recipe @@ -22,7 +22,7 @@ class Archeowiesci(BasicNewsRecipe): return feeds def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://archeowiesci.pl/wp-login.php') br.select_form(name='loginform') diff --git a/recipes/asco_de_vida.recipe b/recipes/asco_de_vida.recipe new file mode 100644 index 0000000000..fa1944f95d --- /dev/null +++ b/recipes/asco_de_vida.recipe @@ -0,0 +1,20 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class HindustanTimes(BasicNewsRecipe): + title = u'Asco de vida' + language = 'es' + __author__ = 'Krittika Goyal' + oldest_article = 1 #days + max_articles_per_feed = 25 + #encoding = 'cp1252' + use_embedded_content = False + + no_stylesheets = True + keep_only_tags = dict(name='div', attrs={'class':'box story'}) + + + feeds = [ +('News', + 'http://feeds2.feedburner.com/AscoDeVida'), +] + diff --git a/recipes/astroflesz.recipe b/recipes/astroflesz.recipe new file mode 100644 index 0000000000..0b92fdfa29 --- /dev/null +++ b/recipes/astroflesz.recipe @@ -0,0 +1,19 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from calibre.web.feeds.news import BasicNewsRecipe + +class Astroflesz(BasicNewsRecipe): + title = u'Astroflesz' + oldest_article = 7 + __author__ = 'fenuks' + description = u'astroflesz.pl - to portal poświęcony astronomii. Informuje zarówno o aktualnych wydarzeniach i odkryciach naukowych, jak również zapowiada ciekawe zjawiska astronomiczne' + category = 'astronomy' + language = 'pl' + cover_url = 'http://www.astroflesz.pl/templates/astroflesz/images/logo/logo.png' + ignore_duplicate_articles = {'title', 'url'} + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + keep_only_tags = [dict(id="k2Container")] + remove_tags_after = dict(name='div', attrs={'class':'itemLinks'}) + remove_tags = [dict(name='div', attrs={'class':['itemLinks', 'itemToolbar', 'itemRatingBlock']})] + feeds = [(u'Wszystkie', u'http://astroflesz.pl/?format=feed')] diff --git a/recipes/azstarnet.recipe b/recipes/azstarnet.recipe index 45339ae208..3ab10a9b6f 100644 --- a/recipes/azstarnet.recipe +++ b/recipes/azstarnet.recipe @@ -31,7 +31,7 @@ class Azstarnet(BasicNewsRecipe): def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open('http://azstarnet.com/') if self.username is not None and self.password is not None: data = urllib.urlencode({ 'm':'login' diff --git a/recipes/badania_net.recipe b/recipes/badania_net.recipe new file mode 100644 index 0000000000..01499f6369 --- /dev/null +++ b/recipes/badania_net.recipe @@ -0,0 +1,17 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class BadaniaNet(BasicNewsRecipe): + title = u'badania.net' + __author__ = 'fenuks' + description = u'chcesz wiedzieć więcej?' + category = 'science' + language = 'pl' + cover_url = 'http://badania.net/wp-content/badanianet_green_transparent.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + remove_tags = [dict(attrs={'class':['omc-flex-category', 'omc-comment-count', 'omc-single-tags']})] + remove_tags_after = dict(attrs={'class':'omc-single-tags'}) + keep_only_tags = [dict(id='omc-full-article')] + feeds = [(u'Psychologia', u'http://badania.net/category/psychologia/feed/'), (u'Technologie', u'http://badania.net/category/technologie/feed/'), (u'Biologia', u'http://badania.net/category/biologia/feed/'), (u'Chemia', u'http://badania.net/category/chemia/feed/'), (u'Zdrowie', u'http://badania.net/category/zdrowie/'), (u'Seks', u'http://badania.net/category/psychologia-ewolucyjna-tematyka-seks/feed/')] diff --git a/recipes/baltimore_sun.recipe b/recipes/baltimore_sun.recipe index 006a4c4ae6..3cd5c8edbc 100644 --- a/recipes/baltimore_sun.recipe +++ b/recipes/baltimore_sun.recipe @@ -19,6 +19,7 @@ class BaltimoreSun(BasicNewsRecipe): use_embedded_content = False no_stylesheets = True remove_javascript = True + #auto_cleanup = True recursions = 1 ignore_duplicate_articles = {'title'} @@ -78,6 +79,7 @@ class BaltimoreSun(BasicNewsRecipe): #(u'High School', u'http://www.baltimoresun.com/sports/high-school/rss2.0.xml'), #(u'Outdoors', u'http://www.baltimoresun.com/sports/outdoors/rss2.0.xml'), + ## Entertainment ## (u'Celebrity News', u'http://www.baltimoresun.com/entertainment/celebrities/rss2.0.xml'), (u'Arts & Theater', u'http://www.baltimoresun.com/entertainment/arts/rss2.0.xml'), @@ -142,12 +144,12 @@ class BaltimoreSun(BasicNewsRecipe): (u'Read Street', u'http://www.baltimoresun.com/features/books/read-street/rss2.0.xml'), (u'Z on TV', u'http://www.baltimoresun.com/entertainment/tv/z-on-tv-blog/rss2.0.xml'), -## Life Blogs ## - (u'BMore Green', u'http://weblogs.baltimoresun.com/features/green/index.xml'), - (u'Baltimore Insider',u'http://www.baltimoresun.com/features/baltimore-insider-blog/rss2.0.xml'), - (u'Homefront', u'http://www.baltimoresun.com/features/parenting/homefront/rss2.0.xml'), - (u'Picture of Health', u'http://www.baltimoresun.com/health/blog/rss2.0.xml'), - (u'Unleashed', u'http://weblogs.baltimoresun.com/features/mutts/blog/index.xml'), +### Life Blogs ## + #(u'BMore Green', u'http://weblogs.baltimoresun.com/features/green/index.xml'), + #(u'Baltimore Insider',u'http://www.baltimoresun.com/features/baltimore-insider-blog/rss2.0.xml'), + #(u'Homefront', u'http://www.baltimoresun.com/features/parenting/homefront/rss2.0.xml'), + #(u'Picture of Health', u'http://www.baltimoresun.com/health/blog/rss2.0.xml'), + #(u'Unleashed', u'http://weblogs.baltimoresun.com/features/mutts/blog/index.xml'), ## b the site blogs ## (u'Game Cache', u'http://www.baltimoresun.com/entertainment/bthesite/game-cache/rss2.0.xml'), @@ -167,6 +169,7 @@ class BaltimoreSun(BasicNewsRecipe): ] + def get_article_url(self, article): ans = None try: diff --git a/recipes/barrons.recipe b/recipes/barrons.recipe index 9d79aed728..58c62e20e9 100644 --- a/recipes/barrons.recipe +++ b/recipes/barrons.recipe @@ -28,6 +28,8 @@ class Barrons(BasicNewsRecipe): ## Don't grab articles more than 7 days old oldest_article = 7 + use_javascript_to_login = True + requires_version = (0, 9, 16) extra_css = ''' .datestamp{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;} @@ -40,7 +42,7 @@ class Barrons(BasicNewsRecipe): .insettipUnit{font-size: x-small;} ''' remove_tags = [ - dict(name ='div', attrs={'class':['tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}), + dict(name ='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}), dict(name = 'a', attrs ={'class':'insetClose'}) ] @@ -60,21 +62,17 @@ class Barrons(BasicNewsRecipe): ] ] - def get_browser(self): - br = BasicNewsRecipe.get_browser() - if self.username is not None and self.password is not None: - br.open('http://commerce.barrons.com/auth/login') - br.select_form(name='login_form') - br['user'] = self.username - br['password'] = self.password - br.submit() - return br + def javascript_login(self, br, username, password): + br.visit('http://commerce.barrons.com/auth/login') + f = br.select_form(nr=0) + f['username'] = username + f['password'] = password + br.submit(timeout=120) ## Use the print version of a page when available. - def print_version(self, url): main, sep, rest = url.rpartition('?') - return main + '#printmode' + return main + '#text.print' def postprocess_html(self, soup, first): diff --git a/recipes/bash_org_pl.recipe b/recipes/bash_org_pl.recipe index 4ed59614e7..a04f267ca3 100644 --- a/recipes/bash_org_pl.recipe +++ b/recipes/bash_org_pl.recipe @@ -35,8 +35,8 @@ class Bash_org_pl(BasicNewsRecipe): soup=self.index_to_soup(u'http://bash.org.pl/random/') #date=soup.find('div', attrs={'class':'right'}).string url=soup.find('a', attrs={'class':'qid click'}) - title=url.string - url='http://bash.org.pl' +url['href'] + title='' + url='http://bash.org.pl/random/' articles.append({'title' : title, 'url' : url, 'date' : '', @@ -44,6 +44,8 @@ class Bash_org_pl(BasicNewsRecipe): }) return articles + def populate_article_metadata(self, article, soup, first): + article.title = soup.find(attrs={'class':'qid click'}).string def parse_index(self): feeds = [] diff --git a/recipes/big_oven.recipe b/recipes/big_oven.recipe index ba3a5dec38..a1e9a5c042 100644 --- a/recipes/big_oven.recipe +++ b/recipes/big_oven.recipe @@ -25,7 +25,7 @@ class BigOven(BasicNewsRecipe): } def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://www.bigoven.com/account/login?ReturnUrl=/') br.select_form(nr=1) diff --git a/recipes/birmingham_post.recipe b/recipes/birmingham_post.recipe index b9b3c3fc57..db2e29c821 100644 --- a/recipes/birmingham_post.recipe +++ b/recipes/birmingham_post.recipe @@ -1,9 +1,11 @@ from calibre.web.feeds.news import BasicNewsRecipe +import re +import mechanize + class AdvancedUserRecipe1306097511(BasicNewsRecipe): title = u'Birmingham post' description = 'Author D.Asbury. News for Birmingham UK' #timefmt = '' - # last update 8/9/12 __author__ = 'Dave Asbury' cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg' oldest_article = 2 @@ -12,27 +14,36 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe): remove_empty_feeds = True remove_javascript = True no_stylesheets = True - #auto_cleanup = True + auto_cleanup = True language = 'en_GB' + cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg' - masthead_url = 'http://www.pressgazette.co.uk/Pictures/web/t/c/g/birmingham_post.jpg' + masthead_url = 'http://www.trinitymirror.com/images/birminghampost-logo.gif' + def get_cover_url(self): + soup = self.index_to_soup('http://www.birminghampost.net') + # look for the block containing the sun button and url + cov = soup.find(attrs={'height' : re.compile('3'), 'alt' : re.compile('Post')}) + print + print '%%%%%%%%%%%%%%%',cov + print + cov2 = str(cov['src']) + # cov2=cov2[7:] + print '88888888 ',cov2,' 888888888888' + + #cover_url=cov2 + #return cover_url + br = mechanize.Browser() + br.set_handle_redirect(False) + try: + br.open_novisit(cov2) + cover_url = cov2 + except: + cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg' + return cover_url - keep_only_tags = [ - dict(attrs={'id' : 'article-header'}), - #dict(name='h1',attrs={'id' : 'article-header'}), - dict(attrs={'class':['article-meta-author','article-meta-date','article main','art-o art-align-center otm-1 ']}), - dict(name='div',attrs={'class' : 'article-image full'}), - dict(attrs={'clas' : 'art-o art-align-center otm-1 '}), - dict(name='div',attrs={'class' : 'article main'}), -#dict(name='p') - #dict(attrs={'id' : 'three-col'}) - ] - remove_tags = [ - # dict(name='div',attrs={'class' : 'span-33 last header-links'}) - ] feeds = [ #(u'News',u'http://www.birminghampost.net/news/rss.xml'), (u'West Mids. News', u'http://www.birminghampost.net/news/west-midlands-news/rss.xml'), @@ -41,9 +52,3 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe): (u'Bloggs & Comments',u'http://www.birminghampost.net/comment/rss.xml') ] - extra_css = ''' - h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;text-align:center;} - h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} - p{font-family:Arial,Helvetica,sans-serif;font-size:small;} - body{font-family:Helvetica,Arial,sans-serif;font-size:small;} - ''' diff --git a/recipes/borse_online.recipe b/recipes/borse_online.recipe index c192ce2b8d..ddd9ac456b 100644 --- a/recipes/borse_online.recipe +++ b/recipes/borse_online.recipe @@ -1,33 +1,36 @@ from calibre.web.feeds.recipes import BasicNewsRecipe class AdvancedUserRecipe1303841067(BasicNewsRecipe): - title = u'Börse-online' - __author__ = 'schuster' - oldest_article = 1 + title = u'Börse-online' + __author__ = 'schuster, Armin Geller' + oldest_article = 1 max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - language = 'de' - remove_javascript = True - cover_url = 'http://www.dpv.de/images/1995/source.gif' - masthead_url = 'http://www.zeitschriften-cover.de/cover/boerse-online-cover-januar-2010-x1387.jpg' - extra_css = ''' - h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} - h4{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} - img {min-width:300px; max-width:600px; min-height:300px; max-height:800px} - p{font-family:Arial,Helvetica,sans-serif;font-size:small;} - body{font-family:Helvetica,Arial,sans-serif;font-size:small;} - ''' - remove_tags_bevor = [dict(name='h3')] - remove_tags_after = [dict(name='div', attrs={'class':'artikelfuss'})] - remove_tags = [dict(attrs={'class':['moduleTopNav', 'moduleHeaderNav', 'text', 'blau', 'poll1150']}), - dict(id=['newsletterlayer', 'newsletterlayerClose', 'newsletterlayer_body', 'newsletterarray_error', 'newsletterlayer_emailadress', 'newsletterlayer_submit', 'kommentar']), - dict(name=['h2', 'Gesamtranking', 'h3',''])] + no_stylesheets = True + use_embedded_content = False + language = 'de' + remove_javascript = True + encoding = 'iso-8859-1' + timefmt = ' [%a, %d %b %Y]' + + + cover_url = 'http://www.wirtschaftsmedien-shop.de/s/media/coverimages/7576_2013107.jpg' + masthead_url = 'http://upload.wikimedia.org/wikipedia/de/5/56/B%C3%B6rse_Online_Logo.svg' + remove_tags_after = [dict(name='div', attrs={'class':['artikelfuss', 'rahmen600']})] + + remove_tags = [ + dict(name='div', attrs={'id':['breadcrumb', 'rightCol', 'clearall']}), + dict(name='div', attrs={'class':['footer', 'artikelfuss']}), + ] + + keep_only_tags = [ + dict(name='div', attrs={'id':['contentWrapper']}) + ] + + feeds = [(u'Börsennachrichten', u'http://www.boerse-online.de/rss/')] + def print_version(self, url): return url.replace('.html#nv=rss', '.html?mode=print') - feeds = [(u'Börsennachrichten', u'http://www.boerse-online.de/rss/')] - diff --git a/recipes/brecha.recipe b/recipes/brecha.recipe index da58710dd5..3eef48379f 100644 --- a/recipes/brecha.recipe +++ b/recipes/brecha.recipe @@ -40,7 +40,7 @@ class Brecha(BasicNewsRecipe): def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open('http://www.brecha.com.uy/index.php/acceder-miembros') if self.username is not None and self.password is not None: data = urllib.urlencode({ 'task':'login' diff --git a/recipes/bwmagazine2.recipe b/recipes/bwmagazine2.recipe index 77143bbefc..cba255afa8 100644 --- a/recipes/bwmagazine2.recipe +++ b/recipes/bwmagazine2.recipe @@ -11,16 +11,15 @@ class BusinessWeekMagazine(BasicNewsRecipe): category = 'news' encoding = 'UTF-8' keep_only_tags = [ - dict(name='div', attrs={'id':'article_body_container'}), - ] - remove_tags = [dict(name='ui'),dict(name='li')] + dict(name='div', attrs={'id':'article_body_container'}), + ] + remove_tags = [dict(name='ui'),dict(name='li'),dict(name='div', attrs={'id':['share-email']})] no_javascript = True no_stylesheets = True cover_url = 'http://images.businessweek.com/mz/covers/current_120x160.jpg' def parse_index(self): - #Go to the issue soup = self.index_to_soup('http://www.businessweek.com/magazine/news/articles/business_news.htm') @@ -39,7 +38,7 @@ class BusinessWeekMagazine(BasicNewsRecipe): title=self.tag_to_string(div.a).strip() url=div.a['href'] soup0 = self.index_to_soup(url) - urlprint=soup0.find('li', attrs={'class':'print'}).a['href'] + urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href'] articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''}) @@ -47,7 +46,6 @@ class BusinessWeekMagazine(BasicNewsRecipe): if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles - div1 = soup.find ('div', attrs={'class':'column center'}) section_title = '' for div in div1.findAll('h5'): @@ -57,7 +55,7 @@ class BusinessWeekMagazine(BasicNewsRecipe): title=self.tag_to_string(div.a).strip() url=div.a['href'] soup0 = self.index_to_soup(url) - urlprint=soup0.find('li', attrs={'class':'print'}).a['href'] + urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href'] articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''}) if articles: diff --git a/recipes/cacm.recipe b/recipes/cacm.recipe index e4af9d2024..a7b1c602a3 100644 --- a/recipes/cacm.recipe +++ b/recipes/cacm.recipe @@ -17,7 +17,7 @@ class AdvancedUserRecipe1286242553(BasicNewsRecipe): cover_url_pattern = 'http://cacm.acm.org/magazines/%d/%d' def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://cacm.acm.org/login') br.select_form(nr=1) diff --git a/recipes/caijing.recipe b/recipes/caijing.recipe index 34e6c1e8a9..05bc9314b3 100644 --- a/recipes/caijing.recipe +++ b/recipes/caijing.recipe @@ -34,7 +34,7 @@ class Caijing(BasicNewsRecipe): def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://service.caijing.com.cn/usermanage/login') br.select_form(name='mainLoginForm') diff --git a/recipes/calgary_herald.recipe b/recipes/calgary_herald.recipe index d1b28de9de..f7b1ee71d5 100644 --- a/recipes/calgary_herald.recipe +++ b/recipes/calgary_herald.recipe @@ -132,14 +132,14 @@ class CanWestPaper(BasicNewsRecipe): def get_cover_url(self): from datetime import timedelta, date cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) daysback=1 try: br.open(cover) except: while daysback<7: cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/chronicle_higher_ed.recipe b/recipes/chronicle_higher_ed.recipe index 15b284cd7a..cac0cf77da 100644 --- a/recipes/chronicle_higher_ed.recipe +++ b/recipes/chronicle_higher_ed.recipe @@ -12,10 +12,10 @@ class Chronicle(BasicNewsRecipe): category = 'news' encoding = 'UTF-8' keep_only_tags = [ - dict(name='div', attrs={'class':'article'}), + dict(name='div', attrs={'class':['article','blog-mod']}), ] - remove_tags = [dict(name='div',attrs={'class':['related module1','maintitle']}), - dict(name='div', attrs={'id':['section-nav','icon-row', 'enlarge-popup']}), + remove_tags = [dict(name='div',attrs={'class':['related module1','maintitle','entry-utility','object-meta']}), + dict(name='div', attrs={'id':['section-nav','icon-row', 'enlarge-popup','confirm-popup']}), dict(name='a', attrs={'class':'show-enlarge enlarge'})] no_javascript = True no_stylesheets = True @@ -23,7 +23,7 @@ class Chronicle(BasicNewsRecipe): needs_subscription = True def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://chronicle.com/myaccount/login') br.select_form(nr=1) diff --git a/recipes/cnn.recipe b/recipes/cnn.recipe index 6043f8b401..3cb66d2967 100644 --- a/recipes/cnn.recipe +++ b/recipes/cnn.recipe @@ -73,7 +73,7 @@ class CNN(BasicNewsRecipe): def get_masthead_url(self): masthead = 'http://i.cdn.turner.com/cnn/.element/img/3.0/global/header/intl/hdr-globe-central.gif' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(masthead) except: diff --git a/recipes/conowego_pl.recipe b/recipes/conowego_pl.recipe old mode 100755 new mode 100644 diff --git a/recipes/contemporary_argentine_writers.recipe b/recipes/contemporary_argentine_writers.recipe new file mode 100644 index 0000000000..bd69967f3d --- /dev/null +++ b/recipes/contemporary_argentine_writers.recipe @@ -0,0 +1,35 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +contemporaryargentinewriters.wordpress.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class contemporaryargentinewriters(BasicNewsRecipe): + title = 'Contemporary Argentine Writers' + __author__ = 'Darko Miletic' + description = 'Short stories by Argentine writers (and others) translated into English' + publisher = 'Dario Bard' + category = 'fiction, literature, Argentina, english' + oldest_article = 25 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = True + language = 'en_AR' + remove_empty_feeds = True + publication_type = 'blog' + extra_css = """ + body{font-family: Arial,Helvetica,sans-serif } + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + feeds = [(u'Posts', u'http://contemporaryargentinewriters.wordpress.com/feed/')] diff --git a/recipes/corriere_della_sera_it.recipe b/recipes/corriere_della_sera_it.recipe index b3bcebf505..01d8dbe720 100644 --- a/recipes/corriere_della_sera_it.recipe +++ b/recipes/corriere_della_sera_it.recipe @@ -62,7 +62,7 @@ class ilCorriere(BasicNewsRecipe): day = "%.2d" % st.tm_mday #http://images.corriere.it/primapagina/storico/2010_05_17/images/prima_pagina_grande.png cover='http://images.corriere.it/primapagina/storico/'+ year + '_' + month +'_' + day +'/images/prima_pagina_grande.png' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/countryfile.recipe b/recipes/countryfile.recipe index 4f2e8cd95f..86769b78cd 100644 --- a/recipes/countryfile.recipe +++ b/recipes/countryfile.recipe @@ -7,25 +7,30 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): #cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg' __author__ = 'Dave Asbury' description = 'The official website of Countryfile Magazine' - # last updated 7/10/12 + # last updated 8/12/12 language = 'en_GB' oldest_article = 30 max_articles_per_feed = 25 remove_empty_feeds = True no_stylesheets = True auto_cleanup = True + ignore_duplicate_articles = {'title', 'url'} #articles_are_obfuscated = True - ignore_duplicate_articles = {'title'} + #article_already_exists = False + #feed_hash = '' def get_cover_url(self): - soup = self.index_to_soup('http://www.countryfile.com/') + soup = self.index_to_soup('http://www.countryfile.com/magazine') + cov = soup.find(attrs={'class' : re.compile('imagecache imagecache-250px_wide')})#'width' : '160', + print '&&&&&&&& ',cov,' ***' + cov=str(cov) + #cov2 = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov) + cov2 = re.findall('/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov) + + cov2 = str(cov2) + cov2= "http://www.countryfile.com"+cov2[2:len(cov2)-8] - cov = soup.find(attrs={'width' : '160', 'class' : re.compile('imagecache imagecache-160px_wide')}) - print '******** ',cov,' ***' - cov2 = str(cov) - cov2=cov2[10:101] print '******** ',cov2,' ***' - #cov2='http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/1b_0.jpg' - # try to get cover - if can't get known cover + # try to get cover - if can't get known cover br = browser() br.set_handle_redirect(False) @@ -45,5 +50,3 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): (u'Countryside', u'http://www.countryfile.com/rss/countryside'), ] - - diff --git a/recipes/czas_gentlemanow.recipe b/recipes/czas_gentlemanow.recipe new file mode 100644 index 0000000000..6df677f25f --- /dev/null +++ b/recipes/czas_gentlemanow.recipe @@ -0,0 +1,20 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from calibre.web.feeds.news import BasicNewsRecipe + +class CzasGentlemanow(BasicNewsRecipe): + title = u'Czas Gentlemanów' + __author__ = 'fenuks' + description = u'Historia mężczyzn z dala od wielkiej polityki' + category = 'blog' + language = 'pl' + cover_url = 'http://czasgentlemanow.pl/wp-content/uploads/2012/10/logo-Czas-Gentlemanow1.jpg' + ignore_duplicate_articles = {'title', 'url'} + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + keep_only_tags = [dict(name='div', attrs={'class':'content'})] + remove_tags = [dict(attrs={'class':'meta_comments'})] + remove_tags_after = dict(name='div', attrs={'class':'fblikebutton_button'}) + feeds = [(u'M\u0119ski \u015awiat', u'http://czasgentlemanow.pl/category/meski-swiat/feed/'), (u'Styl', u'http://czasgentlemanow.pl/category/styl/feed/'), (u'Vademecum Gentlemana', u'http://czasgentlemanow.pl/category/vademecum/feed/'), (u'Dom i rodzina', u'http://czasgentlemanow.pl/category/dom-i-rodzina/feed/'), (u'Honor', u'http://czasgentlemanow.pl/category/honor/feed/'), (u'Gad\u017cety Gentlemana', u'http://czasgentlemanow.pl/category/gadzety-gentlemana/feed/')] diff --git a/recipes/dani.recipe b/recipes/dani.recipe index d11eecfeb9..740d5c2381 100644 --- a/recipes/dani.recipe +++ b/recipes/dani.recipe @@ -40,7 +40,7 @@ class BHDani(BasicNewsRecipe): remove_attributes = ['height','width','align'] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open(self.INDEX) br.select_form(name='form') diff --git a/recipes/der_spiegel.recipe b/recipes/der_spiegel.recipe index 3a12378405..9ea4be6201 100644 --- a/recipes/der_spiegel.recipe +++ b/recipes/der_spiegel.recipe @@ -42,7 +42,7 @@ class DerSpiegel(BasicNewsRecipe): else: return True - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open(self.PREFIX + '/meinspiegel/login.html') br.select_form(predicate=has_login_name) diff --git a/recipes/discover_magazine.recipe b/recipes/discover_magazine.recipe index 02cdb952b5..a7f080bb5f 100644 --- a/recipes/discover_magazine.recipe +++ b/recipes/discover_magazine.recipe @@ -33,6 +33,21 @@ class DiscoverMagazine(BasicNewsRecipe): remove_tags_after = [dict(name='div', attrs={'class':'listingBar'})] + # Login stuff + needs_subscription = True + use_javascript_to_login = True + requires_version = (0, 9, 20) + + def javascript_login(self, br, username, password): + br.visit('http://discovermagazine.com', timeout=120) + f = br.select_form('div.login.section div.form') + f['username'] = username + f['password'] = password + br.submit('input[id="signInButton"]', timeout=120) + br.run_for_a_time(20) + # End login stuff + + def append_page(self, soup, appendtag, position): pager = soup.find('span',attrs={'class':'next'}) if pager: diff --git a/recipes/dobanevinosti.recipe b/recipes/dobanevinosti.recipe new file mode 100644 index 0000000000..9d148be8b4 --- /dev/null +++ b/recipes/dobanevinosti.recipe @@ -0,0 +1,46 @@ + +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +dobanevinosti.blogspot.com +''' +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class DobaNevinosti(BasicNewsRecipe): + title = 'Doba Nevinosti' + __author__ = 'Darko Miletic' + description = 'Filmski blog' + oldest_article = 15 + max_articles_per_feed = 100 + language = 'sr' + encoding = 'utf-8' + no_stylesheets = True + use_embedded_content = True + publication_type = 'blog' + auto_cleanup = True + extra_css = """ + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} + body{font-family: "Trebuchet MS",Trebuchet,Verdana,sans1,sans-serif} + img{margin-bottom: 0.8em; display:block;} + """ + + conversion_options = { + 'comment' : description + , 'tags' : 'film, blog, srbija, tv' + , 'publisher': 'Dimitrije Vojinov' + , 'language' : language + } + remove_attributes = ['lang', 'border'] + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + + feeds = [(u'Tekstovi', u'http://dobanevinosti.blogspot.com/feeds/posts/default')] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup + diff --git a/recipes/dzieje_pl.recipe b/recipes/dzieje_pl.recipe index 0aafa5d2f4..603591e9f0 100644 --- a/recipes/dzieje_pl.recipe +++ b/recipes/dzieje_pl.recipe @@ -7,18 +7,64 @@ class Dzieje(BasicNewsRecipe): cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png' category = 'history' language = 'pl' - index='http://dzieje.pl' + ignore_duplicate_articles = {'title', 'url'} + index = 'http://dzieje.pl' oldest_article = 8 max_articles_per_feed = 100 remove_javascript=True no_stylesheets= True keep_only_tags = [dict(name='h1', attrs={'class':'title'}), dict(id='content-area')] remove_tags = [dict(attrs={'class':'field field-type-computed field-field-tagi'}), dict(id='dogory')] - feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')] + #feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')] + def append_page(self, soup, appendtag): + tag = appendtag.find('li', attrs={'class':'pager-next'}) + if tag: + while tag: + url = tag.a['href'] + if not url.startswith('http'): + url = 'http://dzieje.pl'+tag.a['href'] + soup2 = self.index_to_soup(url) + pagetext = soup2.find(id='content-area').find(attrs={'class':'content'}) + for r in pagetext.findAll(attrs={'class':['fieldgroup group-groupkul', 'fieldgroup group-zdjeciekult', 'fieldgroup group-zdjecieciekaw', 'fieldgroup group-zdjecieksiazka', 'fieldgroup group-zdjeciedu', 'field field-type-filefield field-field-zdjecieglownawyd']}): + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag = soup2.find('li', attrs={'class':'pager-next'}) + for r in appendtag.findAll(attrs={'class':['item-list', 'field field-type-computed field-field-tagi', ]}): + r.extract() + + def find_articles(self, url): + articles = [] + soup=self.index_to_soup(url) + tag=soup.find(id='content-area').div.div + for i in tag.findAll('div', recursive=False): + temp = i.find(attrs={'class':'views-field-title'}).span.a + title = temp.string + url = self.index + temp['href'] + date = '' #i.find(attrs={'class':'views-field-created'}).span.string + articles.append({'title' : title, + 'url' : url, + 'date' : date, + 'description' : '' + }) + return articles + + def parse_index(self): + feeds = [] + feeds.append((u"Wiadomości", self.find_articles('http://dzieje.pl/wiadomosci'))) + feeds.append((u"Kultura i sztuka", self.find_articles('http://dzieje.pl/kulturaisztuka'))) + feeds.append((u"Film", self.find_articles('http://dzieje.pl/kino'))) + feeds.append((u"Rozmaitości historyczne", self.find_articles('http://dzieje.pl/rozmaitości'))) + feeds.append((u"Książka", self.find_articles('http://dzieje.pl/ksiazka'))) + feeds.append((u"Wystawa", self.find_articles('http://dzieje.pl/wystawa'))) + feeds.append((u"Edukacja", self.find_articles('http://dzieje.pl/edukacja'))) + feeds.append((u"Dzieje się", self.find_articles('http://dzieje.pl/wydarzenia'))) + return feeds def preprocess_html(self, soup): for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: a['href']=self.index + a['href'] + self.append_page(soup, soup.body) return soup \ No newline at end of file diff --git a/recipes/dziennik_polski.recipe b/recipes/dziennik_polski.recipe index 83b9d06ecd..253bda2ebe 100644 --- a/recipes/dziennik_polski.recipe +++ b/recipes/dziennik_polski.recipe @@ -116,7 +116,7 @@ class DziennikPolski24(BasicNewsRecipe): loop=True def get_browser(self): - br=BasicNewsRecipe.get_browser() + br=BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://www.dziennikpolski24.pl/pl/moje-konto/950606-loguj.html') br.select_form(nr = 1) diff --git a/recipes/economist.recipe b/recipes/economist.recipe index 25e46892f8..d7af32a243 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -41,10 +41,11 @@ class Economist(BasicNewsRecipe): remove_tags = [ dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']), dict(attrs={'class':['dblClkTrk', 'ec-article-info', - 'share_inline_header', 'related-items']}), + 'share_inline_header', 'related-items', + 'main-content-container']}), {'class': lambda x: x and 'share-links-header' in x}, ] - keep_only_tags = [dict(id='ec-article-body')] + keep_only_tags = [dict(name='article')] no_stylesheets = True preprocess_regexps = [(re.compile('.*', re.DOTALL), lambda x:'')] @@ -56,7 +57,7 @@ class Economist(BasicNewsRecipe): needs_subscription = False ''' def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username and self.password: br.open('http://www.economist.com/user/login') br.select_form(nr=1) @@ -70,18 +71,6 @@ class Economist(BasicNewsRecipe): return br ''' - def get_cover_url(self): - soup = self.index_to_soup('http://www.economist.com/printedition/covers') - div = soup.find('div', attrs={'class':lambda x: x and - 'print-cover-links' in x}) - a = div.find('a', href=True) - url = a.get('href') - if url.startswith('/'): - url = 'http://www.economist.com' + url - soup = self.index_to_soup(url) - div = soup.find('div', attrs={'class':'cover-content'}) - img = div.find('img', src=True) - return img.get('src') def parse_index(self): return self.economist_parse_index() @@ -92,7 +81,7 @@ class Economist(BasicNewsRecipe): if div is not None: img = div.find('img', src=True) if img is not None: - self.cover_url = img['src'] + self.cover_url = re.sub('thumbnail','full',img['src']) feeds = OrderedDict() for section in soup.findAll(attrs={'class':lambda x: x and 'section' in x}): diff --git a/recipes/economist_free.recipe b/recipes/economist_free.recipe index a64310c252..d7af32a243 100644 --- a/recipes/economist_free.recipe +++ b/recipes/economist_free.recipe @@ -9,7 +9,7 @@ from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag, NavigableString from collections import OrderedDict -import time, re +import re class Economist(BasicNewsRecipe): @@ -37,16 +37,15 @@ class Economist(BasicNewsRecipe): padding: 7px 0px 9px; } ''' - oldest_article = 7.0 remove_tags = [ dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']), dict(attrs={'class':['dblClkTrk', 'ec-article-info', - 'share_inline_header', 'related-items']}), + 'share_inline_header', 'related-items', + 'main-content-container']}), {'class': lambda x: x and 'share-links-header' in x}, ] - keep_only_tags = [dict(id='ec-article-body')] - needs_subscription = False + keep_only_tags = [dict(name='article')] no_stylesheets = True preprocess_regexps = [(re.compile('.*', re.DOTALL), lambda x:'')] @@ -55,28 +54,26 @@ class Economist(BasicNewsRecipe): # downloaded with connection reset by peer (104) errors. delay = 1 - def get_cover_url(self): - soup = self.index_to_soup('http://www.economist.com/printedition/covers') - div = soup.find('div', attrs={'class':lambda x: x and - 'print-cover-links' in x}) - a = div.find('a', href=True) - url = a.get('href') - if url.startswith('/'): - url = 'http://www.economist.com' + url - soup = self.index_to_soup(url) - div = soup.find('div', attrs={'class':'cover-content'}) - img = div.find('img', src=True) - return img.get('src') + needs_subscription = False + ''' + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + if self.username and self.password: + br.open('http://www.economist.com/user/login') + br.select_form(nr=1) + br['name'] = self.username + br['pass'] = self.password + res = br.submit() + raw = res.read() + if '>Log out<' not in raw: + raise ValueError('Failed to login to economist.com. ' + 'Check your username and password.') + return br + ''' + def parse_index(self): - try: - return self.economist_parse_index() - except: - raise - self.log.warn( - 'Initial attempt to parse index failed, retrying in 30 seconds') - time.sleep(30) - return self.economist_parse_index() + return self.economist_parse_index() def economist_parse_index(self): soup = self.index_to_soup(self.INDEX) @@ -84,7 +81,7 @@ class Economist(BasicNewsRecipe): if div is not None: img = div.find('img', src=True) if img is not None: - self.cover_url = img['src'] + self.cover_url = re.sub('thumbnail','full',img['src']) feeds = OrderedDict() for section in soup.findAll(attrs={'class':lambda x: x and 'section' in x}): @@ -151,154 +148,3 @@ class Economist(BasicNewsRecipe): div.insert(2, img) table.replaceWith(div) return soup - -''' -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.utils.threadpool import ThreadPool, makeRequests -from calibre.ebooks.BeautifulSoup import Tag, NavigableString -import time, string, re -from datetime import datetime -from lxml import html - -class Economist(BasicNewsRecipe): - - title = 'The Economist (RSS)' - language = 'en' - - __author__ = "Kovid Goyal" - description = ('Global news and current affairs from a European' - ' perspective. Best downloaded on Friday mornings (GMT).' - ' Much slower than the print edition based version.') - extra_css = '.headline {font-size: x-large;} \n h2 { font-size: small; } \n h1 { font-size: medium; }' - oldest_article = 7.0 - cover_url = 'http://media.economist.com/sites/default/files/imagecache/print-cover-thumbnail/print-covers/currentcoverus_large.jpg' - #cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg' - remove_tags = [ - dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']), - dict(attrs={'class':['dblClkTrk', 'ec-article-info', - 'share_inline_header', 'related-items']}), - {'class': lambda x: x and 'share-links-header' in x}, - ] - keep_only_tags = [dict(id='ec-article-body')] - no_stylesheets = True - preprocess_regexps = [(re.compile('.*', re.DOTALL), - lambda x:'')] - - def parse_index(self): - from calibre.web.feeds.feedparser import parse - if self.test: - self.oldest_article = 14.0 - raw = self.index_to_soup( - 'http://feeds.feedburner.com/economist/full_print_edition', - raw=True) - entries = parse(raw).entries - pool = ThreadPool(10) - self.feed_dict = {} - requests = [] - for i, item in enumerate(entries): - title = item.get('title', _('Untitled article')) - published = item.date_parsed - if not published: - published = time.gmtime() - utctime = datetime(*published[:6]) - delta = datetime.utcnow() - utctime - if delta.days*24*3600 + delta.seconds > 24*3600*self.oldest_article: - self.log.debug('Skipping article %s as it is too old.'%title) - continue - link = item.get('link', None) - description = item.get('description', '') - author = item.get('author', '') - - requests.append([i, link, title, description, author, published]) - if self.test: - requests = requests[:4] - requests = makeRequests(self.process_eco_feed_article, requests, self.eco_article_found, - self.eco_article_failed) - for r in requests: pool.putRequest(r) - pool.wait() - - return self.eco_sort_sections([(t, a) for t, a in - self.feed_dict.items()]) - - def eco_sort_sections(self, feeds): - if not feeds: - raise ValueError('No new articles found') - order = { - 'The World This Week': 1, - 'Leaders': 2, - 'Letters': 3, - 'Briefing': 4, - 'Business': 5, - 'Finance And Economics': 6, - 'Science & Technology': 7, - 'Books & Arts': 8, - 'International': 9, - 'United States': 10, - 'Asia': 11, - 'Europe': 12, - 'The Americas': 13, - 'Middle East & Africa': 14, - 'Britain': 15, - 'Obituary': 16, - } - return sorted(feeds, cmp=lambda x,y:cmp(order.get(x[0], 100), - order.get(y[0], 100))) - - def process_eco_feed_article(self, args): - from calibre import browser - i, url, title, description, author, published = args - br = browser() - ret = br.open(url) - raw = ret.read() - url = br.geturl().split('?')[0]+'/print' - root = html.fromstring(raw) - matches = root.xpath('//*[@class = "ec-article-info"]') - feedtitle = 'Miscellaneous' - if matches: - feedtitle = string.capwords(html.tostring(matches[-1], method='text', - encoding=unicode).split('|')[-1].strip()) - return (i, feedtitle, url, title, description, author, published) - - def eco_article_found(self, req, result): - from calibre.web.feeds import Article - i, feedtitle, link, title, description, author, published = result - self.log('Found print version for article:', title, 'in', feedtitle, - 'at', link) - - a = Article(i, title, link, author, description, published, '') - - article = dict(title=a.title, description=a.text_summary, - date=time.strftime(self.timefmt, a.date), author=a.author, url=a.url) - if feedtitle not in self.feed_dict: - self.feed_dict[feedtitle] = [] - self.feed_dict[feedtitle].append(article) - - def eco_article_failed(self, req, tb): - self.log.error('Failed to download %s with error:'%req.args[0][2]) - self.log.debug(tb) - - def eco_find_image_tables(self, soup): - for x in soup.findAll('table', align=['right', 'center']): - if len(x.findAll('font')) in (1,2) and len(x.findAll('img')) == 1: - yield x - - def postprocess_html(self, soup, first): - body = soup.find('body') - for name, val in body.attrs: - del body[name] - for table in list(self.eco_find_image_tables(soup)): - caption = table.find('font') - img = table.find('img') - div = Tag(soup, 'div') - div['style'] = 'text-align:left;font-size:70%' - ns = NavigableString(self.tag_to_string(caption)) - div.insert(0, ns) - div.insert(1, Tag(soup, 'br')) - img.extract() - del img['width'] - del img['height'] - div.insert(2, img) - table.replaceWith(div) - return soup -''' - diff --git a/recipes/edmonton_journal.recipe b/recipes/edmonton_journal.recipe index d3fdbc84f3..33d42d198e 100644 --- a/recipes/edmonton_journal.recipe +++ b/recipes/edmonton_journal.recipe @@ -132,14 +132,14 @@ class CanWestPaper(BasicNewsRecipe): def get_cover_url(self): from datetime import timedelta, date cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) daysback=1 try: br.open(cover) except: while daysback<7: cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/ekologia_pl.recipe b/recipes/ekologia_pl.recipe new file mode 100644 index 0000000000..21d3b607d2 --- /dev/null +++ b/recipes/ekologia_pl.recipe @@ -0,0 +1,25 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from calibre.web.feeds.news import BasicNewsRecipe +import re +class EkologiaPl(BasicNewsRecipe): + title = u'Ekologia.pl' + __author__ = 'fenuks' + description = u'Portal ekologiczny - eko, ekologia, ochrona przyrody, ochrona środowiska, przyroda, środowisko online. Ekologia i ochrona środowiska. Ekologia dla dzieci.' + category = 'ecology' + language = 'pl' + cover_url = 'http://www.ekologia.pl/assets/images/logo/ekologia_pl_223x69.png' + ignore_duplicate_articles = {'title', 'url'} + extra_css = '.title {font-size: 200%;}' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + remove_attrs = ['style'] + remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj', 'widget-social-buttons']})] + + feeds = [(u'Wiadomo\u015bci', u'http://www.ekologia.pl/rss/20,53,0'), (u'\u015arodowisko', u'http://www.ekologia.pl/rss/20,56,0'), (u'Styl \u017cycia', u'http://www.ekologia.pl/rss/20,55,0')] + + def print_version(self, url): + id = re.search(r',(?P\d+)\.html', url).group('id') + return 'http://drukuj.ekologia.pl/artykul/' + id diff --git a/recipes/el_correo.recipe b/recipes/el_correo.recipe index 9190560b02..110c19d7ba 100644 --- a/recipes/el_correo.recipe +++ b/recipes/el_correo.recipe @@ -73,7 +73,7 @@ class heraldo(BasicNewsRecipe): #[url]http://info.elcorreo.com/pdf/06012011-viz.pdf[/url] cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/el_diplo.recipe b/recipes/el_diplo.recipe new file mode 100644 index 0000000000..b9ef8268e1 --- /dev/null +++ b/recipes/el_diplo.recipe @@ -0,0 +1,118 @@ +# Copyright 2013 Tomás Di Domenico +# +# This is a news fetching recipe for the Calibre ebook software, for +# fetching the Cono Sur edition of Le Monde Diplomatique (www.eldiplo.org). +# +# This recipe is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This software is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this recipe. If not, see . + +import re +from contextlib import closing +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ptempfile import PersistentTemporaryFile +from calibre.utils.magick import Image + +class ElDiplo_Recipe(BasicNewsRecipe): + title = u'El Diplo' + __author__ = 'Tomas Di Domenico' + description = 'Publicacion mensual de Le Monde Diplomatique, edicion Argentina' + langauge = 'es_AR' + needs_subscription = True + auto_cleanup = True + + def get_cover(self,url): + tmp_cover = PersistentTemporaryFile(suffix = ".jpg", prefix = "eldiplo_") + self.cover_url = tmp_cover.name + + with closing(self.browser.open(url)) as r: + imgdata = r.read() + + img = Image() + img.load(imgdata) + img.crop(img.size[0],img.size[1]/2,0,0) + + img.save(tmp_cover.name) + + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + if self.username is not None and self.password is not None: + br.open('http://www.eldiplo.org/index.php/login/-/do_login/index.html') + br.select_form(nr=3) + br['uName'] = self.username + br['uPassword'] = self.password + br.submit() + self.browser = br + return br + + def parse_index(self): + default_sect = 'General' + articles = {default_sect:[]} + ans = [default_sect] + sectionsmarker = 'DOSSIER_TITLE: ' + sectionsre = re.compile('^'+sectionsmarker) + + soup = self.index_to_soup('http://www.eldiplo.org/index.php') + + coverdivs = soup.findAll(True,attrs={'id':['lmd-foto']}) + a = coverdivs[0].find('a', href=True) + coverurl = a['href'].split("?imagen=")[1] + self.get_cover(coverurl) + + thedivs = soup.findAll(True,attrs={'class':['lmd-leermas']}) + for div in thedivs: + a = div.find('a', href=True) + if 'Sumario completo' in self.tag_to_string(a, use_alt=True): + summaryurl = re.sub(r'\?.*', '', a['href']) + summaryurl = 'http://www.eldiplo.org' + summaryurl + + for pagenum in xrange(1,10): + soup = self.index_to_soup('{0}/?cms1_paging_p_b32={1}'.format(summaryurl,pagenum)) + thedivs = soup.findAll(True,attrs={'class':['interna']}) + + if len(thedivs) == 0: + break + + for div in thedivs: + section = div.find(True,text=sectionsre).replace(sectionsmarker,'') + if section == '': + section = default_sect + + if section not in articles.keys(): + articles[section] = [] + ans.append(section) + + nota = div.find(True,attrs={'class':['lmd-pl-titulo-nota-dossier']}) + a = nota.find('a', href=True) + if not a: + continue + + url = re.sub(r'\?.*', '', a['href']) + url = 'http://www.eldiplo.org' + url + title = self.tag_to_string(a, use_alt=True).strip() + + summary = div.find(True, attrs={'class':'lmd-sumario-descript'}).find('p') + if summary: + description = self.tag_to_string(summary, use_alt=False) + + aut = div.find(True, attrs={'class':'lmd-autor-sumario'}) + if aut: + auth = self.tag_to_string(aut, use_alt=False).strip() + + if not articles.has_key(section): + articles[section] = [] + + articles[section].append(dict(title=title,author=auth,url=url,date=None,description=description,content='')) + + #ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2}) + ans = [(section, articles[section]) for section in ans if articles.has_key(section)] + return ans diff --git a/recipes/el_mundo_today.recipe b/recipes/el_mundo_today.recipe index 7f558d10e7..010596b6e5 100644 --- a/recipes/el_mundo_today.recipe +++ b/recipes/el_mundo_today.recipe @@ -3,29 +3,34 @@ from calibre.web.feeds.news import BasicNewsRecipe class ElMundoTodayRecipe(BasicNewsRecipe): title = 'El Mundo Today' - __author__ = 'atordo' - description = u'La actualidad del mañana' + description = u'La actualidad del ma\u00f1ana' category = 'Noticias, humor' cover_url = 'http://www.elmundotoday.com/wp-content/themes/EarthlyTouch/images/logo.png' - oldest_article = 30 + oldest_article = 15 max_articles_per_feed = 60 auto_cleanup = False no_stylesheets = True remove_javascript = True language = 'es' use_embedded_content = False + publication_type = 'blog' preprocess_regexps = [ (re.compile(r'.*', re.DOTALL), - lambda match: ''), - #(re.compile(r'^\t{5}$'), lambda match: ''), - #(re.compile(r'\t{5}$'), lambda match: ''), - (re.compile(r'
', re.DOTALL), - lambda match: ''), + lambda match: ''), + (re.compile(r''), + lambda match: ''), + (re.compile(r'
.*', re.DOTALL), + lambda match: '') ] keep_only_tags = [ - dict(name='div', attrs={'class':'post-wrapper'}) + dict(name='div', attrs={'class':'post-wrapper '}) + ] + + remove_tags = [ + dict(name='div', attrs={'class':'social4i'}), + dict(name='span', attrs={'class':'num-comentarios'}) ] remove_attributes = [ 'href', 'title', 'alt' ] @@ -36,8 +41,3 @@ class ElMundoTodayRecipe(BasicNewsRecipe): ''' feeds = [('El Mundo Today', 'http://www.elmundotoday.com/feed/')] - - def get_broser(self): - br = BasicNewsRecipe.get_browser(self) - br.set_handle_gzip(True) - return br diff --git a/recipes/elet_es_irodalom.recipe b/recipes/elet_es_irodalom.recipe index 944096547e..ea259e27ad 100644 --- a/recipes/elet_es_irodalom.recipe +++ b/recipes/elet_es_irodalom.recipe @@ -26,7 +26,7 @@ class elet_es_irodalom(BasicNewsRecipe): #Nem ide a kódba kell beleírni a hozzáférés adatait, hanem azt akkor adod meg, ha le akarod tölteni! def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://www.es.hu/') br.select_form(name='userfrmlogin') diff --git a/recipes/elguardian.recipe b/recipes/elguardian.recipe new file mode 100644 index 0000000000..f5d035dd21 --- /dev/null +++ b/recipes/elguardian.recipe @@ -0,0 +1,93 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +elguardian.com.ar +''' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class ElGuardian(BasicNewsRecipe): + title = 'El Guardian' + __author__ = 'Darko Miletic' + description = "Semanario con todas las tendencias de un pais" + publisher = 'Editorial Apache SA' + category = 'news,politics,Argentina' + oldest_article = 8 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'es_AR' + remove_empty_feeds = True + publication_type = 'magazine' + issn = '1666-7476' + masthead_url = 'http://elguardian.com.ar/application/templates/frontend/images/home/logo.png' + extra_css = """ + body{font-family: Arial,sans-serif} + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'series' : title + , 'isbn' : issn + } + + keep_only_tags = [dict(attrs={'class':['fotos', 'header_nota', 'nota']})] + remove_tags = [dict(name=['meta','link','iframe','embed','object'])] + remove_attributes = ['lang'] + + feeds = [ + (u'El Pais' , u'http://elguardian.com.ar/RSS/el-pais.xml' ) + ,(u'Columnistas' , u'http://elguardian.com.ar/RSS/columnistas.xml' ) + ,(u'Personajes' , u'http://elguardian.com.ar/RSS/personajes.xml' ) + ,(u'Tinta roja' , u'http://elguardian.com.ar/RSS/tinta-roja.xml' ) + ,(u'Yo fui' , u'http://elguardian.com.ar/RSS/yo-fui.xml' ) + ,(u'Ciencia' , u'http://elguardian.com.ar/RSS/ciencia.xml' ) + ,(u'Cronicas' , u'http://elguardian.com.ar/RSS/cronicas.xml' ) + ,(u'Culturas' , u'http://elguardian.com.ar/RSS/culturas.xml' ) + ,(u'DxT' , u'http://elguardian.com.ar/RSS/dxt.xml' ) + ,(u'Fierros' , u'http://elguardian.com.ar/RSS/fierros.xml' ) + ,(u'Frente fashion', u'http://elguardian.com.ar/RSS/frente-fashion.xml') + ,(u'Pan y vino' , u'http://elguardian.com.ar/RSS/pan-y-vino.xml' ) + ,(u'Turismo' , u'http://elguardian.com.ar/RSS/turismo.xml' ) + ] + + def get_cover_url(self): + soup = self.index_to_soup('http://elguardian.com.ar/') + udata = soup.find('div', attrs={'class':'datosNumero'}) + if udata: + sdata = udata.find('div') + if sdata: + stra = re.findall(r'\d+', self.tag_to_string(sdata)) + self.conversion_options.update({'series_index':int(stra[1])}) + unumero = soup.find('div', attrs={'class':'ultimoNumero'}) + if unumero: + img = unumero.find('img', src=True) + if img: + return img['src'] + return None + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name = 'div' + item.attrs = [] + else: + str = self.tag_to_string(item) + item.replaceWith(str) + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup diff --git a/recipes/elmundo.recipe b/recipes/elmundo.recipe index 4f04f68575..c38f941ef2 100644 --- a/recipes/elmundo.recipe +++ b/recipes/elmundo.recipe @@ -116,7 +116,7 @@ class ElMundo(BasicNewsRecipe): day = "%.2d" % st.tm_mday #http://img.kiosko.net/2011/11/19/es/elmundo.750.jpg cover='http://img.kiosko.net/'+ year + '/' + month + '/' + day +'/es/elmundo.750.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/empire_magazine.recipe b/recipes/empire_magazine.recipe index 138b7bffd1..2d7a574dde 100644 --- a/recipes/empire_magazine.recipe +++ b/recipes/empire_magazine.recipe @@ -5,6 +5,7 @@ class AdvancedUserRecipe1341650280(BasicNewsRecipe): title = u'Empire Magazine' description = 'Author D.Asbury. Film articles from Empire Mag. ' + language = 'en' __author__ = 'Dave Asbury' # last updated 7/7/12 remove_empty_feeds = True @@ -15,7 +16,7 @@ class AdvancedUserRecipe1341650280(BasicNewsRecipe): cover_url = 'http://www.empireonline.com/images/magazine/cover.jpg' conversion_options = { 'linearize_tables' : True, - } + } #auto_cleanup = True preprocess_regexps = [ (re.compile(r'Przeczytaj także:.*', re.IGNORECASE|re.DOTALL), lambda m: ''), (re.compile(ur'
Artykuł
', re.IGNORECASE), lambda m: ''), (re.compile(ur'
Ludzie filmu
', re.IGNORECASE), lambda m: '')] + remove_tags = [dict(name='img', attrs={'alt':['Ludzie filmu', u'Artykuł']})] + feeds = [(u'Recenzje', u'http://film.org.pl/r/recenzje/feed/'), (u'Artyku\u0142', u'http://film.org.pl/a/artykul/feed/'), (u'Analiza', u'http://film.org.pl/a/analiza/feed/'), (u'Ranking', u'http://film.org.pl/a/ranking/feed/'), (u'Blog', u'http://film.org.pl/kmf/blog/feed/'), (u'Ludzie', u'http://film.org.pl/a/ludzie/feed/'), (u'Seriale', u'http://film.org.pl/a/seriale/feed/'), (u'Oceanarium', u'http://film.org.pl/a/ocenarium/feed/'), (u'VHS', u'http://film.org.pl/a/vhs-a/feed/')] diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe index 01d7514e0d..6b014e8f93 100644 --- a/recipes/film_web.recipe +++ b/recipes/film_web.recipe @@ -17,6 +17,7 @@ class FilmWebPl(BasicNewsRecipe): preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')] extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})] + remove_attributes = ['style',] keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})] feeds = [(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'), (u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'), @@ -50,4 +51,9 @@ class FilmWebPl(BasicNewsRecipe): for i in soup.findAll('sup'): if not i.string or i.string.startswith('(kliknij'): i.extract() + tag = soup.find(name='ul', attrs={'class':'inline sep-line'}) + if tag: + tag.name = 'div' + for t in tag.findAll('li'): + t.name = 'div' return soup diff --git a/recipes/financial_times.recipe b/recipes/financial_times.recipe index 0079b2be3a..3588a19008 100644 --- a/recipes/financial_times.recipe +++ b/recipes/financial_times.recipe @@ -34,7 +34,7 @@ class FinancialTimes_rss(BasicNewsRecipe): } def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open(self.INDEX) if self.username is not None and self.password is not None: br.open(self.LOGIN) diff --git a/recipes/financial_times_uk.recipe b/recipes/financial_times_uk.recipe index 6af000d990..f7a63fbb18 100644 --- a/recipes/financial_times_uk.recipe +++ b/recipes/financial_times_uk.recipe @@ -40,7 +40,7 @@ class FinancialTimes(BasicNewsRecipe): } def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open(self.INDEX) if self.username is not None and self.password is not None: br.open(self.LOGIN2) diff --git a/recipes/fleshbot.recipe b/recipes/fleshbot.recipe index 0a56e42795..0059d8855d 100644 --- a/recipes/fleshbot.recipe +++ b/recipes/fleshbot.recipe @@ -18,7 +18,7 @@ class Fleshbot(BasicNewsRecipe): encoding = 'utf-8' use_embedded_content = True language = 'en' - masthead_url = 'http://cache.gawkerassets.com/assets/kotaku.com/img/logo.png' + masthead_url = 'http://fbassets.s3.amazonaws.com/images/uploads/2012/01/fleshbot-logo.png' extra_css = ''' body{font-family: "Lucida Grande",Helvetica,Arial,sans-serif} img{margin-bottom: 1em} @@ -31,7 +31,7 @@ class Fleshbot(BasicNewsRecipe): , 'language' : language } - feeds = [(u'Articles', u'http://feeds.gawker.com/fleshbot/vip?format=xml')] + feeds = [(u'Articles', u'http://www.fleshbot.com/feed')] remove_tags = [ {'class': 'feedflare'}, diff --git a/recipes/fluter_de.recipe b/recipes/fluter_de.recipe index 1f8576cf81..18ea8e703e 100644 --- a/recipes/fluter_de.recipe +++ b/recipes/fluter_de.recipe @@ -14,26 +14,17 @@ class AdvancedUserRecipe1313693926(BasicNewsRecipe): language = 'de' encoding = 'UTF-8' - __author__ = 'Armin Geller' # 2011-08-19 + __author__ = 'Armin Geller' # 2013-02-05 V3 oldest_article = 7 max_articles_per_feed = 50 - - remove_tags = [ - dict(name='div', attrs={'id':["comments"]}), - dict(attrs={'class':['commentlink']}), - ] - - - keep_only_tags = [ - dict(name='div', attrs={'class':["grid_8 articleText"]}), - dict(name='div', attrs={'class':["articleTextInnerText"]}), - ] - feeds = [ (u'Inhalt:', u'http://www.fluter.de/de/?tpl=907'), ] extra_css = '.cs_img {margin-right: 10pt;}' + def print_version(self, url): + return url + '?tpl=1260' + diff --git a/recipes/folhadesaopaulo_sub.recipe b/recipes/folhadesaopaulo_sub.recipe index cd444b4682..238310edc1 100644 --- a/recipes/folhadesaopaulo_sub.recipe +++ b/recipes/folhadesaopaulo_sub.recipe @@ -40,7 +40,7 @@ class FSP(BasicNewsRecipe): re.DOTALL|re.IGNORECASE), lambda match: r'')] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://acesso.uol.com.br/login.html') br.form = br.forms().next() diff --git a/recipes/foreignaffairs.recipe b/recipes/foreignaffairs.recipe index 6b36170288..370f6d53e3 100644 --- a/recipes/foreignaffairs.recipe +++ b/recipes/foreignaffairs.recipe @@ -11,21 +11,21 @@ class ForeignAffairsRecipe(BasicNewsRecipe): by Chen Wei weichen302@gmx.com, 2012-02-05''' __license__ = 'GPL v3' - __author__ = 'kwetal' + __author__ = 'Rick Shang, kwetal' language = 'en' version = 1.01 - title = u'Foreign Affairs (Subcription or (free) Registration)' + title = u'Foreign Affairs (Subcription)' publisher = u'Council on Foreign Relations' category = u'USA, Foreign Affairs' description = u'The leading forum for serious discussion of American foreign policy and international affairs.' no_stylesheets = True remove_javascript = True + needs_subscription = True INDEX = 'http://www.foreignaffairs.com' FRONTPAGE = 'http://www.foreignaffairs.com/magazine' - INCLUDE_PREMIUM = False remove_tags = [] @@ -68,43 +68,57 @@ class ForeignAffairsRecipe(BasicNewsRecipe): def parse_index(self): + answer = [] soup = self.index_to_soup(self.FRONTPAGE) - sec_start = soup.findAll('div', attrs={'class':'panel-separator'}) + #get dates + date = re.split('\s\|\s',self.tag_to_string(soup.head.title.string))[0] + self.timefmt = u' [%s]'%date + + sec_start = soup.findAll('div', attrs= {'class':'panel-pane'}) for sec in sec_start: - content = sec.nextSibling - if content: - section = self.tag_to_string(content.find('h2')) - articles = [] - - tags = [] - for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}): - tags.append(div) - for li in content.findAll('li'): - tags.append(li) - - for div in tags: - title = url = description = author = None - - if self.INCLUDE_PREMIUM: - found_premium = False - else: - found_premium = div.findAll('span', attrs={'class': - 'premium-icon'}) - if not found_premium: - tag = div.find('div', attrs={'class': 'views-field-title'}) - - if tag: - a = tag.find('a') - if a: - title = self.tag_to_string(a) - url = self.INDEX + a['href'] - author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'})) - tag_summary = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'}) - description = self.tag_to_string(tag_summary) - articles.append({'title':title, 'date':None, 'url':url, - 'description':description, 'author':author}) - if articles: + articles = [] + section = self.tag_to_string(sec.find('h2')) + if 'Books' in section: + reviewsection=sec.find('div', attrs = {'class': 'item-list'}) + for subsection in reviewsection.findAll('div'): + subsectiontitle=self.tag_to_string(subsection.span.a) + subsectionurl=self.INDEX + subsection.span.a['href'] + soup1 = self.index_to_soup(subsectionurl) + for div in soup1.findAll('div', attrs = {'class': 'views-field-title'}): + if div.find('a') is not None: + originalauthor=self.tag_to_string(div.findNext('div', attrs = {'class':'views-field-field-article-book-nid'}).div.a) + title=subsectiontitle+': '+self.tag_to_string(div.span.a)+' by '+originalauthor + url=self.INDEX+div.span.a['href'] + atr=div.findNext('div', attrs = {'class': 'views-field-field-article-display-authors-value'}) + if atr is not None: + author=self.tag_to_string(atr.span.a) + else: + author='' + desc=div.findNext('span', attrs = {'class': 'views-field-field-article-summary-value'}) + if desc is not None: + description=self.tag_to_string(desc.div.p) + else: + description='' + articles.append({'title':title, 'date':None, 'url':url, 'description':description, 'author':author}) + subsectiontitle='' + else: + for div in sec.findAll('div', attrs = {'class': 'views-field-title'}): + if div.find('a') is not None: + title=self.tag_to_string(div.span.a) + url=self.INDEX+div.span.a['href'] + atr=div.findNext('div', attrs = {'class': 'views-field-field-article-display-authors-value'}) + if atr is not None: + author=self.tag_to_string(atr.span.a) + else: + author='' + desc=div.findNext('span', attrs = {'class': 'views-field-field-article-summary-value'}) + if desc is not None: + description=self.tag_to_string(desc.div.p) + else: + description='' + articles.append({'title':title, 'date':None, 'url':url, 'description':description, 'author':author}) + if articles: answer.append((section, articles)) return answer @@ -115,15 +129,17 @@ class ForeignAffairsRecipe(BasicNewsRecipe): return soup - needs_subscription = True + def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: - br.open('https://www.foreignaffairs.com/user?destination=home') + br.open('https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo') br.select_form(nr = 1) br['name'] = self.username br['pass'] = self.password br.submit() return br + def cleanup(self): + self.browser.open('http://www.foreignaffairs.com/logout?destination=user%3Fop=lo') diff --git a/recipes/gildia_pl.recipe b/recipes/gildia_pl.recipe index def57203e4..525cf6c605 100644 --- a/recipes/gildia_pl.recipe +++ b/recipes/gildia_pl.recipe @@ -4,9 +4,10 @@ import re class Gildia(BasicNewsRecipe): title = u'Gildia.pl' __author__ = 'fenuks' - description = 'Gildia - cultural site' + description = u'Fantastyczny Portal Kulturalny - newsy, recenzje, galerie, wywiady. Literatura, film, gry komputerowe i planszowe, komiks, RPG, sklep. Nie lekceważ potęgi wyobraźni!' cover_url = 'http://www.film.gildia.pl/_n_/portal/redakcja/logo/logo-gildia.pl-500.jpg' category = 'culture' + cover_url = 'http://gildia.pl/images/logo-main.png' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 @@ -23,10 +24,13 @@ class Gildia(BasicNewsRecipe): content = soup.find('div', attrs={'class':'news'}) if 'recenzj' in soup.title.string.lower(): for link in content.findAll(name='a'): - if 'recenzj' in link['href']: - self.log.warn('odnosnik') - self.log.warn(link['href']) + if 'recenzj' in link['href'] or 'muzyka/plyty' in link['href']: return self.index_to_soup(link['href'], raw=True) + if 'fragmen' in soup.title.string.lower(): + for link in content.findAll(name='a'): + if 'fragment' in link['href']: + return self.index_to_soup(link['href'], raw=True) + def preprocess_html(self, soup): for a in soup('a'): diff --git a/recipes/globe_and_mail.recipe b/recipes/globe_and_mail.recipe index a7c78887c5..9ee3c6bd81 100644 --- a/recipes/globe_and_mail.recipe +++ b/recipes/globe_and_mail.recipe @@ -21,6 +21,10 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe): encoding = 'utf8' publisher = 'Globe & Mail' language = 'en_CA' + use_embedded_content = False + + no_stylesheets = True + auto_cleanup = True extra_css = 'p.meta {font-size:75%}\n .redtext {color: red;}\n .byline {font-size: 70%}' feeds = [ @@ -44,12 +48,12 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe): (re.compile(r'', re.DOTALL), lambda m: ''), ] - remove_tags_before = dict(name='h1') - remove_tags = [ - dict(name='div', attrs={'id':['ShareArticles', 'topStories']}), - dict(href=lambda x: x and 'tracking=' in x), - {'class':['articleTools', 'pagination', 'Ads', 'topad', - 'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}] + #remove_tags_before = dict(name='h1') + #remove_tags = [ + #dict(name='div', attrs={'id':['ShareArticles', 'topStories']}), + #dict(href=lambda x: x and 'tracking=' in x), + #{'class':['articleTools', 'pagination', 'Ads', 'topad', + #'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}] def populate_article_metadata(self, article, soup, first): if first and hasattr(self, 'add_toc_thumbnail'): diff --git a/recipes/gram_pl.recipe b/recipes/gram_pl.recipe index 79157630f5..3852f65d32 100644 --- a/recipes/gram_pl.recipe +++ b/recipes/gram_pl.recipe @@ -1,19 +1,20 @@ from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.ebooks.BeautifulSoup import BeautifulSoup class Gram_pl(BasicNewsRecipe): title = u'Gram.pl' __author__ = 'fenuks' - description = 'Gram.pl - site about computer games' + description = u'Serwis społecznościowy o grach: recenzje, newsy, zapowiedzi, encyklopedia gier, forum. Gry PC, PS3, X360, PS Vita, sprzęt dla graczy.' category = 'games' language = 'pl' oldest_article = 8 index='http://www.gram.pl' max_articles_per_feed = 100 + ignore_duplicate_articles = {'title', 'url'} no_stylesheets= True - extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}' + #extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}' cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png' - remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info', 'entry-footer clearfix']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button', 'entry-comment-counter', 'snap_nopreview sharing robots-nocontent', 'sharedaddy sd-sharing-enabled']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])] - keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']}), dict(name='article')] + keep_only_tags= [dict(id='articleModule')] + remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter']})] feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'), (u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles'), (u'Kolektyw- Indie Games', u'http://indie.gram.pl/feed/'), @@ -28,35 +29,21 @@ class Gram_pl(BasicNewsRecipe): feed.articles.remove(article) return feeds - def append_page(self, soup, appendtag): - nexturl = appendtag.find('a', attrs={'class':'cpn'}) - while nexturl: - soup2 = self.index_to_soup('http://www.gram.pl'+ nexturl['href']) - r=appendtag.find(id='pgbox') - if r: - r.extract() - pagetext = soup2.find(attrs={'class':'main'}) - r=pagetext.find('h1') - if r: - r.extract() - r=pagetext.find('h2') - if r: - r.extract() - for r in pagetext.findAll('script'): - r.extract() - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - nexturl = appendtag.find('a', attrs={'class':'cpn'}) - r=appendtag.find(id='pgbox') - if r: - r.extract() def preprocess_html(self, soup): - self.append_page(soup, soup.body) - tag=soup.findAll(name='div', attrs={'class':'picbox'}) - for t in tag: - t['style']='float: left;' + tag=soup.find(name='div', attrs={'class':'summary'}) + if tag: + tag.find(attrs={'class':'pros'}).insert(0, BeautifulSoup('

Plusy:

').h2) + tag.find(attrs={'class':'cons'}).insert(0, BeautifulSoup('

Minusy:

').h2) + tag = soup.find(name='section', attrs={'class':'cenzurka'}) + if tag: + rate = tag.p.img['data-ocena'] + tag.p.img.extract() + tag.p.insert(len(tag.p.contents)-2, BeautifulSoup('

Ocena: {0}

'.format(rate)).h2) for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: a['href']=self.index + a['href'] + tag=soup.find(name='span', attrs={'class':'platforma'}) + if tag: + tag.name = 'p' return soup diff --git a/recipes/haaretz_en.recipe b/recipes/haaretz_en.recipe index ade32ae5ea..0856621d38 100644 --- a/recipes/haaretz_en.recipe +++ b/recipes/haaretz_en.recipe @@ -65,7 +65,7 @@ class Haaretz_en(BasicNewsRecipe): ] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open(self.PREFIX) if self.username is not None and self.password is not None: data = urllib.urlencode({ 'cb':'parseEngReply' diff --git a/recipes/harpers.recipe b/recipes/harpers.recipe index a4576792d0..18e75dce6e 100644 --- a/recipes/harpers.recipe +++ b/recipes/harpers.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2008-2010, Darko Miletic ' +__copyright__ = '2008-2012, Darko Miletic ' ''' harpers.org ''' @@ -16,6 +16,7 @@ class Harpers(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False + masthead_url = 'http://harpers.org/wp-content/themes/harpers/images/pheader.gif' conversion_options = { 'comment' : description @@ -31,27 +32,9 @@ class Harpers(BasicNewsRecipe): .caption{font-family:Verdana,sans-serif;font-size:x-small;color:#666666;} ''' - keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ] - remove_tags = [ - dict(name='table', attrs={'class':['rcnt','rcnt topline']}) - ,dict(name=['link','object','embed','meta','base']) - ] + keep_only_tags = [ dict(name='div', attrs={'class':['postdetailFull', 'articlePost']}) ] + remove_tags = [dict(name=['link','object','embed','meta','base'])] remove_attributes = ['width','height'] - feeds = [(u"Harper's Magazine", u'http://www.harpers.org/rss/frontpage-rss20.xml')] + feeds = [(u"Harper's Magazine", u'http://harpers.org/feed/')] - def get_cover_url(self): - cover_url = None - index = 'http://harpers.org/' - soup = self.index_to_soup(index) - link_item = soup.find(name = 'img',attrs= {'class':"cover"}) - if link_item: - cover_url = 'http://harpers.org' + link_item['src'] - return cover_url - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll(xmlns=True): - del item['xmlns'] - return soup diff --git a/recipes/harpers_full.recipe b/recipes/harpers_full.recipe index ff558e9c5b..a63f828968 100644 --- a/recipes/harpers_full.recipe +++ b/recipes/harpers_full.recipe @@ -1,18 +1,22 @@ __license__ = 'GPL v3' -__copyright__ = '2008-2010, Darko Miletic ' +__copyright__ = '2008-2012, Darko Miletic ' ''' harpers.org - paid subscription/ printed issue articles This recipe only get's article's published in text format images and pdf's are ignored +If you have institutional subscription based on access IP you do not need to enter +anything in username/password fields ''' +import time, re +import urllib from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class Harpers_full(BasicNewsRecipe): title = "Harper's Magazine - articles from printed edition" __author__ = 'Darko Miletic' - description = "Harper's Magazine: Founded June 1850." + description = "Harper's Magazine, the oldest general-interest monthly in America, explores the issues that drive our national conversation, through long-form narrative journalism and essays, and such celebrated features as the iconic Harper's Index." publisher = "Harpers's" category = 'news, politics, USA' oldest_article = 30 @@ -21,52 +25,86 @@ class Harpers_full(BasicNewsRecipe): use_embedded_content = False delay = 1 language = 'en' - needs_subscription = True - masthead_url = 'http://www.harpers.org/media/image/Harpers_305x100.gif' - publication_type = 'magazine' - INDEX = strftime('http://www.harpers.org/archive/%Y/%m') - LOGIN = 'http://www.harpers.org' - cover_url = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif') - extra_css = ' body{font-family: "Georgia",serif} ' + encoding = 'utf8' + needs_subscription = 'optional' + masthead_url = 'http://harpers.org/wp-content/themes/harpers/images/pheader.gif' + publication_type = 'magazine' + LOGIN = 'http://harpers.org/wp-content/themes/harpers/ajax_login.php' + extra_css = """ + body{font-family: adobe-caslon-pro,serif} + .category{font-size: small} + .articlePost p:first-letter{display: inline; font-size: xx-large; font-weight: bold} + """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language } - keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ] + keep_only_tags = [ dict(name='div', attrs={'class':['postdetailFull','articlePost']}) ] remove_tags = [ - dict(name='table', attrs={'class':['rcnt','rcnt topline']}) - ,dict(name='link') + dict(name='div', attrs={'class':'fRight rightDivPad'}) + ,dict(name=['link','meta','object','embed','iframe']) ] - remove_attributes=['xmlns'] + remove_attributes=['xmlns'] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) + br.open('http://harpers.org/') if self.username is not None and self.password is not None: - br.open(self.LOGIN) - br.select_form(nr=1) - br['handle' ] = self.username - br['password'] = self.password - br.submit() + tt = time.localtime()*1000 + data = urllib.urlencode({ 'm':self.username + ,'p':self.password + ,'rt':'http://harpers.org/' + ,'tt':tt + }) + br.open(self.LOGIN, data) return br def parse_index(self): + #find current issue + soup = self.index_to_soup('http://harpers.org/') + currentIssue=soup.find('div',attrs={'class':'mainNavi'}).find('li',attrs={'class':'curentIssue'}) + currentIssue_url=self.tag_to_string(currentIssue.a['href']) + self.log(currentIssue_url) + + #go to the current issue + soup1 = self.index_to_soup(currentIssue_url) + date = re.split('\s\|\s',self.tag_to_string(soup1.head.title.string))[0] + self.timefmt = u' [%s]'%date + + #get cover + coverurl='http://harpers.org/wp-content/themes/harpers/ajax_microfiche.php?img=harpers-'+re.split('harpers.org/',currentIssue_url)[1]+'gif/0001.gif' + soup2 = self.index_to_soup(coverurl) + self.cover_url = self.tag_to_string(soup2.find('img')['src']) + self.log(self.cover_url) articles = [] - print 'Processing ' + self.INDEX - soup = self.index_to_soup(self.INDEX) - for item in soup.findAll('div', attrs={'class':'title'}): - text_link = item.parent.find('img',attrs={'alt':'Text'}) - if text_link: - url = self.LOGIN + item.a['href'] - title = item.a.contents[0] - date = strftime(' %B %Y') - articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':'' - }) - return [(soup.head.title.string, articles)] + count = 0 + for item in soup1.findAll('div', attrs={'class':'articleData'}): + text_links = item.findAll('h2') + for text_link in text_links: + if count == 0: + count = 1 + else: + url = text_link.a['href'] + title = text_link.a.contents[0] + date = strftime(' %B %Y') + articles.append({ + 'title' :title + ,'date' :date + ,'url' :url + ,'description':'' + }) + return [(soup1.head.title.string, articles)] + + def print_version(self, url): + return url + '?single=1' + + def cleanup(self): + soup = self.index_to_soup('http://harpers.org/') + signouturl=self.tag_to_string(soup.find('li', attrs={'class':'subLogOut'}).findNext('li').a['href']) + self.log(signouturl) + self.browser.open(signouturl) + diff --git a/recipes/hbr.recipe b/recipes/hbr.recipe index b80b0bace7..d996cf2200 100644 --- a/recipes/hbr.recipe +++ b/recipes/hbr.recipe @@ -11,11 +11,11 @@ class HBR(BasicNewsRecipe): timefmt = ' [%B %Y]' language = 'en' no_stylesheets = True - recipe_disabled = ('hbr.org has started requiring the use of javascript' - ' to log into their website. This is unsupported in calibre, so' - ' this recipe has been disabled. If you would like to see ' - ' HBR supported in calibre, contact hbr.org and ask them' - ' to provide a javascript free login method.') + # recipe_disabled = ('hbr.org has started requiring the use of javascript' + # ' to log into their website. This is unsupported in calibre, so' + # ' this recipe has been disabled. If you would like to see ' + # ' HBR supported in calibre, contact hbr.org and ask them' + # ' to provide a javascript free login method.') LOGIN_URL = 'https://hbr.org/login?request_url=/' LOGOUT_URL = 'https://hbr.org/logout?request_url=/' @@ -38,46 +38,38 @@ class HBR(BasicNewsRecipe): #articleAuthors{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;} #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;} ''' + use_javascript_to_login = True - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - self.logout_url = None - - #''' - br.open(self.LOGIN_URL) - br.select_form(name='signin-form') - br['signin-form:username'] = self.username - br['signin-form:password'] = self.password - raw = br.submit().read() - if '>Sign out<' not in raw: - raise Exception('Failed to login, are you sure your username and password are correct?') + def javascript_login(self, br, username, password): + from calibre.web.jsbrowser.browser import Timeout try: - link = br.find_link(text='Sign out') - if link: - self.logout_url = link.absolute_url - except: - self.logout_url = self.LOGOUT_URL - #''' - return br - - def cleanup(self): - if self.logout_url is not None: - self.browser.open(self.logout_url) + br.visit('https://hbr.org/login?request_url=/', timeout=20) + except Timeout: + pass + br.click('#accordion div[tabindex="0"]', wait_for_load=False) + f = br.select_form('#signin-form') + f['signin-form:username'] = username + f['signin-form:password'] = password + br.submit(wait_for_load=False) + br.run_for_a_time(30) def map_url(self, url): if url.endswith('/ar/1'): return url[:-1]+'pr' - def hbr_get_toc(self): - #return self.index_to_soup(open('/t/hbr.html').read()) + # return self.index_to_soup(open('/t/toc.html').read()) today = date.today() future = today + timedelta(days=30) - for x in [x.strftime('%y%m') for x in (future, today)]: + past = today - timedelta(days=30) + for x in [x.strftime('%y%m') for x in (future, today, past)]: url = self.INDEX + x soup = self.index_to_soup(url) - if not soup.find(text='Issue Not Found'): + if (not soup.find(text='Issue Not Found') and not soup.find( + text="We're Sorry. There was an error processing your request") + and 'Exception: java.io.FileNotFoundException' not in + unicode(soup)): return soup raise Exception('Could not find current issue') @@ -85,8 +77,9 @@ class HBR(BasicNewsRecipe): feeds = [] current_section = None articles = [] - for x in soup.find(id='archiveToc').findAll(['h3', 'h4']): - if x.name == 'h3': + for x in soup.find(id='issueFeaturesContent').findAll(['li', 'h4']): + if x.name == 'h4': + if x.get('class', None) == 'basic':continue if current_section is not None and articles: feeds.append((current_section, articles)) current_section = self.tag_to_string(x).capitalize() @@ -102,7 +95,7 @@ class HBR(BasicNewsRecipe): if url.startswith('/'): url = 'http://hbr.org' + url url = self.map_url(url) - p = x.parent.find('p') + p = x.find('p', attrs={'class':'author'}) desc = '' if p is not None: desc = self.tag_to_string(p) @@ -114,10 +107,9 @@ class HBR(BasicNewsRecipe): 'date':''}) return feeds - def parse_index(self): soup = self.hbr_get_toc() - #open('/t/hbr.html', 'wb').write(unicode(soup).encode('utf-8')) + # open('/t/hbr.html', 'wb').write(unicode(soup).encode('utf-8')) feeds = self.hbr_parse_toc(soup) return feeds diff --git a/recipes/heise_online.recipe b/recipes/heise_online.recipe index 4d82570698..29f63ce1ac 100644 --- a/recipes/heise_online.recipe +++ b/recipes/heise_online.recipe @@ -15,23 +15,12 @@ class AdvancedUserRecipe(BasicNewsRecipe): timeout = 5 no_stylesheets = True + keep_only_tags = [dict(name='div', attrs={'id':'mitte_news'}), + dict(name='h1', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'meldung_wrapper'})] - remove_tags_after = dict(name ='p', attrs={'class':'editor'}) remove_tags = [dict(id='navi_top_container'), - dict(id='navi_bottom'), - dict(id='mitte_rechts'), - dict(id='navigation'), - dict(id='subnavi'), - dict(id='social_bookmarks'), - dict(id='permalink'), - dict(id='content_foren'), - dict(id='seiten_navi'), - dict(id='adbottom'), - dict(id='sitemap'), - dict(name='div', attrs={'id':'sitemap'}), - dict(name='ul', attrs={'class':'erste_zeile'}), - dict(name='ul', attrs={'class':'zweite_zeile'}), - dict(name='div', attrs={'class':'navi_top_container'})] + dict(name='p', attrs={'class':'size80'})] feeds = [ ('Newsticker', 'http://www.heise.de/newsticker/heise.rdf'), @@ -54,5 +43,3 @@ class AdvancedUserRecipe(BasicNewsRecipe): def print_version(self, url): return url + '?view=print' - - diff --git a/recipes/heraldo.recipe b/recipes/heraldo.recipe index b00d3f23c8..aa1a6cf1ee 100644 --- a/recipes/heraldo.recipe +++ b/recipes/heraldo.recipe @@ -53,7 +53,7 @@ class heraldo(BasicNewsRecipe): day = "%.2d" % st.tm_mday #[url]http://oldorigin-www.heraldo.es/20101211/primeras/portada_aragon.pdf[/url] cover='http://oldorigin-www.heraldo.es/'+ year + month + day +'/primeras/portada_aragon.pdf' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/hindu.recipe b/recipes/hindu.recipe index cc5305eb77..eb84fc4031 100644 --- a/recipes/hindu.recipe +++ b/recipes/hindu.recipe @@ -16,10 +16,14 @@ class TheHindu(BasicNewsRecipe): keep_only_tags = [dict(id='content')] remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}), - dict(id=['email-section', 'right-column', 'printfooter'])] + dict(id=['email-section', 'right-column', 'printfooter', 'topover', + 'slidebox', 'th_footer'])] extra_css = '.photo-caption { font-size: smaller }' + def preprocess_raw_html(self, raw, url): + return raw.replace('

', '

').replace('

', '

') + def postprocess_html(self, soup, first_fetch): for t in soup.findAll(['table', 'tr', 'td','center']): t.name = 'div' diff --git a/recipes/historia_pl.recipe b/recipes/historia_pl.recipe index f3353fe89f..60554c0924 100644 --- a/recipes/historia_pl.recipe +++ b/recipes/historia_pl.recipe @@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class Historia_org_pl(BasicNewsRecipe): title = u'Historia.org.pl' __author__ = 'fenuks' - description = u'history site' + description = u'Artykuły dotyczące historii w układzie epok i tematów, forum. Najlepsza strona historii. Matura z historii i egzamin gimnazjalny z historii.' cover_url = 'http://lh3.googleusercontent.com/_QeRQus12wGg/TOvHsZ2GN7I/AAAAAAAAD_o/LY1JZDnq7ro/logo5.jpg' category = 'history' language = 'pl' @@ -12,16 +12,15 @@ class Historia_org_pl(BasicNewsRecipe): no_stylesheets = True use_embedded_content = True max_articles_per_feed = 100 + ignore_duplicate_articles = {'title', 'url'} - feeds = [(u'Wszystkie', u'http://www.historia.org.pl/index.php?format=feed&type=atom'), - (u'Wiadomości', u'http://www.historia.org.pl/index.php/wiadomosci.feed?type=atom'), - (u'Publikacje', u'http://www.historia.org.pl/index.php/publikacje.feed?type=atom'), - (u'Publicystyka', u'http://www.historia.org.pl/index.php/publicystyka.feed?type=atom'), - (u'Recenzje', u'http://historia.org.pl/index.php/recenzje.feed?type=atom'), - (u'Kultura i sztuka', u'http://www.historia.org.pl/index.php/kultura-i-sztuka.feed?type=atom'), - (u'Rekonstykcje', u'http://www.historia.org.pl/index.php/rekonstrukcje.feed?type=atom'), - (u'Projekty', u'http://www.historia.org.pl/index.php/projekty.feed?type=atom'), - (u'Konkursy'), (u'http://www.historia.org.pl/index.php/konkursy.feed?type=atom')] + + feeds = [(u'Wszystkie', u'http://historia.org.pl/feed/'), + (u'Wiadomości', u'http://historia.org.pl/Kategoria/wiadomosci/feed/'), + (u'Publikacje', u'http://historia.org.pl/Kategoria/artykuly/feed/'), + (u'Publicystyka', u'http://historia.org.pl/Kategoria/publicystyka/feed/'), + (u'Recenzje', u'http://historia.org.pl/Kategoria/recenzje/feed/'), + (u'Projekty', u'http://historia.org.pl/Kategoria/projekty/feed/'),] def print_version(self, url): diff --git a/recipes/history_today.recipe b/recipes/history_today.recipe index 43adf7a358..9f88976b66 100644 --- a/recipes/history_today.recipe +++ b/recipes/history_today.recipe @@ -21,7 +21,7 @@ class HistoryToday(BasicNewsRecipe): needs_subscription = True def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://www.historytoday.com/user/login') br.select_form(nr=1) diff --git a/recipes/icons/astroflesz.png b/recipes/icons/astroflesz.png new file mode 100644 index 0000000000..37a9e21675 Binary files /dev/null and b/recipes/icons/astroflesz.png differ diff --git a/recipes/icons/badania_net.png b/recipes/icons/badania_net.png new file mode 100644 index 0000000000..de915de8d1 Binary files /dev/null and b/recipes/icons/badania_net.png differ diff --git a/recipes/icons/czas_gentlemanow.png b/recipes/icons/czas_gentlemanow.png new file mode 100644 index 0000000000..0d20f80c05 Binary files /dev/null and b/recipes/icons/czas_gentlemanow.png differ diff --git a/recipes/icons/ekologia_pl.png b/recipes/icons/ekologia_pl.png new file mode 100644 index 0000000000..797421420d Binary files /dev/null and b/recipes/icons/ekologia_pl.png differ diff --git a/recipes/icons/elguardian.png b/recipes/icons/elguardian.png new file mode 100644 index 0000000000..a54b067ee4 Binary files /dev/null and b/recipes/icons/elguardian.png differ diff --git a/recipes/icons/eso_pl.png b/recipes/icons/eso_pl.png new file mode 100644 index 0000000000..4f3319fece Binary files /dev/null and b/recipes/icons/eso_pl.png differ diff --git a/recipes/icons/kurier_galicyjski.png b/recipes/icons/kurier_galicyjski.png new file mode 100644 index 0000000000..4d66a15122 Binary files /dev/null and b/recipes/icons/kurier_galicyjski.png differ diff --git a/recipes/icons/libartes.png b/recipes/icons/libartes.png new file mode 100644 index 0000000000..0954c40273 Binary files /dev/null and b/recipes/icons/libartes.png differ diff --git a/recipes/icons/libertad_digital.png b/recipes/icons/libertad_digital.png new file mode 100644 index 0000000000..83ed5a6dda Binary files /dev/null and b/recipes/icons/libertad_digital.png differ diff --git a/recipes/icons/more_intelligent_life.png b/recipes/icons/more_intelligent_life.png new file mode 100644 index 0000000000..4fcf66e9a1 Binary files /dev/null and b/recipes/icons/more_intelligent_life.png differ diff --git a/recipes/icons/nauka_w_polsce.png b/recipes/icons/nauka_w_polsce.png new file mode 100644 index 0000000000..0d872ce682 Binary files /dev/null and b/recipes/icons/nauka_w_polsce.png differ diff --git a/recipes/icons/osworld_pl.png b/recipes/icons/osworld_pl.png new file mode 100644 index 0000000000..97a7d0dd55 Binary files /dev/null and b/recipes/icons/osworld_pl.png differ diff --git a/recipes/icons/poradnia_pwn.png b/recipes/icons/poradnia_pwn.png new file mode 100644 index 0000000000..6cafb534fc Binary files /dev/null and b/recipes/icons/poradnia_pwn.png differ diff --git a/recipes/icons/pravda_rs.png b/recipes/icons/pravda_rs.png new file mode 100644 index 0000000000..8c4533a79d Binary files /dev/null and b/recipes/icons/pravda_rs.png differ diff --git a/recipes/spiders_web_pl.png b/recipes/icons/spiders_web_pl.png similarity index 100% rename from recipes/spiders_web_pl.png rename to recipes/icons/spiders_web_pl.png diff --git a/recipes/icons/tvp_info.png b/recipes/icons/tvp_info.png new file mode 100644 index 0000000000..1414f38d5c Binary files /dev/null and b/recipes/icons/tvp_info.png differ diff --git a/recipes/icons/ubuntu_pomoc_org.png b/recipes/icons/ubuntu_pomoc_org.png new file mode 100644 index 0000000000..a143846630 Binary files /dev/null and b/recipes/icons/ubuntu_pomoc_org.png differ diff --git a/recipes/icons/wprost_rss.png b/recipes/icons/wprost_rss.png new file mode 100644 index 0000000000..5ce1b5563d Binary files /dev/null and b/recipes/icons/wprost_rss.png differ diff --git a/recipes/icons/zaufana_trzecia_strona.png b/recipes/icons/zaufana_trzecia_strona.png new file mode 100644 index 0000000000..cdaeb95d27 Binary files /dev/null and b/recipes/icons/zaufana_trzecia_strona.png differ diff --git a/recipes/il_messaggero.recipe b/recipes/il_messaggero.recipe index 93c35f4695..02310ff9af 100644 --- a/recipes/il_messaggero.recipe +++ b/recipes/il_messaggero.recipe @@ -28,12 +28,15 @@ class IlMessaggero(BasicNewsRecipe): recursion = 10 remove_javascript = True + extra_css = ' .bianco31lucida{color: black} ' - - keep_only_tags = [dict(name='h1', attrs={'class':'titoloLettura2'}), - dict(name='h2', attrs={'class':'sottotitLettura'}), - dict(name='span', attrs={'class':'testoArticoloG'}) + keep_only_tags = [dict(name='h1', attrs={'class':['titoloLettura2','titoloart','bianco31lucida']}), + dict(name='h2', attrs={'class':['sottotitLettura','grigio16']}), + dict(name='span', attrs={'class':'testoArticoloG'}), + dict(name='div', attrs={'id':'testodim'}) ] + + def get_cover_url(self): cover = None st = time.localtime() @@ -41,7 +44,7 @@ class IlMessaggero(BasicNewsRecipe): month = "%.2d" % st.tm_mon day = "%.2d" % st.tm_mday cover='http://carta.ilmessaggero.it/' + year + month + day + '/jpeg/MSGR_20_CITTA_1.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: @@ -55,17 +58,16 @@ class IlMessaggero(BasicNewsRecipe): feeds = [ (u'HomePage', u'http://www.ilmessaggero.it/rss/home.xml'), (u'Primo Piano', u'http://www.ilmessaggero.it/rss/initalia_primopiano.xml'), - (u'Cronaca Bianca', u'http://www.ilmessaggero.it/rss/initalia_cronacabianca.xml'), - (u'Cronaca Nera', u'http://www.ilmessaggero.it/rss/initalia_cronacanera.xml'), (u'Economia e Finanza', u'http://www.ilmessaggero.it/rss/economia.xml'), (u'Politica', u'http://www.ilmessaggero.it/rss/initalia_politica.xml'), - (u'Scienza e Tecnologia', u'http://www.ilmessaggero.it/rss/scienza.xml'), - (u'Cinema', u'http://www.ilmessaggero.it/rss.php?refresh_ce#'), - (u'Viaggi', u'http://www.ilmessaggero.it/rss.php?refresh_ce#'), + (u'Cultura', u'http://www.ilmessaggero.it/rss/cultura.xml'), + (u'Tecnologia', u'http://www.ilmessaggero.it/rss/tecnologia.xml'), + (u'Spettacoli', u'http://www.ilmessaggero.it/rss/spettacoli.xml'), + (u'Edizioni Locali', u'http://www.ilmessaggero.it/rss/edlocali.xml'), (u'Roma', u'http://www.ilmessaggero.it/rss/roma.xml'), - (u'Cultura e Tendenze', u'http://www.ilmessaggero.it/rss/roma_culturaspet.xml'), + (u'Benessere', u'http://www.ilmessaggero.it/rss/benessere.xml'), (u'Sport', u'http://www.ilmessaggero.it/rss/sport.xml'), - (u'Calcio', u'http://www.ilmessaggero.it/rss/sport_calcio.xml'), - (u'Motori', u'http://www.ilmessaggero.it/rss/sport_motori.xml') + (u'Moda', u'http://www.ilmessaggero.it/rss/moda.xml') ] + diff --git a/recipes/independent.recipe b/recipes/independent.recipe index 5e746145ee..43f0f9acde 100644 --- a/recipes/independent.recipe +++ b/recipes/independent.recipe @@ -47,9 +47,10 @@ class TheIndependentNew(BasicNewsRecipe): dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}), dict(name='img',attrs={'alt' : ['view gallery']}), dict(attrs={'style' : re.compile('.*')}), + dict(attrs={'class':lambda x: x and 'voicesRelatedTopics' in x.split()}), ] - keep_only_tags =[dict(attrs={'id':'main'})] + keep_only_tags =[dict(attrs={'id':['main','top']})] recursions = 0 # fixes non compliant html nesting and 'marks' article graphics links @@ -69,7 +70,7 @@ class TheIndependentNew(BasicNewsRecipe): } extra_css = """ - h1{font-family: Georgia,serif } + h1{font-family: Georgia,serif ; font-size: x-large; } body{font-family: Verdana,Arial,Helvetica,sans-serif} img{margin-bottom: 0.4em; display:block} .starRating img {float: left} @@ -77,16 +78,21 @@ class TheIndependentNew(BasicNewsRecipe): .image {clear:left; font-size: x-small; color:#888888;} .articleByTimeLocation {font-size: x-small; color:#888888; margin-bottom:0.2em ; margin-top:0.2em ; display:block} - .subtitle {clear:left} + .subtitle {clear:left ;} .column-1 h1 { color: #191919} .column-1 h2 { color: #333333} .column-1 h3 { color: #444444} - .column-1 p { color: #777777} - .column-1 p,a,h1,h2,h3 { margin: 0; } - .column-1 div{color:#888888; margin: 0;} + .subtitle { color: #777777; font-size: medium;} + .column-1 a,h1,h2,h3 { margin: 0; } + .column-1 div{margin: 0;} .articleContent {display: block; clear:left;} + .articleContent {color: #000000; font-size: medium;} + .ivDrip-section {color: #000000; font-size: medium;} + .datetime {color: #888888} + .title {font-weight:bold;} .storyTop{} .pictureContainer img { max-width: 400px; max-height: 400px;} + .image img { max-width: 400px; max-height: 400px;} """ oldest_article = 1 @@ -325,6 +331,20 @@ class TheIndependentNew(BasicNewsRecipe): item.contents[0] = '' def postprocess_html(self,soup, first_fetch): + + #mark subtitle parent as non-compliant nesting causes + # p's to be 'popped out' of the h3 tag they are nested in. + subtitle = soup.find('h3', attrs={'class' : 'subtitle'}) + subtitle_div = None + if subtitle: + subtitle_div = subtitle.parent + if subtitle_div: + clazz = '' + if 'class' in subtitle_div: + clazz = subtitle_div['class'] + ' ' + clazz = clazz + 'subtitle' + subtitle_div['class'] = clazz + #find broken images and remove captions items_to_extract = [] for item in soup.findAll('div', attrs={'class' : 'image'}): @@ -501,6 +521,9 @@ class TheIndependentNew(BasicNewsRecipe): ), (u'Opinion', u'http://www.independent.co.uk/opinion/?service=rss'), + (u'Voices', + u'http://www.independent.co.uk/voices/?service=rss' + ), (u'Environment', u'http://www.independent.co.uk/environment/?service=rss'), (u'Sport - Athletics', diff --git a/recipes/informacje_usa.recipe b/recipes/informacje_usa.recipe index ac31134103..692dcdc07e 100644 --- a/recipes/informacje_usa.recipe +++ b/recipes/informacje_usa.recipe @@ -1,5 +1,4 @@ from calibre.web.feeds.news import BasicNewsRecipe -import re class Informacje_USA(BasicNewsRecipe): title = u'Informacje USA' oldest_article = 7 @@ -8,11 +7,10 @@ class Informacje_USA(BasicNewsRecipe): description = u'portal wiadomości amerykańskich' category = 'news' language = 'pl' - masthead_url= 'http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg' - cover_url='http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg' + cover_url='http://www.informacjeusa.com/wp-content/uploads/2013/01/V3BANNER420-90new.jpg' no_stylesheets = True - preprocess_regexps = [(re.compile(ur'

Zobacz:.*?

', re.DOTALL), lambda match: ''), (re.compile(ur'

''' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://jbpress.ismedia.jp/articles/print/5549') response = br.response() diff --git a/recipes/johm.recipe b/recipes/johm.recipe index 0f5625b806..5c3346c71f 100644 --- a/recipes/johm.recipe +++ b/recipes/johm.recipe @@ -17,7 +17,7 @@ class JournalofHospitalMedicine(BasicNewsRecipe): # TO LOGIN def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open('http://www3.interscience.wiley.com/cgi-bin/home') br.select_form(nr=0) br['j_username'] = self.username diff --git a/recipes/kdefamily_pl.recipe b/recipes/kdefamily_pl.recipe new file mode 100644 index 0000000000..75f88b0f3d --- /dev/null +++ b/recipes/kdefamily_pl.recipe @@ -0,0 +1,14 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class KDEFamilyPl(BasicNewsRecipe): + title = u'KDEFamily.pl' + __author__ = 'fenuks' + description = u'KDE w Polsce' + category = 'open source, KDE' + language = 'pl' + cover_url = 'http://www.mykde.home.pl/kdefamily/wp-content/uploads/2012/07/logotype-e1341585198616.jpg' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = True + feeds = [(u'Wszystko', u'http://kdefamily.pl/feed/')] diff --git a/recipes/kidney.recipe b/recipes/kidney.recipe index 19fd244675..ac4cc5d7f4 100644 --- a/recipes/kidney.recipe +++ b/recipes/kidney.recipe @@ -31,7 +31,7 @@ class JASN(BasicNewsRecipe): #TO LOGIN def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) self.kidney_toc_soup = BeautifulSoup(br.open(self.INDEX).read()) toc = self.kidney_toc_soup.find(id='tocTable') t = toc.find(text=lambda x: x and '[Full Text]' in x) diff --git a/recipes/klip_me.recipe b/recipes/klip_me.recipe index 71918dc78b..cef6b1b467 100644 --- a/recipes/klip_me.recipe +++ b/recipes/klip_me.recipe @@ -29,7 +29,7 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe): def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None: br.open(self.LOGIN) br.select_form(nr=0) diff --git a/recipes/kommersant.recipe b/recipes/kommersant.recipe index 09fb8f8ad8..390ae7d1bd 100644 --- a/recipes/kommersant.recipe +++ b/recipes/kommersant.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2010-2011, Darko Miletic ' +__copyright__ = '2010-2013, Darko Miletic ' ''' www.kommersant.ru ''' @@ -29,17 +29,20 @@ class Kommersant_ru(BasicNewsRecipe): """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language } keep_only_tags = [dict(attrs={'class':['document','document_vvodka','document_text','document_authors vblock']})] remove_tags = [dict(name=['iframe','object','link','img','base','meta'])] - feeds = [(u'Articles', u'http://feeds.kommersant.ru/RSS_Export/RU/daily.xml')] - + feeds = [(u'Articles', u'http://dynamic.feedsportal.com/pf/438800/http://feeds.kommersant.ru/RSS_Export/RU/daily.xml')] + + def get_article_url(self, article): + return article.get('guid', None) + def print_version(self, url): return url.replace('/doc-rss/','/Doc/') + '/Print' \ No newline at end of file diff --git a/recipes/korben.recipe b/recipes/korben.recipe index fb8134b5cb..620838613f 100644 --- a/recipes/korben.recipe +++ b/recipes/korben.recipe @@ -11,7 +11,7 @@ class BasicUserRecipe1318619728(BasicNewsRecipe): def get_masthead_url(self): masthead = 'http://korben.info/wp-content/themes/korben-steaw/hab/logo.png' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(masthead) except: diff --git a/recipes/kosmonauta_pl.recipe b/recipes/kosmonauta_pl.recipe index ddfa26df36..d1caa85950 100644 --- a/recipes/kosmonauta_pl.recipe +++ b/recipes/kosmonauta_pl.recipe @@ -9,6 +9,21 @@ class Kosmonauta(BasicNewsRecipe): language = 'pl' cover_url='http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg' no_stylesheets = True + INDEX = 'http://www.kosmonauta.net' oldest_article = 7 + no_stylesheets = True max_articles_per_feed = 100 - feeds = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/index.php/feed/rss.html')] + keep_only_tags = [dict(name='div', attrs={'class':'item-page'})] + remove_tags = [dict(attrs={'class':['article-tools clearfix', 'cedtag', 'nav clearfix', 'jwDisqusForm']})] + remove_tags_after = dict(name='div', attrs={'class':'cedtag'}) + feeds = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/?format=feed&type=atom')] + + def preprocess_html(self, soup): + for a in soup.findAll(name='a'): + if a.has_key('href'): + href = a['href'] + if not href.startswith('http'): + a['href'] = self.INDEX + href + print '%%%%%%%%%%%%%%%%%%%%%%%%%', a['href'] + return soup + \ No newline at end of file diff --git a/recipes/kp.recipe b/recipes/kp.recipe index 396ec41422..f52fcef60b 100644 --- a/recipes/kp.recipe +++ b/recipes/kp.recipe @@ -1,4 +1,3 @@ -import re from calibre.web.feeds.news import BasicNewsRecipe @@ -32,12 +31,12 @@ class KrytykaPolitycznaRecipe(BasicNewsRecipe): extra_css = ''' body {font-family: verdana, arial, helvetica, geneva, sans-serif ;} td.contentheading{font-size: large; font-weight: bold;} - ''' + ''' feeds = [ ('Wszystkie', 'http://www.krytykapolityczna.pl/rss.xml') ] - + def print_version(self, url): soup = self.index_to_soup(url) print_ico = soup.find(attrs = {'class' : 'print-page'}) diff --git a/recipes/ksiazka_pl.recipe b/recipes/ksiazka_pl.recipe index 7f9999f782..f91cb4f4f7 100644 --- a/recipes/ksiazka_pl.recipe +++ b/recipes/ksiazka_pl.recipe @@ -1,15 +1,16 @@ from calibre.web.feeds.news import BasicNewsRecipe import re class Ksiazka_net_pl(BasicNewsRecipe): - title = u'ksiazka.net.pl' + title = u'książka.net.pl' __author__ = 'fenuks' - description = u'Ksiazka.net.pl - book vortal' + description = u'Portal Księgarski - tematyczny serwis o książkach. Wydarzenia z rynku księgarsko-wydawniczego, nowości, zapowiedzi, bestsellery, setki recenzji. Niezbędne informacje dla każdego miłośnika książek, księgarza, bibliotekarza i wydawcy.' cover_url = 'http://www.ksiazka.net.pl/fileadmin/templates/ksiazka.net.pl/images/1PortalKsiegarski-logo.jpg' category = 'books' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets= True + remove_empty_feeds = True #extra_css = 'img {float: right;}' preprocess_regexps = [(re.compile(ur'Podoba mi się, kupuję:'), lambda match: '
')] remove_tags_before= dict(name='div', attrs={'class':'m-body'}) diff --git a/recipes/kurier_galicyjski.recipe b/recipes/kurier_galicyjski.recipe new file mode 100644 index 0000000000..01a36bf724 --- /dev/null +++ b/recipes/kurier_galicyjski.recipe @@ -0,0 +1,56 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup as bs +class KurierGalicyjski(BasicNewsRecipe): + title = u'Kurier Galicyjski' + __author__ = 'fenuks' + #description = u'' + category = 'news' + language = 'pl' + cover_url = 'http://www.duszki.pl/Kurier_galicyjski_bis2_small.gif' + oldest_article = 7 + max_articles_per_feed = 100 + remove_empty_feeds = True + no_stylesheets = True + keep_only_tags = [dict(attrs={'class':'item-page'})] + remove_tags = [dict(attrs={'class':'pagenav'}), dict(attrs={'style':'border-top-width: thin; border-top-style: dashed; border-top-color: #CCC; border-bottom-width: thin; border-bottom-style: dashed; border-bottom-color: #CCC; padding-top:5px; padding-bottom:5px; text-align:right; margin-top:10px; height:20px;'})] + feeds = [(u'Wydarzenia', u'http://kuriergalicyjski.com/index.php/wydarzenia?format=feed&type=atom'), (u'Publicystyka', u'http://kuriergalicyjski.com/index.php/niezwykle-historie?format=feed&type=atom'), (u'Reporta\u017ce', u'http://kuriergalicyjski.com/index.php/report?format=feed&type=atom'), (u'Rozmowy Kuriera', u'http://kuriergalicyjski.com/index.php/kuriera?format=feed&type=atom'), (u'Przegl\u0105d prasy', u'http://kuriergalicyjski.com/index.php/2012-01-05-14-08-55?format=feed&type=atom'), (u'Kultura', u'http://kuriergalicyjski.com/index.php/2011-12-02-14-26-39?format=feed&type=atom'), (u'Zabytki', u'http://kuriergalicyjski.com/index.php/2011-12-02-14-27-32?format=feed&type=atom'), (u'Polska-Ukraina', u'http://kuriergalicyjski.com/index.php/pol-ua?format=feed&type=atom'), (u'Polacy i Ukrai\u0144cy', u'http://kuriergalicyjski.com/index.php/polacy-i-ukr?format=feed&type=atom'), (u'Niezwyk\u0142e historie', u'http://kuriergalicyjski.com/index.php/niezwykle-historie?format=feed&type=atom'), (u'Polemiki', u'http://kuriergalicyjski.com/index.php/polemiki?format=feed&type=atom')] + + def append_page(self, soup, appendtag): + pager = soup.find(id='article-index') + if pager: + pager = pager.findAll('a')[1:] + if pager: + for a in pager: + nexturl = 'http://www.kuriergalicyjski.com' + a['href'] + soup2 = self.index_to_soup(nexturl) + pagetext = soup2.find(attrs={'class':'item-page'}) + if pagetext.h2: + pagetext.h2.extract() + r = pagetext.find(attrs={'class':'article-info'}) + if r: + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pos = len(appendtag.contents) + for r in appendtag.findAll(id='article-index'): + r.extract() + for r in appendtag.findAll(attrs={'class':'pagenavcounter'}): + r.extract() + for r in appendtag.findAll(attrs={'class':'pagination'}): + r.extract() + for r in appendtag.findAll(attrs={'class':'pagenav'}): + r.extract() + for r in appendtag.findAll(attrs={'style':'border-top-width: thin; border-top-style: dashed; border-top-color: #CCC; border-bottom-width: thin; border-bottom-style: dashed; border-bottom-color: #CCC; padding-top:5px; padding-bottom:5px; text-align:right; margin-top:10px; height:20px;'}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + for r in soup.findAll(style=True): + del r['style'] + for img in soup.findAll(attrs={'class':'easy_img_caption smartresize'}): + img.insert(len(img.contents)-1, bs('
')) + img.insert(len(img.contents), bs('

')) + for a in soup.findAll('a', href=True): + if a['href'].startswith('/'): + a['href'] = 'http://kuriergalicyjski.com' + a['href'] + return soup diff --git a/recipes/la_nacion_cr.recipe b/recipes/la_nacion_cr.recipe new file mode 100644 index 0000000000..ae320064d6 --- /dev/null +++ b/recipes/la_nacion_cr.recipe @@ -0,0 +1,40 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class crnews(BasicNewsRecipe): + __author__ = 'Douglas Delgado' + title = u'La Nacion' + publisher = 'GRUPO NACION GN, S. A.' + description = 'Diario de circulacion nacional de Costa Rica. Recipe creado por Douglas Delgado (doudelgado@gmail.com) para su uso con Calibre por Kovid Goyal' + category = 'Spanish, Entertainment' + masthead_url = 'http://www.nacion.com/App_Themes/nacioncom/Images/logo_nacioncom.png' + + oldest_article = 7 + delay = 1 + max_articles_per_feed = 100 + auto_cleanup = True + encoding = 'utf-8' + language = 'es_CR' + use_embedded_content = False + remove_empty_feeds = True + remove_javascript = True + no_stylesheets = True + + + feeds = [(u'Portada', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=portada'), (u'Ultima Hora', u'http://www.nacion.com/Generales/RSS/UltimaHoraRss.aspx'), (u'Nacionales', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=elpais'), (u'Entretenimiento', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=entretenimiento'), (u'Sucesos', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=sucesos'), (u'Deportes', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=deportes'), (u'Internacionales', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=mundo'), (u'Economia', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=economia'), (u'Aldea Global', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=aldeaglobal'), (u'Tecnologia', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=tecnologia'), (u'Opinion', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=opinion')] + + def get_cover_url(self): + index = 'http://kiosko.net/cr/np/cr_nacion.html' + soup = self.index_to_soup(index) + for image in soup.findAll('img',src=True): + if image['src'].endswith('cr_nacion.750.jpg'): + return image['src'] + return None + + def get_article_url(self, article): + url = article.get('guid', None) + return url + + extra_css = ''' + h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;} + h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;} + ''' diff --git a/recipes/la_stampa.recipe b/recipes/la_stampa.recipe index b9d8a469aa..06a7debe9d 100644 --- a/recipes/la_stampa.recipe +++ b/recipes/la_stampa.recipe @@ -2,7 +2,7 @@ __license__ = 'GPL v3' __author__ = 'Gabriele Marini, based on Darko Miletic' __copyright__ = '2009, Darko Miletic ' -__description__ = 'La Stampa 05/05/2010' +__description__ = 'La Stampa 28/12/2012' ''' http://www.lastampa.it/ @@ -14,10 +14,11 @@ class LaStampa(BasicNewsRecipe): title = u'La Stampa' language = 'it' __author__ = 'Gabriele Marini' - oldest_article = 15 + #oldest_article = 15 + oldest_articlce = 7 #for daily schedule max_articles_per_feed = 50 recursion = 100 - cover_url = 'http://www.lastampa.it/edicola/PDF/1.pdf' + cover_url = 'http://www1.lastampa.it/edicola/PDF/1.pdf' use_embedded_content = False remove_javascript = True no_stylesheets = True @@ -33,35 +34,41 @@ class LaStampa(BasicNewsRecipe): if link: return link[0]['href'] - keep_only_tags = [dict(attrs={'class':['boxocchiello2','titoloRub','titologir','catenaccio','sezione','articologirata']}), + keep_only_tags = [dict(attrs={'class':['boxocchiello2','titoloRub','titologir','autore-girata','luogo-girata','catenaccio','sezione','articologirata','bodytext','news-single-img','ls-articoloCorpo','ls-blog-list-1col']}), dict(name='div', attrs={'id':'corpoarticolo'}) ] - remove_tags = [dict(name='div', attrs={'id':'menutop'}), - dict(name='div', attrs={'id':'fwnetblocco'}), - dict(name='table', attrs={'id':'strumenti'}), - dict(name='table', attrs={'id':'imgesterna'}), - dict(name='a', attrs={'class':'linkblu'}), - dict(name='a', attrs={'class':'link'}), + + remove_tags = [dict(name='div', attrs={'id':['menutop','fwnetblocco']}), + dict(attrs={'class':['ls-toolbarCommenti','ls-boxCommentsBlog']}), + dict(name='table', attrs={'id':['strumenti','imgesterna']}), + dict(name='a', attrs={'class':['linkblu','link']}), dict(name='span', attrs={'class':['boxocchiello','boxocchiello2','sezione']}) ] - - feeds = [ - (u'Home', u'http://www.lastampa.it/redazione/rss_home.xml'), - (u'Editoriali', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=25'), - (u'Politica', u'http://www.lastampa.it/redazione/cmssezioni/politica/rss_politica.xml'), - (u'ArciItaliana', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=14'), - (u'Cronache', u'http://www.lastampa.it/redazione/cmssezioni/cronache/rss_cronache.xml'), - (u'Esteri', u'http://www.lastampa.it/redazione/cmssezioni/esteri/rss_esteri.xml'), - (u'Danni Collaterali', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=90'), - (u'Economia', u'http://www.lastampa.it/redazione/cmssezioni/economia/rss_economia.xml'), - (u'Tecnologia ', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=30'), - (u'Spettacoli', u'http://www.lastampa.it/redazione/cmssezioni/spettacoli/rss_spettacoli.xml'), - (u'Sport', u'http://www.lastampa.it/sport/rss_home.xml'), - (u'Torino', u'http://rss.feedsportal.com/c/32418/f/466938/index.rss'), - (u'Motori', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=57'), - (u'Scienza', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=38'), - (u'Fotografia', u'http://rss.feedsportal.com/c/32418/f/478449/index.rss'), - (u'Scuola', u'http://www.lastampa.it/cmstp/rubriche/oggetti/rss.asp?ID_blog=60'), - (u'Tempo Libero', u'http://www.lastampa.it/tempolibero/rss_home.xml') + feeds = [(u'BuonGiorno',u'http://www.lastampa.it/cultura/opinioni/buongiorno/rss.xml'), + (u'Jena', u'http://www.lastampa.it/cultura/opinioni/jena/rss.xml'), + (u'Editoriali', u'http://www.lastampa.it/cultura/opinioni/editoriali'), + (u'Finestra sull America', u'http://lastampa.feedsportal.com/c/32418/f/625713/index.rss'), + (u'HomePage', u'http://www.lastampa.it/rss.xml'), + (u'Politica Italia', u'http://www.lastampa.it/italia/politica/rss.xml'), + (u'ArciItaliana', u'http://www.lastampa.it/rss/blog/arcitaliana'), + (u'Cronache', u'http://www.lastampa.it/italia/cronache/rss.xml'), + (u'Esteri', u'http://www.lastampa.it/esteri/rss.xml'), + (u'Danni Collaterali', u'http://www.lastampa.it/rss/blog/danni-collaterali'), + (u'Economia', u'http://www.lastampa.it/economia/rss.xml'), + (u'Tecnologia ', u'http://www.lastampa.it/tecnologia/rss.xml'), + (u'Spettacoli', u'http://www.lastampa.it/spettacoli/rss.xml'), + (u'Sport', u'http://www.lastampa.it/sport/rss.xml'), + (u'Torino', u'http://www.lastampa.it/cronaca/rss.xml'), + (u'Motori', u'http://www.lastampa.it/motori/rss.xml'), + (u'Scienza', u'http://www.lastampa.it/scienza/rss.xml'), + (u'Cultura', u'http://www.lastampa.it/cultura/rss.xml'), + (u'Scuola', u'http://www.lastampa.it/cultura/scuola/rss.xml'), + (u'Benessere', u'http://www.lastampa.it/scienza/benessere/rss.xml'), + (u'Cucina', u'http://www.lastampa.it/societa/cucina/rss.xml'), + (u'Casa', u'http://www.lastampa.it/societa/casa/rss.xml'), + (u'Moda',u'http://www.lastampa.it/societa/moda/rss.xml'), + (u'Giochi',u'http://www.lastampa.it/tecnologia/giochi/rss.xml'), + (u'Viaggi',u'http://www.lastampa.it/societa/viaggi/rss.xml'), + (u'Ambiente', u'http://www.lastampa.it/scienza/ambiente/rss.xml') ] diff --git a/recipes/la_voce.recipe b/recipes/la_voce.recipe index 140adbb84c..18db9346a8 100644 --- a/recipes/la_voce.recipe +++ b/recipes/la_voce.recipe @@ -7,9 +7,9 @@ class AdvancedUserRecipe1324114228(BasicNewsRecipe): max_articles_per_feed = 100 auto_cleanup = True masthead_url = 'http://www.lavoce.info/binary/la_voce/testata/lavoce.1184661635.gif' - feeds = [(u'La Voce', u'http://www.lavoce.info/feed_rss.php?id_feed=1')] + feeds = [(u'La Voce', u'http://www.lavoce.info/feed/')] __author__ = 'faber1971' - description = 'Italian website on Economy - v1.01 (17, December 2011)' + description = 'Italian website on Economy - v1.02 (27, December 2012)' language = 'it' diff --git a/recipes/le_monde.recipe b/recipes/le_monde.recipe index 8693676da9..318df7e362 100644 --- a/recipes/le_monde.recipe +++ b/recipes/le_monde.recipe @@ -22,13 +22,15 @@ class LeMonde(BasicNewsRecipe): #publication_type = 'newsportal' extra_css = ''' h1{font-size:130%;} + h2{font-size:100%;} + blockquote.aside {background-color: #DDD; padding: 0.5em;} .ariane{font-size:xx-small;} .source{font-size:xx-small;} - #.href{font-size:xx-small;} - #.figcaption style{color:#666666; font-size:x-small;} - #.main-article-info{font-family:Arial,Helvetica,sans-serif;} - #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} - #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} + /*.href{font-size:xx-small;}*/ + /*.figcaption style{color:#666666; font-size:x-small;}*/ + /*.main-article-info{font-family:Arial,Helvetica,sans-serif;}*/ + /*full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}*/ + /*match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}*/ ''' #preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] conversion_options = { @@ -44,6 +46,9 @@ class LeMonde(BasicNewsRecipe): filterDuplicates = True def preprocess_html(self, soup): + for aside in soup.findAll('aside'): + aside.name='blockquote' + aside['class'] = "aside" for alink in soup.findAll('a'): if alink.string is not None: tstr = alink.string @@ -107,7 +112,9 @@ class LeMonde(BasicNewsRecipe): ] remove_tags = [ - dict(name='div', attrs={'class':['bloc_base meme_sujet']}), + dict(attrs={'class':['rubriques_liees']}), + dict(attrs={'class':['sociaux']}), + dict(attrs={'class':['bloc_base meme_sujet']}), dict(name='p', attrs={'class':['lire']}) ] diff --git a/recipes/le_monde_sub.recipe b/recipes/le_monde_sub.recipe index 6f5c16e3d1..dc9fa9d36f 100644 --- a/recipes/le_monde_sub.recipe +++ b/recipes/le_monde_sub.recipe @@ -1,132 +1,94 @@ -#!/usr/bin/env python - +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +__author__ = 'Sylvain Durand ' __license__ = 'GPL v3' -__copyright__ = '2012, Rémi Vanicat ' -''' -Lemonde.fr: Version abonnée -''' - -import os, zipfile, re, time +import time from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ptempfile import PersistentTemporaryFile +from urllib2 import HTTPError -class LeMondeAbonne(BasicNewsRecipe): +class LeMonde(BasicNewsRecipe): - title = u'Le Monde: Édition abonnés' - __author__ = u'Rémi Vanicat' - description = u'Actualités' - category = u'Actualités, France, Monde' - language = 'fr' - needs_subscription = True + title = u'Le Monde: Édition abonnés' + __author__ = 'Sylvain Durand' + description = u'Disponible du lundi au samedi à partir de 14 heures environ, avec tous ses cahiers.' + language = 'fr' + encoding = 'utf8' - no_stylesheets = True + needs_subscription = True - extra_css = u''' - h1{font-size:130%;} - .ariane{font-size:xx-small;} - .source{font-size:xx-small;} - .href{font-size:xx-small;} - .LM_caption{color:#666666; font-size:x-small;} - .main-article-info{font-family:Arial,Helvetica,sans-serif;} - #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} - #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} - ''' + date_url = 'http://www.lemonde.fr/journalelectronique/donnees/libre/%Y%m%d/index.html' + login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html' + journal_url = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/%Y%m%d_ipad.xml' + masthead_url = 'http://upload.wikimedia.org/wikipedia/fr/thumb/c/c5/Le_Monde_logo.svg/300px-Le_Monde_logo.svg.png' + couverture_url = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/data/img/%y%m%d01.jpg' - zipurl_format = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/%y%m%d.zip' - coverurl_format = '/img/%y%m%d01.jpg' - path_format = "%y%m%d" - login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html' + extra_css = ''' + img{max-width:100%} + h1{font-size:1.2em !important; line-height:1.2em !important; } + h2{font-size:1em !important; line-height:1em !important; } + h3{font-size:1em !important; text-transform:uppercase !important; color:#666;} + #photo{text-align:center !important; margin:10px 0 -8px;} + #lgd{font-size:1em !important; line-height:1em !important; font-style:italic; color:#333;} ''' - keep_only_tags = [ dict(name="div", attrs={ 'class': 'po-prti' }), dict(name=['h1']), dict(name='div', attrs={ 'class': 'photo' }), dict(name='div', attrs={ 'class': 'po-ti2' }), dict(name='div', attrs={ 'class': 'ar-txt' }), dict(name='div', attrs={ 'class': 'po_rtcol' }) ] + keep_only_tags = [dict(name=['h1','h2','h3','div','txt'])] - article_id_pattern = re.compile("[0-9]+\\.html") - article_url_format = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/html/' + def __init__(self, options, log, progress_reporter): + BasicNewsRecipe.__init__(self, options, log, progress_reporter) + br = BasicNewsRecipe.get_browser(self) + second = time.time() + 24*60*60 + for i in range(7): + self.date = time.gmtime(second) + try: + br.open(time.strftime(self.date_url,self.date)) + break + except HTTPError: + second -= 24*60*60 + self.timefmt = strftime(u" %A %d %B %Y", self.date).replace(u' 0', u' ') def get_browser(self): - br = BasicNewsRecipe.get_browser() - if self.username is not None and self.password is not None: - br.open(self.login_url) - br.select_form(nr=0) - br['login'] = self.username - br['password'] = self.password - br.submit() + br = BasicNewsRecipe.get_browser(self) + br.open(self.login_url) + br.select_form(nr=0) + br['login'] = self.username + br['password'] = self.password + br.submit() return br - decalage = 24 * 60 * 60 # today Monde has tomorow date - def get_cover_url(self): - url = time.strftime(self.coverurl_format, self.ltime) - return self.articles_path + url + url = time.strftime(self.couverture_url,self.date) + return url def parse_index(self): - browser = self.get_browser() - - second = time.time() - second += self.decalage - ltime = self.ltime = time.gmtime(second) - url = time.strftime(self.zipurl_format, ltime) - - self.timefmt=strftime(" %A %d %B %Y", ltime) - - response = browser.open(url) - - tmp = PersistentTemporaryFile(suffix='.zip') - self.report_progress(0.1,_('downloading zip file')) - tmp.write(response.read()) - tmp.close() - - zfile = zipfile.ZipFile(tmp.name, 'r') - self.report_progress(0.1,_('extracting zip file')) - - zfile.extractall(self.output_dir) - zfile.close() - - path = os.path.join(self.output_dir, time.strftime(self.path_format, ltime), "data") - - self.articles_path = path - - files = os.listdir(path) - - nb_index_files = len([ name for name in files if re.match("frame_gauche_[0-9]+.html", name) ]) - - flux = [] - - article_url = time.strftime(self.article_url_format, ltime) - - for i in range(nb_index_files): - filename = os.path.join(path, "selection_%d.html" % (i + 1)) - tmp = open(filename,'r') - soup=BeautifulSoup(tmp) - title=soup.find('span').contents[0] - tmp.close() - - filename = os.path.join(path, "frame_gauche_%d.html" % (i + 1)) - tmp = open(filename,'r') - soup = BeautifulSoup(tmp) + url = time.strftime(self.journal_url,self.date) + soup = self.index_to_soup(url).sommaire + sections = [] + for sec in soup.findAll("section"): articles = [] - for link in soup.findAll("a"): - article_file = link['href'] - article_id=self.article_id_pattern.search(article_file).group() - article = { - 'title': link.contents[0], - 'url': article_url + article_id, - 'descripion': '', - 'content': '' - } - articles.append(article) - tmp.close() + if sec['cahier'] != "Le Monde": + for col in sec.findAll("fnts"): + col.extract() + if sec['cahier']=="Le Monde Magazine": + continue + for art in sec.findAll("art"): + if art.txt.string and art.ttr.string: + if art.find(['url']): + art.insert(6,'

') + if art.find(['lgd']) and art.find(['lgd']).string: + art.insert(7,'
'+art.find(['lgd']).string+'
') + article = ""+unicode(art)+"" + article = article.replace('','').replace(' oC ','°C ') + article = article.replace('srttr>','h3>').replace('ssttr>','h2>').replace('ttr>','h1>') + f = PersistentTemporaryFile() + f.write(article) + articles.append({'title':art.ttr.string,'url':"file:///"+f.name}) + sections.append((sec['nom'], articles)) + return sections - flux.append((title, articles)) - - return flux - - - -# Local Variables: -# mode: python -# End: + def preprocess_html(self, soup): + for lgd in soup.findAll(id="lgd"): + lgd.contents[-1].extract() + return soup diff --git a/recipes/ledevoir.recipe b/recipes/ledevoir.recipe index 0811289827..f79c010804 100644 --- a/recipes/ledevoir.recipe +++ b/recipes/ledevoir.recipe @@ -32,26 +32,28 @@ class ledevoir(BasicNewsRecipe): recursion = 10 needs_subscription = 'optional' - filterDuplicates = False url_list = [] remove_javascript = True no_stylesheets = True + auto_cleanup = True preprocess_regexps = [(re.compile(r'(title|alt)=".*?>.*?"', re.DOTALL), lambda m: '')] - keep_only_tags = [ - dict(name='div', attrs={'id':'article'}), - dict(name='div', attrs={'id':'colonne_principale'}) - ] + #keep_only_tags = [ + #dict(name='div', attrs={'id':'article_detail'}), + #dict(name='div', attrs={'id':'colonne_principale'}) + #] - remove_tags = [ - dict(name='div', attrs={'id':'dialog'}), - dict(name='div', attrs={'class':['interesse_actions','reactions']}), - dict(name='ul', attrs={'class':'mots_cles'}), - dict(name='a', attrs={'class':'haut'}), - dict(name='h5', attrs={'class':'interesse_actions'}) - ] + #remove_tags = [ + #dict(name='div', attrs={'id':'dialog'}), + #dict(name='div', attrs={'class':['interesse_actions','reactions','taille_du_texte right clearfix','partage_sociaux clearfix']}), + #dict(name='aside', attrs={'class':['article_actions clearfix','reactions','partage_sociaux_wrapper']}), + #dict(name='ul', attrs={'class':'mots_cles'}), + #dict(name='ul', attrs={'id':'commentaires'}), + #dict(name='a', attrs={'class':'haut'}), + #dict(name='h5', attrs={'class':'interesse_actions'}) + #] feeds = [ (u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'), @@ -86,7 +88,7 @@ class ledevoir(BasicNewsRecipe): .texte {font-size:1.15em;line-height:1.4em;margin-bottom:17px;} ''' def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://www.ledevoir.com') br.select_form(nr=0) @@ -95,10 +97,4 @@ class ledevoir(BasicNewsRecipe): br.submit() return br - def print_version(self, url): - if self.filterDuplicates: - if url in self.url_list: - return - self.url_list.append(url) - return url diff --git a/recipes/leggo_it.recipe b/recipes/leggo_it.recipe index 13b2ca9018..32d9f0f6d7 100644 --- a/recipes/leggo_it.recipe +++ b/recipes/leggo_it.recipe @@ -53,12 +53,12 @@ class LeggoIT(BasicNewsRecipe): month = "%.2d" % st.tm_mon day = "%.2d" % st.tm_mday cover='http://www.leggo.it/'+ year + month + day + '/jpeg/LEGGO_ROMA_1.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: cover='http://www.leggo.it/'+ year + month + day + '/jpeg/LEGGO_ROMA_3.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/lemonde_dip.recipe b/recipes/lemonde_dip.recipe index 8e61e24cdc..ab2e123e58 100644 --- a/recipes/lemonde_dip.recipe +++ b/recipes/lemonde_dip.recipe @@ -42,7 +42,7 @@ class LeMondeDiplomatiqueEn(BasicNewsRecipe): } def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open(self.LOGIN) if self.username is not None and self.password is not None: data = urllib.urlencode({ 'login':self.username diff --git a/recipes/lepoint.recipe b/recipes/lepoint.recipe index 9a4aab01da..78fbe951cd 100644 --- a/recipes/lepoint.recipe +++ b/recipes/lepoint.recipe @@ -66,7 +66,7 @@ class lepoint(BasicNewsRecipe): def get_masthead_url(self): masthead = 'http://www.lepoint.fr/images/commun/logo.png' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(masthead) except: diff --git a/recipes/lexpress.recipe b/recipes/lexpress.recipe index 3de6226f1b..fef8dc9f96 100644 --- a/recipes/lexpress.recipe +++ b/recipes/lexpress.recipe @@ -64,7 +64,7 @@ class lepoint(BasicNewsRecipe): def get_masthead_url(self): masthead = 'http://static.lexpress.fr/imgstat/logo_lexpress.gif' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(masthead) except: diff --git a/recipes/libartes.recipe b/recipes/libartes.recipe new file mode 100644 index 0000000000..6ddae87119 --- /dev/null +++ b/recipes/libartes.recipe @@ -0,0 +1,69 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +libartes.com +''' + +import re +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe + +class Libartes(BasicNewsRecipe): + title = 'Libartes' + __author__ = 'Darko Miletic' + description = 'Elektronski časopis Libartes delo je kulturnih entuzijasta, umetnika i teoretičara umetnosti i književnosti. Časopis Libartes izlazi tromesečno i bavi se različitim granama umetnosti - književnošću, muzikom, filmom, likovnim umetnostima, dizajnom i arhitekturom.' + publisher = 'Libartes' + category = 'literatura, knjizevnost, film, dizajn, arhitektura, muzika' + no_stylesheets = True + INDEX = 'http://libartes.com/' + use_embedded_content = False + encoding = 'utf-8' + language = 'sr' + publication_type = 'magazine' + masthead_url = 'http://libartes.com/index_files/logo.gif' + extra_css = """ + @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} + body{font-family: "Times New Roman",Times,serif1, serif} + img{display:block} + .naslov{font-size: xx-large; font-weight: bold} + .nag{font-size: large; font-weight: bold} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + remove_tags_before = dict(attrs={'id':'nav'}) + remove_tags_after = dict(attrs={'id':'fb' }) + keep_only_tags = [dict(name='div', attrs={'id':'center_content'})] + remove_tags = [ + dict(name=['object','link','iframe','embed','meta']) + ,dict(attrs={'id':'nav'}) + ] + + def parse_index(self): + articles = [] + soup = self.index_to_soup(self.INDEX) + for item in soup.findAll(name='a', attrs={'class':'belad'}, href=True): + feed_link = item + if feed_link['href'].startswith(self.INDEX): + url = feed_link['href'] + else: + url = self.INDEX + feed_link['href'] + + title = self.tag_to_string(feed_link) + date = strftime(self.timefmt) + articles.append({ + 'title' :title + ,'date' :date + ,'url' :url + ,'description':'' + }) + return [('Casopis Libartes', articles)] + diff --git a/recipes/liberation.recipe b/recipes/liberation.recipe index 7183e26909..741e2e87d2 100644 --- a/recipes/liberation.recipe +++ b/recipes/liberation.recipe @@ -71,7 +71,7 @@ class Liberation(BasicNewsRecipe): def get_masthead_url(self): masthead = 'http://s0.libe.com/libe/img/common/logo-liberation-150.png' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(masthead) except: diff --git a/recipes/liberation_sub.recipe b/recipes/liberation_sub.recipe index 3ea933f364..60450341e4 100644 --- a/recipes/liberation_sub.recipe +++ b/recipes/liberation_sub.recipe @@ -61,7 +61,7 @@ class Liberation(BasicNewsRecipe): index = 'http://www.liberation.fr/abonnes/' def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://www.liberation.fr/jogger/login/') br.select_form(nr=0) diff --git a/recipes/libero.recipe b/recipes/libero.recipe index f2208d01a3..01c2da36c7 100644 --- a/recipes/libero.recipe +++ b/recipes/libero.recipe @@ -14,7 +14,8 @@ class LiberoNews(BasicNewsRecipe): __author__ = 'Marini Gabriele' description = 'Italian daily newspaper' - cover_url = 'http://www.libero-news.it/images/logo.png' + #cover_url = 'http://www.liberoquotidiano.it/images/Libero%20Quotidiano.jpg' + cover_url = 'http://www.edicola.liberoquotidiano.it/vnlibero/fpcut.jsp?testata=milano' title = u'Libero ' publisher = 'EDITORIALE LIBERO s.r.l 2006' category = 'News, politics, culture, economy, general interest' @@ -32,10 +33,11 @@ class LiberoNews(BasicNewsRecipe): remove_javascript = True keep_only_tags = [ - dict(name='div', attrs={'class':'Articolo'}) + dict(name='div', attrs={'class':'Articolo'}), + dict(name='article') ] remove_tags = [ - dict(name='div', attrs={'class':['CommentaFoto','Priva2']}), + dict(name='div', attrs={'class':['CommentaFoto','Priva2','login_commenti','box_16']}), dict(name='div', attrs={'id':['commentigenerale']}) ] feeds = [ diff --git a/recipes/libertad_digital.recipe b/recipes/libertad_digital.recipe new file mode 100644 index 0000000000..1a35e6995a --- /dev/null +++ b/recipes/libertad_digital.recipe @@ -0,0 +1,65 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +www.libertaddigital.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class LibertadDigital(BasicNewsRecipe): + title = 'Libertad Digital' + __author__ = 'Darko Miletic' + description = 'En Libertad Digital encontraras noticias y opinion sobre: España, el Mundo, Internet, sociedad, economia y deportes' + publisher = 'Libertad Digital S.A.' + category = 'noticias, ultima hora, españa, internet, mundo, economia, sociedad, Libertad Digital' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'cp1252' + use_embedded_content = False + language = 'es' + remove_empty_feeds = True + publication_type = 'website' + masthead_url = 'http://s.libertaddigital.com/images/logo.gif' + extra_css = """ + body{font-family: Verdana,sans-serif } + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_tags = [ + dict(name=['meta','link','iframe','embed','object']) + ,dict(name='p', attrs={'class':'copyright'}) + ] + remove_attributes=['lang'] + + + feeds = [ + (u'Portada' , u'http://feeds2.feedburner.com/libertaddigital/deportes' ) + ,(u'Opinion' , u'http://feeds2.feedburner.com/libertaddigital/opinion' ) + ,(u'España' , u'http://feeds2.feedburner.com/libertaddigital/nacional' ) + ,(u'Internacional', u'http://feeds2.feedburner.com/libertaddigital/internacional') + ,(u'Libre Mercado', u'http://feeds2.feedburner.com/libertaddigital/economia' ) + ,(u'Chic' , u'http://feeds2.feedburner.com/libertaddigital/el-candelabro') + ,(u'Internet' , u'http://feeds2.feedburner.com/libertaddigital/internet' ) + ,(u'Deportes' , u'http://feeds2.feedburner.com/libertaddigital/deportes' ) + ] + + def get_article_url(self, article): + return article.get('guid', None) + + def print_version(self, url): + art, sep, rest = url.rpartition('/') + aart, asep, artid = art.rpartition('-') + return 'http://www.libertaddigital.com/c.php?op=imprimir&id=' + artid + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup diff --git a/recipes/linux_journal.recipe b/recipes/linux_journal.recipe old mode 100755 new mode 100644 diff --git a/recipes/lrb_payed.recipe b/recipes/lrb_payed.recipe index 320890110a..ad713e38f1 100644 --- a/recipes/lrb_payed.recipe +++ b/recipes/lrb_payed.recipe @@ -28,7 +28,7 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe): def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open(self.LOGIN) br.select_form(nr=1) diff --git a/recipes/lvivs_ks_ghazieta.recipe b/recipes/lvivs_ks_ghazieta.recipe new file mode 100644 index 0000000000..cc4b326d42 --- /dev/null +++ b/recipes/lvivs_ks_ghazieta.recipe @@ -0,0 +1,12 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1356270446(BasicNewsRecipe): + title = u'\u041b\u044c\u0432\u0456\u0432\u0441\u044c\u043a\u0430 \u0433\u0430\u0437\u0435\u0442\u0430' + __author__ = 'rpalyvoda' + oldest_article = 7 + max_articles_per_feed = 100 + language = 'uk' + cover_url = 'http://lvivska.com/sites/all/themes/biblos/images/logo.png' + masthead_url = 'http://lvivska.com/sites/all/themes/biblos/images/logo.png' + auto_cleanup = True + feeds = [(u'\u041d\u043e\u0432\u0438\u043d\u0438', u'http://lvivska.com/rss/news.xml'), (u'\u041f\u043e\u043b\u0456\u0442\u0438\u043a\u0430', u'http://lvivska.com/rss/politic.xml'), (u'\u0415\u043a\u043e\u043d\u043e\u043c\u0456\u043a\u0430', u'http://lvivska.com/rss/economic.xml'), (u'\u041f\u0440\u0430\u0432\u043e', u'http://lvivska.com/rss/law.xml'), (u'\u0421\u0432\u0456\u0442', u'http://lvivska.com/rss/world.xml'), (u'\u0416\u0438\u0442\u0442\u044f', u'http://lvivska.com/rss/life.xml'), (u'\u041a\u0443\u043b\u044c\u0442\u0443\u0440\u0430', u'http://lvivska.com/rss/culture.xml'), (u'\u041b\u0430\u0441\u0443\u043d', u'http://lvivska.com/rss/cooking.xml'), (u'\u0421\u0442\u0438\u043b\u044c', u'http://lvivska.com/rss/style.xml'), (u'Galicia Incognita', u'http://lvivska.com/rss/galiciaincognita.xml'), (u'\u0421\u043f\u043e\u0440\u0442', u'http://lvivska.com/rss/sport.xml'), (u'\u0415\u043a\u043e\u043b\u043e\u0433\u0456\u044f', u'http://lvivska.com/rss/ecology.xml'), (u"\u0417\u0434\u043e\u0440\u043e\u0432'\u044f", u'http://lvivska.com/rss/health.xml'), (u'\u0410\u0432\u0442\u043e', u'http://lvivska.com/rss/auto.xml'), (u'\u0411\u043b\u043e\u0433\u0438', u'http://lvivska.com/rss/blog.xml')] diff --git a/recipes/lwn.recipe b/recipes/lwn.recipe index e8f8132686..dcc79adfda 100644 --- a/recipes/lwn.recipe +++ b/recipes/lwn.recipe @@ -23,7 +23,7 @@ class LWN(BasicNewsRecipe): LOGIN = 'https://lwn.net/login' def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open(self.LOGIN) br.select_form(name='loginform') diff --git a/recipes/lwn_weekly.recipe b/recipes/lwn_weekly.recipe index 95994aa97e..9400b1bf10 100644 --- a/recipes/lwn_weekly.recipe +++ b/recipes/lwn_weekly.recipe @@ -43,7 +43,7 @@ class WeeklyLWN(BasicNewsRecipe): needs_subscription = 'optional' def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://lwn.net/login') br.select_form(name='loginform') diff --git a/recipes/mediapart.recipe b/recipes/mediapart.recipe index f84fb5bc7e..f1e6c87385 100644 --- a/recipes/mediapart.recipe +++ b/recipes/mediapart.recipe @@ -44,7 +44,7 @@ class Mediapart(BasicNewsRecipe): # -- Handle login def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://www.mediapart.fr/') br.select_form(nr=0) diff --git a/recipes/medscape.recipe b/recipes/medscape.recipe index ef406c64dc..f05948a6e3 100644 --- a/recipes/medscape.recipe +++ b/recipes/medscape.recipe @@ -38,7 +38,7 @@ class MedScrape(BasicNewsRecipe): ] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://profreg.medscape.com/px/getlogin.do') br.select_form(name='LoginForm') diff --git a/recipes/metro_uk.recipe b/recipes/metro_uk.recipe index fcceba4ce7..78db75daf8 100644 --- a/recipes/metro_uk.recipe +++ b/recipes/metro_uk.recipe @@ -1,43 +1,77 @@ from calibre.web.feeds.news import BasicNewsRecipe +from calibre import strftime +import re +import datetime +import time + class AdvancedUserRecipe1306097511(BasicNewsRecipe): title = u'Metro UK' - description = 'Author Dave Asbury : News from The Metro - UK' + description = 'News as provided by The Metro -UK' #timefmt = '' - __author__ = 'Dave Asbury' - #last update 9/9/12 - cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg' - no_stylesheets = True - oldest_article = 1 - max_articles_per_feed = 12 + __author__ = 'fleclerc & Dave Asbury' + #last update 20/1/13 + #cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg' + + cover_url = 'https://twimg0-a.akamaihd.net/profile_images/1638332595/METRO_LETTERS-01.jpg' remove_empty_feeds = True remove_javascript = True - #auto_cleanup = True + auto_cleanup = True + max_articles_per_feed = 12 + ignore_duplicate_articles = {'title', 'url'} encoding = 'UTF-8' - cover_url ='http://profile.ak.fbcdn.net/hprofile-ak-snc4/157897_117118184990145_840702264_n.jpg' + language = 'en_GB' masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif' - extra_css = ''' - h1{font-family:Arial,Helvetica,sans-serif; font-weight:900;font-size:1.6em;} - h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:1.2em;} - p{font-family:Arial,Helvetica,sans-serif;font-size:1.0em;} - body{font-family:Helvetica,Arial,sans-serif;font-size:1.0em;} - ''' - keep_only_tags = [ - #dict(name='h1'), - #dict(name='h2'), - #dict(name='div', attrs={'class' : ['row','article','img-cnt figure','clrd']}) - #dict(name='h3'), - #dict(attrs={'class' : 'BText'}), - ] - remove_tags = [ - dict(name='div',attrs={'class' : 'art-fd fd-gr1-b clrd'}), - dict(name='span',attrs={'class' : 'share'}), - dict(name='li'), - dict(attrs={'class' : ['twitter-share-button','header-forms','hdr-lnks','close','art-rgt','fd-gr1-b clrd google-article','news m12 clrd clr-b p5t shareBtm','item-ds csl-3-img news','c-1of3 c-last','c-1of1','pd','item-ds csl-3-img sport']}), - dict(attrs={'id' : ['','sky-left','sky-right','ftr-nav','and-ftr','notificationList','logo','miniLogo','comments-news','metro_extras']}) - ] - remove_tags_before = dict(name='h1') - #remove_tags_after = dict(attrs={'id':['topic-buttons']}) - feeds = [ - (u'News', u'http://www.metro.co.uk/rss/news/'), (u'Money', u'http://www.metro.co.uk/rss/money/'), (u'Sport', u'http://www.metro.co.uk/rss/sport/'), (u'Film', u'http://www.metro.co.uk/rss/metrolife/film/'), (u'Music', u'http://www.metro.co.uk/rss/metrolife/music/'), (u'TV', u'http://www.metro.co.uk/rss/tv/'), (u'Showbiz', u'http://www.metro.co.uk/rss/showbiz/'), (u'Weird News', u'http://www.metro.co.uk/rss/weird/'), (u'Travel', u'http://www.metro.co.uk/rss/travel/'), (u'Lifestyle', u'http://www.metro.co.uk/rss/lifestyle/'), (u'Books', u'http://www.metro.co.uk/rss/lifestyle/books/'), (u'Food', u'http://www.metro.co.uk/rss/lifestyle/restaurants/')] + def parse_index(self): + articles = {} + key = None + ans = [] + feeds = [ ('UK', 'http://metro.co.uk/news/uk/'), + ('World', 'http://metro.co.uk/news/world/'), + ('Weird', 'http://metro.co.uk/news/weird/'), + ('Money', 'http://metro.co.uk/news/money/'), + ('Sport', 'http://metro.co.uk/sport/'), + ('Guilty Pleasures', 'http://metro.co.uk/guilty-pleasures/') + ] + for key, feed in feeds: + soup = self.index_to_soup(feed) + articles[key] = [] + ans.append(key) + + today = datetime.date.today() + today = time.mktime(today.timetuple())-60*60*24 + + for a in soup.findAll('a'): + for name, value in a.attrs: + if name == "class" and value=="post": + url = a['href'] + title = a['title'] + print title + description = '' + m = re.search('^.*uk/([^/]*)/([^/]*)/([^/]*)/', url) + skip = 1 + if len(m.groups()) == 3: + g = m.groups() + dt = datetime.datetime.strptime(''+g[0]+'-'+g[1]+'-'+g[2], '%Y-%m-%d') + pubdate = time.strftime('%a, %d %b', dt.timetuple()) + + dt = time.mktime(dt.timetuple()) + if dt >= today: + print pubdate + skip = 0 + else: + pubdate = strftime('%a, %d %b') + + summary = a.find(True, attrs={'class':'excerpt'}) + if summary: + description = self.tag_to_string(summary, use_alt=False) + + if skip == 0: + articles[key].append( + dict(title=title, url=url, date=pubdate, + description=description, + content='')) + #ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2}) + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/recipes/michellemalkin.recipe b/recipes/michellemalkin.recipe index e933ed8f1c..0b1f0ebdaa 100644 --- a/recipes/michellemalkin.recipe +++ b/recipes/michellemalkin.recipe @@ -18,6 +18,8 @@ class MichelleMalkin(BasicNewsRecipe): remove_javascript = True no_stylesheets = True + auto_cleanup = True + use_embedded_content = False conversion_options = { @@ -29,16 +31,16 @@ class MichelleMalkin(BasicNewsRecipe): } - keep_only_tags = [ - dict(name='div', attrs={'class':'article'}) - ] + #keep_only_tags = [ + #dict(name='div', attrs={'class':'article'}) + #] - remove_tags = [ - dict(name=['iframe', 'embed', 'object']), - dict(name='div', attrs={'id':['comments', 'commentForm']}), - dict(name='div', attrs={'class':['postCategories', 'comments', 'blogInfo', 'postInfo']}) + #remove_tags = [ + #dict(name=['iframe', 'embed', 'object']), + #dict(name='div', attrs={'id':['comments', 'commentForm']}), + #dict(name='div', attrs={'class':['postCategories', 'comments', 'blogInfo', 'postInfo']}) - ] + #] feeds = [(u'http://feeds.feedburner.com/michellemalkin/posts')] diff --git a/recipes/microwave_and_rf.recipe b/recipes/microwave_and_rf.recipe deleted file mode 100644 index 3cdf6e5acc..0000000000 --- a/recipes/microwave_and_rf.recipe +++ /dev/null @@ -1,224 +0,0 @@ -#!/usr/bin/env python -## -## Title: Microwave and RF -## -## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html - -# Feb 2012: Initial release - -__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' -''' -mwrf.com -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.utils.magick import Image - -class Microwaves_and_RF(BasicNewsRecipe): - - Convert_Grayscale = False # Convert images to gray scale or not - - # Add sections that want to be excluded from the magazine - exclude_sections = [] - - # Add sections that want to be included from the magazine - include_sections = [] - - title = u'Microwaves and RF' - __author__ = u'kiavash' - description = u'Microwaves and RF Montly Magazine' - publisher = 'Penton Media, Inc.' - publication_type = 'magazine' - site = 'http://mwrf.com' - - language = 'en' - asciiize = True - timeout = 120 - simultaneous_downloads = 1 # very peaky site! - - # Main article is inside this tag - keep_only_tags = [dict(name='table', attrs={'id':'prtContent'})] - - no_stylesheets = True - remove_javascript = True - - # Flattens all the tables to make it compatible with Nook - conversion_options = {'linearize_tables' : True} - - remove_tags = [ - dict(name='span', attrs={'class':'body12'}), - ] - - remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan', - 'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ] - - # Specify extra CSS - overrides ALL other CSS (IE. Added last). - extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ - .introduction, .first { font-weight: bold; } \ - .cross-head { font-weight: bold; font-size: 125%; } \ - .cap, .caption { display: block; font-size: 80%; font-style: italic; } \ - .cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \ - .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \ - .correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \ - font-size: 80%; font-style: italic; margin: 1px auto; } \ - .story-date, .published { font-size: 80%; } \ - table { width: 100%; } \ - td img { display: block; margin: 5px auto; } \ - ul { padding-top: 10px; } \ - ol { padding-top: 10px; } \ - li { padding-top: 5px; padding-bottom: 5px; } \ - h1 { font-size: 175%; font-weight: bold; } \ - h2 { font-size: 150%; font-weight: bold; } \ - h3 { font-size: 125%; font-weight: bold; } \ - h4, h5, h6 { font-size: 100%; font-weight: bold; }' - - # Remove the line breaks and float left/right and picture width/height. - preprocess_regexps = [(re.compile(r'', re.IGNORECASE), lambda m: ''), - (re.compile(r'', re.IGNORECASE), lambda m: ''), - (re.compile(r'float:.*?'), lambda m: ''), - (re.compile(r'width:.*?px'), lambda m: ''), - (re.compile(r'height:.*?px'), lambda m: '') - ] - - - def print_version(self, url): - url = re.sub(r'.html', '', url) - url = re.sub('/ArticleID/.*?/', '/Print.cfm?ArticleID=', url) - return url - - # Need to change the user agent to avoid potential download errors - def get_browser(self, *args, **kwargs): - from calibre import browser - kwargs['user_agent'] = 'Mozilla/5.0 (Windows NT 5.1; rv:10.0) Gecko/20100101 Firefox/10.0' - return browser(*args, **kwargs) - - - def parse_index(self): - - # Fetches the main page of Microwaves and RF - soup = self.index_to_soup(self.site) - - # First page has the ad, Let's find the redirect address. - url = soup.find('span', attrs={'class':'commonCopy'}).find('a').get('href') - if url.startswith('/'): - url = self.site + url - - soup = self.index_to_soup(url) - - # Searches the site for Issue ID link then returns the href address - # pointing to the latest issue - latest_issue = soup.find('a', attrs={'href':lambda x: x and 'IssueID' in x}).get('href') - - # Fetches the index page for of the latest issue - soup = self.index_to_soup(latest_issue) - - # Finds the main section of the page containing cover, issue date and - # TOC - ts = soup.find('div', attrs={'id':'columnContainer'}) - - # Finds the issue date - ds = ' '.join(self.tag_to_string(ts.find('span', attrs={'class':'CurrentIssueSectionHead'})).strip().split()[-2:]).capitalize() - self.log('Found Current Issue:', ds) - self.timefmt = ' [%s]'%ds - - # Finds the cover image - cover = ts.find('img', src = lambda x: x and 'Cover' in x) - if cover is not None: - self.cover_url = self.site + cover['src'] - self.log('Found Cover image:', self.cover_url) - - feeds = [] - article_info = [] - - # Finds all the articles (tiles and links) - articles = ts.findAll('a', attrs={'class':'commonArticleTitle'}) - - # Finds all the descriptions - descriptions = ts.findAll('span', attrs={'class':'commonCopy'}) - - # Find all the sections - sections = ts.findAll('span', attrs={'class':'kicker'}) - - title_number = 0 - - # Goes thru all the articles one by one and sort them out - for section in sections: - title_number = title_number + 1 - - # Removes the unwanted sections - if self.tag_to_string(section) in self.exclude_sections: - continue - - # Only includes the wanted sections - if self.include_sections: - if self.tag_to_string(section) not in self.include_sections: - continue - - - title = self.tag_to_string(articles[title_number]) - url = articles[title_number].get('href') - if url.startswith('/'): - url = self.site + url - - self.log('\tFound article:', title, 'at', url) - desc = self.tag_to_string(descriptions[title_number]) - self.log('\t\t', desc) - - article_info.append({'title':title, 'url':url, 'description':desc, - 'date':self.timefmt}) - - if article_info: - feeds.append((self.title, article_info)) - - #self.log(feeds) - return feeds - - def postprocess_html(self, soup, first): - if self.Convert_Grayscale: - #process all the images - for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): - iurl = tag['src'] - img = Image() - img.open(iurl) - if img < 0: - raise RuntimeError('Out of memory') - img.type = "GrayscaleType" - img.save(iurl) - return soup - - def preprocess_html(self, soup): - - # Includes all the figures inside the final ebook - # Finds all the jpg links - for figure in soup.findAll('a', attrs = {'href' : lambda x: x and 'jpg' in x}): - - # makes sure that the link points to the absolute web address - if figure['href'].startswith('/'): - figure['href'] = self.site + figure['href'] - - figure.name = 'img' # converts the links to img - figure['src'] = figure['href'] # with the same address as href - figure['style'] = 'display:block' # adds /n before and after the image - del figure['href'] - del figure['target'] - - # Makes the title standing out - for title in soup.findAll('a', attrs = {'class': 'commonSectionTitle'}): - title.name = 'h1' - del title['href'] - del title['target'] - - # Makes the section name more visible - for section_name in soup.findAll('a', attrs = {'class': 'kicker2'}): - section_name.name = 'h5' - del section_name['href'] - del section_name['target'] - - # Removes all unrelated links - for link in soup.findAll('a', attrs = {'href': True}): - link.name = 'font' - del link['href'] - del link['target'] - - return soup diff --git a/recipes/ming_pao.recipe b/recipes/ming_pao.recipe index 7866c89861..a655d598e4 100644 --- a/recipes/ming_pao.recipe +++ b/recipes/ming_pao.recipe @@ -252,7 +252,7 @@ class MPRecipe(BasicNewsRecipe): cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg' elif __Region__ == 'Toronto': cover = 'http://www.mingpaotor.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/ming_pao_toronto.recipe b/recipes/ming_pao_toronto.recipe index 84001d3952..48aca5ee2c 100644 --- a/recipes/ming_pao_toronto.recipe +++ b/recipes/ming_pao_toronto.recipe @@ -229,7 +229,7 @@ class MPRecipe(BasicNewsRecipe): cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg' elif __Region__ == 'Toronto': cover = 'http://www.mingpaotor.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/ming_pao_vancouver.recipe b/recipes/ming_pao_vancouver.recipe index 8dc2c78cb7..686dbd0bf7 100644 --- a/recipes/ming_pao_vancouver.recipe +++ b/recipes/ming_pao_vancouver.recipe @@ -229,7 +229,7 @@ class MPRecipe(BasicNewsRecipe): cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg' elif __Region__ == 'Toronto': cover = 'http://www.mingpaotor.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/mlody_technik_pl.recipe b/recipes/mlody_technik_pl.recipe index d019efb94c..1eaa08d23a 100644 --- a/recipes/mlody_technik_pl.recipe +++ b/recipes/mlody_technik_pl.recipe @@ -1,15 +1,27 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai - +import re from calibre.web.feeds.news import BasicNewsRecipe class Mlody_technik(BasicNewsRecipe): - title = u'Mlody technik' + title = u'Młody technik' __author__ = 'fenuks' description = u'Młody technik' category = 'science' language = 'pl' cover_url='http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg' no_stylesheets = True + preprocess_regexps = [(re.compile(r"

Podobne

", re.IGNORECASE), lambda m: '')] oldest_article = 7 max_articles_per_feed = 100 - #keep_only_tags=[dict(id='container')] - feeds = [(u'Artyku\u0142y', u'http://www.mt.com.pl/feed')] + remove_empty_feeds = True + use_embedded_content = False + keep_only_tags = [dict(id='content')] + remove_tags = [dict(attrs={'class':'st-related-posts'})] + remove_tags_after = dict(attrs={'class':'entry-content clearfix'}) + feeds = [(u'Wszystko', u'http://www.mt.com.pl/feed'), + (u'MT NEWS 24/7', u'http://www.mt.com.pl/kategoria/mt-newsy-24-7/feed'), + (u'Info zoom', u'http://www.mt.com.pl/kategoria/info-zoom/feed'), + (u'm.technik', u'http://www.mt.com.pl/kategoria/m-technik/feed'), + (u'Szkoła', u'http://www.mt.com.pl/kategoria/szkola-2/feed'), + (u'Na Warsztacie', u'http://www.mt.com.pl/kategoria/na-warsztacie/feed'), + (u'Z pasji do...', u'http://www.mt.com.pl/kategoria/z-pasji-do/feed'), + (u'MT testuje', u'http://www.mt.com.pl/kategoria/mt-testuje/feed')] diff --git a/recipes/mobile_bulgaria.recipe b/recipes/mobile_bulgaria.recipe new file mode 100644 index 0000000000..85440cf376 --- /dev/null +++ b/recipes/mobile_bulgaria.recipe @@ -0,0 +1,27 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1329123365(BasicNewsRecipe): + title = u'Mobilebulgaria.com' + __author__ = 'M3 Web' + description = 'The biggest Bulgarian site covering mobile consumer electronics. Offers detailed reviews, popular discussion forum, shop and platform for selling new and second hand phones and gadgets.' + category = 'News, Reviews, Offers, Forum' + oldest_article = 45 + max_articles_per_feed = 10 + language = 'bg' + encoding = 'windows-1251' + no_stylesheets = False + remove_javascript = True + keep_only_tags = [dict(name='div', attrs={'class':'bigblock'}), +dict(name='div', attrs={'class':'verybigblock'}), +dict(name='table', attrs={'class':'obiaviresults'}), +dict(name='div', attrs={'class':'forumblock'}), +dict(name='div', attrs={'class':'forumblock_b1'}), +dict(name='div', attrs={'class':'block2_2colswrap'})] + + feeds = [(u'News', u'http://www.mobilebulgaria.com/rss_full.php'), +(u'Reviews', u'http://www.mobilebulgaria.com/rss_reviews.php'), +(u'Offers', u'http://www.mobilebulgaria.com/obiavi/rss.php'), +(u'Forum', u'http://www.mobilebulgaria.com/rss_forum_last10.php')] + + extra_css = ''' + #gallery1 div{display: block; float: left; margin: 0 10px 10px 0;} ''' diff --git a/recipes/montreal_gazette.recipe b/recipes/montreal_gazette.recipe index 49a5089b5c..66f951f62f 100644 --- a/recipes/montreal_gazette.recipe +++ b/recipes/montreal_gazette.recipe @@ -132,14 +132,14 @@ class CanWestPaper(BasicNewsRecipe): def get_cover_url(self): from datetime import timedelta, date cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) daysback=1 try: br.open(cover) except: while daysback<7: cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/more_intelligent_life.recipe b/recipes/more_intelligent_life.recipe new file mode 100644 index 0000000000..e90f883080 --- /dev/null +++ b/recipes/more_intelligent_life.recipe @@ -0,0 +1,67 @@ +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' +''' +moreintelligentlife.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class MoreIntelligentLife(BasicNewsRecipe): + title = 'More Intelligent Life' + __author__ = 'Darko Miletic' + description = "More Intelligent Life (moreintelligentlife.com) is the online version of Intelligent Life, a lifestyle and culture magazine from The Economist. The website offers not only content from the print edition, trickled out over the course of its shelf-life, but also the Editors' Blog, which carries daily posts from the editorial team-quickfire observations and opinions that allow readers to eavesdrop on the conversation in the office." + publisher = 'The Economist Newspaper ltd' + category = 'arts,lifestyle,intelligent life,the economist,ideas,style,culture' + oldest_article = 60 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en' + remove_empty_feeds = True + publication_type = 'website' + extra_css = """ + body{font-family: Arial,"Helvetica neue","Bitstream Vera Sans",sans-serif} + img{margin-bottom: 0.4em; display:block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + keep_only_tags = [dict(attrs={'class':'node'})] + remove_tags_after = dict(attrs={'class':'tags'}) + remove_tags = [dict(name=['meta','link','iframe','embed','object'])] + remove_attributes = ['lang'] + + feeds = [(u'Articles', u'http://feeds.feedburner.com/MoreintelligentlifeTotal')] + + def get_cover_url(self): + soup = self.index_to_soup('http://moreintelligentlife.com/') + for image in soup.findAll('img', src=True): + if image['src'].startswith('http://moreintelligentlife.com/files/covers/current_issue_'): + return image['src'] + return None + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name = 'div' + item.attrs = [] + else: + str = self.tag_to_string(item) + item.replaceWith(str) + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup diff --git a/recipes/nauka_w_polsce.recipe b/recipes/nauka_w_polsce.recipe new file mode 100644 index 0000000000..c524c18b26 --- /dev/null +++ b/recipes/nauka_w_polsce.recipe @@ -0,0 +1,47 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class NaukawPolsce(BasicNewsRecipe): + title = u'Nauka w Polsce' + __author__ = 'fenuks' + description = u'Serwis Nauka w Polsce ma za zadanie popularyzację polskiej nauki. Można na nim znaleźć wiadomości takie jak: osiągnięcia polskich naukowców, wydarzenia na polskich uczelniach, osiągnięcia studentów, konkursy dla badaczy, staże i stypendia naukowe, wydarzenia w polskiej nauce, kalendarium wydarzeń w nauce, materiały wideo o nauce.' + category = 'science' + language = 'pl' + cover_url = 'http://www.naukawpolsce.pap.pl/Themes/Pap/images/logo-pl.gif' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + index = 'http://www.naukawpolsce.pl' + keep_only_tags = [dict(name='div', attrs={'class':'margines wiadomosc'})] + remove_tags = [dict(name='div', attrs={'class':'tagi'})] + + def find_articles(self, url): + articles = [] + soup=self.index_to_soup(url) + for i in soup.findAll(name='div', attrs={'class':'aktualnosci-margines lista-depesz information-content'}): + title = i.h1.a.string + url = self.index + i.h1.a['href'] + date = '' #i.span.string + articles.append({'title' : title, + 'url' : url, + 'date' : date, + 'description' : '' + }) + return articles + + def parse_index(self): + feeds = [] + feeds.append((u"Historia i kultura", self.find_articles('http://www.naukawpolsce.pl/historia-i-kultura/'))) + feeds.append((u"Kosmos", self.find_articles('http://www.naukawpolsce.pl/kosmos/'))) + feeds.append((u"Przyroda", self.find_articles('http://www.naukawpolsce.pl/przyroda/'))) + feeds.append((u"Społeczeństwo", self.find_articles('http://www.naukawpolsce.pl/spoleczenstwo/'))) + feeds.append((u"Technologie", self.find_articles('http://www.naukawpolsce.pl/technologie/'))) + feeds.append((u"Uczelnie", self.find_articles('http://www.naukawpolsce.pl/uczelnie/'))) + feeds.append((u"Nauki medyczne", self.find_articles('http://www.naukawpolsce.pl/zdrowie/'))) + + return feeds + + def preprocess_html(self, soup): + for p in soup.findAll(name='p', text=re.compile(' ')): + p.extract() + return soup diff --git a/recipes/navegalo.recipe b/recipes/navegalo.recipe new file mode 100644 index 0000000000..89f6cde45d --- /dev/null +++ b/recipes/navegalo.recipe @@ -0,0 +1,40 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1360354988(BasicNewsRecipe): + title = u'Navegalo.com' + oldest_article = 7 + max_articles_per_feed = 100 + auto_cleanup = True + +from calibre.web.feeds.news import BasicNewsRecipe + +class navegalonews(BasicNewsRecipe): + __author__ = 'Douglas Delgado' + title = u'Navegalo.com' + publisher = 'Navegalo.com' + description = 'Noticias actualizadas por Navegalo.com. Recipe creado por Douglas Delgado (doudelgado@gmail.com) para su uso con Calibre' + category = 'Spanish, Entertainment' + masthead_url = 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQZhML5lwsdss6FFF7CFR0Sf-Ln052Zmhs1TlIOcAL8JWN8a-dPlA' + + oldest_article = 2 + delay = 1 + max_articles_per_feed = 100 + auto_cleanup = True + encoding = 'utf-8' + language = 'es_CR' + use_embedded_content = False + remove_empty_feeds = True + remove_javascript = True + no_stylesheets = True + + + feeds = [(u'Nacionales', u'http://www.navegalo.com/es/noticias/noticias/noticias-nacionales?format=feed&type=rss'), (u'Internacionales', u'http://direct.navegalo.com/es/noticias/noticias/noticias-internacionales?format=feed&type=rss'), (u'Deportes', u'http://direct.navegalo.com/es/noticias/noticias/deportes-nacionales?format=feed&type=rss'), (u'Solo futbol', u'http://www.navegalo.com/es/noticias/noticias/solo-futbol?format=feed&type=rss'), (u'Entretenimiento', u'http://www.navegalo.com/es/noticias/noticias/entretenimiento?format=feed&type=rss'), (u'Solo para ellas', u'http://www.navegalo.com/es/noticias/noticias/solo-para-ellas?format=feed&type=rss'), (u'Infiltrados', u'http://direct.navegalo.com/es/noticias/noticias/infiltrados?format=feed&type=rss'), (u'Mano a mano', u'http://direct.navegalo.com/es/noticias/noticias/mano-a-mano?format=feed&type=rss')] + + + + + extra_css = ''' + h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;} + h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;} + ''' + diff --git a/recipes/nbonline.recipe b/recipes/nbonline.recipe index 82b7667a5c..ff8b0d16b2 100644 --- a/recipes/nbonline.recipe +++ b/recipes/nbonline.recipe @@ -18,7 +18,7 @@ class NBOnline(BasicNewsRecipe): return 'http://business.nikkeibp.co.jp/images/nbo/200804/parts/logo.gif' def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://signon.nikkeibp.co.jp/front/login/?ct=p&ts=nbo') br.select_form(name='loginActionForm') diff --git a/recipes/nejm.recipe b/recipes/nejm.recipe index bc12fbcedf..9e6c4ceddc 100644 --- a/recipes/nejm.recipe +++ b/recipes/nejm.recipe @@ -16,7 +16,7 @@ class NYTimes(BasicNewsRecipe): #TO LOGIN def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open('http://www.nejm.org/action/showLogin?uri=http://www.nejm.org/') br.select_form(name='frmLogin') br['login'] = self.username diff --git a/recipes/new_scientist.recipe b/recipes/new_scientist.recipe index 1bfe27685f..521a7ba0d8 100644 --- a/recipes/new_scientist.recipe +++ b/recipes/new_scientist.recipe @@ -68,7 +68,7 @@ class NewScientist(BasicNewsRecipe): url_list = [] # This list is used to check if an article had already been included. def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open('http://www.newscientist.com/') if self.username is not None and self.password is not None: br.open('https://www.newscientist.com/user/login') diff --git a/recipes/new_york_review_of_books.recipe b/recipes/new_york_review_of_books.recipe index bd18b95c43..2da9536da3 100644 --- a/recipes/new_york_review_of_books.recipe +++ b/recipes/new_york_review_of_books.recipe @@ -31,7 +31,7 @@ class NewYorkReviewOfBooks(BasicNewsRecipe): m:'')] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open('http://www.nybooks.com/account/signin/') br.select_form(nr = 1) br['username'] = self.username @@ -66,21 +66,22 @@ class NewYorkReviewOfBooks(BasicNewsRecipe): self.log('Issue date:', date) # Find TOC - toc = soup.find('ul', attrs={'class':'issue-article-list'}) + tocs = soup.findAll('ul', attrs={'class':'issue-article-list'}) articles = [] - for li in toc.findAll('li'): - h3 = li.find('h3') - title = self.tag_to_string(h3) - author = self.tag_to_string(li.find('h4')) - title = title + u' (%s)'%author - url = 'http://www.nybooks.com'+h3.find('a', href=True)['href'] - desc = '' - for p in li.findAll('p'): - desc += self.tag_to_string(p) - self.log('Found article:', title) - self.log('\t', url) - self.log('\t', desc) - articles.append({'title':title, 'url':url, 'date':'', + for toc in tocs: + for li in toc.findAll('li'): + h3 = li.find('h3') + title = self.tag_to_string(h3) + author = self.tag_to_string(li.find('h4')) + title = title + u' (%s)'%author + url = 'http://www.nybooks.com'+h3.find('a', href=True)['href'] + desc = '' + for p in li.findAll('p'): + desc += self.tag_to_string(p) + self.log('Found article:', title) + self.log('\t', url) + self.log('\t', desc) + articles.append({'title':title, 'url':url, 'date':'', 'description':desc}) return [('Current Issue', articles)] diff --git a/recipes/new_yorker.recipe b/recipes/new_yorker.recipe index c5f1b0aff2..2730b45d6d 100644 --- a/recipes/new_yorker.recipe +++ b/recipes/new_yorker.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2008-2011, Darko Miletic ' +__copyright__ = '2008-2013, Darko Miletic ' ''' newyorker.com ''' @@ -44,20 +44,18 @@ class NewYorker(BasicNewsRecipe): , 'language' : language } - keep_only_tags = [ - dict(name='div', attrs={'class':'headers'}) - ,dict(name='div', attrs={'id':['articleheads','items-container','articleRail','articletext','photocredits']}) - ] + keep_only_tags = [dict(name='div', attrs={'id':'pagebody'})] remove_tags = [ dict(name=['meta','iframe','base','link','embed','object']) - ,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons'] }) + ,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons','social-utils-top','entry-keywords','entry-categories','utilsPrintEmail'] }) ,dict(attrs={'id':['show-header','show-footer'] }) ] + remove_tags_after = dict(attrs={'class':'entry-content'}) remove_attributes = ['lang'] feeds = [(u'The New Yorker', u'http://www.newyorker.com/services/mrss/feeds/everything.xml')] def print_version(self, url): - return url + '?printable=true' + return url + '?printable=true¤tPage=all' def image_url_processor(self, baseurl, url): return url.strip() diff --git a/recipes/nikkei_news.recipe b/recipes/nikkei_news.recipe index aa351e0a0d..9a974e9596 100644 --- a/recipes/nikkei_news.recipe +++ b/recipes/nikkei_news.recipe @@ -13,8 +13,11 @@ class NikkeiNet_paper_subscription(BasicNewsRecipe): max_articles_per_feed = 30 language = 'ja' no_stylesheets = True - cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' - masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + #cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + cover_url = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif' + #masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' + masthead_url = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif' + cover_margins = (10, 188, '#ffffff') remove_tags_before = {'class':"cmn-indent"} remove_tags = [ @@ -27,7 +30,7 @@ class NikkeiNet_paper_subscription(BasicNewsRecipe): remove_tags_after = {'class':"cmn-indent"} def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) #pp.pprint(self.parse_index()) #exit(1) @@ -40,8 +43,11 @@ class NikkeiNet_paper_subscription(BasicNewsRecipe): print "-------------------------open top page-------------------------------------" br.open('http://www.nikkei.com/') print "-------------------------open first login form-----------------------------" - link = br.links(url_regex="www.nikkei.com/etc/accounts/login").next() - br.follow_link(link) + try: + url = br.links(url_regex="www.nikkei.com/etc/accounts/login").next().url + except StopIteration: + url = 'http://www.nikkei.com/etc/accounts/login?dps=3&pageflag=top&url=http%3A%2F%2Fwww.nikkei.com%2F' + br.open(url) #br.follow_link(link) #response = br.response() #print response.get_data() print "-------------------------JS redirect(send autoPostForm)--------------------" diff --git a/recipes/nikkei_sub.recipe b/recipes/nikkei_sub.recipe index 18f324009a..cb76577fa1 100644 --- a/recipes/nikkei_sub.recipe +++ b/recipes/nikkei_sub.recipe @@ -25,7 +25,7 @@ class NikkeiNet_subscription(BasicNewsRecipe): def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) cj = mechanize.LWPCookieJar() br.set_cookiejar(cj) diff --git a/recipes/nikkei_sub_economy.recipe b/recipes/nikkei_sub_economy.recipe index 8e7a68dfe7..7a256f7553 100644 --- a/recipes/nikkei_sub_economy.recipe +++ b/recipes/nikkei_sub_economy.recipe @@ -44,7 +44,7 @@ class NikkeiNet_sub_economy(BasicNewsRecipe): ] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) cj = mechanize.LWPCookieJar() br.set_cookiejar(cj) diff --git a/recipes/nikkei_sub_industry.recipe b/recipes/nikkei_sub_industry.recipe index 81e86767d0..11a17b2415 100644 --- a/recipes/nikkei_sub_industry.recipe +++ b/recipes/nikkei_sub_industry.recipe @@ -41,7 +41,7 @@ class NikkeiNet_sub_industory(BasicNewsRecipe): ] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) cj = mechanize.LWPCookieJar() br.set_cookiejar(cj) diff --git a/recipes/nikkei_sub_life.recipe b/recipes/nikkei_sub_life.recipe index 60e5b170ca..c2b908ca98 100644 --- a/recipes/nikkei_sub_life.recipe +++ b/recipes/nikkei_sub_life.recipe @@ -38,7 +38,7 @@ class NikkeiNet_sub_life(BasicNewsRecipe): ] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) cj = mechanize.LWPCookieJar() br.set_cookiejar(cj) diff --git a/recipes/nikkei_sub_main.recipe b/recipes/nikkei_sub_main.recipe index 485d2f32c0..84503cccf3 100644 --- a/recipes/nikkei_sub_main.recipe +++ b/recipes/nikkei_sub_main.recipe @@ -37,7 +37,7 @@ class NikkeiNet_sub_main(BasicNewsRecipe): feeds = [ (u'NIKKEI', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=main')] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) cj = mechanize.LWPCookieJar() br.set_cookiejar(cj) diff --git a/recipes/nikkei_sub_shakai.recipe b/recipes/nikkei_sub_shakai.recipe index 9a53e910e6..be21b3c43f 100644 --- a/recipes/nikkei_sub_shakai.recipe +++ b/recipes/nikkei_sub_shakai.recipe @@ -36,7 +36,7 @@ class NikkeiNet_sub_shakai(BasicNewsRecipe): ] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) cj = mechanize.LWPCookieJar() br.set_cookiejar(cj) diff --git a/recipes/nikkei_sub_sports.recipe b/recipes/nikkei_sub_sports.recipe index 644b0aa252..47e335a4c3 100644 --- a/recipes/nikkei_sub_sports.recipe +++ b/recipes/nikkei_sub_sports.recipe @@ -42,7 +42,7 @@ class NikkeiNet_sub_sports(BasicNewsRecipe): ] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) cj = mechanize.LWPCookieJar() br.set_cookiejar(cj) diff --git a/recipes/nin.recipe b/recipes/nin.recipe index 78c9dd4324..c6e54a2fe2 100644 --- a/recipes/nin.recipe +++ b/recipes/nin.recipe @@ -15,7 +15,7 @@ class Nin(BasicNewsRecipe): publisher = 'NIN d.o.o. - Ringier d.o.o.' category = 'news, politics, Serbia' no_stylesheets = True - oldest_article = 15 + oldest_article = 180 encoding = 'utf-8' needs_subscription = True remove_empty_feeds = True @@ -25,7 +25,7 @@ class Nin(BasicNewsRecipe): use_embedded_content = False language = 'sr' publication_type = 'magazine' - masthead_url = 'http://www.nin.co.rs/img/head/logo.jpg' + masthead_url = 'http://www.nin.co.rs/img/logo_print.jpg' extra_css = """ @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Verdana, Lucida, sans1, sans-serif} @@ -42,16 +42,16 @@ class Nin(BasicNewsRecipe): , 'tags' : category , 'publisher' : publisher , 'language' : language + , 'linearize_tables': True } preprocess_regexps = [ - (re.compile(r'.*?', re.DOTALL|re.IGNORECASE),lambda match: '') - ,(re.compile(r'.*?', re.DOTALL|re.IGNORECASE),lambda match: '') + (re.compile(r'
.*', re.DOTALL|re.IGNORECASE),lambda match: '') ,(re.compile(u'\u0110'), lambda match: u'\u00D0') ] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open(self.INDEX) br.select_form(name='form1') @@ -60,42 +60,21 @@ class Nin(BasicNewsRecipe): br.submit() return br - keep_only_tags =[dict(name='td', attrs={'width':'520'})] - remove_tags_before =dict(name='span', attrs={'class':'izjava'}) - remove_tags_after =dict(name='html') - remove_tags = [ - dict(name=['object','link','iframe','meta','base']) - ,dict(attrs={'class':['fb-like','twitter-share-button']}) - ,dict(attrs={'rel':'nofollow'}) - ] - remove_attributes=['border','background','height','width','align','valign'] + remove_tags_before = dict(name='div', attrs={'class':'titleFont'}) + remove_tags_after = dict(name='div', attrs={'class':'standardFont'}) + remove_tags = [dict(name=['object','link','iframe','meta','base'])] + remove_attributes = ['border','background','height','width','align','valign'] def get_cover_url(self): cover_url = None soup = self.index_to_soup(self.INDEX) - for item in soup.findAll('a', href=True): - if item['href'].startswith('/pages/issue.php?id='): - simg = item.find('img') - if simg: - return self.PREFIX + item.img['src'] + cover = soup.find('img', attrs={'class':'issueImg'}) + if cover: + return self.PREFIX + cover['src'] return cover_url feeds = [(u'NIN Online', u'http://www.nin.co.rs/misc/rss.php?feed=RSS2.0')] - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll('div'): - if len(item.contents) == 0: - item.extract() - for item in soup.findAll(['td','tr']): - item.name='div' - for item in soup.findAll('img'): - if not item.has_key('alt'): - item['alt'] = 'image' - for tbl in soup.findAll('table'): - img = tbl.find('img') - if img: - img.extract() - tbl.replaceWith(img) - return soup + def print_version(self, url): + return url + '&pf=1' + diff --git a/recipes/novilist_novine_hr.recipe b/recipes/novilist_novine_hr.recipe index 4cd3e8277a..26d4eebe18 100644 --- a/recipes/novilist_novine_hr.recipe +++ b/recipes/novilist_novine_hr.recipe @@ -53,7 +53,7 @@ class NoviList_hr(BasicNewsRecipe): remove_attributes=['border', 'lang', 'size', 'face', 'bgcolor'] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open(self.index + 'loginnow.asp') br.select_form(nr=0) diff --git a/recipes/novistandard.recipe b/recipes/novistandard.recipe index 37d3657e41..57b93b6554 100644 --- a/recipes/novistandard.recipe +++ b/recipes/novistandard.recipe @@ -47,7 +47,7 @@ class NoviStandard(BasicNewsRecipe): preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open(self.INDEX) if self.username is not None and self.password is not None: br.select_form(name='login') diff --git a/recipes/nowa_fantastyka.recipe b/recipes/nowa_fantastyka.recipe index 0371cb1f58..7715b9826a 100644 --- a/recipes/nowa_fantastyka.recipe +++ b/recipes/nowa_fantastyka.recipe @@ -60,7 +60,7 @@ class Nowa_Fantastyka(BasicNewsRecipe): return getattr(self, 'cover_url', self.cover_url) def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://www.fantastyka.pl/') br.select_form(nr=0) diff --git a/recipes/nrc-nl-epub.recipe b/recipes/nrc-nl-epub.recipe index 2d190e4d0a..961eb723c2 100644 --- a/recipes/nrc-nl-epub.recipe +++ b/recipes/nrc-nl-epub.recipe @@ -29,7 +29,7 @@ class NRCHandelsblad(BasicNewsRecipe): } def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://login.nrc.nl/login') br.select_form(nr=0) diff --git a/recipes/nsfw_corp.recipe b/recipes/nsfw_corp.recipe index c88bdd705e..37287b875e 100644 --- a/recipes/nsfw_corp.recipe +++ b/recipes/nsfw_corp.recipe @@ -6,7 +6,6 @@ www.nsfwcorp.com ''' import urllib -from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class NotSafeForWork(BasicNewsRecipe): @@ -21,8 +20,9 @@ class NotSafeForWork(BasicNewsRecipe): needs_subscription = True auto_cleanup = False INDEX = 'https://www.nsfwcorp.com' - LOGIN = INDEX + '/login' - use_embedded_content = False + LOGIN = INDEX + '/login/target/' + SETTINGS = INDEX + '/settings/' + use_embedded_content = True language = 'en' publication_type = 'magazine' masthead_url = 'http://assets.nsfwcorp.com/media/headers/nsfw_banner.jpg' @@ -46,17 +46,8 @@ class NotSafeForWork(BasicNewsRecipe): , 'language' : language } - remove_tags_before = dict(attrs={'id':'fromToLine'}) - remove_tags_after = dict(attrs={'id':'unlockButtonDiv'}) - remove_tags=[ - dict(name=['meta', 'link', 'iframe', 'embed', 'object']) - ,dict(name='a', attrs={'class':'switchToDeskNotes'}) - ,dict(attrs={'id':'unlockButtonDiv'}) - ] - remove_attributes = ['lang'] - def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open(self.LOGIN) if self.username is not None and self.password is not None: data = urllib.urlencode({ 'email':self.username @@ -65,30 +56,12 @@ class NotSafeForWork(BasicNewsRecipe): br.open(self.LOGIN, data) return br - def parse_index(self): - articles = [] - soup = self.index_to_soup(self.INDEX) - dispatches = soup.find(attrs={'id':'dispatches'}) - if dispatches: - for item in dispatches.findAll('h3'): - description = u'' - title_link = item.find('span', attrs={'class':'dispatchTitle'}) - description_link = item.find('span', attrs={'class':'dispatchSubtitle'}) - feed_link = item.find('a', href=True) - if feed_link: - url = self.INDEX + feed_link['href'] - title = self.tag_to_string(title_link) - description = self.tag_to_string(description_link) - date = strftime(self.timefmt) - articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':description - }) - return [('Dispatches', articles)] + def get_feeds(self): + self.feeds = [] + soup = self.index_to_soup(self.SETTINGS) + for item in soup.findAll('input', attrs={'type':'text'}): + if item.has_key('value') and item['value'].startswith('http://www.nsfwcorp.com/feed/'): + self.feeds.append(item['value']) + return self.feeds + return self.feeds - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - return soup diff --git a/recipes/nspm.recipe b/recipes/nspm.recipe index f5a54b0a9a..0be78bc652 100644 --- a/recipes/nspm.recipe +++ b/recipes/nspm.recipe @@ -45,7 +45,7 @@ class Nspm(BasicNewsRecipe): remove_attributes = ['width','height','lang','xmlns:fb','xmlns:og','vspace','hspace','type','start','size'] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open(self.INDEX) return br diff --git a/recipes/nursingtimes.recipe b/recipes/nursingtimes.recipe index 699bc281f3..f18baa9fff 100644 --- a/recipes/nursingtimes.recipe +++ b/recipes/nursingtimes.recipe @@ -30,7 +30,7 @@ class NursingTimes(BasicNewsRecipe): } def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open(self.LOGIN) if self.username is not None and self.password is not None: data = urllib.urlencode({ 'campaigncode' :'0' diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe index bf138ee289..d0f311818e 100644 --- a/recipes/nytimes.recipe +++ b/recipes/nytimes.recipe @@ -6,22 +6,50 @@ __copyright__ = '2008, Kovid Goyal ' nytimes.com ''' import re, string, time -from calibre import entity_to_unicode, strftime +from calibre import strftime from datetime import timedelta, date +from time import sleep from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup - class NYTimes(BasicNewsRecipe): + recursions=1 # set this to zero to omit Related articles lists + match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/'] # speeds up processing by preventing index page links from being followed + + # set getTechBlogs to True to include the technology blogs + # set tech_oldest_article to control article age + # set tech_max_articles_per_feed to control article count + getTechBlogs = True + remove_empty_feeds = True + tech_oldest_article = 14 + tech_max_articles_per_feed = 25 + + # set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles + # otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category) + getPopularArticles = True + popularPeriod = '1' # set this to the number of days to include in the measurement + # e.g. 7 will get the most popular measured over the last 7 days + # and 30 will get the most popular measured over 30 days. + # you still only get up to 20 articles in each category + + # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored. headlinesOnly = True - # set webEdition to True for the Web edition of the newspaper. Set oldest_article to the - # number of days old an article can be for inclusion. If oldest_article = 0 all articles - # will be included. Note: oldest_article is ignored if webEdition = False + # set webEdition to True for the Web edition of the newspaper. Set oldest_web_article to the + # number of days old an article can be for inclusion. If oldest_web_article = None all articles + # will be included. Note: oldest_web_article is ignored if webEdition = False webEdition = False - oldest_article = 7 + oldest_web_article = 7 + + # download higher resolution images than the small thumbnails typically included in the article + # the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper + useHighResImages = True + + # replace paid Kindle Version: the name will be changed to "The New York Times" to cause + # previous paid versions of the new york times to best sent to the back issues folder on the kindle + replaceKindleVersion = False # includeSections: List of sections to include. If empty, all sections found will be included. # Otherwise, only the sections named will be included. For example, @@ -82,79 +110,123 @@ class NYTimes(BasicNewsRecipe): ('Education',u'education'), ('Multimedia',u'multimedia'), (u'Obituaries',u'obituaries'), - (u'Sunday Magazine',u'magazine'), - (u'Week in Review',u'weekinreview')] + (u'Sunday Magazine',u'magazine') + ] + + tech_feeds = [ + (u'Tech - Pogues Posts', u'http://pogue.blogs.nytimes.com/feed/'), + (u'Tech - Bits', u'http://bits.blogs.nytimes.com/feed/'), + (u'Tech - Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'), + (u'Tech - Open', u'http://open.blogs.nytimes.com/feed/') + ] if headlinesOnly: title='New York Times Headlines' - description = 'Headlines from the New York Times. Needs a subscription from http://www.nytimes.com' - needs_subscription = 'optional' + description = 'Headlines from the New York Times' + needs_subscription = False elif webEdition: title='New York Times (Web)' description = 'New York Times on the Web' - needs_subscription = True + needs_subscription = False + elif replaceKindleVersion: + title='The New York Times' + description = 'Today\'s New York Times' + needs_subscription = False else: title='New York Times' description = 'Today\'s New York Times' - needs_subscription = True + needs_subscription = False - - month_list = ['january','february','march','april','may','june','july','august','september','october','november','december'] - - def decode_us_date(self,datestr): - udate = datestr.strip().lower().split() + def decode_url_date(self,url): + urlitems = url.split('/') try: - m = self.month_list.index(udate[0])+1 + d = date(int(urlitems[3]),int(urlitems[4]),int(urlitems[5])) except: - return date.today() - d = int(udate[1]) - y = int(udate[2]) - try: - d = date(y,m,d) - except: - d = date.today + try: + d = date(int(urlitems[4]),int(urlitems[5]),int(urlitems[6])) + except: + return None return d - earliest_date = date.today() - timedelta(days=oldest_article) + if oldest_web_article is None: + earliest_date = date.today() + else: + earliest_date = date.today() - timedelta(days=oldest_web_article) + oldest_article = 365 # by default, a long time ago __author__ = 'GRiker/Kovid Goyal/Nick Redding' language = 'en' requires_version = (0, 7, 5) - + encoding = 'utf-8' timefmt = '' - masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' + + #simultaneous_downloads = 1 # no longer required to deal with ads + cover_margins = (18,18,'grey99') remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') - remove_tags = [dict(attrs={'class':[ - 'articleFooter', - 'articleTools', - 'columnGroup doubleRule', - 'columnGroup singleRule', - 'columnGroup last', - 'columnGroup last', - 'doubleRule', - 'dottedLine', - 'entry-meta', - 'entry-response module', - 'icon enlargeThis', - 'leftNavTabs', - 'metaFootnote', - 'module box nav', - 'nextArticleLink', - 'nextArticleLink clearfix', - 'post-tools', - 'relatedSearchesModule', - 'side_tool', - 'singleAd', - re.compile('^subNavigation'), - re.compile('^leaderboard'), - re.compile('^module'), - ]}), - dict(id=[ + remove_tags = [ + dict(attrs={'class':[ + 'articleFooter', + 'articleTools', + 'columnGroup singleRule', + 'columnGroup last', + 'columnGroup last', + 'doubleRule', + 'dottedLine', + 'entry-meta', + 'entry-response module', + 'leftNavTabs', + 'metaFootnote', + 'module box nav', + 'nextArticleLink', + 'nextArticleLink clearfix', + 'post-tools', + 'relatedSearchesModule', + 'side_tool', + 'singleAd', + 'entry entry-utility', #added for DealBook + 'entry-tags', #added for DealBook + 'footer promos clearfix', #added for DealBook + 'footer links clearfix', #added for DealBook + 'tabsContainer', #added for other blog downloads + 'column lastColumn', #added for other blog downloads + 'pageHeaderWithLabel', #added for other gadgetwise downloads + 'column two', #added for other blog downloads + 'column two last', #added for other blog downloads + 'column three', #added for other blog downloads + 'column three last', #added for other blog downloads + 'column four',#added for other blog downloads + 'column four last',#added for other blog downloads + 'column last', #added for other blog downloads + 'entry entry-related', + 'subNavigation tabContent active', #caucus blog navigation + 'mediaOverlay slideshow', + 'wideThumb', + 'video', #added 02-11-2011 + 'videoHeader',#added 02-11-2011 + 'articleInlineVideoHolder', #added 02-11-2011 + 'assetCompanionAd', + re.compile('^subNavigation'), + re.compile('^leaderboard'), + re.compile('^module'), + re.compile('commentCount'), + 'credit' + ]}), + dict(name='div', attrs={'class':re.compile('toolsList')}), # bits + dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits + dict(name='div', attrs={'class':'tweet'}), + dict(name='span', attrs={'class':'commentCount meta'}), + dict(name='div', attrs={'id':'header'}), + dict(name='div', attrs={'id':re.compile('commentsContainer')}), # bits, pogue, gadgetwise, open + dict(name='ul', attrs={'class':re.compile('entry-tools')}), # pogue, gadgetwise + dict(name='div', attrs={'class':re.compile('nocontent')}), # pogue, gadgetwise + dict(name='div', attrs={'id':re.compile('respond')}), # open + dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue + dict(id=[ 'adxLeaderboard', 'adxSponLink', 'archive', @@ -183,22 +255,29 @@ class NYTimes(BasicNewsRecipe): 'side_index', 'side_tool', 'toolsRight', + 'skybox', #added for DealBook + 'TopAd', #added for DealBook + 'related-content', #added for DealBook ]), - dict(name=['script', 'noscript', 'style','form','hr'])] + dict(name=['script', 'noscript', 'style','form','hr'])] no_stylesheets = True extra_css = ''' .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; } - .credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - .byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } - .dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - .timestamp { text-align: left; font-size: small; } - .caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } + .dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .timestamp { font-weight: normal; text-align: left; font-size: 50%; } + .caption { font-size: 50%; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } a:link {text-decoration: none; } + .date{font-size: 50%; } + .update{font-size: 50%; } .articleBody { } - .authorId {text-align: left; } + .authorId {text-align: left; font-size: 50%; } .image {text-align: center;} - .source {text-align: left; }''' + .aside {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;} + .asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;} + .source {text-align: left; font-size: x-small; }''' articles = {} @@ -222,11 +301,11 @@ class NYTimes(BasicNewsRecipe): del ans[idx] idx_max = idx_max-1 continue - if self.verbose: + if True: #self.verbose self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) ) for article in ans[idx][1]: total_article_count += 1 - if self.verbose: + if True: #self.verbose self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'), article['url'].encode('cp1252','replace'))) idx = idx+1 @@ -237,7 +316,7 @@ class NYTimes(BasicNewsRecipe): def exclude_url(self,url): if not url.startswith("http"): return True - if not url.endswith(".html"): + if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook return True if 'nytimes.com' not in url: return True @@ -279,89 +358,79 @@ class NYTimes(BasicNewsRecipe): return fixed def get_browser(self): - br = BasicNewsRecipe.get_browser() - if self.username is not None and self.password is not None: - br.open('http://www.nytimes.com/auth/login') - br.form = br.forms().next() - br['userid'] = self.username - br['password'] = self.password - raw = br.submit().read() - if 'Please try again' in raw: - raise Exception('Your username and password are incorrect') + br = BasicNewsRecipe.get_browser(self) return br - def skip_ad_pages(self, soup): - # Skip ad pages served before actual article - skip_tag = soup.find(True, {'name':'skip'}) - if skip_tag is not None: - self.log.warn("Found forwarding link: %s" % skip_tag.parent['href']) - url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) - url += '?pagewanted=all' - self.log.warn("Skipping ad to article at '%s'" % url) - return self.index_to_soup(url, raw=True) - + cover_tag = 'NY_NYT' def get_cover_url(self): - cover = None - st = time.localtime() - year = str(st.tm_year) - month = "%.2d" % st.tm_mon - day = "%.2d" % st.tm_mday - cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg' - br = BasicNewsRecipe.get_browser() + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg' + br = BasicNewsRecipe.get_browser(self) + daysback=1 try: br.open(cover) except: + while daysback<7: + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.cover_tag+'.jpg' + br = BasicNewsRecipe.get_browser(self) + try: + br.open(cover) + except: + daysback = daysback+1 + continue + break + if daysback==7: self.log("\nCover unavailable") cover = None return cover + masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' + + def short_title(self): return self.title - def index_to_soup(self, url_or_raw, raw=False): - ''' - OVERRIDE of class method - deals with various page encodings between index and articles - ''' - def get_the_soup(docEncoding, url_or_raw, raw=False) : - if re.match(r'\w+://', url_or_raw): - br = self.clone_browser(self.browser) - f = br.open_novisit(url_or_raw) + + def article_to_soup(self, url_or_raw, raw=False): + from contextlib import closing + import copy + from calibre.ebooks.chardet import xml_to_unicode + print("ARTICLE_TO_SOUP "+url_or_raw) + if re.match(r'\w+://', url_or_raw): + br = self.clone_browser(self.browser) + open_func = getattr(br, 'open_novisit', br.open) + with closing(open_func(url_or_raw)) as f: _raw = f.read() - f.close() - if not _raw: - raise RuntimeError('Could not fetch index from %s'%url_or_raw) + if not _raw: + raise RuntimeError('Could not fetch index from %s'%url_or_raw) + else: + _raw = url_or_raw + if raw: + return _raw + if not isinstance(_raw, unicode) and self.encoding: + if callable(self.encoding): + _raw = self.encoding(_raw) else: - _raw = url_or_raw - if raw: - return _raw + _raw = _raw.decode(self.encoding, 'replace') - if not isinstance(_raw, unicode) and self.encoding: - _raw = _raw.decode(docEncoding, 'replace') - massage = list(BeautifulSoup.MARKUP_MASSAGE) - massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding))) - return BeautifulSoup(_raw, markupMassage=massage) + nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) + nmassage.extend(self.preprocess_regexps) + nmassage += [(re.compile(r'', re.DOTALL), lambda m: '')] + # Some websites have buggy doctype declarations that mess up beautifulsoup + # Remove comments as they can leave detritus when extracting tags leaves + # multiple nested comments + nmassage.append((re.compile(r'', re.DOTALL), lambda m: '')) + usrc = xml_to_unicode(_raw, self.verbose, strip_encoding_pats=True)[0] + usrc = self.preprocess_raw_html(usrc, url_or_raw) + return BeautifulSoup(usrc, markupMassage=nmassage) - # Entry point - soup = get_the_soup( self.encoding, url_or_raw ) - contentType = soup.find(True,attrs={'http-equiv':'Content-Type'}) - docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')] - if docEncoding == '' : - docEncoding = self.encoding - - if self.verbose > 2: - self.log( " document encoding: '%s'" % docEncoding) - if docEncoding != self.encoding : - soup = get_the_soup(docEncoding, url_or_raw) - - return soup def massageNCXText(self, description): # Kindle TOC descriptions won't render certain characters if description: massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) # Replace '&' with '&' - massaged = re.sub("&","&", massaged) + massaged = re.sub("&","&", massaged) + massaged = re.sub("&","&", massaged) return self.fixChars(massaged) else: return description @@ -383,6 +452,16 @@ class NYTimes(BasicNewsRecipe): if self.filterDuplicates: if url in self.url_list: return + if self.webEdition: + date_tag = self.decode_url_date(url) + if date_tag is not None: + if self.oldest_web_article is not None: + if date_tag < self.earliest_date: + self.log("Skipping article %s" % url) + return + else: + self.log("Skipping article %s" % url) + return self.url_list.append(url) title = self.tag_to_string(a, use_alt=True).strip() description = '' @@ -407,6 +486,92 @@ class NYTimes(BasicNewsRecipe): description=description, author=author, content='')) + def get_popular_articles(self,ans): + if self.getPopularArticles: + popular_articles = {} + key_list = [] + + def handleh3(h3tag): + try: + url = h3tag.a['href'] + except: + return ('','','','') + url = re.sub(r'\?.*', '', url) + if self.exclude_url(url): + return ('','','','') + url += '?pagewanted=all' + title = self.tag_to_string(h3tag.a,False) + h6tag = h3tag.findNextSibling('h6') + if h6tag is not None: + author = self.tag_to_string(h6tag,False) + else: + author = '' + ptag = h3tag.findNextSibling('p') + if ptag is not None: + desc = self.tag_to_string(ptag,False) + else: + desc = '' + return(title,url,author,desc) + + + have_emailed = False + emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod) + for h3tag in emailed_soup.findAll('h3'): + (title,url,author,desc) = handleh3(h3tag) + if url=='': + continue + if not have_emailed: + key_list.append('Most E-Mailed') + popular_articles['Most E-Mailed'] = [] + have_emailed = True + popular_articles['Most E-Mailed'].append( + dict(title=title, url=url, date=strftime('%a, %d %b'), + description=desc, author=author, + content='')) + have_viewed = False + viewed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-viewed?period='+self.popularPeriod) + for h3tag in viewed_soup.findAll('h3'): + (title,url,author,desc) = handleh3(h3tag) + if url=='': + continue + if not have_viewed: + key_list.append('Most Viewed') + popular_articles['Most Viewed'] = [] + have_viewed = True + popular_articles['Most Viewed'].append( + dict(title=title, url=url, date=strftime('%a, %d %b'), + description=desc, author=author, + content='')) + viewed_ans = [(k, popular_articles[k]) for k in key_list if popular_articles.has_key(k)] + for x in viewed_ans: + ans.append(x) + return ans + + def get_tech_feeds(self,ans): + if self.getTechBlogs: + tech_articles = {} + key_list = [] + save_oldest_article = self.oldest_article + save_max_articles_per_feed = self.max_articles_per_feed + self.oldest_article = self.tech_oldest_article + self.max_articles_per_feed = self.tech_max_articles_per_feed + self.feeds = self.tech_feeds + tech = self.parse_feeds() + self.oldest_article = save_oldest_article + self.max_articles_per_feed = save_max_articles_per_feed + self.feeds = None + for f in tech: + key_list.append(f.title) + tech_articles[f.title] = [] + for a in f.articles: + tech_articles[f.title].append( + dict(title=a.title, url=a.url, date=a.date, + description=a.summary, author=a.author, + content=a.content)) + tech_ans = [(k, tech_articles[k]) for k in key_list if tech_articles.has_key(k)] + for x in tech_ans: + ans.append(x) + return ans def parse_web_edition(self): @@ -418,31 +583,41 @@ class NYTimes(BasicNewsRecipe): if sec_title in self.excludeSections: print "SECTION EXCLUDED: ",sec_title continue + try: + soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html') + except: + continue print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html' - soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html') + self.key = sec_title # Find each article for div in soup.findAll(True, - attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): - if div['class'] in ['story', 'story headline'] : + attrs={'class':['section-headline', 'ledeStory', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): + if div['class'] in ['story', 'story headline', 'storyHeader'] : self.handle_article(div) + elif div['class'] == 'ledeStory': + divsub = div.find('div','storyHeader') + if divsub is not None: + self.handle_article(divsub) + ulrefer = div.find('ul','refer') + if ulrefer is not None: + for lidiv in ulrefer.findAll('li'): + self.handle_article(lidiv) elif div['class'] == 'headlinesOnly multiline flush': for lidiv in div.findAll('li'): self.handle_article(lidiv) self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] - return self.filter_ans(self.ans) + return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) def parse_todays_index(self): soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') - skipping = False # Find each article for div in soup.findAll(True, attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): - if div['class'] in ['section-headline','sectionHeader']: self.key = string.capwords(self.feed_title(div)) self.key = self.key.replace('Op-ed','Op-Ed') @@ -466,7 +641,7 @@ class NYTimes(BasicNewsRecipe): self.handle_article(lidiv) self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] - return self.filter_ans(self.ans) + return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) def parse_headline_index(self): @@ -514,7 +689,7 @@ class NYTimes(BasicNewsRecipe): for h3_item in search_div.findAll('h3'): byline = h3_item.h6 if byline is not None: - author = self.tag_to_string(byline,usa_alt=False) + author = self.tag_to_string(byline,use_alt=False) else: author = '' a = h3_item.find('a', href=True) @@ -540,7 +715,7 @@ class NYTimes(BasicNewsRecipe): self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] - return self.filter_ans(self.ans) + return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) def parse_index(self): if self.headlinesOnly: @@ -550,174 +725,439 @@ class NYTimes(BasicNewsRecipe): else: return self.parse_todays_index() - def strip_anchors(self,soup): + def strip_anchors(self,soup,kill_all=False): paras = soup.findAll(True) for para in paras: aTags = para.findAll('a') for a in aTags: if a.img is None: - a.replaceWith(a.renderContents().decode('cp1252','replace')) + if kill_all or (self.recursions==0): + a.replaceWith(self.tag_to_string(a,False)) + else: + if a.has_key('href'): + if a['href'].startswith('http://www.nytimes'): + if not a['href'].endswith('pagewanted=all'): + url = re.sub(r'\?.*', '', a['href']) + if self.exclude_url(url): + a.replaceWith(self.tag_to_string(a,False)) + else: + a['href'] = url+'?pagewanted=all' + elif not (a['href'].startswith('http://pogue') or \ + a['href'].startswith('http://bits') or \ + a['href'].startswith('http://travel') or \ + a['href'].startswith('http://business') or \ + a['href'].startswith('http://tech') or \ + a['href'].startswith('http://health') or \ + a['href'].startswith('http://dealbook') or \ + a['href'].startswith('http://open')): + a.replaceWith(self.tag_to_string(a,False)) + return soup + + def handle_tags(self,soup): + try: + print("HANDLE TAGS: TITLE = "+self.tag_to_string(soup.title)) + except: + print("HANDLE TAGS: NO TITLE") + if soup is None: + print("ERROR: handle_tags received NoneType") + return None + +## print("HANDLING AD FORWARD:") +## print(soup) + if self.keep_only_tags: + body = Tag(soup, 'body') + try: + if isinstance(self.keep_only_tags, dict): + self.keep_only_tags = [self.keep_only_tags] + for spec in self.keep_only_tags: + for tag in soup.find('body').findAll(**spec): + body.insert(len(body.contents), tag) + soup.find('body').replaceWith(body) + except AttributeError: # soup has no body element + pass + + def remove_beyond(tag, next): + while tag is not None and getattr(tag, 'name', None) != 'body': + after = getattr(tag, next) + while after is not None: + ns = getattr(tag, next) + after.extract() + after = ns + tag = tag.parent + + if self.remove_tags_after is not None: + rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after + for spec in rt: + tag = soup.find(**spec) + remove_beyond(tag, 'nextSibling') + + if self.remove_tags_before is not None: + tag = soup.find(**self.remove_tags_before) + remove_beyond(tag, 'previousSibling') + + for kwds in self.remove_tags: + for tag in soup.findAll(**kwds): + tag.extract() + return soup def preprocess_html(self, soup): + #print(strftime("%H:%M:%S")+" -- PREPROCESS TITLE="+self.tag_to_string(soup.title)) + skip_tag = soup.find(True, {'name':'skip'}) + if skip_tag is not None: + #url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) + url = 'http://www.nytimes.com' + skip_tag.parent['href'] + #url += '?pagewanted=all' + self.log.warn("Skipping ad to article at '%s'" % url) + sleep(5) + soup = self.handle_tags(self.article_to_soup(url)) - if self.webEdition & (self.oldest_article>0): - date_tag = soup.find(True,attrs={'class': ['dateline','date']}) - if date_tag: - date_str = self.tag_to_string(date_tag,use_alt=False) - date_str = date_str.replace('Published:','') - date_items = date_str.split(',') + # check if the article is from one of the tech blogs + blog=soup.find('div',attrs={'id':['pogue','bits','gadgetwise','open']}) + + if blog is not None: + old_body = soup.find('body') + new_body=Tag(soup,'body') + new_body.append(soup.find('div',attrs={'id':'content'})) + new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html + old_body.replaceWith(new_body) + for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}): + if divr.find(text=re.compile('Sign up')): + divr.extract() + divr = soup.find('div',attrs={'id':re.compile('related-content')}) + if divr is not None: + # handle related articles + rlist = [] + ul = divr.find('ul') + if ul is not None: + for li in ul.findAll('li'): + atag = li.find('a') + if atag is not None: + if atag['href'].startswith('http://pogue') or atag['href'].startswith('http://bits') or \ + atag['href'].startswith('http://open'): + atag.find(text=True).replaceWith(self.massageNCXText(self.tag_to_string(atag,False))) + rlist.append(atag) + divr.extract() + if rlist != []: + asidediv = Tag(soup,'div',[('class','aside')]) + if soup.find('hr') is None: + asidediv.append(Tag(soup,'hr')) + h4 = Tag(soup,'h4',[('class','asidenote')]) + h4.insert(0,"Related Posts") + asidediv.append(h4) + ul = Tag(soup,'ul') + for r in rlist: + li = Tag(soup,'li',[('class','aside')]) + r['class'] = 'aside' + li.append(r) + ul.append(li) + asidediv.append(ul) + asidediv.append(Tag(soup,'hr')) + smain = soup.find('body') + smain.append(asidediv) + for atag in soup.findAll('a'): + img = atag.find('img') + if img is not None: + atag.replaceWith(img) + elif not atag.has_key('href'): + atag.replaceWith(atag.renderContents().decode('cp1252','replace')) + elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or \ + atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')): + atag.replaceWith(atag.renderContents().decode('cp1252','replace')) + hdr = soup.find('address') + if hdr is not None: + hdr.name='span' + for span_credit in soup.findAll('span','credit'): + sp = Tag(soup,'span') + span_credit.replaceWith(sp) + sp.append(Tag(soup,'br')) + sp.append(span_credit) + sp.append(Tag(soup,'br')) + + else: # nytimes article + + related = [] # these will be the related articles + first_outer = None # first related outer tag + first_related = None # first related tag + for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}): + for rdiv in soup.findAll('div','columnGroup doubleRule'): + if rdiv.find('h3') is not None: + if self.tag_to_string(rdiv.h3,False).startswith('Related'): + rdiv.h3.find(text=True).replaceWith("Related articles") + rdiv.h3['class'] = 'asidenote' + for litag in rdiv.findAll('li'): + if litag.find('a') is not None: + if litag.find('a')['href'].startswith('http://www.nytimes.com'): + url = re.sub(r'\?.*', '', litag.find('a')['href']) + litag.find('a')['href'] = url+'?pagewanted=all' + litag.extract() + related.append(litag) + if first_related is None: + first_related = rdiv + first_outer = outerdiv + else: + litag.extract() + if related != []: + for r in related: + if r.h6: # don't want the anchor inside a h6 tag + r.h6.replaceWith(r.h6.a) + first_related.ul.append(r) + first_related.insert(0,Tag(soup,'hr')) + first_related.append(Tag(soup,'hr')) + first_related['class'] = 'aside' + first_outer.replaceWith(first_related) # replace the outer tag with the related tag + + for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}): + rdiv.extract() + + kicker_tag = soup.find(attrs={'class':'kicker'}) + if kicker_tag: # remove Op_Ed author head shots + tagline = self.tag_to_string(kicker_tag) + if tagline=='Op-Ed Columnist': + img_div = soup.find('div','inlineImage module') + if img_div: + img_div.extract() + + if self.useHighResImages: try: - datestring = date_items[0]+' '+date_items[1] - article_date = self.decode_us_date(datestring) + #open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs + enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) + if enlargeThisList: + for popupref in enlargeThisList: + popupreflink = popupref.find('a') + if popupreflink: + reflinkstring = str(popupreflink['href']) + refstart = reflinkstring.find("javascript:pop_me_up2('") + len("javascript:pop_me_up2('") + refend = reflinkstring.find(".html", refstart) + len(".html") + reflinkstring = reflinkstring[refstart:refend] + + popuppage = self.browser.open(reflinkstring) + popuphtml = popuppage.read() + popuppage.close() + if popuphtml: + st = time.localtime() + year = str(st.tm_year) + month = "%.2d" % st.tm_mon + day = "%.2d" % st.tm_mday + imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4] + popupSoup = BeautifulSoup(popuphtml) + highResTag = popupSoup.find('img', {'src':highResImageLink}) + if highResTag: + try: + newWidth = highResTag['width'] + newHeight = highResTag['height'] + imageTag = popupref.parent.find("img") + except: + self.log("Error: finding width and height of img") + popupref.extract() + if imageTag: + try: + imageTag['src'] = highResImageLink + imageTag['width'] = newWidth + imageTag['height'] = newHeight + except: + self.log("Error setting the src width and height parameters") + except Exception: + self.log("Error pulling high resolution images") + + try: + #in case pulling images failed, delete the enlarge this text + enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) + if enlargeThisList: + for popupref in enlargeThisList: + popupref.extract() except: - article_date = date.today() - if article_date < self.earliest_date: - self.log("Skipping article dated %s" % date_str) - return None + self.log("Error removing Enlarge this text") - kicker_tag = soup.find(attrs={'class':'kicker'}) - if kicker_tag: # remove Op_Ed author head shots - tagline = self.tag_to_string(kicker_tag) - if tagline=='Op-Ed Columnist': - img_div = soup.find('div','inlineImage module') - if img_div: - img_div.extract() - return self.strip_anchors(soup) - def postprocess_html(self,soup, True): - try: - if self.one_picture_per_article: - # Remove all images after first - largeImg = soup.find(True, {'class':'articleSpanImage'}) - inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) - if largeImg: - for inlineImg in inlineImgs: - inlineImg.extract() - else: - if inlineImgs: - firstImg = inlineImgs[0] - for inlineImg in inlineImgs[1:]: - inlineImg.extract() - # Move firstImg before article body - cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')}) - if cgFirst: - # Strip all sibling NavigableStrings: noise - navstrings = cgFirst.findAll(text=True, recursive=False) - [ns.extract() for ns in navstrings] - headline_found = False - tag = cgFirst.find(True) - insertLoc = 0 - while True: - insertLoc += 1 - if hasattr(tag,'class') and tag['class'] == 'articleHeadline': - headline_found = True - break - tag = tag.nextSibling - if not tag: - headline_found = False - break - if headline_found: - cgFirst.insert(insertLoc,firstImg) - else: - self.log(">>> No class:'columnGroup first' found <<<") - except: - self.log("ERROR: One picture per article in postprocess_html") + return self.strip_anchors(soup,False) - try: - # Change captions to italic - for caption in soup.findAll(True, {'class':'caption'}) : - if caption and len(caption) > 0: - cTag = Tag(soup, "p", [("class", "caption")]) - c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() - mp_off = c.find("More Photos") - if mp_off >= 0: - c = c[:mp_off] - cTag.insert(0, c) - caption.replaceWith(cTag) - except: - self.log("ERROR: Problem in change captions to italic") + def postprocess_html(self,soup,first_fetch): + if not first_fetch: # remove Related links + for aside in soup.findAll('div','aside'): + aside.extract() + soup = self.strip_anchors(soup,True) + #print("RECURSIVE: "+self.tag_to_string(soup.title)) - try: - # Change to

- h1 = soup.find('h1') - if h1: - headline = h1.find("nyt_headline") - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, self.fixChars(headline.contents[0])) - h1.replaceWith(tag) - else: - # Blog entry - replace headline, remove
tags - headline = soup.find('title') - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, self.fixChars(headline.contents[0])) - soup.insert(0, tag) - hrs = soup.findAll('hr') - for hr in hrs: - hr.extract() - except: - self.log("ERROR: Problem in Change to

") + if soup.find('div',attrs={'id':'blogcontent'}) is None: + if first_fetch: + aside = soup.find('div','aside') + if aside is not None: # move the related list to the end of the article + art = soup.find('div',attrs={'id':'article'}) + if art is None: + art = soup.find('div',attrs={'class':'article'}) + if art is not None: + art.append(aside) + try: + if self.one_picture_per_article: + # Remove all images after first + largeImg = soup.find(True, {'class':'articleSpanImage'}) + inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) + if largeImg: + for inlineImg in inlineImgs: + inlineImg.extract() + else: + if inlineImgs: + firstImg = inlineImgs[0] + for inlineImg in inlineImgs[1:]: + inlineImg.extract() + # Move firstImg before article body + cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')}) + if cgFirst: + # Strip all sibling NavigableStrings: noise + navstrings = cgFirst.findAll(text=True, recursive=False) + [ns.extract() for ns in navstrings] + headline_found = False + tag = cgFirst.find(True) + insertLoc = 0 + while True: + insertLoc += 1 + if hasattr(tag,'class') and tag['class'] == 'articleHeadline': + headline_found = True + break + tag = tag.nextSibling + if not tag: + headline_found = False + break + if headline_found: + cgFirst.insert(insertLoc,firstImg) + else: + self.log(">>> No class:'columnGroup first' found <<<") + except: + self.log("ERROR: One picture per article in postprocess_html") - try: - # Change

to

- used in editorial blogs - masthead = soup.find("h1") - if masthead: - # Nuke the href - if masthead.a: - del(masthead.a['href']) - tag = Tag(soup, "h3") - tag.insert(0, self.fixChars(masthead.contents[0])) - masthead.replaceWith(tag) - except: - self.log("ERROR: Problem in Change

to

- used in editorial blogs") + try: + # Change captions to italic + for caption in soup.findAll(True, {'class':'caption'}) : + if caption and len(caption) > 0: + cTag = Tag(soup, "p", [("class", "caption")]) + c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() + mp_off = c.find("More Photos") + if mp_off >= 0: + c = c[:mp_off] + cTag.insert(0, c) + caption.replaceWith(cTag) + except: + self.log("ERROR: Problem in change captions to italic") - try: - # Change to - for subhead in soup.findAll(True, {'class':'bold'}) : - if subhead.contents: - bTag = Tag(soup, "b") - bTag.insert(0, subhead.contents[0]) - subhead.replaceWith(bTag) - except: - self.log("ERROR: Problem in Change

to

- used in editorial blogs") + try: + # Change to

+ h1 = soup.find('h1') + blogheadline = str(h1) #added for dealbook + if h1: + headline = h1.find("nyt_headline") + if headline: + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, self.fixChars(headline.contents[0])) + h1.replaceWith(tag) + elif blogheadline.find('entry-title'):#added for dealbook + tag = Tag(soup, "h2")#added for dealbook + tag['class'] = "headline"#added for dealbook + tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook + h1.replaceWith(tag)#added for dealbook - try: - divTag = soup.find('div',attrs={'id':'articleBody'}) - if divTag: - divTag['class'] = divTag['id'] - except: - self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})") + else: + # Blog entry - replace headline, remove
tags - BCC I think this is no longer functional 1-18-2011 + headline = soup.find('title') + if headline: + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, self.fixChars(self.tag_to_string(headline,False))) + soup.insert(0, tag) + hrs = soup.findAll('hr') + for hr in hrs: + hr.extract() + except: + self.log("ERROR: Problem in Change to

") - try: - # Add class="authorId" to
so we can format with CSS - divTag = soup.find('div',attrs={'id':'authorId'}) - if divTag and divTag.contents[0]: - tag = Tag(soup, "p") - tag['class'] = "authorId" - tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], - use_alt=False))) - divTag.replaceWith(tag) - except: - self.log("ERROR: Problem in Add class=authorId to
so we can format with CSS") + try: + #if this is from a blog (dealbook, fix the byline format + bylineauthor = soup.find('address',attrs={'class':'byline author vcard'}) + if bylineauthor: + tag = Tag(soup, "h6") + tag['class'] = "byline" + tag.insert(0, self.fixChars(self.tag_to_string(bylineauthor,False))) + bylineauthor.replaceWith(tag) + except: + self.log("ERROR: fixing byline author format") - return soup + try: + #if this is a blog (dealbook) fix the credit style for the pictures + blogcredit = soup.find('div',attrs={'class':'credit'}) + if blogcredit: + tag = Tag(soup, "h6") + tag['class'] = "credit" + tag.insert(0, self.fixChars(self.tag_to_string(blogcredit,False))) + blogcredit.replaceWith(tag) + except: + self.log("ERROR: fixing credit format") + + + try: + # Change

to

- used in editorial blogs + masthead = soup.find("h1") + if masthead: + # Nuke the href + if masthead.a: + del(masthead.a['href']) + tag = Tag(soup, "h3") + tag.insert(0, self.fixChars(masthead.contents[0])) + masthead.replaceWith(tag) + except: + self.log("ERROR: Problem in Change

to

- used in editorial blogs") + + try: + # Change to + for subhead in soup.findAll(True, {'class':'bold'}) : + if subhead.contents: + bTag = Tag(soup, "b") + bTag.insert(0, subhead.contents[0]) + subhead.replaceWith(bTag) + except: + self.log("ERROR: Problem in Change

to

- used in editorial blogs") + try: + #remove the update tag + blogupdated = soup.find('span', {'class':'update'}) + if blogupdated: + blogupdated.replaceWith("") + except: + self.log("ERROR: Removing strong tag") + + try: + divTag = soup.find('div',attrs={'id':'articleBody'}) + if divTag: + divTag['class'] = divTag['id'] + except: + self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})") + + try: + # Add class="authorId" to
so we can format with CSS + divTag = soup.find('div',attrs={'id':'authorId'}) + if divTag and divTag.contents[0]: + tag = Tag(soup, "p") + tag['class'] = "authorId" + tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], + use_alt=False))) + divTag.replaceWith(tag) + except: + self.log("ERROR: Problem in Add class=authorId to
so we can format with CSS") + #print(strftime("%H:%M:%S")+" -- POSTPROCESS TITLE="+self.tag_to_string(soup.title)) + return soup def populate_article_metadata(self, article, soup, first): - if first and hasattr(self, 'add_toc_thumbnail'): - idxdiv = soup.find('div',attrs={'class':'articleSpanImage'}) - if idxdiv is not None: - if idxdiv.img: - self.add_toc_thumbnail(article, idxdiv.img['src']) - else: - img = soup.find('img') - if img is not None: - self.add_toc_thumbnail(article, img['src']) - + if not first: + return + idxdiv = soup.find('div',attrs={'class':'articleSpanImage'}) + if idxdiv is not None: + if idxdiv.img: + self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',idxdiv.img['src'])) + else: + img = soup.find('body').find('img') + if img is not None: + self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',img['src'])) shortparagraph = "" try: if len(article.text_summary.strip()) == 0: @@ -731,13 +1171,22 @@ class NYTimes(BasicNewsRecipe): #account for blank paragraphs and short paragraphs by appending them to longer ones if len(refparagraph) > 0: if len(refparagraph) > 70: #approximately one line of text - article.summary = article.text_summary = shortparagraph + refparagraph + newpara = shortparagraph + refparagraph + newparaDateline,newparaEm,newparaDesc = newpara.partition('—') + if newparaEm == '': + newparaDateline,newparaEm,newparaDesc = newpara.partition('—') + if newparaEm == '': + newparaDesc = newparaDateline + article.summary = article.text_summary = newparaDesc.strip() return else: shortparagraph = refparagraph + " " if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"): shortparagraph = shortparagraph + "- " + else: + article.summary = article.text_summary = self.massageNCXText(article.text_summary) except: self.log("Error creating article descriptions") return + diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 3c1bdcbc0d..06c476ef19 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -6,31 +6,51 @@ __copyright__ = '2008, Kovid Goyal ' nytimes.com ''' import re, string, time -from calibre import entity_to_unicode, strftime +from calibre import strftime from datetime import timedelta, date +from time import sleep from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup - class NYTimes(BasicNewsRecipe): + recursions=1 # set this to zero to omit Related articles lists + match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/'] # speeds up processing by preventing index page links from being followed + + # set getTechBlogs to True to include the technology blogs + # set tech_oldest_article to control article age + # set tech_max_articles_per_feed to control article count + getTechBlogs = True + remove_empty_feeds = True + tech_oldest_article = 14 + tech_max_articles_per_feed = 25 + + # set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles + # otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category) + getPopularArticles = True + popularPeriod = '1' # set this to the number of days to include in the measurement + # e.g. 7 will get the most popular measured over the last 7 days + # and 30 will get the most popular measured over 30 days. + # you still only get up to 20 articles in each category + + # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored. headlinesOnly = False - # set webEdition to True for the Web edition of the newspaper. Set oldest_article to the - # number of days old an article can be for inclusion. If oldest_article = 0 all articles - # will be included. Note: oldest_article is ignored if webEdition = False + # set webEdition to True for the Web edition of the newspaper. Set oldest_web_article to the + # number of days old an article can be for inclusion. If oldest_web_article = None all articles + # will be included. Note: oldest_web_article is ignored if webEdition = False webEdition = False - oldest_article = 7 - - # replace paid Kindle Version: the name will be changed to "The New York Times" to cause - # previous paid versions of the new york times to best sent to the back issues folder on the kindle - replaceKindleVersion = False + oldest_web_article = None # download higher resolution images than the small thumbnails typically included in the article # the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper useHighResImages = True + # replace paid Kindle Version: the name will be changed to "The New York Times" to cause + # previous paid versions of the new york times to best sent to the back issues folder on the kindle + replaceKindleVersion = False + # includeSections: List of sections to include. If empty, all sections found will be included. # Otherwise, only the sections named will be included. For example, # @@ -90,107 +110,123 @@ class NYTimes(BasicNewsRecipe): ('Education',u'education'), ('Multimedia',u'multimedia'), (u'Obituaries',u'obituaries'), - (u'Sunday Magazine',u'magazine'), - (u'Week in Review',u'weekinreview')] + (u'Sunday Magazine',u'magazine') + ] + + tech_feeds = [ + (u'Tech - Pogues Posts', u'http://pogue.blogs.nytimes.com/feed/'), + (u'Tech - Bits', u'http://bits.blogs.nytimes.com/feed/'), + (u'Tech - Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'), + (u'Tech - Open', u'http://open.blogs.nytimes.com/feed/') + ] + if headlinesOnly: title='New York Times Headlines' description = 'Headlines from the New York Times' - needs_subscription = True + needs_subscription = 'optional' elif webEdition: title='New York Times (Web)' description = 'New York Times on the Web' - needs_subscription = True + needs_subscription = 'optional' elif replaceKindleVersion: title='The New York Times' description = 'Today\'s New York Times' - needs_subscription = True + needs_subscription = 'optional' else: title='New York Times' - description = 'Today\'s New York Times. Needs subscription from http://www.nytimes.com' - needs_subscription = True + description = 'Today\'s New York Times' + needs_subscription = 'optional' - - month_list = ['january','february','march','april','may','june','july','august','september','october','november','december'] - - def decode_us_date(self,datestr): - udate = datestr.strip().lower().split() + def decode_url_date(self,url): + urlitems = url.split('/') try: - m = self.month_list.index(udate[0])+1 + d = date(int(urlitems[3]),int(urlitems[4]),int(urlitems[5])) except: - return date.today() - d = int(udate[1]) - y = int(udate[2]) - try: - d = date(y,m,d) - except: - d = date.today + try: + d = date(int(urlitems[4]),int(urlitems[5]),int(urlitems[6])) + except: + return None return d - earliest_date = date.today() - timedelta(days=oldest_article) + if oldest_web_article is None: + earliest_date = date.today() + else: + earliest_date = date.today() - timedelta(days=oldest_web_article) + oldest_article = 365 # by default, a long time ago - __author__ = 'GRiker/Kovid Goyal/Nick Redding/Ben Collier' + __author__ = 'GRiker/Kovid Goyal/Nick Redding' language = 'en' requires_version = (0, 7, 5) - + encoding = 'utf-8' timefmt = '' - masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' + + #simultaneous_downloads = 1 # no longer required to deal with ads + cover_margins = (18,18,'grey99') remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') - remove_tags = [dict(attrs={'class':[ - 'articleFooter', - 'articleTools', - 'columnGroup doubleRule', - 'columnGroup singleRule', - 'columnGroup last', - 'columnGroup last', - 'doubleRule', - 'dottedLine', - 'entry-meta', - 'entry-response module', - #'icon enlargeThis', #removed to provide option for high res images - 'leftNavTabs', - 'metaFootnote', - 'module box nav', - 'nextArticleLink', - 'nextArticleLink clearfix', - 'post-tools', - 'relatedSearchesModule', - 'side_tool', - 'singleAd', - 'entry entry-utility', #added for DealBook - 'entry-tags', #added for DealBook - 'footer promos clearfix', #added for DealBook - 'footer links clearfix', #added for DealBook - 'tabsContainer', #added for other blog downloads - 'column lastColumn', #added for other blog downloads - 'pageHeaderWithLabel', #added for other gadgetwise downloads - 'column two', #added for other blog downloads - 'column two last', #added for other blog downloads - 'column three', #added for other blog downloads - 'column three last', #added for other blog downloads - 'column four',#added for other blog downloads - 'column four last',#added for other blog downloads - 'column last', #added for other blog downloads - 'timestamp published', #added for other blog downloads - 'entry entry-related', - 'subNavigation tabContent active', #caucus blog navigation - 'columnGroup doubleRule', - 'mediaOverlay slideshow', - 'headlinesOnly multiline flush', - 'wideThumb', - 'video', #added 02-11-2011 - 'videoHeader',#added 02-11-2011 - 'articleInlineVideoHolder', #added 02-11-2011 - 'assetCompanionAd', - re.compile('^subNavigation'), - re.compile('^leaderboard'), - re.compile('^module'), - ]}), - dict(id=[ + remove_tags = [ + dict(attrs={'class':[ + 'articleFooter', + 'articleTools', + 'columnGroup singleRule', + 'columnGroup last', + 'columnGroup last', + 'doubleRule', + 'dottedLine', + 'entry-meta', + 'entry-response module', + 'leftNavTabs', + 'metaFootnote', + 'module box nav', + 'nextArticleLink', + 'nextArticleLink clearfix', + 'post-tools', + 'relatedSearchesModule', + 'side_tool', + 'singleAd', + 'entry entry-utility', #added for DealBook + 'entry-tags', #added for DealBook + 'footer promos clearfix', #added for DealBook + 'footer links clearfix', #added for DealBook + 'tabsContainer', #added for other blog downloads + 'column lastColumn', #added for other blog downloads + 'pageHeaderWithLabel', #added for other gadgetwise downloads + 'column two', #added for other blog downloads + 'column two last', #added for other blog downloads + 'column three', #added for other blog downloads + 'column three last', #added for other blog downloads + 'column four',#added for other blog downloads + 'column four last',#added for other blog downloads + 'column last', #added for other blog downloads + 'entry entry-related', + 'subNavigation tabContent active', #caucus blog navigation + 'mediaOverlay slideshow', + 'wideThumb', + 'video', #added 02-11-2011 + 'videoHeader',#added 02-11-2011 + 'articleInlineVideoHolder', #added 02-11-2011 + 'assetCompanionAd', + re.compile('^subNavigation'), + re.compile('^leaderboard'), + re.compile('^module'), + re.compile('commentCount'), + 'credit' + ]}), + dict(name='div', attrs={'class':re.compile('toolsList')}), # bits + dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits + dict(name='div', attrs={'class':'tweet'}), + dict(name='span', attrs={'class':'commentCount meta'}), + dict(name='div', attrs={'id':'header'}), + dict(name='div', attrs={'id':re.compile('commentsContainer')}), # bits, pogue, gadgetwise, open + dict(name='ul', attrs={'class':re.compile('entry-tools')}), # pogue, gadgetwise + dict(name='div', attrs={'class':re.compile('nocontent')}), # pogue, gadgetwise + dict(name='div', attrs={'id':re.compile('respond')}), # open + dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue + dict(id=[ 'adxLeaderboard', 'adxSponLink', 'archive', @@ -223,21 +259,25 @@ class NYTimes(BasicNewsRecipe): 'TopAd', #added for DealBook 'related-content', #added for DealBook ]), - dict(name=['script', 'noscript', 'style','form','hr'])] + dict(name=['script', 'noscript', 'style','form','hr'])] no_stylesheets = True extra_css = ''' .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; } - .credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - .byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } - .dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - .timestamp { text-align: left; font-size: small; } - .caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } + .dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .timestamp { font-weight: normal; text-align: left; font-size: 50%; } + .caption { font-size: 50%; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } a:link {text-decoration: none; } + .date{font-size: 50%; } + .update{font-size: 50%; } .articleBody { } - .authorId {text-align: left; } + .authorId {text-align: left; font-size: 50%; } .image {text-align: center;} - .source {text-align: left; }''' + .aside {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;} + .asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;} + .source {text-align: left; font-size: x-small; }''' articles = {} @@ -261,11 +301,11 @@ class NYTimes(BasicNewsRecipe): del ans[idx] idx_max = idx_max-1 continue - if self.verbose: + if True: #self.verbose self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) ) for article in ans[idx][1]: total_article_count += 1 - if self.verbose: + if True: #self.verbose self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'), article['url'].encode('cp1252','replace'))) idx = idx+1 @@ -276,7 +316,7 @@ class NYTimes(BasicNewsRecipe): def exclude_url(self,url): if not url.startswith("http"): return True - if not url.endswith(".html") and 'dealbook.nytimes.com' not in url and 'blogs.nytimes.com' not in url: #added for DealBook + if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook return True if 'nytimes.com' not in url: return True @@ -318,7 +358,7 @@ class NYTimes(BasicNewsRecipe): return fixed def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://www.nytimes.com/auth/login') br.form = br.forms().next() @@ -329,78 +369,76 @@ class NYTimes(BasicNewsRecipe): raise Exception('Your username and password are incorrect') return br - def skip_ad_pages(self, soup): - # Skip ad pages served before actual article - skip_tag = soup.find(True, {'name':'skip'}) - if skip_tag is not None: - self.log.warn("Found forwarding link: %s" % skip_tag.parent['href']) - url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) - url += '?pagewanted=all' - self.log.warn("Skipping ad to article at '%s'" % url) - return self.index_to_soup(url, raw=True) - + cover_tag = 'NY_NYT' def get_cover_url(self): - cover = None - st = time.localtime() - year = str(st.tm_year) - month = "%.2d" % st.tm_mon - day = "%.2d" % st.tm_mday - cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg' - br = BasicNewsRecipe.get_browser() + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg' + br = BasicNewsRecipe.get_browser(self) + daysback=1 try: br.open(cover) except: + while daysback<7: + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.cover_tag+'.jpg' + br = BasicNewsRecipe.get_browser(self) + try: + br.open(cover) + except: + daysback = daysback+1 + continue + break + if daysback==7: self.log("\nCover unavailable") cover = None return cover + masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' + + def short_title(self): return self.title - def index_to_soup(self, url_or_raw, raw=False): - ''' - OVERRIDE of class method - deals with various page encodings between index and articles - ''' - def get_the_soup(docEncoding, url_or_raw, raw=False) : - if re.match(r'\w+://', url_or_raw): - br = self.clone_browser(self.browser) - f = br.open_novisit(url_or_raw) + + def article_to_soup(self, url_or_raw, raw=False): + from contextlib import closing + import copy + from calibre.ebooks.chardet import xml_to_unicode + print("ARTICLE_TO_SOUP "+url_or_raw) + if re.match(r'\w+://', url_or_raw): + br = self.clone_browser(self.browser) + open_func = getattr(br, 'open_novisit', br.open) + with closing(open_func(url_or_raw)) as f: _raw = f.read() - f.close() - if not _raw: - raise RuntimeError('Could not fetch index from %s'%url_or_raw) + if not _raw: + raise RuntimeError('Could not fetch index from %s'%url_or_raw) + else: + _raw = url_or_raw + if raw: + return _raw + if not isinstance(_raw, unicode) and self.encoding: + if callable(self.encoding): + _raw = self.encoding(_raw) else: - _raw = url_or_raw - if raw: - return _raw + _raw = _raw.decode(self.encoding, 'replace') - if not isinstance(_raw, unicode) and self.encoding: - _raw = _raw.decode(docEncoding, 'replace') - massage = list(BeautifulSoup.MARKUP_MASSAGE) - massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding))) - return BeautifulSoup(_raw, markupMassage=massage) + nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) + nmassage.extend(self.preprocess_regexps) + nmassage += [(re.compile(r'', re.DOTALL), lambda m: '')] + # Some websites have buggy doctype declarations that mess up beautifulsoup + # Remove comments as they can leave detritus when extracting tags leaves + # multiple nested comments + nmassage.append((re.compile(r'', re.DOTALL), lambda m: '')) + usrc = xml_to_unicode(_raw, self.verbose, strip_encoding_pats=True)[0] + usrc = self.preprocess_raw_html(usrc, url_or_raw) + return BeautifulSoup(usrc, markupMassage=nmassage) - # Entry point - soup = get_the_soup( self.encoding, url_or_raw ) - contentType = soup.find(True,attrs={'http-equiv':'Content-Type'}) - docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')] - if docEncoding == '' : - docEncoding = self.encoding - - if self.verbose > 2: - self.log( " document encoding: '%s'" % docEncoding) - if docEncoding != self.encoding : - soup = get_the_soup(docEncoding, url_or_raw) - - return soup def massageNCXText(self, description): # Kindle TOC descriptions won't render certain characters if description: massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) # Replace '&' with '&' - massaged = re.sub("&","&", massaged) + massaged = re.sub("&","&", massaged) + massaged = re.sub("&","&", massaged) return self.fixChars(massaged) else: return description @@ -422,6 +460,16 @@ class NYTimes(BasicNewsRecipe): if self.filterDuplicates: if url in self.url_list: return + if self.webEdition: + date_tag = self.decode_url_date(url) + if date_tag is not None: + if self.oldest_web_article is not None: + if date_tag < self.earliest_date: + self.log("Skipping article %s" % url) + return + else: + self.log("Skipping article %s" % url) + return self.url_list.append(url) title = self.tag_to_string(a, use_alt=True).strip() description = '' @@ -446,6 +494,92 @@ class NYTimes(BasicNewsRecipe): description=description, author=author, content='')) + def get_popular_articles(self,ans): + if self.getPopularArticles: + popular_articles = {} + key_list = [] + + def handleh3(h3tag): + try: + url = h3tag.a['href'] + except: + return ('','','','') + url = re.sub(r'\?.*', '', url) + if self.exclude_url(url): + return ('','','','') + url += '?pagewanted=all' + title = self.tag_to_string(h3tag.a,False) + h6tag = h3tag.findNextSibling('h6') + if h6tag is not None: + author = self.tag_to_string(h6tag,False) + else: + author = '' + ptag = h3tag.findNextSibling('p') + if ptag is not None: + desc = self.tag_to_string(ptag,False) + else: + desc = '' + return(title,url,author,desc) + + + have_emailed = False + emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod) + for h3tag in emailed_soup.findAll('h3'): + (title,url,author,desc) = handleh3(h3tag) + if url=='': + continue + if not have_emailed: + key_list.append('Most E-Mailed') + popular_articles['Most E-Mailed'] = [] + have_emailed = True + popular_articles['Most E-Mailed'].append( + dict(title=title, url=url, date=strftime('%a, %d %b'), + description=desc, author=author, + content='')) + have_viewed = False + viewed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-viewed?period='+self.popularPeriod) + for h3tag in viewed_soup.findAll('h3'): + (title,url,author,desc) = handleh3(h3tag) + if url=='': + continue + if not have_viewed: + key_list.append('Most Viewed') + popular_articles['Most Viewed'] = [] + have_viewed = True + popular_articles['Most Viewed'].append( + dict(title=title, url=url, date=strftime('%a, %d %b'), + description=desc, author=author, + content='')) + viewed_ans = [(k, popular_articles[k]) for k in key_list if popular_articles.has_key(k)] + for x in viewed_ans: + ans.append(x) + return ans + + def get_tech_feeds(self,ans): + if self.getTechBlogs: + tech_articles = {} + key_list = [] + save_oldest_article = self.oldest_article + save_max_articles_per_feed = self.max_articles_per_feed + self.oldest_article = self.tech_oldest_article + self.max_articles_per_feed = self.tech_max_articles_per_feed + self.feeds = self.tech_feeds + tech = self.parse_feeds() + self.oldest_article = save_oldest_article + self.max_articles_per_feed = save_max_articles_per_feed + self.feeds = None + for f in tech: + key_list.append(f.title) + tech_articles[f.title] = [] + for a in f.articles: + tech_articles[f.title].append( + dict(title=a.title, url=a.url, date=a.date, + description=a.summary, author=a.author, + content=a.content)) + tech_ans = [(k, tech_articles[k]) for k in key_list if tech_articles.has_key(k)] + for x in tech_ans: + ans.append(x) + return ans def parse_web_edition(self): @@ -457,31 +591,41 @@ class NYTimes(BasicNewsRecipe): if sec_title in self.excludeSections: print "SECTION EXCLUDED: ",sec_title continue + try: + soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html') + except: + continue print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html' - soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html') + self.key = sec_title # Find each article for div in soup.findAll(True, - attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): - if div['class'] in ['story', 'story headline'] : + attrs={'class':['section-headline', 'ledeStory', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): + if div['class'] in ['story', 'story headline', 'storyHeader'] : self.handle_article(div) + elif div['class'] == 'ledeStory': + divsub = div.find('div','storyHeader') + if divsub is not None: + self.handle_article(divsub) + ulrefer = div.find('ul','refer') + if ulrefer is not None: + for lidiv in ulrefer.findAll('li'): + self.handle_article(lidiv) elif div['class'] == 'headlinesOnly multiline flush': for lidiv in div.findAll('li'): self.handle_article(lidiv) self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] - return self.filter_ans(self.ans) + return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) def parse_todays_index(self): soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') - skipping = False # Find each article for div in soup.findAll(True, attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): - if div['class'] in ['section-headline','sectionHeader']: self.key = string.capwords(self.feed_title(div)) self.key = self.key.replace('Op-ed','Op-Ed') @@ -505,7 +649,7 @@ class NYTimes(BasicNewsRecipe): self.handle_article(lidiv) self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] - return self.filter_ans(self.ans) + return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) def parse_headline_index(self): @@ -553,7 +697,7 @@ class NYTimes(BasicNewsRecipe): for h3_item in search_div.findAll('h3'): byline = h3_item.h6 if byline is not None: - author = self.tag_to_string(byline,usa_alt=False) + author = self.tag_to_string(byline,use_alt=False) else: author = '' a = h3_item.find('a', href=True) @@ -579,7 +723,7 @@ class NYTimes(BasicNewsRecipe): self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] - return self.filter_ans(self.ans) + return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) def parse_index(self): if self.headlinesOnly: @@ -589,155 +733,310 @@ class NYTimes(BasicNewsRecipe): else: return self.parse_todays_index() - def strip_anchors(self,soup): + def strip_anchors(self,soup,kill_all=False): paras = soup.findAll(True) for para in paras: aTags = para.findAll('a') for a in aTags: if a.img is None: - a.replaceWith(a.renderContents().decode('cp1252','replace')) + if kill_all or (self.recursions==0): + a.replaceWith(self.tag_to_string(a,False)) + else: + if a.has_key('href'): + if a['href'].startswith('http://www.nytimes'): + if not a['href'].endswith('pagewanted=all'): + url = re.sub(r'\?.*', '', a['href']) + if self.exclude_url(url): + a.replaceWith(self.tag_to_string(a,False)) + else: + a['href'] = url+'?pagewanted=all' + elif not (a['href'].startswith('http://pogue') or \ + a['href'].startswith('http://bits') or \ + a['href'].startswith('http://travel') or \ + a['href'].startswith('http://business') or \ + a['href'].startswith('http://tech') or \ + a['href'].startswith('http://health') or \ + a['href'].startswith('http://dealbook') or \ + a['href'].startswith('http://open')): + a.replaceWith(self.tag_to_string(a,False)) + return soup + + def handle_tags(self,soup): + try: + print("HANDLE TAGS: TITLE = "+self.tag_to_string(soup.title)) + except: + print("HANDLE TAGS: NO TITLE") + if soup is None: + print("ERROR: handle_tags received NoneType") + return None + +## print("HANDLING AD FORWARD:") +## print(soup) + if self.keep_only_tags: + body = Tag(soup, 'body') + try: + if isinstance(self.keep_only_tags, dict): + self.keep_only_tags = [self.keep_only_tags] + for spec in self.keep_only_tags: + for tag in soup.find('body').findAll(**spec): + body.insert(len(body.contents), tag) + soup.find('body').replaceWith(body) + except AttributeError: # soup has no body element + pass + + def remove_beyond(tag, next): + while tag is not None and getattr(tag, 'name', None) != 'body': + after = getattr(tag, next) + while after is not None: + ns = getattr(tag, next) + after.extract() + after = ns + tag = tag.parent + + if self.remove_tags_after is not None: + rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after + for spec in rt: + tag = soup.find(**spec) + remove_beyond(tag, 'nextSibling') + + if self.remove_tags_before is not None: + tag = soup.find(**self.remove_tags_before) + remove_beyond(tag, 'previousSibling') + + for kwds in self.remove_tags: + for tag in soup.findAll(**kwds): + tag.extract() + return soup def preprocess_html(self, soup): - if self.webEdition & (self.oldest_article>0): - date_tag = soup.find(True,attrs={'class': ['dateline','date']}) - if date_tag: - date_str = self.tag_to_string(date_tag,use_alt=False) - date_str = date_str.replace('Published:','') - date_items = date_str.split(',') + #print(strftime("%H:%M:%S")+" -- PREPROCESS TITLE="+self.tag_to_string(soup.title)) + skip_tag = soup.find(True, {'name':'skip'}) + if skip_tag is not None: + #url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) + url = 'http://www.nytimes.com' + skip_tag.parent['href'] + #url += '?pagewanted=all' + self.log.warn("Skipping ad to article at '%s'" % url) + sleep(5) + soup = self.handle_tags(self.article_to_soup(url)) + + # check if the article is from one of the tech blogs + blog=soup.find('div',attrs={'id':['pogue','bits','gadgetwise','open']}) + + if blog is not None: + old_body = soup.find('body') + new_body=Tag(soup,'body') + new_body.append(soup.find('div',attrs={'id':'content'})) + new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html + old_body.replaceWith(new_body) + for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}): + if divr.find(text=re.compile('Sign up')): + divr.extract() + divr = soup.find('div',attrs={'id':re.compile('related-content')}) + if divr is not None: + # handle related articles + rlist = [] + ul = divr.find('ul') + if ul is not None: + for li in ul.findAll('li'): + atag = li.find('a') + if atag is not None: + if atag['href'].startswith('http://pogue') or atag['href'].startswith('http://bits') or \ + atag['href'].startswith('http://open'): + atag.find(text=True).replaceWith(self.massageNCXText(self.tag_to_string(atag,False))) + rlist.append(atag) + divr.extract() + if rlist != []: + asidediv = Tag(soup,'div',[('class','aside')]) + if soup.find('hr') is None: + asidediv.append(Tag(soup,'hr')) + h4 = Tag(soup,'h4',[('class','asidenote')]) + h4.insert(0,"Related Posts") + asidediv.append(h4) + ul = Tag(soup,'ul') + for r in rlist: + li = Tag(soup,'li',[('class','aside')]) + r['class'] = 'aside' + li.append(r) + ul.append(li) + asidediv.append(ul) + asidediv.append(Tag(soup,'hr')) + smain = soup.find('body') + smain.append(asidediv) + for atag in soup.findAll('a'): + img = atag.find('img') + if img is not None: + atag.replaceWith(img) + elif not atag.has_key('href'): + atag.replaceWith(atag.renderContents().decode('cp1252','replace')) + elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or \ + atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')): + atag.replaceWith(atag.renderContents().decode('cp1252','replace')) + hdr = soup.find('address') + if hdr is not None: + hdr.name='span' + for span_credit in soup.findAll('span','credit'): + sp = Tag(soup,'span') + span_credit.replaceWith(sp) + sp.append(Tag(soup,'br')) + sp.append(span_credit) + sp.append(Tag(soup,'br')) + + else: # nytimes article + + related = [] # these will be the related articles + first_outer = None # first related outer tag + first_related = None # first related tag + for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}): + for rdiv in soup.findAll('div','columnGroup doubleRule'): + if rdiv.find('h3') is not None: + if self.tag_to_string(rdiv.h3,False).startswith('Related'): + rdiv.h3.find(text=True).replaceWith("Related articles") + rdiv.h3['class'] = 'asidenote' + for litag in rdiv.findAll('li'): + if litag.find('a') is not None: + if litag.find('a')['href'].startswith('http://www.nytimes.com'): + url = re.sub(r'\?.*', '', litag.find('a')['href']) + litag.find('a')['href'] = url+'?pagewanted=all' + litag.extract() + related.append(litag) + if first_related is None: + first_related = rdiv + first_outer = outerdiv + else: + litag.extract() + if related != []: + for r in related: + if r.h6: # don't want the anchor inside a h6 tag + r.h6.replaceWith(r.h6.a) + first_related.ul.append(r) + first_related.insert(0,Tag(soup,'hr')) + first_related.append(Tag(soup,'hr')) + first_related['class'] = 'aside' + first_outer.replaceWith(first_related) # replace the outer tag with the related tag + + for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}): + rdiv.extract() + + kicker_tag = soup.find(attrs={'class':'kicker'}) + if kicker_tag: # remove Op_Ed author head shots + tagline = self.tag_to_string(kicker_tag) + if tagline=='Op-Ed Columnist': + img_div = soup.find('div','inlineImage module') + if img_div: + img_div.extract() + + if self.useHighResImages: try: - datestring = date_items[0]+' '+date_items[1] - article_date = self.decode_us_date(datestring) - except: - article_date = date.today() - if article_date < self.earliest_date: - self.log("Skipping article dated %s" % date_str) - return None + #open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs + enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) + if enlargeThisList: + for popupref in enlargeThisList: + popupreflink = popupref.find('a') + if popupreflink: + reflinkstring = str(popupreflink['href']) + refstart = reflinkstring.find("javascript:pop_me_up2('") + len("javascript:pop_me_up2('") + refend = reflinkstring.find(".html", refstart) + len(".html") + reflinkstring = reflinkstring[refstart:refend] - #all articles are from today, no need to print the date on every page - try: - if not self.webEdition: - date_tag = soup.find(True,attrs={'class': ['dateline','date']}) - if date_tag: - date_tag.extract() - except: - self.log("Error removing the published date") - - if self.useHighResImages: - try: - #open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs - enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) - if enlargeThisList: - for popupref in enlargeThisList: - popupreflink = popupref.find('a') - if popupreflink: - reflinkstring = str(popupreflink['href']) - refstart = reflinkstring.find("javascript:pop_me_up2('") + len("javascript:pop_me_up2('") - refend = reflinkstring.find(".html", refstart) + len(".html") - reflinkstring = reflinkstring[refstart:refend] - - popuppage = self.browser.open(reflinkstring) - popuphtml = popuppage.read() - popuppage.close() - if popuphtml: - st = time.localtime() - year = str(st.tm_year) - month = "%.2d" % st.tm_mon - day = "%.2d" % st.tm_mday - imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') - highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4] - popupSoup = BeautifulSoup(popuphtml) - highResTag = popupSoup.find('img', {'src':highResImageLink}) - if highResTag: - try: - newWidth = highResTag['width'] - newHeight = highResTag['height'] - imageTag = popupref.parent.find("img") - except: - self.log("Error: finding width and height of img") - popupref.extract() - if imageTag: + popuppage = self.browser.open(reflinkstring) + popuphtml = popuppage.read() + popuppage.close() + if popuphtml: + st = time.localtime() + year = str(st.tm_year) + month = "%.2d" % st.tm_mon + day = "%.2d" % st.tm_mday + imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4] + popupSoup = BeautifulSoup(popuphtml) + highResTag = popupSoup.find('img', {'src':highResImageLink}) + if highResTag: try: - imageTag['src'] = highResImageLink - imageTag['width'] = newWidth - imageTag['height'] = newHeight + newWidth = highResTag['width'] + newHeight = highResTag['height'] + imageTag = popupref.parent.find("img") except: - self.log("Error setting the src width and height parameters") - except Exception: - self.log("Error pulling high resolution images") + self.log("Error: finding width and height of img") + popupref.extract() + if imageTag: + try: + imageTag['src'] = highResImageLink + imageTag['width'] = newWidth + imageTag['height'] = newHeight + except: + self.log("Error setting the src width and height parameters") + except Exception: + self.log("Error pulling high resolution images") + + try: + #in case pulling images failed, delete the enlarge this text + enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) + if enlargeThisList: + for popupref in enlargeThisList: + popupref.extract() + except: + self.log("Error removing Enlarge this text") + + + return self.strip_anchors(soup,False) + + def postprocess_html(self,soup,first_fetch): + if not first_fetch: # remove Related links + for aside in soup.findAll('div','aside'): + aside.extract() + soup = self.strip_anchors(soup,True) + #print("RECURSIVE: "+self.tag_to_string(soup.title)) + + if soup.find('div',attrs={'id':'blogcontent'}) is None: + if first_fetch: + aside = soup.find('div','aside') + if aside is not None: # move the related list to the end of the article + art = soup.find('div',attrs={'id':'article'}) + if art is None: + art = soup.find('div',attrs={'class':'article'}) + if art is not None: + art.append(aside) + try: + if self.one_picture_per_article: + # Remove all images after first + largeImg = soup.find(True, {'class':'articleSpanImage'}) + inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) + if largeImg: + for inlineImg in inlineImgs: + inlineImg.extract() + else: + if inlineImgs: + firstImg = inlineImgs[0] + for inlineImg in inlineImgs[1:]: + inlineImg.extract() + # Move firstImg before article body + cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')}) + if cgFirst: + # Strip all sibling NavigableStrings: noise + navstrings = cgFirst.findAll(text=True, recursive=False) + [ns.extract() for ns in navstrings] + headline_found = False + tag = cgFirst.find(True) + insertLoc = 0 + while True: + insertLoc += 1 + if hasattr(tag,'class') and tag['class'] == 'articleHeadline': + headline_found = True + break + tag = tag.nextSibling + if not tag: + headline_found = False + break + if headline_found: + cgFirst.insert(insertLoc,firstImg) + else: + self.log(">>> No class:'columnGroup first' found <<<") + except: + self.log("ERROR: One picture per article in postprocess_html") try: - #remove "Related content" bar - runAroundsFound = soup.findAll('div',{'class':['articleInline runaroundLeft','articleInline doubleRule runaroundLeft','articleInline runaroundLeft firstArticleInline','articleInline runaroundLeft ','articleInline runaroundLeft lastArticleInline']}) - if runAroundsFound: - for runAround in runAroundsFound: - #find all section headers - hlines = runAround.findAll(True ,{'class':['sectionHeader','sectionHeader flushBottom']}) - if hlines: - for hline in hlines: - hline.extract() - - #find all section headers - hlines = runAround.findAll('h6') - if hlines: - for hline in hlines: - hline.extract() - except: - self.log("Error removing related content bar") - - - try: - #in case pulling images failed, delete the enlarge this text - enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) - if enlargeThisList: - for popupref in enlargeThisList: - popupref.extract() - except: - self.log("Error removing Enlarge this text") - - return self.strip_anchors(soup) - - def postprocess_html(self,soup, True): - try: - if self.one_picture_per_article: - # Remove all images after first - largeImg = soup.find(True, {'class':'articleSpanImage'}) - inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) - if largeImg: - for inlineImg in inlineImgs: - inlineImg.extract() - else: - if inlineImgs: - firstImg = inlineImgs[0] - for inlineImg in inlineImgs[1:]: - inlineImg.extract() - # Move firstImg before article body - cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')}) - if cgFirst: - # Strip all sibling NavigableStrings: noise - navstrings = cgFirst.findAll(text=True, recursive=False) - [ns.extract() for ns in navstrings] - headline_found = False - tag = cgFirst.find(True) - insertLoc = 0 - while True: - insertLoc += 1 - if hasattr(tag,'class') and tag['class'] == 'articleHeadline': - headline_found = True - break - tag = tag.nextSibling - if not tag: - headline_found = False - break - if headline_found: - cgFirst.insert(insertLoc,firstImg) - else: - self.log(">>> No class:'columnGroup first' found <<<") - except: - self.log("ERROR: One picture per article in postprocess_html") - - try: # Change captions to italic for caption in soup.findAll(True, {'class':'caption'}) : if caption and len(caption) > 0: @@ -748,10 +1047,10 @@ class NYTimes(BasicNewsRecipe): c = c[:mp_off] cTag.insert(0, c) caption.replaceWith(cTag) - except: + except: self.log("ERROR: Problem in change captions to italic") - try: + try: # Change to

h1 = soup.find('h1') blogheadline = str(h1) #added for dealbook @@ -774,38 +1073,38 @@ class NYTimes(BasicNewsRecipe): if headline: tag = Tag(soup, "h2") tag['class'] = "headline" - tag.insert(0, self.fixChars(headline.renderContents())) + tag.insert(0, self.fixChars(self.tag_to_string(headline,False))) soup.insert(0, tag) hrs = soup.findAll('hr') for hr in hrs: hr.extract() - except: + except: self.log("ERROR: Problem in Change to

") - try: + try: #if this is from a blog (dealbook, fix the byline format bylineauthor = soup.find('address',attrs={'class':'byline author vcard'}) if bylineauthor: tag = Tag(soup, "h6") tag['class'] = "byline" - tag.insert(0, self.fixChars(bylineauthor.renderContents())) + tag.insert(0, self.fixChars(self.tag_to_string(bylineauthor,False))) bylineauthor.replaceWith(tag) - except: - self.log("ERROR: fixing byline author format") + except: + self.log("ERROR: fixing byline author format") - try: + try: #if this is a blog (dealbook) fix the credit style for the pictures blogcredit = soup.find('div',attrs={'class':'credit'}) if blogcredit: tag = Tag(soup, "h6") tag['class'] = "credit" - tag.insert(0, self.fixChars(blogcredit.renderContents())) + tag.insert(0, self.fixChars(self.tag_to_string(blogcredit,False))) blogcredit.replaceWith(tag) - except: - self.log("ERROR: fixing credit format") + except: + self.log("ERROR: fixing credit format") - try: + try: # Change

to

- used in editorial blogs masthead = soup.find("h1") if masthead: @@ -815,34 +1114,34 @@ class NYTimes(BasicNewsRecipe): tag = Tag(soup, "h3") tag.insert(0, self.fixChars(masthead.contents[0])) masthead.replaceWith(tag) - except: + except: self.log("ERROR: Problem in Change

to

- used in editorial blogs") - try: + try: # Change to for subhead in soup.findAll(True, {'class':'bold'}) : if subhead.contents: bTag = Tag(soup, "b") bTag.insert(0, subhead.contents[0]) subhead.replaceWith(bTag) - except: + except: self.log("ERROR: Problem in Change

to

- used in editorial blogs") - try: + try: #remove the update tag blogupdated = soup.find('span', {'class':'update'}) if blogupdated: blogupdated.replaceWith("") - except: + except: self.log("ERROR: Removing strong tag") - try: + try: divTag = soup.find('div',attrs={'id':'articleBody'}) if divTag: divTag['class'] = divTag['id'] - except: + except: self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})") - try: + try: # Add class="authorId" to
so we can format with CSS divTag = soup.find('div',attrs={'id':'authorId'}) if divTag and divTag.contents[0]: @@ -851,27 +1150,26 @@ class NYTimes(BasicNewsRecipe): tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], use_alt=False))) divTag.replaceWith(tag) - except: + except: self.log("ERROR: Problem in Add class=authorId to
so we can format with CSS") - + #print(strftime("%H:%M:%S")+" -- POSTPROCESS TITLE="+self.tag_to_string(soup.title)) return soup - def populate_article_metadata(self, article, soup, first): - if first and hasattr(self, 'add_toc_thumbnail'): - idxdiv = soup.find('div',attrs={'class':'articleSpanImage'}) - if idxdiv is not None: - if idxdiv.img: - self.add_toc_thumbnail(article, idxdiv.img['src']) - else: - img = soup.find('img') - if img is not None: - self.add_toc_thumbnail(article, img['src']) + def populate_article_metadata(self, article, soup, first): + if not first: + return + idxdiv = soup.find('div',attrs={'class':'articleSpanImage'}) + if idxdiv is not None: + if idxdiv.img: + self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',idxdiv.img['src'])) + else: + img = soup.find('body').find('img') + if img is not None: + self.add_toc_thumbnail(article, re.sub(r'links\\link\d+\\','',img['src'])) shortparagraph = "" try: if len(article.text_summary.strip()) == 0: articlebodies = soup.findAll('div',attrs={'class':'articleBody'}) - if not articlebodies: #added to account for blog formats - articlebodies = soup.findAll('div', attrs={'class':'entry-content'}) #added to account for blog formats if articlebodies: for articlebody in articlebodies: if articlebody: @@ -880,15 +1178,23 @@ class NYTimes(BasicNewsRecipe): refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip() #account for blank paragraphs and short paragraphs by appending them to longer ones if len(refparagraph) > 0: - if len(refparagraph) > 140: #approximately two lines of text - article.summary = article.text_summary = shortparagraph + refparagraph + if len(refparagraph) > 70: #approximately one line of text + newpara = shortparagraph + refparagraph + newparaDateline,newparaEm,newparaDesc = newpara.partition('—') + if newparaEm == '': + newparaDateline,newparaEm,newparaDesc = newpara.partition('—') + if newparaEm == '': + newparaDesc = newparaDateline + article.summary = article.text_summary = newparaDesc.strip() return else: shortparagraph = refparagraph + " " if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"): shortparagraph = shortparagraph + "- " - + else: + article.summary = article.text_summary = self.massageNCXText(article.text_summary) except: self.log("Error creating article descriptions") return + diff --git a/recipes/nzherald.recipe b/recipes/nzherald.recipe index b73fd8366e..46242b630a 100644 --- a/recipes/nzherald.recipe +++ b/recipes/nzherald.recipe @@ -25,7 +25,7 @@ class NewZealandHerald(BasicNewsRecipe): 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000466.xml'), ('Life & Style', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000006.xml'), - ('Technology' + ('Technology', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000005.xml'), ('Sport', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000004.xml'), diff --git a/recipes/nzz_webpaper.recipe b/recipes/nzz_webpaper.recipe index 202cfeadab..1491518afc 100644 --- a/recipes/nzz_webpaper.recipe +++ b/recipes/nzz_webpaper.recipe @@ -78,7 +78,7 @@ class Nzz(BasicNewsRecipe): return ans def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://webpaper.nzz.ch/login') br.select_form(nr=0) diff --git a/recipes/omgubuntu.recipe b/recipes/omgubuntu.recipe index e2a619c281..efd778955a 100644 --- a/recipes/omgubuntu.recipe +++ b/recipes/omgubuntu.recipe @@ -11,7 +11,7 @@ class BasicUserRecipe1318619832(BasicNewsRecipe): def get_masthead_url(self): masthead = 'http://cdn.omgubuntu.co.uk/wp-content/themes/omgubuntu/images/logo.png' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(masthead) except: diff --git a/recipes/oreilly_premium.recipe b/recipes/oreilly_premium.recipe index 17b8f241ff..0088426503 100644 --- a/recipes/oreilly_premium.recipe +++ b/recipes/oreilly_premium.recipe @@ -64,7 +64,7 @@ class OReillyPremium(BasicNewsRecipe): def get_browser(self): print("In get_browser") - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp') br.select_form(name='login') diff --git a/recipes/osworld_pl.recipe b/recipes/osworld_pl.recipe new file mode 100644 index 0000000000..7784a271e0 --- /dev/null +++ b/recipes/osworld_pl.recipe @@ -0,0 +1,33 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class OSWorld(BasicNewsRecipe): + title = u'OSWorld.pl' + __author__ = 'fenuks' + description = u'OSWorld.pl to serwis internetowy, dzięki któremu poznasz czym naprawdę jest Open Source. Serwis poświęcony jest wolnemu oprogramowaniu jak linux mint, centos czy ubunty. Znajdziecie u nasz artykuły, unity oraz informacje o certyfikatach CACert. OSWorld to mały świat wielkich systemów!' + category = 'OS, IT, open source, Linux' + language = 'pl' + cover_url = 'http://osworld.pl/wp-content/uploads/osworld-kwadrat-128x111.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + use_embedded_content = False + keep_only_tags = [dict(id=['dzial', 'posts'])] + remove_tags = [dict(attrs={'class':'post-comments'})] + remove_tags_after = dict(attrs={'class':'entry clr'}) + feeds = [(u'Artyku\u0142y', u'http://osworld.pl/category/artykuly/feed/'), (u'Nowe wersje', u'http://osworld.pl/category/nowe-wersje/feed/')] + + def append_page(self, soup, appendtag): + tag = appendtag.find(attrs={'id':'paginacja'}) + if tag: + for nexturl in tag.findAll('a'): + soup2 = self.index_to_soup(nexturl['href']) + pagetext = soup2.find(attrs={'class':'entry clr'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'id':'paginacja'}): + r.extract() + + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/ottawa_citizen.recipe b/recipes/ottawa_citizen.recipe index 0245b65231..99052343ca 100644 --- a/recipes/ottawa_citizen.recipe +++ b/recipes/ottawa_citizen.recipe @@ -132,14 +132,14 @@ class CanWestPaper(BasicNewsRecipe): def get_cover_url(self): from datetime import timedelta, date cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) daysback=1 try: br.open(cover) except: while daysback<7: cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/outside_magazine.recipe b/recipes/outside_magazine.recipe new file mode 100644 index 0000000000..15eaf3221e --- /dev/null +++ b/recipes/outside_magazine.recipe @@ -0,0 +1,65 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe + +class NYTimes(BasicNewsRecipe): + + title = 'Outside Magazine' + __author__ = 'Krittika Goyal' + description = 'Outside Magazine - Free 1 Month Old Issue' + timefmt = ' [%d %b, %Y]' + needs_subscription = False + language = 'en' + + no_stylesheets = True + #auto_cleanup = True + #auto_cleanup_keep = '//div[@class="thumbnail"]' + + keep_only_tags = dict(name='div', attrs={'class':'masonry-box width-four'}) + remove_tags = [ + dict(name='div', attrs={'id':['share-bar', 'outbrain_widget_0', 'outbrain_widget_1', 'livefyre']}), + #dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}), + #dict(name='form', attrs={'onsubmit':''}), + dict(name='section', attrs={'id':['article-quote', 'article-navigation']}), + ] + #TO GET ARTICLE TOC + def out_get_index(self): + super_url = 'http://www.outsideonline.com/magazine/' + super_soup = self.index_to_soup(super_url) + div = super_soup.find(attrs={'class':'masonry-box width-four'}) + issue = div.findAll(name='article')[1] + super_a = issue.find('a', href=True) + return super_a.get('href') + + + # To parse artice toc + def parse_index(self): + parse_soup = self.index_to_soup(self.out_get_index()) + + feeds = [] + feed_title = 'Articles' + + articles = [] + self.log('Found section:', feed_title) + div = parse_soup.find(attrs={'class':'print clearfix'}) + for art in div.findAll(name='p'): + art_info = art.find(name = 'a') + if art_info is None: + continue + art_title = self.tag_to_string(art_info) + url = art_info.get('href') + '?page=all' + self.log.info('\tFound article:', art_title, 'at', url) + article = {'title':art_title, 'url':url, 'date':''} + #au = art.find(attrs={'class':'articleAuthors'}) + #if au is not None: + #article['author'] = self.tag_to_string(au) + #desc = art.find(attrs={'class':'hover_text'}) + #if desc is not None: + #desc = self.tag_to_string(desc) + #if 'author' in article: + #desc = ' by ' + article['author'] + ' ' +desc + #article['description'] = desc + articles.append(article) + if articles: + feeds.append((feed_title, articles)) + + return feeds + diff --git a/recipes/oxford_mail.recipe b/recipes/oxford_mail.recipe new file mode 100644 index 0000000000..3096b867f4 --- /dev/null +++ b/recipes/oxford_mail.recipe @@ -0,0 +1,22 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class HindustanTimes(BasicNewsRecipe): + title = u'Oxford Mail' + language = 'en_GB' + __author__ = 'Krittika Goyal' + oldest_article = 1 #days + max_articles_per_feed = 25 + #encoding = 'cp1252' + use_embedded_content = False + + no_stylesheets = True + auto_cleanup = True + + + feeds = [ +('News', + 'http://www.oxfordmail.co.uk/news/rss/'), +('Sports', + 'http://www.oxfordmail.co.uk/sport/rss/'), +] + diff --git a/recipes/pajama.recipe b/recipes/pajama.recipe index 8c5ba74317..9b474b6e65 100644 --- a/recipes/pajama.recipe +++ b/recipes/pajama.recipe @@ -1,27 +1,27 @@ from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup class PajamasMedia(BasicNewsRecipe): title = u'Pajamas Media' description = u'Provides exclusive news and opinion for forty countries.' language = 'en' __author__ = 'Krittika Goyal' - oldest_article = 1 #days + oldest_article = 2 #days max_articles_per_feed = 25 recursions = 1 match_regexps = [r'http://pajamasmedia.com/blog/.*/2/$'] #encoding = 'latin1' remove_stylesheets = True - #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) - remove_tags_after = dict(name='div', attrs={'class':'paged-nav'}) - remove_tags = [ - dict(name='iframe'), - dict(name='div', attrs={'class':['pages']}), - #dict(name='div', attrs={'id':['bookmark']}), - #dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}), - #dict(name='ul', attrs={'class':'articleTools'}), - ] + auto_cleanup = True + ##remove_tags_before = dict(name='h1', attrs={'class':'heading'}) + #remove_tags_after = dict(name='div', attrs={'class':'paged-nav'}) + #remove_tags = [ + #dict(name='iframe'), + #dict(name='div', attrs={'class':['pages']}), + ##dict(name='div', attrs={'id':['bookmark']}), + ##dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}), + ##dict(name='ul', attrs={'class':'articleTools'}), + #] feeds = [ ('pajamas Media', @@ -29,20 +29,20 @@ class PajamasMedia(BasicNewsRecipe): ] - def preprocess_html(self, soup): - story = soup.find(name='div', attrs={'id':'innerpage-content'}) - #td = heading.findParent(name='td') - #td.extract() + #def preprocess_html(self, soup): + #story = soup.find(name='div', attrs={'id':'innerpage-content'}) + ##td = heading.findParent(name='td') + ##td.extract() - soup = BeautifulSoup('t') - body = soup.find(name='body') - body.insert(0, story) - return soup + #soup = BeautifulSoup('t') + #body = soup.find(name='body') + #body.insert(0, story) + #return soup - def postprocess_html(self, soup, first): - if not first: - h = soup.find(attrs={'class':'innerpage-header'}) - if h: h.extract() - auth = soup.find(attrs={'class':'author'}) - if auth: auth.extract() - return soup + #def postprocess_html(self, soup, first): + #if not first: + #h = soup.find(attrs={'class':'innerpage-header'}) + #if h: h.extract() + #auth = soup.find(attrs={'class':'author'}) + #if auth: auth.extract() + #return soup diff --git a/recipes/pc_lab.recipe b/recipes/pc_lab.recipe index c4b33b8416..7a6038bd65 100644 --- a/recipes/pc_lab.recipe +++ b/recipes/pc_lab.recipe @@ -1,5 +1,4 @@ #!/usr/bin/env python - from calibre.web.feeds.recipes import BasicNewsRecipe class PCLab(BasicNewsRecipe): @@ -8,12 +7,13 @@ class PCLab(BasicNewsRecipe): __author__ = 'ravcio - rlelusz[at]gmail.com' description = u"Articles from PC Lab website" language = 'pl' - oldest_article = 30.0 + oldest_article = 30 max_articles_per_feed = 100 recursions = 0 encoding = 'iso-8859-2' no_stylesheets = True remove_javascript = True + remove_empty_feeds = True use_embedded_content = False keep_only_tags = [ @@ -21,50 +21,54 @@ class PCLab(BasicNewsRecipe): ] remove_tags = [ - dict(name='div', attrs={'class':['chapters']}) - ,dict(name='div', attrs={'id':['script_bxad_slot_display_list_bxad_slot']}) + dict(name='div', attrs={'class':['toc first', 'toc', 'tags', 'recommendedarticles', 'name', 'zumi', 'chapters']}) ] - remove_tags_after = [ - dict(name='div', attrs={'class':['navigation']}) - ] - #links to RSS feeds - feeds = [ ('PCLab', u'http://pclab.pl/xml/artykuly.xml') ] + feeds = [ + (u'Aktualności', 'http://pclab.pl/xml/aktualnosci.xml'), + (u'Artykuły', u'http://pclab.pl/xml/artykuly.xml'), + (u'Poradniki', 'http://pclab.pl/xml/poradniki.xml') + ] #load second and subsequent page content # in: soup - full page with 'next' button # out: appendtag - tag to which new page is to be added def append_page(self, soup, appendtag): # find the 'Next' button - pager = soup.find('div', attrs={'class':'next'}) - + pager = soup.find('div', attrs={'class':'navigation'}) if pager: + a = pager.find('a') + if 'news' in a['href']: + pager = None + else: + pager = pager.find('div', attrs={'class':'next'}) + + while pager: #search for 'a' element with link to next page (exit if not found) a = pager.find('a') - if a: - nexturl = a['href'] + nexturl = a['href'] + soup2 = self.index_to_soup('http://pclab.pl' + nexturl) + pager = soup2.find('div', attrs={'class':'next'}) + pagetext = soup2.find('div', attrs={'class':'substance'}) + pagetext = pagetext.find('div', attrs={'class':'data'}) - soup2 = self.index_to_soup('http://pclab.pl/' + nexturl) - - pagetext_substance = soup2.find('div', attrs={'class':'substance'}) - pagetext = pagetext_substance.find('div', attrs={'class':'data'}) - pagetext.extract() - - pos = len(appendtag.contents) - appendtag.insert(pos, pagetext) - pos = len(appendtag.contents) - - self.append_page(soup2, appendtag) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pos = len(appendtag.contents) + pager = soup.find('div', attrs={'class':'navigation'}) + if pager: + pager.extract() def preprocess_html(self, soup): - # soup.body contains no title and no navigator, they are in soup self.append_page(soup, soup.body) - + for link in soup.findAll('a'): + href = link.get('href', None) + if href and href.startswith('/'): + link['href'] = 'http://pclab.pl' + href # finally remove some tags - tags = soup.findAll('div',attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']}) - [tag.extract() for tag in tags] + #for r in soup.findAll('div', attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']}) return soup diff --git a/recipes/people_daily.recipe b/recipes/people_daily.recipe index 76ee599e39..d783421bc4 100644 --- a/recipes/people_daily.recipe +++ b/recipes/people_daily.recipe @@ -115,7 +115,7 @@ class AdvancedUserRecipe1277129332(BasicNewsRecipe): month = time.strftime('%m') day = time.strftime('%d') cover = 'http://paper.people.com.cn/rmrb/page/'+year+'-'+month+'/'+day+'/01/RMRB'+year+month+day+'B001_b.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/phillosophy_now.recipe b/recipes/phillosophy_now.recipe index 7c12832c70..370456eb52 100644 --- a/recipes/phillosophy_now.recipe +++ b/recipes/phillosophy_now.recipe @@ -6,7 +6,6 @@ class PhilosophyNow(BasicNewsRecipe): title = 'Philosophy Now' __author__ = 'Rick Shang' - description = '''Philosophy Now is a lively magazine for everyone interested in ideas. It isn't afraid to tackle all the major questions of life, the universe and everything. Published every two months, it tries to @@ -25,9 +24,9 @@ class PhilosophyNow(BasicNewsRecipe): needs_subscription = True def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open('https://philosophynow.org/auth/login') - br.select_form(nr = 1) + br.select_form(name="loginForm") br['username'] = self.username br['password'] = self.password br.submit() @@ -50,19 +49,20 @@ class PhilosophyNow(BasicNewsRecipe): #Go to the main body current_issue_url = 'http://philosophynow.org/issues/' + issuenum soup = self.index_to_soup(current_issue_url) - div = soup.find ('div', attrs={'class':'articlesColumn'}) + div = soup.find ('div', attrs={'class':'contentsColumn'}) feeds = OrderedDict() - for post in div.findAll('h3'): + + for post in div.findAll('h1'): articles = [] a=post.find('a',href=True) if a is not None: url="http://philosophynow.org" + a['href'] title=self.tag_to_string(a).strip() - s=post.findPrevious('h4') + s=post.findPrevious('h3') section_title = self.tag_to_string(s).strip() - d=post.findNext('p') + d=post.findNext('h2') desc = self.tag_to_string(d).strip() articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) @@ -73,3 +73,5 @@ class PhilosophyNow(BasicNewsRecipe): ans = [(key, val) for key, val in feeds.iteritems()] return ans + def cleanup(self): + self.browser.open('http://philosophynow.org/auth/logout') diff --git a/recipes/physics_today.recipe b/recipes/physics_today.recipe index d1ce17cf32..87c3889517 100644 --- a/recipes/physics_today.recipe +++ b/recipes/physics_today.recipe @@ -27,7 +27,7 @@ class Physicstoday(BasicNewsRecipe): ] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://ptonline.aip.org/journals/doc/PHTOAD-home/pt_login.jsp?fl=f') br.select_form(name='login_form') diff --git a/recipes/pnn.recipe b/recipes/pnn.recipe new file mode 100644 index 0000000000..cb36afe88b --- /dev/null +++ b/recipes/pnn.recipe @@ -0,0 +1,55 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe + +'''Calibre recipe to convert the RSS feeds of the PNN to an ebook.''' + +class SportsIllustratedRecipe(BasicNewsRecipe) : + __author__ = 'n.kucklaender' + __copyright__ = 'a.peter' + __license__ = 'GPL v3' + language = 'de' + description = 'PNN RSS' + version = 1 + title = u'PNN' + timefmt = ' [%d.%m.%Y]' + + oldest_article = 7.0 + no_stylesheets = True + remove_javascript = True + use_embedded_content = False + publication_type = 'newspaper' + + remove_empty_feeds = True + remove_tags = [dict(attrs={'class':['um-weather um-header-weather','um-has-sub um-mainnav','um-box','ts-products','um-meta-nav','um-box um-last','um-footer','um-footer-links','share hidden','um-buttons']}),dict(id=['dinsContainer'])] + # remove_tags_before = [dict(name='div', attrs={'class':'um-first'})] + # remove_tags_after = [dict(name='div', attrs={'class':'um-metabar'})] + + feeds = [(u'Titelseite', u'http://www.pnn.de/rss.xml'), + (u'Dritte Seite', u'http://www.pnn.de/dritte-seite/rss.xml'), + (u'Politik', u'http://www.pnn.de/politik/rss.xml'), + (u'Meinung', u'http://www.pnn.de/meinung/rss.xml'), + (u'Potsdam', u'http://www.pnn.de/potsdam/rss.xml'), + (u'Havel-Spree', u'http://www.pnn.de/havel-spree/rss.xml'), + (u'Potsdam-Mittelmark', u'http://www.pnn.de/pm/rss.xml'), + (u'Berlin-Brandenburg', u'http://www.pnn.de/brandenburg-berlin/rss.xml'), + (u'Wirtschaft', u'http://www.pnn.de/wirtschaft/rss.xml'), + (u'Sport', u'http://www.pnn.de/sport/rss.xml'), + (u'Regionalsport', u'http://www.pnn.de/regionalsport/rss.xml'), + (u'Kultur', u'http://www.pnn.de/kultur/rss.xml'), + (u'Potsdam-Kultur', u'http://www.pnn.de/potsdam-kultur/rss.xml'), + (u'Wissen', u'http://www.pnn.de/wissen/rss.xml'), + (u'Medien', u'http://www.pnn.de/medien/rss.xml'), + (u'Weltspiegel', u'http://www.pnn.de/weltspiegel/rss.xml'), + (u'Wissenschaft', u'http://www.pnn.de/campus/rss.xml'), + (u'Mobil', u'http://www.pnn.de/mobil/rss.xml'), + (u'Reise', u'http://www.pnn.de/reise/rss.xml'), + (u'Ratgeber', u'http://www.pnn.de/ratgeber/rss.xml'), + (u'Fragen des Tages', u'http://www.pnn.de/fragen-des-tages/rss.xml'), + # (u'Potsdam bin ich', u'http://www.pnn.de/potsdam-bin-ich/rss.xml'), + (u'Leserbriefe', u'http://www.pnn.de/leserbriefe/rss.xml')] + + def get_masthead_url(self): + return 'http://www.pnn.de/app/base/img/pnn_logo.png' + + def print_version(self, url): + return url.replace('.html', ',view,printVersion.html') + diff --git a/recipes/poradnia_pwn.recipe b/recipes/poradnia_pwn.recipe new file mode 100644 index 0000000000..b3e2825618 --- /dev/null +++ b/recipes/poradnia_pwn.recipe @@ -0,0 +1,63 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from calibre.web.feeds.news import BasicNewsRecipe +class PoradniaPWN(BasicNewsRecipe): + title = u'Poradnia Językowa PWN' + __author__ = 'fenuks' + description = u'Internetowa poradnia językowa Wydawnictwa Naukowego PWN. Poradnię prowadzi Redaktor Naczelny Słowników Języka Polskiego, prof. Mirosław Bańko. Pomagają mu eksperci - znani polscy językoznawcy. Współpracuje z nami m.in. prof. Jerzy Bralczyk oraz dr Jan Grzenia.' + category = 'language' + language = 'pl' + #cover_url = '' + oldest_article = 14 + max_articles_per_feed = 100000 + INDEX = "http://poradnia.pwn.pl/" + no_stylesheets = True + remove_attributes = ['style'] + remove_javascript = True + use_embedded_content = False + #preprocess_regexps = [(re.compile('', re.IGNORECASE), lambda m: '
'), (re.compile('', re.IGNORECASE), lambda m: '
')] + keep_only_tags = [dict(name="div", attrs={"class":"searchhi"})] + feeds = [(u'Poradnia', u'http://rss.pwn.pl/poradnia.rss')] + + '''def find_articles(self, url): + articles = [] + soup=self.index_to_soup(url) + counter = int(soup.find(name='p', attrs={'class':'count'}).findAll('b')[-1].string) + counter = 500 + pos = 0 + next = url + while next: + soup=self.index_to_soup(next) + tag=soup.find(id="listapytan") + art=tag.findAll(name='li') + for i in art: + if i.h4: + title=i.h4.a.string + url=self.INDEX+i.h4.a['href'] + #date=soup.find(id='footer').ul.li.string[41:-1] + articles.append({'title' : title, + 'url' : url, + 'date' : '', + 'description' : '' + }) + pos += 10 + if not pos >=counter: + next = 'http://poradnia.pwn.pl/lista.php?kat=18&od=' + str(pos) + print u'Tworzenie listy artykułów dla', next + else: + next = None + print articles + return articles + + def parse_index(self): + feeds = [] + feeds.append((u"Poradnia", self.find_articles('http://poradnia.pwn.pl/lista.php'))) + + return feeds''' + + def preprocess_html(self, soup): + for i in soup.findAll(name=['ul', 'li']): + i.name="div" + for z in soup.findAll(name='a'): + if not z['href'].startswith('http'): + z['href'] = 'http://poradnia.pwn.pl/' + z['href'] + return soup diff --git a/recipes/pravda_rs.recipe b/recipes/pravda_rs.recipe new file mode 100644 index 0000000000..742527ac2b --- /dev/null +++ b/recipes/pravda_rs.recipe @@ -0,0 +1,85 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2013, Darko Miletic ' + +''' +www.pravda.rs +''' + +import re +from calibre.web.feeds.recipes import BasicNewsRecipe + +class Pravda_rs(BasicNewsRecipe): + title = 'Dnevne novine Pravda' + __author__ = 'Darko Miletic' + description = '24 sata portal vesti iz Srbije' + publisher = 'Dnevne novine Pravda' + category = 'news, politics, entertainment, Serbia' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = False + language = 'sr' + publication_type = 'newspaper' + remove_empty_feeds = True + PREFIX = 'http://www.pravda.rs' + FEEDPR = PREFIX + '/category/' + LANGLAT = '?lng=lat' + FEEDSU = '/feed/' + LANGLAT + INDEX = PREFIX + LANGLAT + masthead_url = 'http://www.pravda.rs/wp-content/uploads/2012/09/logoof.png' + extra_css = """ + @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} + body{font-family: Georgia,"Times New Roman",Times,serif1,serif;} + img{display: block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher': publisher + , 'language' : language + } + + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] + + keep_only_tags = [dict(name='div', attrs={'class':'post'})] + remove_tags = [dict(name='h3')] + remove_tags_after = dict(name='h3') + + feeds = [ + (u'Politika' , FEEDPR + 'politika/' + FEEDSU), + (u'Tema Dana', FEEDPR + 'tema-dana/' + FEEDSU), + (u'Hronika' , FEEDPR + 'hronika/' + FEEDSU), + (u'Društvo' , FEEDPR + 'drustvo/' + FEEDSU), + (u'Ekonomija', FEEDPR + 'ekonomija/' + FEEDSU), + (u'Srbija' , FEEDPR + 'srbija/' + FEEDSU), + (u'Beograd' , FEEDPR + 'beograd/' + FEEDSU), + (u'Kultura' , FEEDPR + 'kultura/' + FEEDSU), + (u'Zabava' , FEEDPR + 'zabava/' + FEEDSU), + (u'Sport' , FEEDPR + 'sport/' + FEEDSU), + (u'Svet' , FEEDPR + 'svet/' + FEEDSU), + (u'Porodica' , FEEDPR + 'porodica/' + FEEDSU), + (u'Vremeplov', FEEDPR + 'vremeplov/' + FEEDSU), + (u'IT' , FEEDPR + 'it/' + FEEDSU), + (u'Republika Srpska', FEEDPR + 'republika-srpska/' + FEEDSU), + (u'Crna Gora', FEEDPR + 'crna-gora/' + FEEDSU), + (u'EX YU' , FEEDPR + 'eks-ju/' + FEEDSU), + (u'Dijaspora', FEEDPR + 'dijaspora/' + FEEDSU), + (u'Kolumna' , FEEDPR + 'kolumna/' + FEEDSU), + (u'Afere' , FEEDPR + 'afere/' + FEEDSU), + (u'Feljton' , FEEDPR + 'feljton/' + FEEDSU), + (u'Intervju' , FEEDPR + 'intervju/' + FEEDSU), + (u'Reportaža', FEEDPR + 'reportaza/' + FEEDSU), + (u'Zanimljivosti', FEEDPR + 'zanimljivosti/' + FEEDSU), + (u'Sa trga' , FEEDPR + 'sa-trga/' + FEEDSU) + ] + + def print_version(self, url): + return url + self.LANGLAT + + def preprocess_raw_html(self, raw, url): + return 'title'+raw[raw.find(''):] + \ No newline at end of file diff --git a/recipes/pro_physik.recipe b/recipes/pro_physik.recipe index eca10e96f3..3cd4f54361 100644 --- a/recipes/pro_physik.recipe +++ b/recipes/pro_physik.recipe @@ -2,21 +2,46 @@ from calibre.web.feeds.recipes import BasicNewsRecipe class AdvancedUserRecipe1303841067(BasicNewsRecipe): title = u'Pro Physik' - __author__ = 'schuster' - oldest_article = 4 + __author__ = 'schuster, Armin Geller' # AGE Upd. 2012-11-28 + oldest_article = 4 max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - language = 'de' - remove_javascript = True - cover_url = 'http://www.pro-physik.de/Phy/images/site/prophysik_logo1.jpg' + + no_stylesheets = True + remove_javascript = True + remove_empty_feeds = True + language = 'de' + + cover_url = 'http://www.pro-physik.de/prophy/images/bg_logo_prophy.gif' - def print_version(self, url): - return url.replace('leadArticle.do', 'print.do') + keep_only_tags = [ + dict(name='div', attrs={'class':['leftColRight']}) + ] + remove_tags = [ + dict(name='div', attrs={'class':["withMargin socialWrapper addthis_toolbox addthis_default_style"]}), + # AGe: If you don't like to see further informations for the article + # and additional links please remove # in belows next line +# dict(name='div', attrs={'class':["insideBox"]}), + ] - feeds = [(u'Hightech', u'http://www.pro-physik.de/Phy/hightechfeed.xml'), - (u'Forschung', u'http://www.pro-physik.de/Phy/forschungfeed.xml'), - (u'Magazin', u'http://www.pro-physik.de/Phy/magazinfeed.xml')] + feeds = [ + (u'Nachrichten', u'http://www.pro-physik.de/graphicalrss/prophy/newsFeed.xml'), + (u'Forschung', u'http://www.pro-physik.de/graphicalrss/prophy/newsforschungFeed.xml'), + (u'Techologie', u'http://www.pro-physik.de/graphicalrss/prophy/newstechnologieFeed.xml'), + (u'Industrie', u'http://www.pro-physik.de/graphicalrss/prophy/newsindustrieFeed.xml'), + (u'Hochschule', u'http://www.pro-physik.de/graphicalrss/prophy/newshochschuleFeed.xml'), + (u'Panorama', u'http://www.pro-physik.de/graphicalrss/prophy/newspanoramaFeed.xml'), + (u'DPG', u'http://www.pro-physik.de/graphicalrss/prophy/newsdpgFeed.xml'), + (u'Physik Jornal', u'http://www.pro-physik.de/graphicalrss/prophy/pjnewsFeed.xml'), + (u'Veranstaltungen', u'http://www.pro-physik.de/rss/prophy/eventsFeed.xml'), + # AGe if you like to see job offers please remove # on next lines below + +# (u'Stellenmarkt', u'http://www.pro-physik.de/rss/prophy/jobsFeed.xml'), +# (u'Industrie Stellenanzeigen', u'http://www.pro-physik.de/rss/prophy/jobsindustrieFeed.xml'), +# (u'PhD Stellenanzeigen', u'http://www.pro-physik.de/rss/prophy/jobsphdFeed.xml'), +# (u'PostDoc Stellenanzeigen', u'http://www.pro-physik.de/rss/prophy/jobspostdocFeed.xml'), +# (u'Öffentlicher Dienst Stellenanzeigen', u'http://www.pro-physik.de/rss/prophy/jobsdienstFeed.xml'), +# (u'Hochschule Stellenanzeigen', u'http://www.pro-physik.de/rss/prophy/jobshochschuleFeed.xml'), + ] diff --git a/recipes/prospectmaguk.recipe b/recipes/prospectmaguk.recipe index 4ea725dded..8dd73ab8d9 100644 --- a/recipes/prospectmaguk.recipe +++ b/recipes/prospectmaguk.recipe @@ -30,7 +30,7 @@ class ProspectMagUK(BasicNewsRecipe): INDEX = 'http://www.prospectmagazine.co.uk/issue/' def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://www.prospectmagazine.co.uk/wp-login.php') br.select_form(name='loginform') diff --git a/recipes/readitlater.recipe b/recipes/readitlater.recipe index e1d90b22c2..8344d82826 100644 --- a/recipes/readitlater.recipe +++ b/recipes/readitlater.recipe @@ -34,7 +34,7 @@ class Pocket(BasicNewsRecipe): def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None: br.open(self.LOGIN) br.select_form(nr=0) diff --git a/recipes/real_clear.recipe b/recipes/real_clear.recipe index cbf5a2f8e4..7c420a21b9 100644 --- a/recipes/real_clear.recipe +++ b/recipes/real_clear.recipe @@ -128,7 +128,7 @@ class RealClear(BasicNewsRecipe): def get_browser(self): if self.debugMessages == True : print("In get_browser") - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) return br def parseRSS(self, index) : diff --git a/recipes/regina_leader_post.recipe b/recipes/regina_leader_post.recipe index 77842b96b6..0ca00da268 100644 --- a/recipes/regina_leader_post.recipe +++ b/recipes/regina_leader_post.recipe @@ -101,14 +101,14 @@ class CanWestPaper(BasicNewsRecipe): if self.fp_tag=='': return None cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) daysback=1 try: br.open(cover) except: while daysback<7: cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/saskatoon_star_phoenix.recipe b/recipes/saskatoon_star_phoenix.recipe index b17e1ee038..2a2e90e28f 100644 --- a/recipes/saskatoon_star_phoenix.recipe +++ b/recipes/saskatoon_star_phoenix.recipe @@ -101,14 +101,14 @@ class CanWestPaper(BasicNewsRecipe): if self.fp_tag=='': return None cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) daysback=1 try: br.open(cover) except: while daysback<7: cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/schattenblick.recipe b/recipes/schattenblick.recipe new file mode 100644 index 0000000000..d02ec1d272 --- /dev/null +++ b/recipes/schattenblick.recipe @@ -0,0 +1,13 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1345802300(BasicNewsRecipe): + title = u'Online-Zeitung Schattenblick' + language = 'de' + __author__ = 'ThB' + publisher = u'MA-Verlag' + category = u'Nachrichten' + oldest_article = 7 + max_articles_per_feed = 100 + cover_url = 'http://www.schattenblick.de/mobi/rss/cover.jpg' + feeds = [(u'Schattenblick Tagesausgabe', u'http://www.schattenblick.de/mobi/rss/rss.xml')] + diff --git a/recipes/science_aas.recipe b/recipes/science_aas.recipe index 2d486e4458..f2810fdeaa 100644 --- a/recipes/science_aas.recipe +++ b/recipes/science_aas.recipe @@ -24,7 +24,7 @@ class ScienceAAS(BasicNewsRecipe): LOGIN = 'http://www.sciencemag.org/cgi/login?uri=%2Findex.dtl' def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open(self.LOGIN) br.select_form(nr=0) diff --git a/recipes/scmp.recipe b/recipes/scmp.recipe index a4f4bf497c..6c66231da7 100644 --- a/recipes/scmp.recipe +++ b/recipes/scmp.recipe @@ -4,7 +4,6 @@ __copyright__ = '2010, Darko Miletic ' scmp.com ''' -import re from calibre.web.feeds.news import BasicNewsRecipe class SCMP(BasicNewsRecipe): @@ -18,10 +17,11 @@ class SCMP(BasicNewsRecipe): max_articles_per_feed = 200 no_stylesheets = True encoding = 'utf-8' + auto_cleanup = True use_embedded_content = False language = 'en_CN' remove_empty_feeds = True - needs_subscription = True + needs_subscription = 'optional' publication_type = 'newspaper' masthead_url = 'http://www.scmp.com/images/logo_scmp_home.gif' extra_css = ' body{font-family: Arial,Helvetica,sans-serif } ' @@ -34,7 +34,7 @@ class SCMP(BasicNewsRecipe): } def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) #br.set_debug_http(True) #br.set_debug_responses(True) #br.set_debug_redirects(True) @@ -46,17 +46,17 @@ class SCMP(BasicNewsRecipe): br.submit() return br - remove_attributes=['width','height','border'] + #remove_attributes=['width','height','border'] - keep_only_tags = [ - dict(attrs={'id':['ART','photoBox']}) - ,dict(attrs={'class':['article_label','article_byline','article_body']}) - ] + #keep_only_tags = [ + #dict(attrs={'id':['ART','photoBox']}) + #,dict(attrs={'class':['article_label','article_byline','article_body']}) + #] - preprocess_regexps = [ - (re.compile(r'

).)*

', re.DOTALL|re.IGNORECASE), - lambda match: ''), - ] + #preprocess_regexps = [ + #(re.compile(r'

).)*', re.DOTALL|re.IGNORECASE), + #lambda match: ''), + #] feeds = [ (u'Business' , u'http://www.scmp.com/rss/business.xml' ) @@ -68,13 +68,13 @@ class SCMP(BasicNewsRecipe): ,(u'Sport' , u'http://www.scmp.com/rss/sport.xml' ) ] - def print_version(self, url): - rpart, sep, rest = url.rpartition('&') - return rpart #+ sep + urllib.quote_plus(rest) + #def print_version(self, url): + #rpart, sep, rest = url.rpartition('&') + #return rpart #+ sep + urllib.quote_plus(rest) - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - items = soup.findAll(src="/images/label_icon.gif") - [item.extract() for item in items] - return self.adeify_images(soup) + #def preprocess_html(self, soup): + #for item in soup.findAll(style=True): + #del item['style'] + #items = soup.findAll(src="/images/label_icon.gif") + #[item.extract() for item in items] + #return self.adeify_images(soup) diff --git a/recipes/seattle_times.recipe b/recipes/seattle_times.recipe index 631dfa58f1..9ae40d1f20 100644 --- a/recipes/seattle_times.recipe +++ b/recipes/seattle_times.recipe @@ -23,6 +23,7 @@ class SeattleTimes(BasicNewsRecipe): language = 'en' auto_cleanup = True auto_cleanup_keep = '//div[@id="PhotoContainer"]' + cover_url = 'http://seattletimes.com/PDF/frontpage.pdf' feeds = [ (u'Top Stories', diff --git a/recipes/singtaohk.recipe b/recipes/singtaohk.recipe index a038372693..d830381731 100644 --- a/recipes/singtaohk.recipe +++ b/recipes/singtaohk.recipe @@ -102,7 +102,7 @@ class STHKRecipe(BasicNewsRecipe): diff = todaydate - date(2011, 12, 29) base = base + int(diff.total_seconds()/(3600*24)) cover = 'http://singtao.com/media/a/a(' + str(base) +').jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/sivil_dusunce.recipe b/recipes/sivil_dusunce.recipe index 66bb895f0f..3ae665771a 100644 --- a/recipes/sivil_dusunce.recipe +++ b/recipes/sivil_dusunce.recipe @@ -1,12 +1,13 @@ -from calibre.web.feeds.news import BasicNewsRecipe +# -*- coding: utf-8 -*- -class BasicUserRecipe1324913680(BasicNewsRecipe): +from calibre.web.feeds.news import BasicNewsRecipe +class AdvancedUserRecipe1355341662(BasicNewsRecipe): title = u'Sivil Dusunce' language = 'tr' __author__ = 'asalet_r' oldest_article = 7 - max_articles_per_feed = 20 + max_articles_per_feed = 50 auto_cleanup = True - feeds = [(u'Sivil Dusunce', u'http://www.sivildusunce.com/feed/')] + feeds = [(u'Sivil Dusunce', u'http://www.sivildusunce.com/?t=rss&xml=1')] diff --git a/recipes/slate.recipe b/recipes/slate.recipe index 36560cdf33..28d35a415e 100644 --- a/recipes/slate.recipe +++ b/recipes/slate.recipe @@ -85,7 +85,7 @@ class Slate(BasicNewsRecipe): def get_masthead_url(self): masthead = 'http://img.slate.com/images/redesign2008/slate_logo.gif' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(masthead) except: diff --git a/recipes/smilezilla.recipe b/recipes/smilezilla.recipe index 242ee8c42a..3f5534785a 100644 --- a/recipes/smilezilla.recipe +++ b/recipes/smilezilla.recipe @@ -34,7 +34,7 @@ class SmileZilla(BasicNewsRecipe): f.close() return BeautifulSoup(html, fromEncoding=self.encoding) - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) response = br.open(url) html = response.read() soup = BeautifulSoup(html, fromEncoding=self.encoding) diff --git a/recipes/smith.recipe b/recipes/smith.recipe index 3d6a95c494..cd0c94ab35 100644 --- a/recipes/smith.recipe +++ b/recipes/smith.recipe @@ -48,10 +48,14 @@ class Smithsonian(BasicNewsRecipe): link=post.find('a',href=True) url=link['href']+'?c=y&story=fullstory' if subsection is not None: - subsection_title = self.tag_to_string(subsection) + subsection_title = self.tag_to_string(subsection).strip() prefix = (subsection_title+': ') description=self.tag_to_string(post('p', limit=2)[1]).strip() else: + if post.find('img') is not None: + subsection_title = self.tag_to_string(post.findPrevious('div', attrs={'class':'departments plainModule'}).find('p', attrs={'class':'article-cat'})).strip() + prefix = (subsection_title+': ') + description=self.tag_to_string(post.find('p')).strip() desc=re.sub('\sBy\s.*', '', description, re.DOTALL) author=re.sub('.*By\s', '', description, re.DOTALL) @@ -64,4 +68,3 @@ class Smithsonian(BasicNewsRecipe): feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] return ans - diff --git a/recipes/spectator_magazine.recipe b/recipes/spectator_magazine.recipe new file mode 100644 index 0000000000..eb61a8babd --- /dev/null +++ b/recipes/spectator_magazine.recipe @@ -0,0 +1,60 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe + +class NYTimes(BasicNewsRecipe): + + title = 'Spectator Magazine' + __author__ = 'Krittika Goyal' + description = 'Magazine' + timefmt = ' [%d %b, %Y]' + needs_subscription = False + language = 'en' + + no_stylesheets = True + #auto_cleanup = True + #auto_cleanup_keep = '//div[@class="thumbnail"]' + + keep_only_tags = dict(name='div', attrs={'id':'content'}) + remove_tags = [ + dict(name='div', attrs={'id':['disqus_thread']}), + ##dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}), + ##dict(name='form', attrs={'onsubmit':''}), + #dict(name='section', attrs={'id':['article-quote', 'article-navigation']}), + ] + + #TO GET ARTICLE TOC + def spec_get_index(self): + return self.index_to_soup('http://www.spectator.co.uk/') + + # To parse artice toc + def parse_index(self): + parse_soup = self.index_to_soup('http://www.spectator.co.uk/') + + feeds = [] + feed_title = 'Spectator Magazine Articles' + + articles = [] + self.log('Found section:', feed_title) + div = parse_soup.find(attrs={'class':'one-col-tax-widget magazine-list columns-1 post-8 taxonomy-category full-width widget section-widget icit-taxonomical-listings'}) + for art in div.findAll(name='h2'): + art_info = art.find(name = 'a') + if art_info is None: + continue + art_title = self.tag_to_string(art_info) + url = art_info.get('href') + self.log.info('\tFound article:', art_title, 'at', url) + article = {'title':art_title, 'url':url, 'date':''} + #au = art.find(attrs={'class':'articleAuthors'}) + #if au is not None: + #article['author'] = self.tag_to_string(au) + #desc = art.find(attrs={'class':'hover_text'}) + #if desc is not None: + #desc = self.tag_to_string(desc) + #if 'author' in article: + #desc = ' by ' + article['author'] + ' ' +desc + #article['description'] = desc + articles.append(article) + if articles: + feeds.append((feed_title, articles)) + + return feeds + diff --git a/recipes/spiders_web_pl.recipe b/recipes/spiders_web_pl.recipe index 678ee5c640..b593d6b837 100644 --- a/recipes/spiders_web_pl.recipe +++ b/recipes/spiders_web_pl.recipe @@ -5,11 +5,14 @@ class SpidersWeb(BasicNewsRecipe): oldest_article = 7 __author__ = 'fenuks' description = u'' - cover_url = 'http://www.spidersweb.pl/wp-content/themes/spiderweb/img/Logo.jpg' + cover_url = 'http://www.spidersweb.pl/wp-content/themes/new_sw/images/spidersweb.png' category = 'IT, WEB' language = 'pl' no_stylesheers=True + remove_javascript = True + use_embedded_content = False max_articles_per_feed = 100 - keep_only_tags=[dict(id='Post')] - remove_tags=[dict(name='div', attrs={'class':['Comments', 'Shows', 'Post-Tags']}), dict(id='Author-Column')] + keep_only_tags=[dict(id='start')] + remove_tags_after = dict(attrs={'class':'padding20'}) + remove_tags=[dict(name='div', attrs={'class':['padding border-bottom', 'padding20', 'padding border-top']})] feeds = [(u'Wpisy', u'http://www.spidersweb.pl/feed')] diff --git a/recipes/st_louis_post_dispatch.recipe b/recipes/st_louis_post_dispatch.recipe index 3b7701cedc..6d22a327ab 100644 --- a/recipes/st_louis_post_dispatch.recipe +++ b/recipes/st_louis_post_dispatch.recipe @@ -7,12 +7,16 @@ class AdvancedUserRecipe1282093204(BasicNewsRecipe): oldest_article = 1 max_articles_per_feed = 15 + use_embedded_content = False + + no_stylesheets = True + auto_cleanup = True masthead_url = 'http://farm5.static.flickr.com/4118/4929686950_0e22e2c88a.jpg' feeds = [ (u'News-Bill McClellan', u'http://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=news%2Flocal%2fcolumns%2Fbill-mclellan&f=rss&t=article'), (u'News-Columns', u'http://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=news%2Flocal%2Fcolumns*&l=50&f=rss&t=article'), - (u'News-Crime & Courtshttp://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=news%2Flocal%2Fcrime-and-courts&l=50&f=rss&t=article'), + (u'News-Crime & Courts', 'http://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=news%2Flocal%2Fcrime-and-courts&l=50&f=rss&t=article'), (u'News-Deb Peterson', u'http://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=news%2Flocal%2fcolumns%2Fdeb-peterson&f=rss&t=article'), (u'News-Education', u'http://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=news%2Flocal%2feducation&f=rss&t=article'), (u'News-Government & Politics', u'http://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=news%2Flocal%2fgovt-and-politics&f=rss&t=article'), @@ -62,9 +66,9 @@ class AdvancedUserRecipe1282093204(BasicNewsRecipe): (u'Entertainment-House-O-Fun', u'http://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=entertainment%2Fhouse-o-fun&l=100&f=rss&t=article'), (u'Entertainment-Kevin C. Johnson', u'http://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=entertainment%2Fmusic%2Fkevin-johnson&l=100&f=rss&t=article') ] - remove_empty_feeds = True - remove_tags = [dict(name='div', attrs={'id':'blox-logo'}),dict(name='a')] - keep_only_tags = [dict(name='h1'), dict(name='p', attrs={'class':'byline'}), dict(name="div", attrs={'id':'blox-story-text'})] + #remove_empty_feeds = True + #remove_tags = [dict(name='div', attrs={'id':'blox-logo'}),dict(name='a')] + #keep_only_tags = [dict(name='h1'), dict(name='p', attrs={'class':'byline'}), dict(name="div", attrs={'id':'blox-story-text'})] extra_css = 'p {text-align: left;}' diff --git a/recipes/staradvertiser.recipe b/recipes/staradvertiser.recipe index bf9c6460a1..283f2bba3c 100644 --- a/recipes/staradvertiser.recipe +++ b/recipes/staradvertiser.recipe @@ -52,7 +52,7 @@ class Starbulletin(BasicNewsRecipe): ] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://www.staradvertiser.com/manage/Login/') br.select_form(name='loginForm') diff --git a/recipes/sueddeutsche.recipe b/recipes/sueddeutsche.recipe index 624321e730..33750b024c 100644 --- a/recipes/sueddeutsche.recipe +++ b/recipes/sueddeutsche.recipe @@ -8,19 +8,19 @@ Fetch sueddeutsche.de from calibre.web.feeds.news import BasicNewsRecipe class Sueddeutsche(BasicNewsRecipe): - title = u'Süddeutsche.de' # 2012-01-26 AGe Correct Title - description = 'News from Germany, Access to online content' # 2012-01-26 AGe - __author__ = 'Oliver Niesner and Armin Geller' #Update AGe 2012-01-26 - publisher = u'Süddeutsche Zeitung' # 2012-01-26 AGe add - category = 'news, politics, Germany' # 2012-01-26 AGe add - timefmt = ' [%a, %d %b %Y]' # 2012-01-26 AGe add %a + title = u'Süddeutsche.de' + description = 'News from Germany, Access to online content' + __author__ = 'Oliver Niesner and Armin Geller' #Update AGe 2012-12-05 + publisher = u'Süddeutsche Zeitung' + category = 'news, politics, Germany' + timefmt = ' [%a, %d %b %Y]' oldest_article = 7 max_articles_per_feed = 100 language = 'de' encoding = 'utf-8' - publication_type = 'newspaper' # 2012-01-26 add + publication_type = 'newspaper' cover_source = 'http://www.sueddeutsche.de/verlag' # 2012-01-26 AGe add from Darko Miletic paid content source - masthead_url = 'http://www.sueddeutsche.de/static_assets/build/img/sdesiteheader/logo_homepage.441d531c.png' # 2012-01-26 AGe add + masthead_url = 'http://www.sueddeutsche.de/static_assets/img/sdesiteheader/logo_standard.a152b0df.png' # 2012-12-05 AGe add use_embedded_content = False no_stylesheets = True @@ -40,9 +40,9 @@ class Sueddeutsche(BasicNewsRecipe): (u'Sport', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ESport%24?output=rss'), (u'Leben', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5ELeben%24?output=rss'), (u'Karriere', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EKarriere%24?output=rss'), - (u'Bildung', u'http://rss.sueddeutsche.de/rss/bildung'), #2012-01-26 AGe New - (u'Gesundheit', u'http://rss.sueddeutsche.de/rss/gesundheit'), #2012-01-26 AGe New - (u'Stil', u'http://rss.sueddeutsche.de/rss/stil'), #2012-01-26 AGe New + (u'Bildung', u'http://rss.sueddeutsche.de/rss/bildung'), + (u'Gesundheit', u'http://rss.sueddeutsche.de/rss/gesundheit'), + (u'Stil', u'http://rss.sueddeutsche.de/rss/stil'), (u'München & Region', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMünchen&Region%24?output=rss'), (u'Bayern', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EBayern%24?output=rss'), (u'Medien', u'http://suche.sueddeutsche.de/query/%23/sort/-docdatetime/drilldown/%C2%A7ressort%3A%5EMedien%24?output=rss'), diff --git a/recipes/sueddeutsche_mobil.recipe b/recipes/sueddeutsche_mobil.recipe index d1b08cbcba..7b1a9c4d8d 100644 --- a/recipes/sueddeutsche_mobil.recipe +++ b/recipes/sueddeutsche_mobil.recipe @@ -1,13 +1,16 @@ -# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai __license__ = 'GPL v3' -__copyright__ = '2012, Andreas Zeiser ' +__copyright__ = '2012, 2013 Andreas Zeiser ' ''' szmobil.sueddeutsche.de/ ''' +# History +# 2013.01.09 Fixed bugs in article titles containing "strong" and +# other small changes +# 2012.08.04 Initial release from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe -import re +import re class SZmobil(BasicNewsRecipe): title = u'Süddeutsche Zeitung mobil' @@ -26,6 +29,8 @@ class SZmobil(BasicNewsRecipe): delay = 1 cover_source = 'http://www.sueddeutsche.de/verlag' + # if you want to get rid of the date on the title page use + # timefmt = '' timefmt = ' [%a, %d %b, %Y]' root_url ='http://szmobil.sueddeutsche.de/' @@ -50,7 +55,7 @@ class SZmobil(BasicNewsRecipe): return browser - def parse_index(self): + def parse_index(self): # find all sections src = self.index_to_soup('http://szmobil.sueddeutsche.de') feeds = [] @@ -76,10 +81,10 @@ class SZmobil(BasicNewsRecipe): # first check if link is a special article in section "Meinungsseite" if itt.find('strong')!= None: article_name = itt.strong.string - article_shorttitle = itt.contents[1] + if len(itt.contents)>1: + shorttitles[article_id] = itt.contents[1] articles.append( (article_name, article_url, article_id) ) - shorttitles[article_id] = article_shorttitle continue @@ -89,7 +94,7 @@ class SZmobil(BasicNewsRecipe): else: article_name = itt.string - if (article_name[0:10] == " mehr"): + if (article_name.find(" mehr") == 0): # just another link ("mehr") to an article continue @@ -102,7 +107,9 @@ class SZmobil(BasicNewsRecipe): for article_name, article_url, article_id in articles: url = self.root_url + article_url title = article_name - pubdate = strftime('%a, %d %b') + # if you want to get rid of date for each article use + # pubdate = strftime('') + pubdate = strftime('[%a, %d %b]') description = '' if shorttitles.has_key(article_id): description = shorttitles[article_id] @@ -115,3 +122,4 @@ class SZmobil(BasicNewsRecipe): return all_articles + diff --git a/recipes/sueddeutschezeitung.recipe b/recipes/sueddeutschezeitung.recipe index f38f80dd45..281f66ef59 100644 --- a/recipes/sueddeutschezeitung.recipe +++ b/recipes/sueddeutschezeitung.recipe @@ -40,7 +40,7 @@ class SueddeutcheZeitung(BasicNewsRecipe): remove_attributes = ['height','width','style'] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open(self.INDEX) br.select_form(name='lbox') diff --git a/recipes/sunday_times.recipe b/recipes/sunday_times.recipe index 973f1792c7..2ffb65423d 100644 --- a/recipes/sunday_times.recipe +++ b/recipes/sunday_times.recipe @@ -42,7 +42,7 @@ class TimesOnline(BasicNewsRecipe): def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open('http://www.thesundaytimes.co.uk/sto/') if self.username is not None and self.password is not None: data = urllib.urlencode({ diff --git a/recipes/techtarget.recipe b/recipes/techtarget.recipe index 888ec7fac0..d5f4972a92 100644 --- a/recipes/techtarget.recipe +++ b/recipes/techtarget.recipe @@ -17,7 +17,7 @@ class TechTarget(BasicNewsRecipe): LOGIN = u'http://searchservervirtualization.techtarget.com/login' def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None: br.open(self.LOGIN) br.select_form(nr=1) diff --git a/recipes/the_age.recipe b/recipes/the_age.recipe index 415ff0a25d..b9e59527b6 100644 --- a/recipes/the_age.recipe +++ b/recipes/the_age.recipe @@ -24,7 +24,7 @@ class TheAge(BasicNewsRecipe): remove_tags = [dict(name=['table', 'script', 'noscript', 'style']), dict(name='a', attrs={'href':'/'}), dict(name='a', attrs={'href':'/text/'})] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.set_handle_refresh(False) return br diff --git a/recipes/the_nation.recipe b/recipes/the_nation.recipe index a830817762..073beb067c 100644 --- a/recipes/the_nation.recipe +++ b/recipes/the_nation.recipe @@ -45,7 +45,7 @@ class Thenation(BasicNewsRecipe): return url.replace('.thenation.com/','.thenation.com/print/') def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open('http://www.thenation.com/') if self.username is not None and self.password is not None: br.open(self.login_url) diff --git a/recipes/thestar.recipe b/recipes/thestar.recipe index f667b86472..59c3b43c6b 100644 --- a/recipes/thestar.recipe +++ b/recipes/thestar.recipe @@ -1,7 +1,5 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' +__copyright__ = '2009-2013, Darko Miletic ' ''' www.thestar.com ''' @@ -11,18 +9,17 @@ from calibre.web.feeds.news import BasicNewsRecipe class TheTorontoStar(BasicNewsRecipe): title = 'The Toronto Star' __author__ = 'Darko Miletic' - description = "Canada's largest daily newspaper" + description = "Thestar.com is Canada's largest online news site. Stay current with our sports, business entertainment news and more from the Toronto Star and thestar.com" oldest_article = 2 language = 'en_CA' max_articles_per_feed = 100 no_stylesheets = True - #auto_cleanup = True - #auto_cleanup_keep = '//div[@class="topsContent topsContentActive"]' use_embedded_content = False delay = 2 publisher = 'The Toronto Star' category = "Toronto Star,Canada's largest daily newspaper,breaking news,classifieds,careers,GTA,Toronto Maple Leafs,sports,Toronto,news,editorial,The Star,Ontario,information,columnists,business,entertainment,births,deaths,automotive,rentals,weather,archives,Torstar,technology,Joseph Atkinson" encoding = 'utf-8' + masthead_url = 'http://www.thestar.com/etc/designs/thestar/images/general/logoLrg.png' conversion_options = { 'comments' : description @@ -30,23 +27,18 @@ class TheTorontoStar(BasicNewsRecipe): ,'publisher' : publisher } - #keep_only_tags = [dict(name='div', attrs={'class':'ts-article'})] - #remove_tags_before = dict(name='div',attrs={'id':'ts-article_header'}) + remove_tags_before = dict(name='div',attrs={'class':'article-headline'}) feeds = [ - (u'News' , u'http://www.thestar.com/rss/?categories=293' ) - ,(u'Opinion' , u'http://www.thestar.com/rss/?categories=303' ) - ,(u'Business' , u'http://www.thestar.com/rss/?categories=294' ) - ,(u'Sports' , u'http://www.thestar.com/rss/?categories=295' ) - ,(u'Entertainment', u'http://www.toronto.com/rss?categories=6298' ) - ,(u'Living' , u'http://www.thestar.com/rss/?categories=297' ) - ,(u'Travel' , u'http://www.thestar.com/rss/list/1042246?' ) - ,(u'Science' , u'http://www.thestar.com/rss?categories=6481') + (u'News' , u'http://www.thestar.com/feeds.articles.news.rss' ) + ,(u'Opinion' , u'http://www.thestar.com/feeds.articles.opinion.rss' ) + ,(u'Business' , u'http://www.thestar.com/feeds.articles.business.rss' ) + ,(u'Sports' , u'http://www.thestar.com/feeds.articles.sports.rss' ) + ,(u'Entertainment', u'http://www.thestar.com/feeds.articles.entertainment.rss' ) + ,(u'Living' , u'http://www.thestar.com/feeds.articles.life.rss' ) + ,(u'Travel' , u'http://www.thestar.com/feeds.articles.life.travel.rss' ) + ,(u'Technology' , u'http://www.thestar.com/feeds.articles.life.technology.rss') ] def print_version(self, url): - artl = url.rpartition('--')[0] - artid = artl.rpartition('/')[2] - return 'http://www.thestar.com/printarticle/' + artid - - + return url.replace('.html', '.print.html') diff --git a/recipes/tidbits.recipe b/recipes/tidbits.recipe index 702c65e9e4..a053dfb91f 100644 --- a/recipes/tidbits.recipe +++ b/recipes/tidbits.recipe @@ -16,8 +16,9 @@ class TidBITS(BasicNewsRecipe): oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True + #auto_cleanup = True encoding = 'utf-8' - use_embedded_content = True + use_embedded_content = False language = 'en' remove_empty_feeds = True masthead_url = 'http://db.tidbits.com/images/tblogo9.gif' @@ -30,9 +31,11 @@ class TidBITS(BasicNewsRecipe): , 'language' : language } - remove_attributes = ['width','height'] - remove_tags = [dict(name='small')] - remove_tags_after = dict(name='small') + #remove_attributes = ['width','height'] + #remove_tags = [dict(name='small')] + #remove_tags_after = dict(name='small') + keep_only_tags = [dict(name='div', attrs={'id':'center_ajax_sub'})] + remove_tags = [dict(name='div', attrs={'id':'social-media'})] feeds = [ (u'Business Apps' , u'http://db.tidbits.com/feeds/business.rss' ) diff --git a/recipes/times_online.recipe b/recipes/times_online.recipe index 1299c92fa3..2213c3a116 100644 --- a/recipes/times_online.recipe +++ b/recipes/times_online.recipe @@ -41,7 +41,7 @@ class TimesOnline(BasicNewsRecipe): def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open('http://www.thetimes.co.uk/tto/news/') if self.username is not None and self.password is not None: data = urllib.urlencode({ diff --git a/recipes/titanic_de.recipe b/recipes/titanic_de.recipe new file mode 100644 index 0000000000..edc9580602 --- /dev/null +++ b/recipes/titanic_de.recipe @@ -0,0 +1,20 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class Titanic(BasicNewsRecipe): + title = u'Titanic' + language = 'de' + __author__ = 'Krittika Goyal' + oldest_article = 14 #days + max_articles_per_feed = 25 + #encoding = 'cp1252' + use_embedded_content = False + + no_stylesheets = True + auto_cleanup = True + + + feeds = [ +('News', + 'http://www.titanic-magazin.de/ich.war.bei.der.waffen.rss'), +] + diff --git a/recipes/todays_zaman.recipe b/recipes/todays_zaman.recipe index 5f3b85131a..13d82e31fb 100644 --- a/recipes/todays_zaman.recipe +++ b/recipes/todays_zaman.recipe @@ -26,28 +26,33 @@ class TodaysZaman_en(BasicNewsRecipe): # remove_attributes = ['width','height'] feeds = [ - ( u'Home', u'http://www.todayszaman.com/rss?sectionId=0'), - ( u'News', u'http://www.todayszaman.com/rss?sectionId=100'), - ( u'Business', u'http://www.todayszaman.com/rss?sectionId=105'), - ( u'Interviews', u'http://www.todayszaman.com/rss?sectionId=8'), - ( u'Columnists', u'http://www.todayszaman.com/rss?sectionId=6'), - ( u'Op-Ed', u'http://www.todayszaman.com/rss?sectionId=109'), - ( u'Arts & Culture', u'http://www.todayszaman.com/rss?sectionId=110'), - ( u'Expat Zone', u'http://www.todayszaman.com/rss?sectionId=132'), - ( u'Sports', u'http://www.todayszaman.com/rss?sectionId=5'), - ( u'Features', u'http://www.todayszaman.com/rss?sectionId=116'), - ( u'Travel', u'http://www.todayszaman.com/rss?sectionId=117'), - ( u'Leisure', u'http://www.todayszaman.com/rss?sectionId=118'), - ( u'Weird But True', u'http://www.todayszaman.com/rss?sectionId=134'), - ( u'Life', u'http://www.todayszaman.com/rss?sectionId=133'), - ( u'Health', u'http://www.todayszaman.com/rss?sectionId=126'), - ( u'Press Review', u'http://www.todayszaman.com/rss?sectionId=130'), - ( u'Todays think tanks', u'http://www.todayszaman.com/rss?sectionId=159'), - - ] + ( u'Home', u'http://www.todayszaman.com/0.rss'), + ( u'Sports', u'http://www.todayszaman.com/5.rss'), + ( u'Columnists', u'http://www.todayszaman.com/6.rss'), + ( u'Interviews', u'http://www.todayszaman.com/9.rss'), + ( u'News', u'http://www.todayszaman.com/100.rss'), + ( u'National', u'http://www.todayszaman.com/101.rss'), + ( u'Diplomacy', u'http://www.todayszaman.com/102.rss'), + ( u'World', u'http://www.todayszaman.com/104.rss'), + ( u'Business', u'http://www.todayszaman.com/105.rss'), + ( u'Op-Ed', u'http://www.todayszaman.com/109.rss'), + ( u'Arts & Culture', u'http://www.todayszaman.com/110.rss'), + ( u'Features', u'http://www.todayszaman.com/116.rss'), + ( u'Travel', u'http://www.todayszaman.com/117.rss'), + ( u'Food', u'http://www.todayszaman.com/124.rss'), + ( u'Press Review', u'http://www.todayszaman.com/130.rss'), + ( u'Expat Zone', u'http://www.todayszaman.com/132.rss'), + ( u'Life', u'http://www.todayszaman.com/133.rss'), + ( u'Think Tanks', u'http://www.todayszaman.com/159.rss'), + ( u'Almanac', u'http://www.todayszaman.com/161.rss'), + ( u'Health', u'http://www.todayszaman.com/162.rss'), + ( u'Fashion & Beauty', u'http://www.todayszaman.com/163.rss'), + ( u'Science & Technology', u'http://www.todayszaman.com/349.rss'), + ] #def preprocess_html(self, soup): # return self.adeify_images(soup) #def print_version(self, url): #there is a probem caused by table format #return url.replace('http://www.todayszaman.com/newsDetail_getNewsById.action?load=detay&', 'http://www.todayszaman.com/newsDetail_openPrintPage.action?') + diff --git a/recipes/tomshardware.recipe b/recipes/tomshardware.recipe index bcf154c8fb..c75b19189d 100644 --- a/recipes/tomshardware.recipe +++ b/recipes/tomshardware.recipe @@ -33,7 +33,7 @@ class Tomshardware(BasicNewsRecipe): html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open(self.INDEX+'/us/') if self.username is not None and self.password is not None: data = urllib.urlencode({ 'action':'login_action' diff --git a/recipes/toyokeizai.recipe b/recipes/toyokeizai.recipe index a50558f8e8..c8e57eba53 100644 --- a/recipes/toyokeizai.recipe +++ b/recipes/toyokeizai.recipe @@ -58,7 +58,7 @@ class Toyokeizai(BasicNewsRecipe): return feeds def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://member.toyokeizai.net/norights/form/') br.select_form(nr=0) diff --git a/recipes/tsn.recipe b/recipes/tsn.recipe index e822ebc633..6c3dbe5159 100644 --- a/recipes/tsn.recipe +++ b/recipes/tsn.recipe @@ -7,28 +7,15 @@ class AdvancedUserRecipe1289990851(BasicNewsRecipe): language = 'en_CA' __author__ = 'Nexus' no_stylesheets = True + auto_cleanup = True + use_embedded_content = False INDEX = 'http://tsn.ca/nhl/story/?id=nhl' - keep_only_tags = [dict(name='div', attrs={'id':['tsnColWrap']}), - dict(name='div', attrs={'id':['tsnStory']})] - remove_tags = [dict(name='div', attrs={'id':'tsnRelated'}), - dict(name='div', attrs={'class':'textSize'})] - - def parse_index(self): - feeds = [] - soup = self.index_to_soup(self.INDEX) - feed_parts = soup.findAll('div', attrs={'class': 'feature'}) - for feed_part in feed_parts: - articles = [] - if not feed_part.h2: - continue - feed_title = feed_part.h2.string - article_parts = feed_part.findAll('a') - for article_part in article_parts: - article_title = article_part.string - article_date = '' - article_url = 'http://tsn.ca/' + article_part['href'] - articles.append({'title': article_title, 'url': article_url, 'description':'', 'date':article_date}) - if articles: - feeds.append((feed_title, articles)) - return feeds + #keep_only_tags = [dict(name='div', attrs={'id':['tsnColWrap']}), + #dict(name='div', attrs={'id':['tsnStory']})] + #remove_tags = [dict(name='div', attrs={'id':'tsnRelated'}), + #dict(name='div', attrs={'class':'textSize'})] + feeds = [ +('News', + 'http://www.tsn.ca/datafiles/rss/Stories.xml'), +] diff --git a/recipes/tvp_info.recipe b/recipes/tvp_info.recipe new file mode 100644 index 0000000000..64528d4194 --- /dev/null +++ b/recipes/tvp_info.recipe @@ -0,0 +1,20 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from calibre.web.feeds.news import BasicNewsRecipe +class TVPINFO(BasicNewsRecipe): + title = u'TVP.INFO' + __author__ = 'fenuks' + description = u'Serwis informacyjny TVP.INFO' + category = 'news' + language = 'pl' + cover_url = 'http://s.v3.tvp.pl/files/tvp-info/gfx/logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} + keep_only_tags = [dict(id='contentNews')] + remove_tags = [dict(attrs={'class':['toolbox', 'modulBox read', 'modulBox social', 'videoPlayerBox']}), dict(id='belka')] + feeds = [(u'Wiadomo\u015bci', u'http://tvp.info/informacje?xslt=tvp-info/news/rss.xslt&src_id=191865'), + (u'\u015awiat', u'http://tvp.info/informacje/swiat?xslt=tvp-info/news/rss.xslt&src_id=191867'), (u'Biznes', u'http://tvp.info/informacje/biznes?xslt=tvp-info/news/rss.xslt&src_id=191868'), (u'Nauka', u'http://tvp.info/informacje/nauka?xslt=tvp-info/news/rss.xslt&src_id=191870'), (u'Kultura', u'http://tvp.info/informacje/kultura?xslt=tvp-info/news/rss.xslt&src_id=191869'), (u'Rozmaito\u015bci', u'http://tvp.info/informacje/rozmaitosci?xslt=tvp-info/news/rss.xslt&src_id=191872'), (u'Opinie', u'http://tvp.info/opinie?xslt=tvp-info/news/rss.xslt&src_id=191875'), (u'Komentarze', u'http://tvp.info/opinie/komentarze?xslt=tvp-info/news/rss.xslt&src_id=238200'), (u'Wywiady', u'http://tvp.info/opinie/wywiady?xslt=tvp-info/news/rss.xslt&src_id=236644')] diff --git a/recipes/ubuntu_pomoc_org.recipe b/recipes/ubuntu_pomoc_org.recipe new file mode 100644 index 0000000000..1a78649dfc --- /dev/null +++ b/recipes/ubuntu_pomoc_org.recipe @@ -0,0 +1,22 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe +class UbuntuPomoc(BasicNewsRecipe): + title = u'Ubuntu-pomoc.org' + __author__ = 'fenuks' + description = u'Strona poświęcona systemowi Ubuntu Linux. Znajdziesz tutaj przydatne i sprawdzone poradniki oraz sposoby rozwiązywania wielu popularnych problemów. Ten blog rozwiąże każdy Twój problem - jeśli nie teraz, to wkrótce! :)' + category = 'Linux, Ubuntu, open source' + language = 'pl' + cover_url = 'http://www.ubuntu-pomoc.org/grafika/ubuntupomoc.png' + preprocess_regexps = [(re.compile(r'

.+', re.IGNORECASE|re.DOTALL), lambda m: '')] + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_javascript = True + remove_empty_feeds = True + use_embedded_content = False + remove_attrs = ['style'] + keep_only_tags = [dict(attrs={'class':'post'})] + remove_tags_after = dict(attrs={'class':'underEntry'}) + remove_tags = [dict(attrs={'class':['underPostTitle', 'yarpp-related', 'underEntry', 'social', 'tags', 'commentlist', 'youtube_sc']}), dict(id=['wp_rp_first', 'commentReply'])] + feeds = [(u'Ca\u0142o\u015b\u0107', u'http://feeds.feedburner.com/Ubuntu-Pomoc'), + (u'Gry', u'http://feeds.feedburner.com/GryUbuntu-pomoc')] diff --git a/recipes/ukraiyns_kii_tizhdien.recipe b/recipes/ukraiyns_kii_tizhdien.recipe new file mode 100644 index 0000000000..3064ebfe55 --- /dev/null +++ b/recipes/ukraiyns_kii_tizhdien.recipe @@ -0,0 +1,13 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1356283265(BasicNewsRecipe): + title = u'\u0423\u043a\u0440\u0430\u0457\u043d\u0441\u044c\u043a\u0438\u0439 \u0422\u0438\u0436\u0434\u0435\u043d\u044c' + __author__ = 'rpalyvoda' + oldest_article = 7 + max_articles_per_feed = 100 + language = 'uk' + cover_url = 'http://tyzhden.ua/Images/Style1/tyzhden.ua-logo2.gif' + masthead_url = 'http://tyzhden.ua/Images/Style1/tyzhden.ua-logo2.gif' + auto_cleanup = True + + feeds = [(u'\u041d\u043e\u0432\u0438\u043d\u0438', u'http://tyzhden.ua/RSS/News/'), (u'\u041e\u0440\u0438\u0433\u0456\u043d\u0430\u043b\u044c\u043d\u0456 \u043d\u043e\u0432\u0438\u043d\u0438', u'http://tyzhden.ua/RSS/News.Original/'), (u'\u041f\u0443\u0431\u043b\u0456\u043a\u0430\u0446\u0456\u0457', u'http://tyzhden.ua/RSS/Publications/')] diff --git a/recipes/usatoday.recipe b/recipes/usatoday.recipe index 8fdb6eef30..12ffec38f0 100644 --- a/recipes/usatoday.recipe +++ b/recipes/usatoday.recipe @@ -52,7 +52,7 @@ class USAToday(BasicNewsRecipe): def get_masthead_url(self): masthead = 'http://i.usatoday.net/mobile/_common/_images/565x73_usat_mobile.gif' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(masthead) except: diff --git a/recipes/vancouver_provice.recipe b/recipes/vancouver_provice.recipe index 690daefbc2..9bb8085fdc 100644 --- a/recipes/vancouver_provice.recipe +++ b/recipes/vancouver_provice.recipe @@ -132,14 +132,14 @@ class CanWestPaper(BasicNewsRecipe): def get_cover_url(self): from datetime import timedelta, date cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) daysback=1 try: br.open(cover) except: while daysback<7: cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/vancouver_province.recipe b/recipes/vancouver_province.recipe index 5687c7c858..0648f29f41 100644 --- a/recipes/vancouver_province.recipe +++ b/recipes/vancouver_province.recipe @@ -101,14 +101,14 @@ class CanWestPaper(BasicNewsRecipe): if self.fp_tag=='': return None cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) daysback=1 try: br.open(cover) except: while daysback<7: cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/vancouver_sun.recipe b/recipes/vancouver_sun.recipe index 1cfacb9639..faea272558 100644 --- a/recipes/vancouver_sun.recipe +++ b/recipes/vancouver_sun.recipe @@ -132,14 +132,14 @@ class CanWestPaper(BasicNewsRecipe): def get_cover_url(self): from datetime import timedelta, date cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) daysback=1 try: br.open(cover) except: while daysback<7: cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/vic_times.recipe b/recipes/vic_times.recipe index 076f4fa8a9..48fb9038aa 100644 --- a/recipes/vic_times.recipe +++ b/recipes/vic_times.recipe @@ -1,114 +1,55 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - __license__ = 'GPL v3' ''' www.canada.com ''' - import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup -class CanWestPaper(BasicNewsRecipe): +class TimesColonist(BasicNewsRecipe): - # un-comment the following four lines for the Victoria Times Colonist title = u'Victoria Times Colonist' url_prefix = 'http://www.timescolonist.com' description = u'News from Victoria, BC' fp_tag = 'CAN_TC' - # un-comment the following four lines for the Vancouver Province -## title = u'Vancouver Province' -## url_prefix = 'http://www.theprovince.com' -## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VP' - - # un-comment the following four lines for the Vancouver Sun -## title = u'Vancouver Sun' -## url_prefix = 'http://www.vancouversun.com' -## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VS' - - # un-comment the following four lines for the Edmonton Journal -## title = u'Edmonton Journal' -## url_prefix = 'http://www.edmontonjournal.com' -## description = u'News from Edmonton, AB' -## fp_tag = 'CAN_EJ' - - # un-comment the following four lines for the Calgary Herald -## title = u'Calgary Herald' -## url_prefix = 'http://www.calgaryherald.com' -## description = u'News from Calgary, AB' -## fp_tag = 'CAN_CH' - - # un-comment the following four lines for the Regina Leader-Post -## title = u'Regina Leader-Post' -## url_prefix = 'http://www.leaderpost.com' -## description = u'News from Regina, SK' -## fp_tag = '' - - # un-comment the following four lines for the Saskatoon Star-Phoenix -## title = u'Saskatoon Star-Phoenix' -## url_prefix = 'http://www.thestarphoenix.com' -## description = u'News from Saskatoon, SK' -## fp_tag = '' - - # un-comment the following four lines for the Windsor Star -## title = u'Windsor Star' -## url_prefix = 'http://www.windsorstar.com' -## description = u'News from Windsor, ON' -## fp_tag = 'CAN_' - - # un-comment the following four lines for the Ottawa Citizen -## title = u'Ottawa Citizen' -## url_prefix = 'http://www.ottawacitizen.com' -## description = u'News from Ottawa, ON' -## fp_tag = 'CAN_OC' - - # un-comment the following four lines for the Montreal Gazette -## title = u'Montreal Gazette' -## url_prefix = 'http://www.montrealgazette.com' -## description = u'News from Montreal, QC' -## fp_tag = 'CAN_MG' - - + url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True - timefmt = ' [%b %d]' + timefmt = ' [%b %d]' + encoding = 'utf-8' extra_css = ''' - .timestamp { font-size:xx-small; display: block; } - #storyheader { font-size: medium; } - #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } - .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' - keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + .byline { font-size:xx-small; font-weight: bold;} + h3 { margin-bottom: 6px; } + .caption { font-size: xx-small; font-style: italic; font-weight: normal; } + ''' + keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})] remove_tags = [{'class':'comments'}, - dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), - dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), - dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), - dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), - dict(name='div', attrs={'class':'rule_grey_solid'}), - dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + {'id':'photocredit'}, + dict(name='div', attrs={'class':re.compile('top.controls')}), + dict(name='div', attrs={'class':re.compile('social')}), + dict(name='div', attrs={'class':re.compile('tools')}), + dict(name='div', attrs={'class':re.compile('bottom.tools')}), + dict(name='div', attrs={'class':re.compile('window')}), + dict(name='div', attrs={'class':re.compile('related.news.element')})] + def get_cover_url(self): from datetime import timedelta, date - if self.fp_tag=='': - return None cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) daysback=1 try: br.open(cover) except: while daysback<7: cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: @@ -120,6 +61,18 @@ class CanWestPaper(BasicNewsRecipe): cover = None return cover + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(path_to_image, out_path) + def fixChars(self,string): # Replace lsquo (\x91) fixed = re.sub("\x91","‘",string) @@ -166,55 +119,107 @@ class CanWestPaper(BasicNewsRecipe): a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup - def preprocess_html(self, soup): + def preprocess_html(self,soup): + byline = soup.find('p',attrs={'class':re.compile('ancillary')}) + if byline is not None: + byline.find('a') + authstr = self.tag_to_string(byline,False) + authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE) + authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE) + newdiv = Tag(soup,'div') + newdiv.insert(0,authstr) + newdiv['class']='byline' + byline.replaceWith(newdiv) + for caption in soup.findAll('p',attrs={'class':re.compile('caption')}): + capstr = self.tag_to_string(caption,False) + capstr = re.sub('Photograph by.*$','',capstr, flags=re.IGNORECASE) + newdiv = Tag(soup,'div') + newdiv.insert(0,capstr) + newdiv['class']='caption' + caption.replaceWith(newdiv) + for ptag in soup.findAll('p'): + ptext = self.tag_to_string(ptag,use_alt=False, normalize_whitespace=True) + ptext = re.sub(r'\s+','', ptext) + if (ptext=='') or (ptext==' '): + ptag.extract() return self.strip_anchors(soup) + raeside = False + def handle_articles(self,htag,article_list,sectitle): + atag = htag.a + if atag is not None: + url = atag['href'] + #print("Checking "+url) + if atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + if 'RAESIDE' in title.upper(): + if self.raeside: + return + self.raeside = True + dtag = htag.findNext('p') + description='' + if dtag is not None: + description = self.tag_to_string(dtag,False) + article_list.append(dict(title=title,url=url,date='',description=description,author='',content='')) + #print(sectitle+title+": description = "+description+" URL="+url) + def add_section_index(self,ans,securl,sectitle): + print("Add section url="+self.url_prefix+'/'+securl) + try: + soup = self.index_to_soup(self.url_prefix+'/'+securl) + except: + return ans + mainsoup = soup.find('div',attrs={'class':re.compile('main.content')}) + article_list = [] + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('featured.story')}): + for htag in wdiv.findAll('h3'): + self.handle_articles(htag,article_list,sectitle) + for ladiv in mainsoup.findAll(attrs={'class':re.compile('leading.articles')}): + for wdiv in mainsoup.findAll('div',attrs={'class':re.compile('article.row')}): + for htag in wdiv.findAll('h2'): + self.handle_articles(htag,article_list,sectitle) + ans.append((sectitle,article_list)) + return ans def parse_index(self): - soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') - - articles = {} - key = 'News' - ans = ['News'] - - # Find each instance of class="sectiontitle", class="featurecontent" - for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): - #self.log(" div class = %s" % divtag['class']) - if divtag['class'].startswith('section_title'): - # div contains section title - if not divtag.h3: - continue - key = self.tag_to_string(divtag.h3,False) - ans.append(key) - self.log("Section name %s" % key) - continue - # div contains article data - h1tag = divtag.find('h1') - if not h1tag: - continue - atag = h1tag.find('a',href=True) - if not atag: - continue - url = self.url_prefix+'/news/todays-paper/'+atag['href'] - #self.log("Section %s" % key) - #self.log("url %s" % url) - title = self.tag_to_string(atag,False) - #self.log("title %s" % title) - pubdate = '' - description = '' - ptag = divtag.find('p'); - if ptag: - description = self.tag_to_string(ptag,False) - #self.log("description %s" % description) - author = '' - autag = divtag.find('h4') - if autag: - author = self.tag_to_string(autag,False) - #self.log("author %s" % author) - if not articles.has_key(key): - articles[key] = [] - articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) - - ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + ans = [] + ans = self.add_section_index(ans,'','Web Front Page') + ans = self.add_section_index(ans,'news/','News Headlines') + ans = self.add_section_index(ans,'news/b-c/','BC News') + ans = self.add_section_index(ans,'news/national/','Natioanl News') + ans = self.add_section_index(ans,'news/world/','World News') + ans = self.add_section_index(ans,'opinion/','Opinion') + ans = self.add_section_index(ans,'opinion/letters/','Letters') + ans = self.add_section_index(ans,'business/','Business') + ans = self.add_section_index(ans,'business/money/','Money') + ans = self.add_section_index(ans,'business/technology/','Technology') + ans = self.add_section_index(ans,'business/working/','Working') + ans = self.add_section_index(ans,'sports/','Sports') + ans = self.add_section_index(ans,'sports/hockey/','Hockey') + ans = self.add_section_index(ans,'sports/football/','Football') + ans = self.add_section_index(ans,'sports/basketball/','Basketball') + ans = self.add_section_index(ans,'sports/golf/','Golf') + ans = self.add_section_index(ans,'entertainment/','entertainment') + ans = self.add_section_index(ans,'entertainment/go/','Go!') + ans = self.add_section_index(ans,'entertainment/music/','Music') + ans = self.add_section_index(ans,'entertainment/books/','Books') + ans = self.add_section_index(ans,'entertainment/Movies/','movies') + ans = self.add_section_index(ans,'entertainment/television/','Television') + ans = self.add_section_index(ans,'life/','Life') + ans = self.add_section_index(ans,'life/health/','Health') + ans = self.add_section_index(ans,'life/travel/','Travel') + ans = self.add_section_index(ans,'life/driving/','Driving') + ans = self.add_section_index(ans,'life/homes/','Homes') + ans = self.add_section_index(ans,'life/food-drink/','Food & Drink') return ans + diff --git a/recipes/vice_magazine_de.recipe b/recipes/vice_magazine_de.recipe new file mode 100644 index 0000000000..c3e1aa8f7d --- /dev/null +++ b/recipes/vice_magazine_de.recipe @@ -0,0 +1,40 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class ViceDERecipe(BasicNewsRecipe): + title = u'Vice Magazin Deutschland' + __author__ = 'atordo;alex' + description = u'Die offizielle Website des Vice Magazins Deutschland' + category = u'Nachrichten, Fotografie, Blogs, Mode, Kunst, Film, Musik, Literatur, Technik' + cover_url = 'http://www.seeklogo.com/images/V/Vice-logo-668578AC94-seeklogo.com.gif' + oldest_article = 14 + max_articles_per_feed = 100 + auto_cleanup = False + no_stylesheets = True + language = 'de' + use_embedded_content = False + remove_javascript = True + publication_type = 'magazine' + + recursions=10 + match_regexps = [r'/read/.*\?Contentpage=[2-9]$'] + + keep_only_tags = [ + dict(attrs={'class':['article_title','article_content','next']}) + ] + remove_tags = [ + dict(attrs={'class':['social_buttons','search','tweet','like','inline_socials' + ,'stumblebadge','plusone']}) + ] + + extra_css = ''' + .author{font-size:small} + img{margin-bottom: 0.4em; display:block; margin-left:auto; margin-right: auto} + ''' + + preprocess_regexps = [ + (re.compile(r'', re.DOTALL|re.IGNORECASE), lambda m: ''), + (re.compile(r'', re.DOTALL|re.IGNORECASE), lambda m: '') ] + # Para sustituir el video incrustado de YouTube por una imagen def preprocess_html(self, soup): @@ -108,14 +121,16 @@ class weblogssl(BasicNewsRecipe): # Para obtener la url original del articulo a partir de la de "feedsportal" # El siguiente código es gracias al usuario "bosplans" de www.mobileread.com - # http://www.mobileread.com/forums/sho...d.php?t=130297 + # http://www.mobileread.com/forums/showthread.php?t=130297 def get_article_url(self, article): link = article.get('link', None) if link is None: return article + # if link.split('/')[-4]=="xataka2": + # return article.get('feedburner_origlink', article.get('link', article.get('guid'))) if link.split('/')[-4]=="xataka2": - return article.get('feedburner_origlink', article.get('link', article.get('guid'))) + return article.get('guid', None) if link.split('/')[-1]=="story01.htm": link=link.split('/')[-2] a=['0B','0C','0D','0E','0F','0G','0N' ,'0L0S','0A'] diff --git a/recipes/windsor_star.recipe b/recipes/windsor_star.recipe index 8f10cff462..f294674219 100644 --- a/recipes/windsor_star.recipe +++ b/recipes/windsor_star.recipe @@ -102,14 +102,14 @@ class CanWestPaper(BasicNewsRecipe): if self.fp_tag=='': return None cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) daysback=1 try: br.open(cover) except: while daysback<7: cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: diff --git a/recipes/winsupersite.recipe b/recipes/winsupersite.recipe index 910dfd9125..8670ae5928 100644 --- a/recipes/winsupersite.recipe +++ b/recipes/winsupersite.recipe @@ -21,7 +21,7 @@ class Winsupersite(BasicNewsRecipe): lambda match: ''), ] def get_browser(self): - br = BasicNewsRecipe.get_browser() + br = BasicNewsRecipe.get_browser(self) br.open('http://www.winsupersite.com') return br diff --git a/recipes/wprost.recipe b/recipes/wprost.recipe index 2adac1e113..90dde251ca 100644 --- a/recipes/wprost.recipe +++ b/recipes/wprost.recipe @@ -10,89 +10,89 @@ from calibre.web.feeds.news import BasicNewsRecipe import re class Wprost(BasicNewsRecipe): - EDITION = 0 - FIND_LAST_FULL_ISSUE = True - EXCLUDE_LOCKED = True - ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png' + EDITION = 0 + FIND_LAST_FULL_ISSUE = True + EXCLUDE_LOCKED = True + ICO_BLOCKED = 'http://www.wprost.pl/G/layout2/ico_blocked.png' + title = u'Wprost' + __author__ = 'matek09' + description = 'Weekly magazine' + encoding = 'ISO-8859-2' + no_stylesheets = True + language = 'pl' + remove_javascript = True + recursions = 0 + remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) + remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) + ''' + keep_only_tags =[] + keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'})) + ''' - title = u'Wprost' - __author__ = 'matek09' - description = 'Weekly magazine' - encoding = 'ISO-8859-2' - no_stylesheets = True - language = 'pl' - remove_javascript = True - recursions = 0 - - remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) - remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) - - '''keep_only_tags =[] - keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))''' - - preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), + preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), (re.compile(r'display: block;'), lambda match: ''), (re.compile(r'\\\<\/table\>'), lambda match: ''), (re.compile(r'\'), lambda match: ''), (re.compile(r'\'), lambda match: ''), (re.compile(r'\
'), lambda match: ''), - (re.compile(r'\