Merge branch 'master' of git://github.com/kovidgoyal/calibre

This commit is contained in:
fenuks 2013-10-31 23:46:24 +01:00
commit 0ac211b150
622 changed files with 90279 additions and 20385 deletions

2
.gitignore vendored
View File

@ -14,7 +14,6 @@ build
dist
docs
resources/localization
resources/images.qrc
resources/scripts.pickle
resources/ebook-convert-complete.pickle
resources/builtin_recipes.xml
@ -42,3 +41,4 @@ calibre_plugins/
recipes/*.mobi
recipes/*.epub
recipes/debug
/.metadata/

View File

@ -20,6 +20,950 @@
# new recipes:
# - title:
- version: 1.8.0
date: 2013-10-25
new features:
- title: "DOCX Input: Support linked (as opposed to embedded) images, if the linked image is found on the local computer."
tickets: [1243597]
- title: 'FB2 Input: Add support for note and cite back references. Link pairs of type="note" and type="cite" now automatically generate the correct back reference.'
tickets: [1243714]
- title: "When automerging books during during an add, include the author as well as the title in the report of merged books."
- title: "OS X Mavericks (10.9) breaks connecting to iTunes and iBooks on iOS devices. For more details see: http://www.mobileread.com/forums/showthread.php?t=215624"
bug fixes:
- title: "OS X: Fix system tray notifications causing crashes on some OS X 10.9 (Mavericks) systems (those that had Growl installed at some point)."
tickets: [1224491]
- title: "OSX: Fix font size in completion popups too small on Mavericks (I hope)"
tickets: [1243761]
- title: "PDF Output: Fix rendering of some semi-transparent images. All semi-transparent images are now rendered using soft masks."
tickets: [1243829]
- title: "MOBI Output: Fix text marked with white-space:pre-wrap causing the Kindle to break lines at arbitrary points inside words."
tickets: [1240235]
- title: "FB2 Input: Fix a regression that broke conversion of FB2 files with paragraphs having both a style and an id attribute."
tickets: [1243709]
- title: "TXT Input: Ensure that <title> in the generated HTML has a meaningful value."
tickets: [1236923]
- title: "Book details panel: Fix HTML in author names and identifiers not being escaped"
tickets: [1243976]
- title: "HTML 5 parsing: Fix handling of xml:lang attributes on all elements xml:lang is now mapped to a plain lang on all elements, not just <html>"
- title: "Update HTML 5 parser used in calibre (html5lib-python) to fix a few corner cases"
- title: "When bulk deleting formats, use a single temporary directory for the deleted files. This makes restoring them from the recycle bin a little cleaner. Also might fix the reported issue with the windows recycle bin choking on creating a large number of folders."
- title: "DOCX Input: Add support for hyperlink fields that have only anchors and not URLs"
- title: "DOCX Input: Fix handling of multiple block level bookmarks at the same location."
tickets: [1241451]
- title: "HTMLZ Output: Fix Htmlz does not apply inline css from <body>."
tickets: [1242261]
- title: "Fix the restore database operation failing on windows installs with long usernames (this would cause the path to the temporary folder used to restore the database to become too long)."
- title: "ODT Input: Various workarounds for broken ODT files generated my mk4ht"
- title: "Fix a bug with non-ascii text in the create catalog dialog"
ticket: [1241515]
improved recipes:
- A List Apart
- version: 1.7.0
date: 2013-10-18
new features:
- title: "Cover grid: Allow using images as the background for the cover grid. To choose an image, go to Preferences->Look & Feel->Cover Grid."
tickets: [1239194]
- title: "An option to mark newly added books with a temporary mark. Option is in Preferences->Adding books."
tickets: [1238609]
- title: "Edit metadata dialog: Allow turning off the cover size displayed in the bottom right corner of the cover by right clicking the cover and choosing 'Hide cover size'. It can be restored the same way."
bug fixes:
- title: "Conversion: If both embed font family and the filter css option to remove fonts are set, do not remove the font specified by the embed font family option."
- title: "Fix a few remaining situations that could cause formats column to show an error message about SHLock"
- title: "Make deleting books to recycle bin more robust. Ensure that the temporary directory created during the move to recycle bin process is not left behind in case of errors."
- title: "Windows: Check if the books' files are in use before deleting"
- title: "Fix custom device driver swap main and card option not working. Also fix swapping not happening for a few devices on linux"
tickets: [1240504]
- title: "Edit metadata dialog: The Edit metadata dialog currently limits its max size based on the geometry of the smallest attached screen. Change that to use the geometry of the screen on which it will be shown."
tickets: [1239597]
- title: "HTMLZ Output: Fix <style> tag placed inside <body> instead of <head>."
tickets: [1239530]
- title: "HTMLZ Output: Fix inline styles not escaping quotes properly."
tickets: [1239527]
- title: "HTMLZ Output: Fix incorrect handling of some self closing tags like <br>."
tickets: [1239555]
- title: "Content server: Fix single item categories not working with reverse proxy setup."
tickets: [1238987]
- title: "Fix a bug that could cause calibre to crash when switching from a large library to a smaller library with marked books."
tickets: [1239210]
- title: "Get Books: Fix downloading of some books in formats that do not have metadata yielding nonsense titles"
- title: "Allow marked book button to be added to main toolbar when device is connected"
tickets: [1239163]
- title: "Fix error if a marked book is deleted/merged."
tickets: [1239161]
- title: "Template language: Fix formatter function days_between to compute the right value when the answer is negative."
- title: "Windows: Fix spurious file in use by other process error if the book's folder contained multiple hard links pointing to the same file"
tickets: [1240788, 1240194]
- title: "Windows: Fix duplicate files being created in very special circumstances when changing title and/or author. (the title or author had to be between 31 and 35 characters long and the book entry had to have been created by a pre 1.x version of calibre). You can check if your library has any such duplicates and remove them, by using the Check Library tool (Right click the calibre button on the toolbar and select Library Maintenance->Check Library)."
improved recipes:
- Wall Street Journal
- Newsweek Polska
- Wired Magazine
- cracked.com
- Television Without Pity
- Carta
- Diagonales
- version: 1.6.0
date: 2013-10-11
new features:
- title: "Temporary marking of books in the library"
description: "This allows you to select books from your calibre library manually and mark them. This 'mark' will remain until you restart calibre, or clear the marks. You can easily work with only the marked subset of books by right clicking the Mark Books button. To use this feature, go to Preferences->Toolbars and add the 'Mark Books' tool to the main toolbar."
type: major
- title: "Get Books: Add Wolne Lektury and Amazon (Canada) ebook stores"
- title: "DOCX Input: Handle hyperlinks in footnotes and endnotes"
tickets: [1232790]
- title: "Driver for Sunstech reader"
tickets: [1231590]
- title: "Allow using both uri: and url: identifiers to create two different arbitrary links instead of just one in the Book details panel"
- title: "E-book viewer: Make all keyboard shortcuts configurable"
tickets: [1232019]
- title: "Conversion: Add an option to not condense CSS rules for margin, padding, border, etc. Option is under the Look & Feel section of the conversion dialog."
tickets: [1233220]
- title: "calibredb: Allow setting of title sort field"
tickets: [1233711]
- title: "ebook-meta: Add an --identifier option to set identifiers."
bug fixes:
- title: "Fix a locking error when composite columns containing formats are used and formats are added/deleted."
tickets: [1233330]
- title: "EPUB Output: Do not strip <object> tags with type application/svg+xml in addition to those that use image/svg+xml."
tickets: [1236845]
- title: "Cover grid: Fix selecting all books with Ctrl+A causing subsequent deselects to not fully work."
tickets: [1236348]
- title: "HTMLZ Output: Fix long titles causing error when converting on windows."
tickets: [1235815]
- title: "Content server: Fix OPDS category links to composite columns"
- title: "E-book viewer: Fix regression that broke import/export of bookmarks"
tickets: [1231980]
- title: "E-book viewer: Use the default font size setting for the dictionary view as well."
tickets: [1232025]
- title: "DOCX Input: Avoid using the value attribute for simple numbered lists, to silence the asinine epubcheck"
- title: "HTML Input: Images linked by the poster attribute of the <video> tag are now recognized and processed."
- title: "DOCX Input: Fix erorr when converting docx files that have numbering defined with no associated character style."
tickets: [1232100]
- title: "EPUB Metadata: Implementing updating identifiers other than isbn in the epub file from calibre when polishing or exporting the epub"
- title: "Amazon metadata download: Fix parsing of some dates on amazon.de"
tickets: [1238125]
improved recipes:
- National Geographic Magazine
- New York Review of Books
- Focus (PL)
- Carta Capital
- AM 730
- Ming Pao (HK)
- Neu Osnabrucker Zeitung
new recipes:
- title: Various Uruguayan news sources
author: Carlos Alves
- version: 1.5.0
date: 2013-09-26
new features:
- title: "Driver for Woxter Scriba"
tickets: [1228690]
- title: "Bulk metadata edit: Allow setting the comments for all selected books and also allow cloning the covers. Cloning covers means that the cover of the first selected book will be set for all other selected books."
tickets: [1230040]
bug fixes:
- title: "Windows: Improved device ejection code. Eject individual drives before trying to eject the device. This fixes incomplete ejection with the Nook devices."
- title: "Catalogs: fix exclude tags rules not working in non-English locales when creating catalogs in EPUB/MOBI formats."
tickets: [1228949]
- title: "Kobo driver: Fix reading status being cleared when connecting to a Kobo with older firmware and metadata management set to automatic."
tickets: [1230018]
- title: "Content server: Sort virtual libraries by name"
tickets: [1229459]
- title: "DOCX Input: Convert tabs in the docx file into non-breaking spaces in the output document. Note that custom tab stops are not supported."
tickets: [1228893]
- title: "Conversion: Handle the style attribute on the <html> tag"
- title: "Handle databases with invalid ratings link tables"
tickets: [1228517]
- title: "DOCX Input: Handle DOCX files with missing referenced styles"
tickets: [1228669]
- title: "Update amazon metadata download plugin for changes to the Amazon website"
improved recipes:
- Slate
- El Universal (VE)
- GoComics
new recipes:
- title:
- version: 1.4.0
date: 2013-09-20
new features:
- title: "Column icons: Allow the use of multiple icons with column icon rules."
description: "You can now have column icon rules display multiple icons in a single column, side by side. There are two ways to do this, either specify multiple icons when creating the rule, or create multiple rules that match the same book and specify the icon type to be 'composed' for every rule. See Preferences->Look & Feel->Column icons for details."
- title: "Kobo driver: Add support for new cover handling in Kobo Aura with updated firmware"
- title: "Driver for Icarus Essence"
tickets: [1226304]
- title: "Show a warning when attempting to copy books between libraries that do not have the same set of custom columns."
tickets: [1225484]
- title: "EPUB/AZW3 Output: Use shorthand forms for margin, padding and border CSS properties, where possible"
- title: "Allow colons in identifier values, needed for using URIs as identifiers"
tickets: [1224885]
- title: "Comments editor: Allow treating arbitrary URLs as images"
- title: "Show full path of library under mouse in status bar when switching/renaming/deleting libraries via the calibre library button."
tickets: [1224925]
- title: "DOCX Input: Add support for embedded EMF images that are just wrappers around an actual raster image."
tickets: [1224849]
bug fixes:
- title: "Conversion: Fix font subsetting not working for large fonts with more than 4000 glyphs, such as CJK fonts"
- title: "HTML Input: Fix a regression that broke processing of HTML files that contain meta tags with dc: namespaced attribute values."
- title: "Fix switching to an empty virtual library not blanking the book details panel"
- title: "Keep position when deleting tags in the tag editor"
tickets: [1226093]
- title: "Book details panel: Render custom comments fields the same as the builtin comments field. In particular this fixes problems caused by wide text and images in custom comments fields."
tickets: [1226350]
- title: "Metadata jackets: Do not error out when using a custom template with some custom fields that are not present."
tickets: [1225357]
- title: "AZW3 Output: Dont choke on invalid (undecodable) links in the input document"
- title: "Cover grid: Respect the double click on library view tweak"
- title: "Fix covers set by drag and drop or pasting in the edit metadata dialog showing compression artifacts due to aggressive jpeg compression"
- title: "Conversion: Fix a bug that could cause incorrect border values to be used when cascading, shorthand border CSS is present."
- title: "Fix regression in 1.3 that caused the book list to not track the current book when using Next/Previous in the edit metadata dialog."
improved recipes:
- Liberation
- Politika
new recipes:
- title: Sage News
author: Brian Hahn
- title: Il Cambiamento
author: ghib9
- version: 1.3.0
date: 2013-09-13
new features:
- title: "When doing searches or switching between virtual libraries in the main book list, preserve the current book. The currently selected book will remain visible if it is present in the results of the search or the selected virtual library."
tickets: [1216713]
- title: "Drivers for Oppo Find 5 and PocketBook Mini 515"
tickets: [1223853]
bug fixes:
- title: "DOCX Input: Handle numbered paragraphs where the numbering is specified in the paragraph style, instead of on the paragraph directly. Also support the use of arbitrary, styled text for bullets."
- title: "DOCX Input: Fix a single line break at the end of a paragraph not being rendered as a blank line."
- title: "DOCX Input: Fix extra top/bottom margins around headings when the heading style in word does not specify any top/bottom margins."
- title: "DOCX Input: Handle images in footnotes and endnotes."
tickets: [1221686]
- title: "ODT Input: Only treat the first image as a cover if it is of suitable size, instead of any image in the document."
tickets: [1224157]
- title: "Book polishing: Do not leave behind the old comments when updating metadata if the comments have been deleted in calibre."
- title: "PDF Output: Fix non-breaking space characters incorrectly rendered in PDF outline."
tickets: [1223862]
- title: "Content server: Fix error in opds feed after using virtual libraries in the main server."
tickets: [1222108]
- title: "Do not scroll the book list horizontally after editing metadata."
tickets: [1221552]
- title: "New database backend: Handle databases that contain multiple tags/series/publishers/etc. that differ only in case."
tickets: [1221545]
improved recipes:
- Harvard Business Review
- Jakarta Post
- Jakarta Globe
- Dilema Veche
- Daily Express
- Anandtech
- High Country News
new recipes:
- title: Caravan Magazine
author: Kovid Goyal
- title: Phys Org
author: Krittika Goyal
- version: 1.2.0
date: 2013-09-06
new features:
- title: "Conversion: Add support for the CSS3 rem font size unit"
- title: "MTP devices, such as Android tablets/phones: Allow ignoring any folder on the device, not just top level folders. For newly connected devices, also scan /Android/data/com.amazon.kindle for books by default (newer versions of the Kindle app place downloaded files there)."
- title: "Speed up sorting when the book list is showing a restricted set of books, such as when the results of a search are displayed or a virtual library is used."
tickets: [1217622]
- title: "Edit metadata dialog: Add an undo option for the Trim cover button."
tickets: [1219227]
bug fixes:
- title: "Amazon metadata download: Update to handle website changes at amazon.com"
- title: "PDF Output: Workaround a bug in the library calibre uses to render HTML to PDF that caused text in some documents that used small-caps fonts to not render correctly."
tickets: [1216354]
- title: "Kobo driver: When a sideloaded kepub is added from a Kobo device to the calibre library, it is added as an epub, but the file copied is corrupt."
tickets: [1221035]
- title: "Fix changing the user interface language in the welcome wizard causing some parts of the interface to remain in the old language until calibre is restarted."
tickets: [1220767]
- title: "Fix regression in 1.0 that broke setting author names with the | character in them."
tickets: [1220348]
- title: "Content server: When running from inside the main calibre program, do not restrict the books shown based on the current virtual library in the main program. If you wish to restrict the books shown in the content server, use Preferences->Sharing over the net."
- title: "Output dates in the local timezone instead of UTC when generating CSV catalogs"
- title: "Library maintenance: When doing a check library instead of dumping the database to SQL and restoring it, run a VACUUM. This works around various bugs in the dump and restore capabilities of apsw."
tickets: [1217988]
- title: "Edit metadata dialog: Fix adding an image to an empty comments block not working"
- title: "Conversion: Fix font declarations with invalid font-family values causing conversion to abort when subsetting is enabled."
- title: "MOBI Output: Fix conversion of some super/sub scripts failling if they are the first or last element in a paragraph."
tickets: [1218858]
- title: "New database backend: Various improvements to make the backend more robust against databases with invalid/corrupt data in them."
tickets: [1218465, 1218783]
improved recipes:
- Countryfile
- version: 1.1.0
date: 2013-08-30
new features:
- title: "Rewrite the HTML metadata parser to make it faster and more robust."
tickets: [1217751]
- title: "Book list: When sorting on a currently unsorted column, use the last applied sort for that column, instead of always sorting in ascending order."
tickets: [1216714]
- title: "PocketBook driver: Scan for books files in the entire device not just in the 'books' folder"
bug fixes:
- title: "Fix a regression in 1.0 that could cause the dates in custom date-type columns to change in some timezones when using the edit metadata dialog to make unrelated changes."
tickets: [1217096]
- title: "When replacing formats in a book with a very long title+authors on windows, calibre could leave behind the old format file, because the filename shortening algorithm has changed. Handle that case."
- title: "Fix content server giving an error if you return to the top level page after using the virtual libraries."
tickets: [1216838]
- title: "Fix calibredb not updating the running calibre instance properly in 1.0"
tickets: [1218177]
- title: "Fix a regression in 1.0 that broke splitting of multiple valued field like tags into many items during a rename."
tickets: [1216699]
- title: "Fix a regression in 1.0 that caused an error when trying to set values for tags with the same item repeated, with different case."
tickets: [1216398]
- title: "Fix a regression that broke downloading news when output format is set to PDF"
- title: "Creating a catalog with an already existing catalog in the library would cause a temporary duplicate entry in the book list. Also fix the author sort for catalogs generated in the AZW3 format not being correct."
- title: "EPUB metadata: When changing the title in an EPUB 2.0 file that has multiple titles, remove the extra titles."
tickets: [1211949]
- title: "Fix a regression in 1.0 that caused Search and Replace in the bulk metadata edit dialog to be much slower than before"
- title: "Fix a regression in 1.0 that caused incorrect sorting and searching on some composite columns (columns built from other columns)."
- title: "Fix a regression in 1.0 that prevented the moving of libraries inside calibre"
tickets: [1216401]
- title: "Virtual Library tabs: If the user activates a hidden tab via the Virtual Library button, change the name of the All Books tab to reflect the hidden virtual library."
tickets: [1216174]
- title: "Ignore text records in the database that are damaged, instead of erroring out. Lets the rest of the data be used."
tickets: [1215981]
- title: "Fix regression that broke calibredb catalog when sorting on the id field."
tickets: [1216090]
- title: "HTML Input: Handle malformed OPF files when converting. "
tickets: [1215924]
- title: "Ensure that the Formats custom column (if present) is updated when a new format is created as a result of a conversion."
tickets: [1215885]
- title: "Fix a bug in 1.0 that broke the Check Library function on computers with non-English locales."
tickets: [1215819]
- title: "Content server: Fix blank username causing error on startup."
tickets: [1215893]
- title: "Fix sorting of book list by multi-valued fields like tags not correct in the new backend."
tickets: [1215820]
improved recipes:
- Daily Mirror
new recipes:
- title: VFR Magazine
author: Krittika Goyal
- version: 1.0.0
date: 2013-08-23
new features:
- title: "A new 'cover grid' view of the books in your calibre library"
description: "Excellent for judging your books by their covers :) To use click the button with the icon of a grid in the bottom right corner of the main window. It can be configured via Preferences->Look & Feel->Cover Grid"
type: major
- title: "A new, faster database backend"
description: "The database backend in calibre has been re-written from scratch. The new code is smaller, more robust and much faster than the old code. The exact speedup will depend on the number of books and number and type of custom columns in your library. Users have reported calibre startup times decreasing by a factor of 2-3 times."
type: major
- title: "For a summary of the major changes in calibre between 0.9 and 1.0, see http://calibre-ebook.com/new-in/ten"
type: major
- title: "RTF Input: Add option to ignore WMF images iinstead of replacing them with a placeholder."
tickets: [1213599]
- title: "Content server: Make virtual libraries available as searches from the start page. They work just like saved searches, clicking on a virtual library will show you all the books in that virtual library."
bug fixes:
- title: "Remove extra, useless 'language' entry in metadata download configuration dialog"
- title: "Kobo driver: Display device collections even if the driver is configured to not manage shelves on the device."
tickets: [1214233]
- title: "Fix typo in calibre.desktop file on linux"
tickets: [1213664]
- title: "Edit metadata dialog: Disable OK button while results are being downloaded."
tickets: [1213397]
- title: "In OS X 10.8 Apple stopped redirecting stdout/stderr to Console.app for applications launched by launch services. Re-enable the redirection, useful for debugging."
- title: "Fix virtual library tabs not being updated when using VL button"
improved recipes:
- Consumerist
- jeuxvideo
- Metro UK
- El Tribuno
- High Country News
- Daily Express
- Providence Journal
- mediapart
new recipes:
- title: News24 and Nuus24
author: Nicki de Wet
- version: 0.9.44
date: 2013-08-16
new features:
- title: "Add an option to display all virtual libraries as tabs above the book list."
description: "Convenient to quickly switch between virtual libraries. To enable, click the Virtual library button and select 'Show virtual libraries as tabs'. You can re-arrange the tabs by drag and drop and close tabs you do not want. Right click on the tabs to restore closed tabs."
- title: "An improved cover trimming algorithm to automatically detect and remove borders and extra space from the edge of cover images. To try it use the 'Trim' button in the edit metadata dialog. This can sometimes remove too much so if you dont like the result, just click cancel. You can make the algorithm more or less aggressive via Preferences->Tweaks"
- title: "Allow customizing the comic metadata reader plugin via Preferences->Plugins to read the series index from either the volume or the issue number of the comic."
tickets: [1211433]
- title: "Linux MTP driver: Add ids for some newer devices."
tickets: [1212458]
- title: "Add a trim cover option to the bulk metadata edit dialog"
- title: "Make the book information dialog user resizable, with a splitter between the cover and the info panel. Also change the background of the cover panel for books that have been marked using the Temp marker plugin."
tickets: [1209057]
- title: "Driver for Samsung Galaxy Young Android phone"
tickets: [1212918]
bug fixes:
- title: "PDF Output: Do not abort conversion if the document being converted has an HTML cover (found in some broken EPUB files)."
- title: "RTF Input: When converting RTF files with no codepage, use the input encoding setting as the codepage."
tickets: [1163572]
improved recipes:
- The Independent
- El Periodica de Aragon
- El Correo
new recipes:
- title: Daily Express
author: Dave Asbury
- version: 0.9.43
date: 2013-08-09
new features:
- title: "TXT Input: Allow using various markdown extensions for more features when converting markdown formatted txt files. See http://pythonhosted.org/Markdown/extensions/index.html for details."
- title: "Sending by email: Allow sending by email to an arbitrary combination of email address. Access it via the 'Select recipients' menu entry in the Email To menu."
tickets: [1207818]
- title: "A new 'Sort By' action for the right click menu. This allows sorting on all columns in the library, not just the visible columns. To use it go to Preferences->Toolbars and add it to 'The context menu for books in the calibre library'"
- title: "Allow adding images into the comments field, by clicking on the insert link button in the comments editor in the edit metadata dialog."
- title: "Allow skipping the confirm bulk reconvert dialog"
- title: "EPUB Input: If the EPUB file identifies an actual cover image in addition to the titlepage html file, use the cover image instead of rendering the titlepage. This is faster and has the advantage that an EPUB to EPUB conversion preserves internal cover structure."
- title: "Get Books: Improve searching by removing punctuation from title/authors before matching."
bug fixes:
- title: "Conversion: Fix empty inline tags that are the second child of a paragraph causing text change location."
tickets: [1207735]
- title: "Fix book count in tooltip of choose library button not updating"
ticket: [1208217]
- title: "Kobo driver: When deleting shelves that have been synced, the Activity entry for the shelf was not being deleted. This left a tile for the shelf on the home screen of the Glo and AuraHD."
tickets: [1208159]
- title: "Comments editor: The Insert Link button has no effect until the user clicks inside the comments box, therefore disable it until it is ready, to prevent confusion."
tickets: [1208073]
- title: "Get Books: Update various Polish store plugins"
improved recipes:
- The Sunday Times UK and The Times Online
- Telegraph UK
- "Le Monde: Edition abonnés"
- The Scotsman
new recipes:
- title: Various French news sources
author: Malah
- title: La Capital de Rosario
author: Darko Miletic
- title: Jot Down
author: desUbiKado
- title: Private Eye
author: Martyn Pritchard
- version: 0.9.42
date: 2013-08-02
new features:
- title: "When downloading metadata from Amazon, convert the amazon categories into tags. You can turn this off by going to Preferences->Metadata download and configuring the Amazon source."
tickets: [1206763]
- title: "Kobo driver: Add an option to modify the styling in books being sent to the device, based on a template on the device."
tickets: [1207151]
- title: "Get Books: Add support for two more Polish ebook stores: cdp.pl and ebooki.allegro.pl"
- title: "calibredb: Add a new clone command to create clones of libraries with the same custom columns, virtual libraries, etc. as the current library."
bug fixes:
- title: "MOBI metadata: Do not fail to set metadata in MOBI files if they have EXTH fields with NULL pointers to a cover or thumbnail."
tickets: [1205757]
- title: "Fix editing of book metadata failing when its timestamp is out of range for the system."
tickets: [1191599]
- title: "Fix renaming a user category to the same name it already has erases the user category."
tickets: [1207131]
- title: "Fix drag 'n drop of cover onto conversion dialog not working"
- title: "Device drivers: Explicitly fsync() all files when writing to devices, to reduce chances of file corruption if the device is disconnected while jobs are running"
- title: "Fix calibre not appearing in Ubuntu's 'Open with..' menu"
tickets: [1207518]
improved recipes:
- PC World
- version: 0.9.41
date: 2013-07-27
new features:
- title: "Add a button to clear the current virtual library easily"
- title: "Driver for Surftab Ventos"
tickets: [1204885]
- title: "Ebook-viewer: Allow re-ordering bookmarks in the bookmarks manager by drag and drop."
bug fixes:
- title: "DOCX Input: Fix conversion breaking for files that use heading style paragraphs to insert line rules"
- title: "Content server: Fix last search query not being fully sanitized in results page"
tickets: [1205385]
- title: "Book polishing: Fix page margins being removed if an unused font was found during subsetting of embedded fonts."
- title: "PDF Output: Do not error out when the input document uses a font that cannot be subset, such as the Symbol font. Instead print a warning and embed the full font."
tickets: [1203449]
- title: "Conversion: Fix a regression in the last release that broke conversion of a few files with comments just before a chapter start."
tickets: [1188635]
improved recipes:
- Something Awful
- Spektrum der Wissenschaft
- mediapart.fr
- Dilbert
- Antyweb
- Scientific American
- taz.de (RSS)
new recipes:
- title: Blindbuch and No names, No jackets
author: Armin Geller
- title: El Tribuno Salta and Jujuy
author: Darko Miletic
- version: 0.9.40
date: 2013-07-19
new features:
- title: "EPUB Output: Add an option to insert an inline Table of Contents into the main text."
tickets: [1201006]
- title: "Driver for LG Android phone"
tickets: [1202013]
- title: "When matching books in the library against the device manually, pre-fill the search field with the book title"
tickets: [1200826]
bug fixes:
- title: "PDF Input: Fix a regression that caused some images to be flipped when converting PDF files that use image rotation operators."
tickets: [1201083]
- title: "Fix regression that caused incorrect font size in dropcaps generated by the DOCX input plugin"
- title: "Get Books: Fix searching for title and author returning some extra matches, if the title starts with an article like the, a or an."
tickets: [1200012]
- title: "PDF Output: Fix extra blank page being inserted at the start of the chapter when converting some epub files from feedbooks"
- title: "PDF Output: Workaround bug in WebKit's getBoundingClientRect() method that could cause links to occasionally point to incorrect locations."
tickets: [1202390]
- title: "E-book viewer: Fix a bug that could cause the reported position to be incorrect immediately after opening a previously opened book. This also fixes the Back button not working if a link is clicked on the page immediately after opening the book."
- title: "Fix memory card not being detected for Elonex 621 on Windows"
- title: "Fix regression in last release that broke auto-conversion of ebooks when sending to device/sending by email."
tickets: [1200864]
- title: "Get Books: Update amazon plugins for website changes"
- title: "Allow using non-ascii chars in email passwords."
tickets: [1202825]
improved recipes:
- Galaxy's Edge
new recipes:
- title: Il Foglio
author: faber1971
- title: Le Monde Diplomatique and Acrimed
author: Gaetan Lehmann
- version: 0.9.39
date: 2013-07-12
new features:
- title: "Bulk metadata edit: Add a checkbox to prevent the refreshing of the book list after the bulk edit. This means that the book list will not be resorted and any existing search/virtual library will not be refreshed. Useful if you have a large library as the refresh can be slow."
- title: "Allow manually marking a book in the calibre library as being on the device. To do so click the device icon in calibre, then right click on the book you want marked and choose 'Match book to library'. Once you are done marking all the books, right click the device icon and choose 'Update cached metadata'"
- title: "Driver for Coby Kyros MID1126"
tickets: [1199410]
- title: "When adding formats to an existing book, by right clicking the add books button, ask for confirmation if some formats will be overwritten."
- title: "Add a tweak to restrict the list of output formats available in the conversion dialog. Go to Preferences->Tweaks to change it."
bug fixes:
- title: "Amazon metadata download: Update plugin to deal with the new amazon.com website"
- title: "Edelweiss metadata download plugin: Workaround for advanced search being broken at the Edelweiss website."
- title: "Invalid data in the device database on sony readers could cause errors when sorting device collections, ignore those errors."
- title: "DOCX Input: Fix no page break being inserted before the last section."
tickets: [1198414]
- title: "Metadata download dialog: Have the OK button enabled in the results screen as well."
tickets: [1198288]
- title: "Get Books: Update empik store plugin"
improved recipes:
- Houston Chronicle
- cracked.com
- mediapart.fr
new recipes:
- title: Glenn Brenwald and Ludwig von Mises Institute
author: anywho
- version: 0.9.38
date: 2013-07-05
new features:
- title: "Book polishing: Add option to embed all referenced fonts when polishing books using the 'Polish Books' tool."
tickets: [1196038]
- title: "DOCX Input: Add support for clickable (hyperlinked) images"
tickets: [1196728]
- title: "DOCX Input: Insert page breaks at the start of every new section"
tickets: [1196728]
- title: "Drivers for Trekstor Pyrus Maxi and PocketBook Surfpad 2"
tickets: [1196931, 1182850]
- title: "DOCX Input: Add support for horizontal rules created by typing three hyphens and pressing enter."
bug fixes:
- title: "Fix detection of SD Card in some PRS-T2N devices"
tickets: [1197970]
- title: "MOBI Input: Fix a regression that broke parsing of MOBI files with malformed markup that also used entities for apostrophes."
ticket: [1197585]
- title: "Get Books: Update Woblink store plugin"
- title: "Metadata download dialog: Prevent the buttons from being re-ordered when the Next button is clicked."
- title: "PDF Output: Fix links that point to URLs with query parameters being mangled by the conversion process."
tickets: [1197006]
- title: "DOCX Input: Fix links pointing to locations in the same document that contain multiple, redundant bookmarks not working."
- title: "EPUB/AZW3 Output: Fix splitting on page-break-after with plain text immediately following the split point causing the text to be added before rather than after the split point."
tickets: [1196728]
- title: "DOCX Input: handle bookmarks defined at the paragraph level"
tickets: [1196728]
- title: "DOCX Input: Handle hyperlinks created as fields"
tickets: [1196728]
improved recipes:
- iprofessional
new recipes:
- title: Democracy Now
author: Antoine Beaupre
- version: 0.9.37
date: 2013-06-28
new features:
- title: "Conversion: Add option to embed all referenced fonts"
type: major
description: "Add an option to embed all fonts that are referenced in the input document but are not already embedded. This will search your system for the referenced font, and if found, the font will be embedded. Only works if the output format supports font embedding (for example: EPUB or AZW3). The option is under the Look & Feel section of the conversion dialog."
- title: "ToC Editor: When generating a ToC from files, if the file has no text, do not skip it. Instead create an entry using the filename of the file."
- title: "AZW3 Input: Add support for the page-progression-direction that is used to indicate page turns should happen from right to left. The attribute is passed into EPUB when converting."
tickets: [1194766]
- title: "ebook-convert: Add a --from-opf option to read metadata from OPF files directly, instead of having to run ebook-meta --from-opf after conversion"
bug fixes:
- title: "PDF Output: Fix Table of Contents being added to the end of the PDF even without the Add Table of Contents option being enabled."
tickets: [1194836]
- title: "When auto-merging books on add, also merge identifiers."
- title: "Fix an error when using the Template Editor to create a template that uses custom columns."
tickets: [1193763]
- title: "LRF Output: Fix &quot; entities in attribute values causing problems"
- title: "News download: Apply the default page margin conversion settings. Also, when converting to PDF, apply the pdf conversion defaults."
tickets: [1193912]
- title: "Fix a regression that broke scanning for books on all devices that used the Aluratek Color driver."
tickets: [1192940]
- title: "fetch-ebbok-metadata: Fix --opf argument erroneously requiring a value"
- title: "When waiting before sending email, log the wait."
tickets: [1195173]
improved recipes:
- taz.de (RSS)
- Miradas al sur
- Frontline
- La Nacion (Costa Rica)
- version: 0.9.36
date: 2013-06-21
new features:
- title: "DOCX Input: Support for Table of Contents created using the Word Table of Contents tool. calibre now first looks for such a Table of Contents and only if one is not found does it generate a ToC from headings."
- title: "DOCX Input: Add support for images used as bullets in lists"
- title: "DOCX Input: If a large image that looks like a cover is present at the start of the document, remove it and use it as the cover of the output ebook. This can be turned off under the DOCX Input section of the conversion dialog."
- title: "When dropping files onto the Book Details panel, ask for confirmation before adding the files to the book. The confirmation can be disabled."
- title: "News download: Add the 'downloaded from' link at the bottom of every article when using a touchscreen output profile (like the Tablet profile)."
- title: "E-book viewer: Change the bookmark button to always popup a menu when clicked, makes accessing existing bookmarks easier."
- title: "After a bulk metadata download, focus the review button on the popup notification, instead of the OK button."
tickets: [1190931]
bug fixes:
- title: "DOCX Input: Hide text that has been marked as not being visible in the web view in Word."
- title: "DOCX Input: When converting docx files with large numbers of unnamed images, do not crash on windows."
tickets: [1191354]
- title: "DOCX Input: Add support for the Word setting 'No space between paragraphs with the same style'."
tickets: [119100]
- title: "MOBI Output: Fix rendering of SVG images that embed large raster images in 64bit calibre installs."
tickets: [1191020]
- title: "HTMLZ Output: Fix handling of images with URL unsafe filenames."
tickets: [1192687]
- title: "Fix unable to change the case of a previously used search because of the search history."
- title: "When searching allow use of uppercase location names, such as AUTHOR instead of author, automatically lowercasing them."
tickets: [1192785]
- title: "DOCX metadata: When reading covers from DOCX files use the first image as specified in the actual markup instead of just the first image in the container."
- title: "Kobo driver: Fix a regression when deleting empty shelves on Kobo devices with older firmware."
tickets: [1192441]
- title: "Do not show builtin plugins in the get new plugins dialog If a builtin plugin with the same name as a third party plugin exists, then the builtin plagin was displayed in the get new plugins dialog as installed (happened with the new DOCX Input plugin)."
- title: "Apple driver: When in synchronous mode (direct to iBooks), disable PDF transfers, as we can't update metadata in iTunes. Not sure when this started, but as of iTunes 11.0.4 it's broken."
- title: "Get Books: Fix error when using internal browser on some systems"
tickets: [1191199]
improved recipes:
- The Walrus Mag
- Various Polish news sources
new recipes:
- title: Various Polish news sources
author: fenuks
- version: 0.9.35
date: 2013-06-14

View File

@ -24,3 +24,10 @@ Development
A [tarball of the source code](http://status.calibre-ebook.com/dist/src) for the
current calibre release.
Bugs
------
Bug reports and feature requests should be made in the calibre bug tracker at [launchpad](https://bugs.launchpad.net/calibre).
The GitHub bug tracker is only for people contributing code to calibre.

162
imgsrc/marked.svg Normal file
View File

@ -0,0 +1,162 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="128"
height="128"
id="svg2"
version="1.1"
inkscape:version="0.48.4 r9939"
sodipodi:docname="marked.svg"
inkscape:export-filename="/home/kovid/work/calibre/resources/images/marked.png"
inkscape:export-xdpi="90"
inkscape:export-ydpi="90">
<title
id="title3847">Pushpin Icon</title>
<defs
id="defs4">
<linearGradient
id="linearGradient3782">
<stop
style="stop-color:#000000;stop-opacity:1;"
offset="0"
id="stop3784" />
<stop
style="stop-color:#c3c3c0;stop-opacity:1;"
offset="1"
id="stop3786" />
</linearGradient>
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient3782"
id="linearGradient3813"
gradientUnits="userSpaceOnUse"
gradientTransform="matrix(0.70710678,-0.70710678,0.70710678,0.70710678,-18.805519,996.21376)"
x1="58"
y1="91"
x2="73"
y2="91" />
<filter
id="filter3014"
inkscape:label="Ridged border"
inkscape:menu="Bevels"
inkscape:menu-tooltip="Ridged border with inner bevel"
color-interpolation-filters="sRGB">
<feMorphology
id="feMorphology3016"
radius="4.3"
in="SourceAlpha"
result="result91" />
<feComposite
id="feComposite3018"
in2="result91"
operator="out"
in="SourceGraphic" />
<feGaussianBlur
id="feGaussianBlur3020"
result="result0"
stdDeviation="1.2" />
<feDiffuseLighting
id="feDiffuseLighting3022"
diffuseConstant="1"
result="result92">
<feDistantLight
id="feDistantLight3024"
elevation="66"
azimuth="225" />
</feDiffuseLighting>
<feBlend
id="feBlend3026"
in2="SourceGraphic"
mode="multiply"
result="result93" />
<feComposite
id="feComposite3028"
in2="SourceAlpha"
operator="in" />
</filter>
</defs>
<sodipodi:namedview
id="base"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:pageopacity="0.0"
inkscape:pageshadow="2"
inkscape:zoom="5.6568542"
inkscape:cx="30.580486"
inkscape:cy="63.624717"
inkscape:document-units="px"
inkscape:current-layer="layer1"
showgrid="true"
inkscape:snap-smooth-nodes="false"
inkscape:window-width="1920"
inkscape:window-height="1058"
inkscape:window-x="0"
inkscape:window-y="22"
inkscape:window-maximized="0"
inkscape:snap-bbox="false"
inkscape:object-paths="true"
inkscape:snap-midpoints="false"
inkscape:snap-global="true">
<inkscape:grid
empspacing="5"
visible="true"
enabled="true"
snapvisiblegridlinesonly="true"
type="xygrid"
id="grid2985" />
</sodipodi:namedview>
<metadata
id="metadata7">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title>Pushpin Icon</dc:title>
<dc:creator>
<cc:Agent>
<dc:title>Kovid Goyal</dc:title>
</cc:Agent>
</dc:creator>
<dc:rights>
<cc:Agent>
<dc:title>Public domain</dc:title>
</cc:Agent>
</dc:rights>
</cc:Work>
</rdf:RDF>
</metadata>
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1"
transform="translate(0,-924.36218)">
<path
style="fill:#f39509;fill-opacity:1;stroke:#7a6822;stroke-opacity:1;stroke-width:0;stroke-miterlimit:4;stroke-dasharray:none;filter:url(#filter3014)"
d="m 1.9128912,974.70018 49.4974748,-49.49747 -7.071068,21.2132 31.819805,17.67767 24.433067,-3.85121 -63.639613,63.63963 3.851207,-24.43308 -17.677669,-31.81981 z"
id="path3088"
inkscape:connector-curvature="0"
sodipodi:nodetypes="ccccccccc"
inkscape:export-xdpi="90"
inkscape:export-ydpi="90" />
<path
style="fill:url(#linearGradient3813);fill-opacity:1;stroke:none"
d="M 63.925974,996.92087 120,1042.5389 74.532576,986.31427"
id="path3097"
inkscape:connector-curvature="0"
sodipodi:nodetypes="ccc"
inkscape:export-xdpi="90"
inkscape:export-ydpi="90" />
</g>
</svg>

After

Width:  |  Height:  |  Size: 4.9 KiB

7412
imgsrc/tweak.svg Normal file

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 249 KiB

View File

@ -537,25 +537,38 @@ Set the :guilabel:`Level 1 TOC` setting to ``//h:h2``. Then, for chapter two, |a
How options are set/saved for Conversion
-------------------------------------------
There are two places where conversion options can be set in |app|. The first is in Preferences->Conversion. These
settings are the defaults for the conversion options. Whenever you try to convert a new book, the settings set here
will be used by default.
There are two places where conversion options can be set in |app|. The first is
in Preferences->Conversion. These settings are the defaults for the conversion
options. Whenever you try to convert a new book, the settings set here will be
used by default.
You can also change settings in the conversion dialog for each book conversion. When you convert a book, |app| remembers the
settings you used for that book, so that if you convert it again, the saved settings for the individual book will take
precedence over the defaults set in Preferences. You can restore the individual settings to defaults by using the Restore to defaults
button in the individual book conversion dialog.
You can also change settings in the conversion dialog for each book conversion.
When you convert a book, |app| remembers the settings you used for that book,
so that if you convert it again, the saved settings for the individual book
will take precedence over the defaults set in Preferences. You can restore the
individual settings to defaults by using the Restore to defaults button in the
individual book conversion dialog. You can remove the saved settings for a
group of books by selecting all the books and then clicking the edit metadata
button to bring up the bulk metadata edit dialog, near the bottom of the dialog
is an option to remove stored conversion settings.
When you Bulk Convert a set of books, settings are taken in the following order:
When you Bulk Convert a set of books, settings are taken in the following order (last one wins):
* From the defaults set in Preferences->Conversion
* From the saved conversion settings for each book being converted (if
any). This can be turned off by the option in the top left corner of the
Bulk Conversion dialog.
* From the defaults set in Preferences->Conversion
* From the saved conversion settings for each book being converted (if any). This can be turned off by the option in the top left corner of the Bulk Conversion dialog.
* From the settings set in the Bulk conversion dialog
Note that the final settings for each book in a Bulk Conversion will be saved and re-used if the book is converted again. Since the
highest priority in Bulk Conversion is given to the settings in the Bulk Conversion dialog, these will override any book specific
settings. So you should only bulk convert books together that need similar settings. The exceptions are metadata and input format specific
settings. Since the Bulk Conversion dialog does not have settings for these two categories, they will be taken from book specific
Note that the final settings for each book in a Bulk Conversion will be saved
and re-used if the book is converted again. Since the highest priority in Bulk
Conversion is given to the settings in the Bulk Conversion dialog, these will
override any book specific settings. So you should only bulk convert books
together that need similar settings. The exceptions are metadata and input
format specific settings. Since the Bulk Conversion dialog does not have
settings for these two categories, they will be taken from book specific
settings (if any) or the defaults.
.. note::
@ -772,9 +785,11 @@ size. By default, |app| uses a page size defined by the current
:guilabel:`Output profile`. So if your output profile is set to Kindle, |app|
will create a PDF with page size suitable for viewing on the small kindle
screen. However, if you view this PDF file on a computer screen, then it will
appear to have too large fonts. To create "normal" sized PDFs, use the override
page size option under :guilabel:`PDF Output` in the conversion dialog.
appear to have too large fonts. To create "normal" sized PDFs, use the
:guilabel:`Override page size` option under :guilabel:`PDF Output` in the conversion dialog.
Headers and Footers
^^^^^^^^^^^^^^^^^^^^
You can insert arbitrary headers and footers on each page of the PDF by
specifying header and footer templates. Templates are just snippets of HTML
code that get rendered in the header and footer locations. For example, to
@ -813,6 +828,9 @@ the page will be used.
bottom margins to large enough values, under the Page Setup section of the
conversion dialog.
Printable Table of Contents
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
You can also insert a printable Table of Contents at the end of the PDF that
lists the page numbers for every section. This is very useful if you intend to
print out the PDF to paper. If you wish to use the PDF on an electronic device,

View File

@ -92,6 +92,11 @@ The first thing to note is that this zip file has a lot more files in it, explai
**about.txt**
A text file with information about the plugin
**translations**
A folder containing .mo files with the translations of the user
interface of your plugin into different languages. See below for
details.
Now let's look at the code.
__init__.py
@ -175,6 +180,42 @@ You can see the ``prefs`` object being used in main.py:
.. literalinclude:: plugin_examples/interface_demo/main.py
:pyobject: DemoDialog.config
Adding translations to your plugin
--------------------------------------
You can have all the user interface strings in your plugin translated and
displayed in whatever language is set for the main calibre user interface.
The first step is to go through your plugin's source code and mark all user
visible strings as translatable, by surrounding them in _(). For example::
action_spec = (_('My plugin'), None, _('My plugin is cool'), None)
Then use some program to generate .po files from your plugin source code. There
should be one .po file for every language you want to translate into. For
example: de.po for German, fr.po for French and so on. You can use the
`poedit <http://www.poedit.net/>`_ program for this.
Send these .po files to your translators. Once you get them back, compile them
into .mo files. You can again use poedit for that, or just do::
calibre-debug -c "from calibre.translations.msgfmt import main; main()" filename.po
Put the .mo files into the ``translations`` folder in your plugin.
The last step is to simply call the function `load_translations()` at the top
of your plugin's .py files. For performance reasons you should only call this
function in those .py files that actually have translatable strings. So in a
typical User Interface plugin you would call it at the top of ``ui.py`` but not
``__init__.py``.
You can test the translations of your plugins by changing the user interface
language in calibre under Preferences->Look & Feel or by running calibre like
this::
CALIBRE_OVERRIDE_LANG=de calibre
Replace ``de`` with the language code of the language you want to test.
The plugin API
--------------------------------

View File

@ -83,7 +83,6 @@ def generate_calibredb_help(preamble, info):
global_options = '\n'.join(render_options('calibredb', groups, False, False))
lines, toc = [], []
for cmd in COMMANDS:
args = []
@ -99,7 +98,7 @@ def generate_calibredb_help(preamble, info):
usage = [i for i in usage.replace('%prog', 'calibredb').splitlines()]
cmdline = ' '+usage[0]
usage = usage[1:]
usage = [i.replace(cmd, ':command:`%s`'%cmd) for i in usage]
usage = [re.sub(r'(%s)([^a-zA-Z0-9])'%cmd, r':command:`\1`\2', i) for i in usage]
lines += ['.. code-block:: none', '', cmdline, '']
lines += usage
groups = [(None, None, parser.option_list)]
@ -152,7 +151,6 @@ def generate_ebook_convert_help(preamble, info):
prog = 'ebook-convert-'+(pl.name.lower().replace(' ', '-'))
raw += '\n\n' + '\n'.join(render_options(prog, groups, False, True))
update_cli_doc(os.path.join('cli', 'ebook-convert.rst'), raw, info)
def update_cli_doc(path, raw, info):
@ -200,7 +198,8 @@ def cli_docs(app):
for script in entry_points['console_scripts'] + entry_points['gui_scripts']:
module = script[script.index('=')+1:script.index(':')].strip()
cmd = script[:script.index('=')].strip()
if cmd in ('calibre-complete', 'calibre-parallel'): continue
if cmd in ('calibre-complete', 'calibre-parallel'):
continue
module = __import__(module, fromlist=[module.split('.')[-1]])
if hasattr(module, 'option_parser'):
documented_cmds.append((cmd, getattr(module, 'option_parser')()))
@ -260,3 +259,4 @@ def setup(app):
def finished(app, exception):
pass

View File

@ -30,10 +30,13 @@ Environment variables
* ``CALIBRE_OVERRIDE_DATABASE_PATH`` - allows you to specify the full path to metadata.db. Using this variable you can have metadata.db be in a location other than the library folder. Useful if your library folder is on a networked drive that does not support file locking.
* ``CALIBRE_DEVELOP_FROM`` - Used to run from a calibre development environment. See :ref:`develop`.
* ``CALIBRE_OVERRIDE_LANG`` - Used to force the language used by the interface (ISO 639 language code)
* ``CALIBRE_NO_NATIVE_FILEDIALOGS`` - Causes calibre to not use native file dialogs for selecting files/directories.
* ``CALIBRE_NO_NATIVE_FILEDIALOGS`` - Causes calibre to not use native file dialogs for selecting files/directories. Set it to 1 to enable.
* ``SYSFS_PATH`` - Use if sysfs is mounted somewhere other than /sys
* ``http_proxy`` - Used on linux to specify an HTTP proxy
See `How to set environment variables in windows <http://www.computerhope.com/issues/ch000549.htm>`_ or
`How to set environment variables in OS X <http://blog.dowdandassociates.com/content/howto-set-an-environment-variable-in-mac-os-x-home-slash-dot-macosx-slash-environment-dot-plist/>`_.
Tweaks
------------
@ -46,17 +49,31 @@ The default values for the tweaks are reproduced below
Overriding icons, templates, et cetera
----------------------------------------
|app| allows you to override the static resources, like icons, templates, javascript, etc. with customized versions that you like.
All static resources are stored in the resources sub-folder of the calibre install location. On Windows, this is usually
:file:`C:/Program Files/Calibre2/resources`. On OS X, :file:`/Applications/calibre.app/Contents/Resources/resources/`. On linux, if you are using the binary installer
from the calibre website it will be :file:`/opt/calibre/resources`. These paths can change depending on where you choose to install |app|.
|app| allows you to override the static resources, like icons, javascript and
templates for the metadata jacket, catalogs, etc. with customized versions that
you like. All static resources are stored in the resources sub-folder of the
calibre install location. On Windows, this is usually :file:`C:/Program Files/Calibre2/resources`.
On OS X, :file:`/Applications/calibre.app/Contents/Resources/resources/`. On linux, if
you are using the binary installer from the calibre website it will be
:file:`/opt/calibre/resources`. These paths can change depending on where you
choose to install |app|.
You should not change the files in this resources folder, as your changes will get overwritten the next time you update |app|. Instead, go to
:guilabel:`Preferences->Advanced->Miscellaneous` and click :guilabel:`Open calibre configuration directory`. In this configuration directory, create a sub-folder called resources and place the files you want to override in it. Place the files in the appropriate sub folders, for example place images in :file:`resources/images`, etc.
|app| will automatically use your custom file in preference to the built-in one the next time it is started.
You should not change the files in this resources folder, as your changes will
get overwritten the next time you update |app|. Instead, go to
:guilabel:`Preferences->Advanced->Miscellaneous` and click
:guilabel:`Open calibre configuration directory`. In this configuration directory, create a
sub-folder called resources and place the files you want to override in it.
Place the files in the appropriate sub folders, for example place images in
:file:`resources/images`, etc. |app| will automatically use your custom file
in preference to the built-in one the next time it is started.
For example, if you wanted to change the icon for the :guilabel:`Remove books` action, you would first look in the built-in resources folder and see that the relevant file is
:file:`resources/images/trash.png`. Assuming you have an alternate icon in PNG format called :file:`mytrash.png` you would save it in the configuration directory as :file:`resources/images/trash.png`. All the icons used by the calibre user interface are in :file:`resources/images` and its sub-folders.
For example, if you wanted to change the icon for the :guilabel:`Remove books`
action, you would first look in the built-in resources folder and see that the
relevant file is :file:`resources/images/trash.png`. Assuming you have an
alternate icon in PNG format called :file:`mytrash.png` you would save it in
the configuration directory as :file:`resources/images/trash.png`. All the
icons used by the calibre user interface are in :file:`resources/images` and
its sub-folders.
Customizing |app| with plugins
--------------------------------

View File

@ -49,7 +49,7 @@ All the |app| python code is in the ``calibre`` package. This package contains t
* Metadata reading, writing, and downloading is all in ``ebooks.metadata``
* Conversion happens in a pipeline, for the structure of the pipeline,
see :ref:`conversion-introduction`. The pipeline consists of an input
plugin, various transforms and an output plugin. The that code constructs
plugin, various transforms and an output plugin. The code that constructs
and drives the pipeline is in :file:`plumber.py`. The pipeline works on a
representation of an ebook that is like an unzipped epub, with
manifest, spine, toc, guide, html content, etc. The

View File

@ -499,11 +499,17 @@ that allows you to create collections on your Kindle from the |app| metadata. It
I am getting an error when I try to use |app| with my Kobo Touch/Glo/etc.?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The Kobo has very buggy firmware. Connecting to it has been known to fail at random. Certain combinations of motherboard, USB ports/cables/hubs can exacerbate this tendency to fail. If you are getting an error when connecting to your touch with |app| try the following, each of which has solved the problem for *some* |app| users.
The Kobo has very buggy firmware. Connecting to it has been known to fail at
random. Certain combinations of motherboard, USB ports/cables/hubs can
exacerbate this tendency to fail. If you are getting an error when connecting
to your touch with |app| try the following, each of which has solved the
problem for *some* |app| users.
* Connect the Kobo directly to your computer, not via USB Hub
* Try a different USB cable and a different USB port on your computer
* Try a different computer (preferably an older model)
* Try a different computer, in particular the Kobo does not work well with
some Windows XP machines. If you are on Windows XP, try a computer with a
newer version of windows.
* Try upgrading the firmware on your Kobo Touch to the latest
* Try resetting the Kobo (sometimes this cures the problem for a little while, but then it re-appears, in which case you have to reset again and again)
* Try only putting one or two books onto the Kobo at a time and do not keep large collections on the Kobo
@ -622,13 +628,29 @@ should fix by hand.
The list of books in |app| is blank!
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In order to understand why that happened, you have to understand what a |app| library is. At the most basic level, a |app| library is just a folder. Whenever you add a book to |app|, that book's files are copied into this folder (arranged into sub folders by author and title). Inside the |app| library folder, at the top level, you will see a file called metadata.db. This file is where |app| stores the metadata like title/author/rating/tags etc. for *every* book in your |app| library. The list of books that |app| displays is created by reading the contents of this metadata.db file.
In order to understand why that happened, you have to understand what a |app|
library is. At the most basic level, a |app| library is just a folder. Whenever
you add a book to |app|, that book's files are copied into this folder
(arranged into sub folders by author and title). Inside the |app| library
folder, at the top level, you will see a file called metadata.db. This file is
where |app| stores the metadata like title/author/rating/tags etc. for *every*
book in your |app| library. The list of books that |app| displays is created by
reading the contents of this metadata.db file.
There can be two reasons why |app| is showing a empty list of books:
* Your |app| library folder changed its location. This can happen if it was on an external disk and the drive letter for that disk changed. Or if you accidentally moved the folder. In this case, |app| cannot find its library and so starts up with an empty library instead. To remedy this, do a right-click on the |app| icon in the |app| toolbar (it will say 0 books underneath it) and select Switch/create library. Click the little blue icon to select the new location of your |app| library and click OK.
* Your |app| library folder changed its location. This can happen if it was
on an external disk and the drive letter for that disk changed. Or if you
accidentally moved the folder. In this case, |app| cannot find its library
and so starts up with an empty library instead. To remedy this, do a
right-click on the |app| icon in the |app| toolbar and select Switch/create
library. Click the little blue icon to select the new location of your
|app| library and click OK.
* Your metadata.db file was deleted/corrupted. In this case, you can ask |app| to rebuild the metadata.db from its backups. Right click the |app| icon in the |app| toolbar (it will say 0 books underneath it) and select Library maintenance->Restore database. |app| will automatically rebuild metadata.db.
* Your metadata.db file was deleted/corrupted. In this case, you can ask
|app| to rebuild the metadata.db from its backups. Right click the |app|
icon in the |app| toolbar and select Library maintenance->Restore database.
|app| will automatically rebuild metadata.db.
I am getting errors with my calibre library on a networked drive/NAS?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -700,8 +722,14 @@ Take your pick:
|app| is pronounced as cal-i-ber *not* ca-li-bre. If you're wondering, |app| is the British/commonwealth spelling for caliber. Being Indian, that's the natural spelling for me.
Why does |app| show only some of my fonts on OS X?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|app| embeds fonts in ebook files it creates. Ebook files support embedding only TrueType (.ttf) fonts. Most fonts on OS X systems are in .dfont format, thus they cannot be embedded. |app| shows only TrueType fonts found on your system. You can obtain many TrueType fonts on the web. Simply download the .ttf files and add them to the Library/Fonts directory in your home directory.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|app| embeds fonts in ebook files it creates. Ebook files support embedding
only TrueType and OpenType (.ttf and .otf) fonts. Most fonts on OS X systems
are in .dfont format, thus they cannot be embedded. |app| shows only TrueType
and OpenType fonts found on your system. You can obtain many such fonts on the
web. Simply download the .ttf/.otf files and add them to the Library/Fonts
directory in your home directory.
|app| is not starting on Windows?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -763,6 +791,13 @@ There are several possible things I know of, that can cause this:
that prevent 64-bit |app| from working properly. If you are using the 64-bit
version of |app| try switching to the 32-bit version.
* If the crashes happen specifically when you are using a file open dialog,
like clicking on the Add Books button or the Save to Disk button, then
you may have an issue with the windows file open dialogs on your
computer. You can tell calibre to use its own file open dialogs by
setting the environment variable ``CALIBRE_NO_NATIVE_FILEDIALOGS=1``.
See `How to set environment variables in windows <http://www.computerhope.com/issues/ch000549.htm>`_.
If none of the above apply to you, then there is some other program on your
computer that is interfering with |app|. First reboot your computer in safe
mode, to have as few running programs as possible, and see if the crashes still
@ -776,6 +811,31 @@ The only way to find the culprit is to eliminate the programs one by one and
see which one is causing the issue. Basically, stop a program, run calibre,
check for crashes. If they still happen, stop another program and repeat.
Using the viewer or doing any conversions results in a permission denied error on windows
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Something on your computer is preventing calibre from accessing its own
temporary files. Most likely the permissions on your Temp folder are incorrect.
Go to the folder file:`C:\\Users\\USERNAME\\AppData\\Local` in Windows
Explorer and then right click on the file:`Temp` folder, select Properties and go to
the Security tab. Make sure that your user account has full control for this
folder.
Some users have reported that running the following command in an Administrator
Command Prompt fixed their permissions. To get an Administrator Command Prompt
search for cmd.exe in the start menu, then right click on the command prompt
entry and select Run as Administrator. At the command prompt type the following
command and press Enter::
icacls "%appdata%\..\Local\Temp" /reset /T
Alternately, you can run calibre as Administrator, but doing so will cause
some functionality, such as drag and drop to not work.
Finally, some users have reported that disabling UAC fixes the problem.
|app| is not starting on OS X?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -816,9 +876,10 @@ My antivirus program claims |app| is a virus/trojan?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The first thing to check is that you are downloading |app| from the official
website: `<http://calibre-ebook.com/download>`_. |app| is a very popular program
and unscrupulous people try to setup websites offering it for download to fool
the unwary.
website: `<http://calibre-ebook.com/download>`_. Make sure you are clicking the
download links on the left, not the advertisements on the right. |app| is a
very popular program and unscrupulous people try to setup websites offering it
for download to fool the unwary.
If you have the official download and your antivirus program is still claiming
|app| is a virus, then, your antivirus program is wrong. Antivirus programs use
@ -880,10 +941,25 @@ Why doesn't |app| have an automatic update?
For many reasons:
* *There is no need to update every week*. If you are happy with how |app| works turn off the update notification and be on your merry way. Check back to see if you want to update once a year or so.
* Pre downloading the updates for all users in the background would require about 80TB of bandwidth *every week*. That costs thousands of dollars a month. And |app| is currently growing at 300,000 new users every month.
* If I implement a dialog that downloads the update and launches it, instead of going to the website as it does now, that would save the most ardent |app| updater, *at most five clicks a week*. There are far higher priority things to do in |app| development.
* If you really, really hate downloading |app| every week but still want to be up to the latest, I encourage you to run from source, which makes updating trivial. Instructions are :ref:`available here <develop>`.
* *There is no need to update every week*. If you are happy with how |app|
works turn off the update notification and be on your merry way. Check back
to see if you want to update once a year or so. There is a check box to
turn off the update notification, on the update notification itself.
* |app| downloads currently use `about 100TB of bandwidth a month
<http://status.calibre-ebook.com/downloads>`_. Implementing automatic
updates would greatly increase that and end up costing thousands of dollars
a month, which someone has to pay. And |app| is currently growing at `half
a million new installs a month <https://status.calibre-ebook.com>`_.
* If I implement a dialog that downloads the update and launches it, instead
of going to the website as it does now, that would save the most ardent
|app| updater, *at most five clicks a week*. There are far higher priority
things to do in |app| development.
* If you really, really hate downloading |app| every week but still want to
be up to the latest, I encourage you to run from source, which makes
updating trivial. Instructions are :ref:`available here <develop>`.
How is |app| licensed?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -62,7 +62,13 @@ Add books
The :guilabel:`Add books` action can read metadata from a wide variety of ebook formats. In addition, it tries to guess metadata from the filename.
See the :ref:`config_filename_metadata` section, to learn how to configure this.
To add an additional format for an existing book use the :ref:`edit_meta_information` action.
To add an additional format for an existing book you can do any of three things:
1. Drag and drop the file onto the book details panel on the right side of the main window
2. Right click the Add books button and choose :guilabel`:Add files to selected books`.
3. Click the red add books button in the top right area of the :guilabel:`Edit Metadata` dialog, accessed by the :ref:`edit_meta_information` action.
.. _edit_meta_information:
@ -593,6 +599,8 @@ Calibre has several keyboard shortcuts to save you time and mouse movement. Thes
- Toggle Book Details panel
* - :kbd:`Alt+Shift+T`
- Toggle Tag Browser
* - :kbd:`Alt+Shift+G`
- Toggle Cover Grid
* - :kbd:`Alt+A`
- Show books by the same author as the current book
* - :kbd:`Alt+T`

View File

@ -38,6 +38,8 @@ Sections
glossary
.. REMOVE_IN_PDF
The main |app| user interface
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

View File

@ -1,4 +1,3 @@
.. include:: global.rst
.. _regexptutorial:
@ -26,7 +25,7 @@ There are a few places |app| uses regular expressions. There's the Search & Repl
What on earth *is* a regular expression?
------------------------------------------------
A regular expression is a way to describe sets of strings. A single regular expression cat *match* a number of different strings. This is what makes regular expression so powerful -- they are a concise way of describing a potentially large number of variations.
A regular expression is a way to describe sets of strings. A single regular expression can *match* a number of different strings. This is what makes regular expression so powerful -- they are a concise way of describing a potentially large number of variations.
.. note:: I'm using string here in the sense it is used in programming languages: a string of one or more characters, characters including actual characters, numbers, punctuation and so-called whitespace (linebreaks, tabulators etc.). Please note that generally, uppercase and lowercase characters are not considered the same, thus "a" being a different character from "A" and so forth. In |app|, regular expressions are case insensitive in the search bar, but not in the conversion options. There's a way to make every regular expression case insensitive, but we'll discuss that later. It gets complicated because regular expressions allow for variations in the strings it matches, so one expression can match multiple strings, which is why people bother using them at all. More on that in a bit.

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

View File

@ -104,7 +104,7 @@ Save this adapter as :file:`calibre-wsgi-adpater.py` somewhere your server will
Let's suppose that we want to use WSGI in Apache. First enable WSGI in Apache by adding the following to :file:`httpd.conf`::
LoadModule proxy_module modules/mod_wsgi.so
LoadModule wsgi_module modules/mod_wsgi.so
The exact technique for enabling the wsgi module will vary depending on your Apache installation. Once you have the proxy modules enabled, add the following rules to httpd.conf (or if you are using virtual hosts to the conf file for the virtual host in question::

View File

@ -16,16 +16,13 @@
<div class="body">
{% if not embedded %}
<div id="ad-container" style="text-align:center">
<script type="text/javascript"><!--
google_ad_client = "ca-pub-5939552585043235";
/* User Manual horizontal */
google_ad_slot = "7580893187";
google_ad_width = 728;
google_ad_height = 90;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
<script async="async" src="http://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
<ins class="adsbygoogle"
style="display:inline-block;width:728px;height:90px"
data-ad-client="ca-pub-5939552585043235"
data-ad-slot="7580893187"></ins>
<script>
(adsbygoogle = window.adsbygoogle || []).push({});
</script>
</div>
{% endif %}
@ -62,7 +59,7 @@
<form action="https://www.paypal.com/cgi-bin/webscr" method="post" title="Contribute to support calibre development">
<input type="hidden" name="cmd" value="_s-xclick" />
<input type="hidden" name="hosted_button_id" value="AF4H3B8QVDG6N" />
<input type="image" src="http://manual.calibre-ebook.com/simple_donate_button.gif" border="0" name="submit" alt="Contribute to support calibre development" style="border:0pt" />
<input type="image" src="_static/simple_donate_button.gif" border="0" name="submit" alt="Contribute to support calibre development" style="border:0pt" />
<img alt="" border="0" src="https://www.paypalobjects.com/en_GB/i/scr/pixel.gif" width="1" height="1" />
</form>
<hr/>

View File

@ -94,6 +94,13 @@ You can quickly use the current search as a temporary virtual library by
clicking the :guilabel:`Virtual Library` button and choosing the
:guilabel:`*current search` entry.
You can display all available virtual libraries as tabs above the book list.
This is particularly handy if you like switching between virtual libraries very
often. Click the :guilabel:`Virtual Library` button and select :guilabel:`Show
virtual libraries as tabs`. You can re-arrange the tabs by drag and drop and
close ones you do not want to see. Closed tabs can be restored by
right-clicking on the tab bar.
Using additional restrictions
-------------------------------

50
recipes/10minutos.recipe Normal file
View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = '2013, Carlos Alves <carlosalves90@gmail.com>'
'''
10minutos.com.uy
'''
from calibre.web.feeds.news import BasicNewsRecipe
class General(BasicNewsRecipe):
title = '10minutos'
__author__ = 'Carlos Alves'
description = 'Noticias de Salto - Uruguay'
tags = 'news, sports'
language = 'es_UY'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 5
encoding = 'utf8'
remove_javascript = True
no_stylesheets = True
oldest_article = 2
max_articles_per_feed = 100
keep_only_tags = [dict(name='div', attrs={'class':'post-content'})]
remove_tags = [
dict(name='div', attrs={'class':['hr', 'titlebar', 'navigation']}),
dict(name='p', attrs={'class':'post-meta'}),
dict(name=['object','link'])
]
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
p {font-family:Arial,Helvetica,sans-serif;}
'''
feeds = [
(u'Articulos', u'http://10minutos.com.uy/feed/')
]
def get_cover_url(self):
return 'http://10minutos.com.uy/a/img/logo.png'
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

30
recipes/acrimed.recipe Normal file
View File

@ -0,0 +1,30 @@
# vim:fileencoding=utf-8
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012'
'''
acrimed.org
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Acrimed(BasicNewsRecipe):
title = u'Acrimed'
__author__ = 'Gaëtan Lehmann'
oldest_article = 30
max_articles_per_feed = 100
auto_cleanup = True
auto_cleanup_keep = '//div[@class="crayon article-chapo-4112 chapo"]'
language = 'fr'
masthead_url = 'http://www.acrimed.org/IMG/siteon0.gif'
feeds = [(u'Acrimed', u'http://www.acrimed.org/spip.php?page=backend')]
preprocess_regexps = [
(re.compile(r'<title>(.*) - Acrimed \| Action Critique M.*dias</title>'), lambda m: '<title>' + m.group(1) + '</title>'),
(re.compile(r'<h2>(.*) - Acrimed \| Action Critique M.*dias</h2>'), lambda m: '<h2>' + m.group(1) + '</h2>')]
extra_css = """
.chapo{font-style:italic; margin: 1em 0 0.5em}
"""

View File

@ -3,10 +3,10 @@ from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Eddie Lau'
__Date__ = ''
__HiResImg__ = True
'''
Change Log:
2013/09/28 -- update due to website redesign, add cover
2013/03/30 -- first version
'''
@ -15,7 +15,7 @@ from calibre.utils.date import now as nowf
import os, datetime, re
from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
@ -32,18 +32,17 @@ class AppleDaily(BasicNewsRecipe):
encoding = 'utf-8'
auto_cleanup = False
remove_javascript = True
use_embedded_content = False
use_embedded_content = False
no_stylesheets = True
description = 'http://www.am730.com.hk'
category = 'Chinese, News, Hong Kong'
masthead_url = 'http://www.am730.com.hk/images/logo.jpg'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} photocaption {font-size:50%; margin-left:auto; margin-right:auto;}'
keep_only_tags = [dict(name='div', attrs={'id':'articleHeader'}),
dict(name='div', attrs={'class':'thecontent wordsnap'}),
dict(name='a', attrs={'class':'lightboximg'})]
remove_tags = [dict(name='img', attrs={'src':'/images/am730_article_logo.jpg'}),
dict(name='img', attrs={'src':'/images/am_endmark.gif'})]
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 20px; margin-bottom: 20px; max-height:70%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} li {font-size:50%; margin-left:auto; margin-right:auto;}'
keep_only_tags = [dict(name='h2', attrs={'class':'printTopic'}),
dict(name='div', attrs={'id':'article_content'}),
dict(name='div', attrs={'id':'slider'})]
remove_tags = [dict(name='img', attrs={'src':'images/am730_article_logo.jpg'}),
dict(name='img', attrs={'src':'images/am_endmark.gif'})]
def get_dtlocal(self):
dt_utc = datetime.datetime.utcnow()
@ -84,6 +83,16 @@ class AppleDaily(BasicNewsRecipe):
def get_weekday(self):
return self.get_dtlocal().weekday()
def get_cover_url(self):
soup = self.index_to_soup('http://www.am730.com.hk')
cover = 'http://www.am730.com.hk/' + soup.find(attrs={'id':'mini_news_img'}).find('img').get('src', False)
br = BasicNewsRecipe.get_browser(self)
try:
br.open(cover)
except:
cover = None
return cover
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
@ -93,48 +102,17 @@ class AppleDaily(BasicNewsRecipe):
def parse_index(self):
feeds = []
soup = self.index_to_soup('http://www.am730.com.hk/')
ul = soup.find(attrs={'class':'nav-section'})
sectionList = []
for li in ul.findAll('li'):
a = 'http://www.am730.com.hk/' + li.find('a', href=True).get('href', False)
title = li.find('a').get('title', False).strip()
sectionList.append((title, a))
for title, url in sectionList:
articles = self.parse_section(url)
if articles:
feeds.append((title, articles))
optgroups = soup.findAll('optgroup')
for optgroup in optgroups:
sectitle = optgroup.get('label')
articles = []
for option in optgroup.findAll('option'):
articlelink = "http://www.am730.com.hk/" + option.get('value')
title = option.string
articles.append({'title': title, 'url': articlelink})
feeds.append((sectitle, articles))
return feeds
def parse_section(self, url):
soup = self.index_to_soup(url)
items = soup.findAll(attrs={'style':'padding-bottom: 15px;'})
current_articles = []
for item in items:
a = item.find(attrs={'class':'t6 f14'}).find('a', href=True)
articlelink = 'http://www.am730.com.hk/' + a.get('href', True)
title = self.tag_to_string(a)
description = self.tag_to_string(item.find(attrs={'class':'t3 f14'}))
current_articles.append({'title': title, 'url': articlelink, 'description': description})
return current_articles
def preprocess_html(self, soup):
multia = soup.findAll('a')
for a in multia:
if not (a == None):
image = a.find('img')
if not (image == None):
if __HiResImg__:
image['src'] = image.get('src').replace('/thumbs/', '/')
caption = image.get('alt')
tag = Tag(soup, "photo", [])
tag2 = Tag(soup, "photocaption", [])
tag.insert(0, image)
if not caption == None:
tag2.insert(0, caption)
tag.insert(1, tag2)
a.replaceWith(tag)
return soup
def create_opf(self, feeds, dir=None):
if dir is None:
dir = self.output_dir
@ -288,3 +266,4 @@ class AppleDaily(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file)

View File

@ -12,26 +12,30 @@ class anan(BasicNewsRecipe):
title = 'Anandtech'
description = 'comprehensive Hardware Tests'
__author__ = 'Oliver Niesner' # 2012-09-20 AGE: update
__author__ = 'Oliver Niesner, Armin Geller' # 2013-09-07 AGE: update
use_embedded_content = False
language = 'en'
timefmt = ' [%d %b %Y]'
oldest_article = 7 # 2012-09-20 AGE: update
oldest_article = 7
max_articles_per_feed = 40
no_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
cover_url = 'http://www.anandtech.com/content/images/globals/header_logo.png' # 2012-09-20 AGE: new
masthead_url = 'http://www.anandtech.com/content/images/globals/printheader.png' # 2012-09-20 AGE: update
cover_url = 'http://www.anandtech.com/content/images/globals/header_logo.png'
masthead_url = 'http://www.anandtech.com/content/images/globals/printheader.png'
keep_only_tags = [
dict(name='section', attrs={'class':['main_cont']}),
]
remove_tags=[ # 2013-09-07 AGE: update
dict(name='div', attrs={'class':['print', # logo
'breadcrumb_area noprint',
'fl-rt noprint',
'blog_top_right',]})
]
remove_tags=[
dict(name='a', attrs={'class': 'bluebutton noprint'}),
dict(name='img', attrs={'alt': 'header'}),
] # 2012-09-20 AGE: update
feeds = [ ('Anandtech', 'http://www.anandtech.com/rss/')]
feeds = [('Anandtech', 'http://www.anandtech.com/rss/')]
def print_version(self,url):
return url.replace('0Cshow0C', '0Cprint0C') # 2012-09-20 AGE: update
return url.replace("0Cshow0C", "0Cprint0C") # 2013-09-07 AGE: update

View File

@ -21,21 +21,9 @@ class AntywebRecipe(BasicNewsRecipe):
simultaneous_downloads = 3
keep_only_tags =[]
keep_only_tags.append(dict(name = 'h1', attrs = { 'class' : 'mm-article-title'}))
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'mm-article-content'}))
remove_tags =[]
remove_tags.append(dict(name = 'h2', attrs = {'class' : 'widgettitle'}))
remove_tags.append(dict(name = 'img', attrs = {'class' : 'alignleft'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'float: right;margin-left:1em;margin-bottom: 0.5em;padding-bottom: 3px; width: 72px;'}))
remove_tags.append(dict(name = 'img', attrs = {'src' : 'http://antyweb.pl/wp-content/uploads/2011/09/HOSTERSI_testy_pasek600x30.gif'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'podwpisowe'}))
extra_css = '''
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
'''
keep_only_tags.append(dict(name = 'h1', attrs = { 'class' : 'entry-title '}))
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'news-content'}))
extra_css = '''body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}'''
feeds = [
(u'Artykuly', u'feed://feeds.feedburner.com/Antyweb?format=xml'),

View File

@ -0,0 +1,63 @@
#
# Written: July 2013
# Last Edited: 2013-07-11
# Version: 1.0
# Last update: 2013-07-25
#
__license__ = 'GPL v3'
__copyright__ = '2013, Armin Geller'
'''
Fetch blindenbuch.de
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
title = u'Blindbuch - Bücher neu entdecken'
__author__ = 'Armin Geller' # AGe 2013-07-11
description = u'Bücher blind präsentiert'
publisher = 'blindbuch.de'
publication_type = 'ebook news'
tags = 'Bücher, Literatur, E-Books, Germany'
timefmt = ' [%a, %d %b %Y]'
publication_type = 'Feed'
language = 'de-DE'
encoding = 'utf-8'
oldest_article = 14
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
remove_javascript = True
conversion_options = {'title' : title,
'comments' : description,
'tags' : tags,
'language' : language,
'publisher' : publisher,
'authors' : publisher,
}
cover_url = 'http://blindbuch.de/img/blindbuch_calibre.png'
masthead_url = 'http://www.blindbuch.de/img/Masterhead.JPG'
extra_css = '''
h1{font-weight:bold;font-size:large;}
.post-meta {font-size: 1em;text-align: left; font-style: italic}
'''
keep_only_tags = [
dict(name='article')
]
remove_tags = [
dict(name='div', attrs={'class':['su-spoiler su-spoiler-style-1','post-comments comments',]}),
dict(name='span', attrs={'class':['post-comments comments',]}),
dict(name='div', attrs={'addthis':['title',]}),
]
feeds = [(u'Blindbuch', u'http://www.blindbuch.de/feed/')]

View File

@ -0,0 +1,92 @@
import html5lib
from lxml import etree
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.utils.cleantext import clean_xml_chars
def is_title(tag):
return tag.name == 'h2' and tag.parent.name == 'div' and tag.parent['class'] == 'left-corner'
class CaravanMagazine(BasicNewsRecipe):
title = 'Caravan Magazine'
__author__ = 'Kovid Goyal'
description = 'An Indian Journal of politics and culture'
language = 'en_IN'
timefmt = ' [%b, %Y]'
no_stylesheets = True
keep_only_tags = [
dict(name=is_title),
dict(attrs={'class':['subhheading', 'authorndate', 'full-image-view', 'fullpage-body']}),
]
remove_tags = [
dict(attrs={'class':['share-with']}),
dict(attrs={'class':lambda x: x and 'thumb-image-view' in x}),
]
def preprocess_raw_html(self, raw_html, url):
root = html5lib.parse(
clean_xml_chars(raw_html), treebuilder='lxml',
namespaceHTMLElements=False)
for s in root.xpath('//script'):
s.getparent().remove(s)
return etree.tostring(root, encoding=unicode)
def preprocess_html(self, soup):
# Handle the image thumbnails
for div in soup.findAll('div', attrs={'class':lambda x: x and x.startswith('show-image')}):
if div['class'] == 'show-image':
div.extract()
else:
div['style'] = 'page-break-inside:avoid'
return soup
# To parse artice toc
def parse_index(self):
raw = self.index_to_soup(
'http://caravanmagazine.in/current-issue', raw=True)
raw = raw.decode('utf-8')
raw = self.preprocess_raw_html(raw, None)
soup = self.index_to_soup(raw)
a = soup.find('a', rel=lambda x:x and '[field_c_issues_image]' in x)
if a is not None:
self.cover_url = a['href']
ci = soup.find(attrs={'class': 'current-issue-block'})
current_section = 'Section'
current_articles = []
feeds = []
for div in ci.findAll(
attrs={'class': ['view-header', 'view-content']}):
if div['class'] == 'view-header':
if current_articles:
feeds.append((current_section, current_articles))
current_section = self.tag_to_string(div).replace('paging_filter', '')
current_articles = []
self.log('Section:', current_section)
else:
for art in div.findAll('div', attrs={'class': lambda x: x and 'views-row' in x.split()}):
title = div.find(attrs={'class': 'views-field-title'})
if title is not None:
a = title.find('a', href=True)
if a is not None:
href = a['href']
if href.startswith('/'):
href = 'http://caravanmagazine.in' + href
article = {
'title': self.tag_to_string(title), 'url': href}
title.extract()
desc = self.tag_to_string(div).strip()
if desc:
article['description'] = desc
current_articles.append(article)
self.log('\t' + article['title'])
self.log('\t\t' + article['url'])
if current_articles:
feeds.append((current_section, current_articles))
return feeds

View File

@ -12,7 +12,7 @@ class Carta(BasicNewsRecipe):
title = u'Carta'
description = 'News about electronic publishing'
__author__ = 'Oliver Niesner'
__author__ = 'Oliver Niesner' # AGe Update 2013-10-13
use_embedded_content = False
timefmt = ' [%a %d %b %Y]'
oldest_article = 7
@ -25,7 +25,7 @@ class Carta(BasicNewsRecipe):
remove_tags_after = [dict(name='p', attrs={'class':'tags-blog'})]
remove_tags_after = [dict(name='div', attrs={'id':'BlogContent'})] # AGe
remove_tags = [dict(name='p', attrs={'class':'print'}),
dict(name='p', attrs={'class':'tags-blog'}),

View File

@ -1,23 +1,29 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1312361378(BasicNewsRecipe):
title = u'Carta capital'
__author__ = 'Pablo Aldama'
class AdvancedUserRecipe1380852962(BasicNewsRecipe):
title = u'Carta Capital'
__author__ = 'Erico Lisboa'
language = 'pt_BR'
oldest_article = 9
oldest_article = 15
max_articles_per_feed = 100
auto_cleanup = True
use_embedded_content = False
feeds = [(u'Politica', u'http://www.cartacapital.com.br/category/politica/feed')
,(u'Economia', u'http://www.cartacapital.com.br/category/economia/feed')
,(u'Cultura', u'http://www.cartacapital.com.br/category/cultura/feed')
,(u'Internacional', u'http://www.cartacapital.com.br/category/internacional/feed')
,(u'Saude', u'http://www.cartacapital.com.br/category/saude/feed')
,(u'Sociedade', u'http://www.cartacapital.com.br/category/sociedade/feed')
,(u'Tecnologia', u'http://www.cartacapital.com.br/category/tecnologia/feed')
,(u'Carta na escola', u'http://www.cartacapital.com.br/category/carta-na-escola/feed')
,(u'Carta fundamental', u'http://www.cartacapital.com.br/category/carta-fundamental/feed')
,(u'Carta verde', u'http://www.cartacapital.com.br/category/carta-verde/feed')
]
def print_version(self, url):
return url + '/print'
feeds = [(u'Pol\xedtica',
u'http://www.cartacapital.com.br/politica/politica/rss'), (u'Economia',
u'http://www.cartacapital.com.br/economia/economia/atom.xml'),
(u'Sociedade',
u'http://www.cartacapital.com.br/sociedade/sociedade/atom.xml'),
(u'Internacional',
u'http://www.cartacapital.com.br/internacional/internacional/atom.xml'),
(u'Tecnologia',
u'http://www.cartacapital.com.br/tecnologia/tecnologia/atom.xml'),
(u'Cultura',
u'http://www.cartacapital.com.br/cultura/cultura/atom.xml'),
(u'Sa\xfade', u'http://www.cartacapital.com.br/saude/saude/atom.xml'),
(u'Educa\xe7\xe3o',
u'http://www.cartacapital.com.br/educacao/educacao/atom.xml')]

View File

@ -1,3 +1,5 @@
## Last Edit: 2013-08-23
## From: Armin Geller
__license__ = 'GPL v3'
__copyright__ = '2010, NA'
'''
@ -18,33 +20,30 @@ class Consumerist(BasicNewsRecipe):
encoding = 'utf-8'
use_embedded_content = False
language = 'en'
masthead_url = 'http://consumerist.com/css/images/footer_man.gif'
masthead_url = 'http://consumermediallc.files.wordpress.com/2013/02/consumerist.png'# AGe 2013-08-23
extra_css = '''
body{font-family: "Lucida Grande",Helvetica,Arial,sans-serif}
img{margin-bottom: 1em}
h1{font-family :Arial,Helvetica,sans-serif; font-size:x-large}
h2{font-family :Arial,Helvetica,sans-serif; font-size:large}
'''
body{font-family: "Lucida Grande",Helvetica,Arial,sans-serif}
img{margin-bottom: 1em}
h1{font-family :Arial,Helvetica,sans-serif; font-size:x-large}
h2{font-family :Arial,Helvetica,sans-serif; font-size:large}
'''
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
'comment' : description,
'tags' : category,
'publisher' : publisher,
'language' : language,
}
remove_attributes = ['width','height']
#keep_only_tags = [dict(attrs={'class':['', 'category-breadcrumb']}),]
remove_tags_before = dict(name='h2')
remove_tags = [
#dict(name='iframe'),
dict(name='div', attrs={'class':['e-comments', 'more-about', 'entry-tags']}),
#dict(name='div', attrs={'id':['IEContainer', 'clickIncludeBox']}),
#dict(name='ul', attrs={'class':'article-tools'}),
#dict(name='ul', attrs={'class':'articleTools'}),
]
keep_only_tags = dict(name='div', attrs={'class':['hfeed',]}) # AGe 2013-08-23
remove_tags_after = dict(attrs={'class':'e-body'})
remove_tags = [dict(name='div', attrs={'class':['navigation', # AGe 2013-08-23
'wpcom-related-posts widget widget_related_posts', # AGe 2013-08-23
'sharedaddy sd-like-enabled sd-sharing-enabled',]}), # AGe 2013-08-23
dict(name='div', attrs={'id':['comments',]}), # AGe 2013-08-23
]
feeds = [(u'Articles', u'http://consumerist.com/index.xml')]

View File

@ -20,27 +20,22 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
#article_already_exists = False
#feed_hash = ''
def get_cover_url(self):
soup = self.index_to_soup('http://www.countryfile.com/magazine')
cov = soup.find(attrs={'class' : re.compile('imagecache imagecache-250px_wide')})#'width' : '160',
print '&&&&&&&& ',cov,' ***'
cov=str(cov)
#cov2 = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
cov2 = re.findall('/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
soup = self.index_to_soup('http://www.countryfile.com/magazine')
cov = soup.find(attrs={'class' : re.compile('imagecache imagecache-250px')}) # 'width' : '160',
cov2 = str(cov2)
cov2= "http://www.countryfile.com"+cov2[2:len(cov2)-8]
print '******** ',cov2,' ***'
# try to get cover - if can't get known cover
br = browser()
br.set_handle_redirect(False)
try:
br.open_novisit(cov2)
cover_url = cov2
except:
cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
return cover_url
cov=str(cov)
cov=cov[10:]
cov=cov[:-135]
br = browser()
br.set_handle_redirect(False)
try:
br.open_novisit(cov)
cover_url = cov
except:
cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
return cover_url
preprocess_regexps = [
(re.compile(r' \| Countryfile.com', re.IGNORECASE | re.DOTALL), lambda match: '')]
remove_tags = [
# dict(attrs={'class' : ['player']}),
@ -48,6 +43,5 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
feeds = [
(u'Homepage', u'http://www.countryfile.com/rss/home'),
(u'Country News', u'http://www.countryfile.com/rss/news'),
(u'Countryside', u'http://www.countryfile.com/rss/countryside'),
(u'Countryside', u'http://www.countryfile.com/rss/countryside'),
]

View File

@ -1,63 +1,51 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Cracked(BasicNewsRecipe):
title = u'Cracked.com'
__author__ = 'UnWeave'
language = 'en'
description = "America's Only HumorSite since 1958"
publisher = 'Cracked'
category = 'comedy, lists'
oldest_article = 3 #days
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'ascii'
remove_javascript = True
use_embedded_content = False
feeds = [ (u'Articles', u'http://feeds.feedburner.com/CrackedRSS/') ]
class Cracked(BasicNewsRecipe):
title = u'Cracked.com'
__author__ = 'UnWeave'
language = 'en'
description = "America's Only HumorSite since 1958"
publisher = 'Cracked'
category = 'comedy, lists'
oldest_article = 3 # days
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'ascii'
remove_javascript = True
use_embedded_content = False
# auto_cleanup = True
feeds = [(u'Articles', u'http://feeds.feedburner.com/CrackedRSS/')]
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
remove_tags_before = dict(id='PrimaryContent')
keep_only_tags = [dict(name='article', attrs={'class': 'module article dropShadowBottomCurved'}),
dict(name='article', attrs={'class': 'module blog dropShadowBottomCurved'})]
remove_tags_after = dict(name='div', attrs={'class':'shareBar'})
remove_tags = [ dict(name='div', attrs={'class':['social',
'FacebookLike',
'shareBar'
]}),
dict(name='div', attrs={'id':['inline-share-buttons',
]}),
dict(name='span', attrs={'class':['views',
'KonaFilter'
]}),
#dict(name='img'),
]
remove_tags = [
dict(name='section', attrs={'class': ['socialTools', 'quickFixModule']})]
def appendPage(self, soup, appendTag, position):
# Check if article has multiple pages
pageNav = soup.find('nav', attrs={'class':'PaginationContent'})
pageNav = soup.find('nav', attrs={'class': 'PaginationContent'})
if pageNav:
# Check not at last page
nextPage = pageNav.find('a', attrs={'class':'next'})
nextPage = pageNav.find('a', attrs={'class': 'next'})
if nextPage:
nextPageURL = nextPage['href']
nextPageSoup = self.index_to_soup(nextPageURL)
# 8th <section> tag contains article content
nextPageContent = nextPageSoup.findAll('section')[7]
nextPageContent = nextPageSoup.findAll('article')[0]
newPosition = len(nextPageContent.contents)
self.appendPage(nextPageSoup,nextPageContent,newPosition)
self.appendPage(nextPageSoup, nextPageContent, newPosition)
nextPageContent.extract()
pageNav.extract()
appendTag.insert(position,nextPageContent)
appendTag.insert(position, nextPageContent)
def preprocess_html(self, soup):
self.appendPage(soup, soup.body, 3)
return soup

View File

@ -0,0 +1,88 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1376229553(BasicNewsRecipe):
title = u'Daily Express'
__author__ = 'Dave Asbury'
# 9-9-13 added article author and now use (re.compile(r'>[\w].+? News<'
encoding = 'utf-8'
remove_empty_feeds = True
#remove_javascript = True
no_stylesheets = True
oldest_article = 1
max_articles_per_feed = 10
#auto_cleanup = True
compress_news_images = True
compress_news_images_max_size = 30
ignore_duplicate_articles = {'title', 'url'}
masthead_url = 'http://cdn.images.dailyexpress.co.uk/img/page/express_logo.png'
preprocess_regexps = [
(re.compile(r'widget', re.IGNORECASE | re.DOTALL), lambda match: ''),
(re.compile(r'Related articles', re.IGNORECASE | re.DOTALL), lambda match: ''),
(re.compile(r'Add Your Comment<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
(re.compile(r'>More [\w].+?<', re.IGNORECASE), lambda match: '><'),
(re.compile(r'>[\w].+? News<', re.IGNORECASE), lambda match: '><'),
#(re.compile(r'Health News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
#(re.compile(r'Car News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
#(re.compile(r'TV & Radio News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
#(re.compile(r'Food & Recipe News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
#(re.compile(r'More City & Business<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
#(re.compile(r'Travel News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
#(re.compile(r'Garden News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
#(re.compile(r'Fashion & Beauty News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
#(re.compile(r'More Personal Finance<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
#(re.compile(r'<h3>More UK</h3>', re.IGNORECASE | re.DOTALL), lambda match: ''),
]
remove_tags = [
dict(attrs={'class' : 'quote'}),
#dict(attrs={'class' : 'author'}),
dict(name='footer'),
dict(attrs={'id' : 'header_addons'}),
dict(attrs={'class' : 'hoverException'}),
dict(name='_li'),dict(name='li'),
dict(attrs={'class' : 'box related-articles clear'}),
dict(attrs={'class' : 'news-list'}),
dict(attrs={'class' : 'sponsored-section'}),
dict(attrs={'class' : 'pull-quote on-right'}),
dict(attrs={'class' : 'pull-quote on-left'}),
]
keep_only_tags = [
dict(name='h1'),
dict(attrs={'class' : 'publish-info'}),
dict(name='h3', limit=2),
dict(attrs={'class' : 'clearfix hR new-style'}),
]
feeds = [(u'UK News', u'http://www.express.co.uk/posts/rss/1/uk'),
(u'World News',u'http://www.express.co.uk/posts/rss/78/world'),
(u'Finance',u'http://www.express.co.uk/posts/rss/21/finance'),
(u'Sport',u'http://www.express.co.uk/posts/rss/65/sport'),
(u'Entertainment',u'http://www.express.co.uk/posts/rss/18/entertainment'),
(u'Lifestyle',u'http://www.express.co.uk/posts/rss/8/life&style'),
(u'Fun',u'http://www.express.co.uk/posts/rss/110/fun'),
]
def get_cover_url(self):
soup = self.index_to_soup('http://www.express.co.uk/ourpaper/')
cov = soup.find(attrs={'src' : re.compile('http://images.dailyexpress.co.uk/img/covers/')})
cov=str(cov)
cov2 = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
cov=str(cov2)
cov=cov[2:len(cov)-2]
cover_url=cov
return cover_url
extra_css = '''
h1{font-weight:bold;font-size:175%;}
h2{font-weight:normal;font-size:75%;}
#p{font-size:14px;}
#body{font-size:14px;}
.photo-caption {display: block;margin-left: auto;margin-right: auto;width:100%;font-size:40%;}
.publish-info {font-size:50%;}
.photo img {display: block;margin-left: auto;margin-right: auto;width:100%;}
'''

View File

@ -7,50 +7,50 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
description = 'News as provided by The Daily Mirror -UK'
__author__ = 'Dave Asbury'
# last updated 19/10/12
# last updated 27/8/13
language = 'en_GB'
#cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
#recursions = 10
compress_news_images = True
oldest_article = 1
max_articles_per_feed = 12
compress_news_images_max_size = 30
oldest_article = 1.5
max_articles_per_feed = 10
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
ignore_duplicate_articles = {'title'}
ignore_duplicate_articles = {'url'}
# auto_cleanup = True
#auto_cleanup = True
#conversion_options = { 'linearize_tables' : True }
keep_only_tags = [ dict(name='h1'),
keep_only_tags = [dict(name='h1'),
dict(name='div',attrs={'class' : 'lead-text'}),
dict(name='div',attrs={'class' : 'styleGroup clearfix'}),
dict(attrs={'class' : 'tools clearfix'}),
dict(name='div',attrs={'class' : 'widget relatedContents pictures widget-editable viziwyg-section-245 inpage-widget-158123'}),
# dict(name='figure',attrs={'class' : 'clearfix'}),
dict(name='div',attrs={'class' :'body '}),
dict(name='div',attrs={'class' :'thumb'}),
dict(attrs={'img alt' : ['Perishers','Horace']}),
#dict(attrs={'class' : 'tmRow span-15-5 col-1 article-page'}),
#dict(attrs={'class' : ['article-attr','byline append-1','published']}),
#dict(name='p'),
# dict(name='p'),
]
remove_tags = [
dict(attrs={'class' : ['article sa-teaser type-opinion','image-gallery','gallery-caption']}),
dict(attrs={'class' : ['article sa-teaser type-opinion','last','gallery-caption','gallery-data','ir btn-fullscreen','avatar']}), # ,'image-gallery'
dict(attrs={'class' : 'comment'}),
dict(name='title'),
dict(name='ul',attrs={'class' : 'clearfix breadcrumbs '}),
dict(name='ul',attrs={'id' : 'login-201109171215'}),
dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),#'widget navigation breadcrumb widget-editable viziwyg-section-198 inpage-widget-80721 span-17','image-credit'
#'widget navigation breadcrumb widget-editable viziwyg-section-198 inpage-widget-80721 span-17','image-credit'
dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),
]
preprocess_regexps = [
(re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
feeds = [
(u'News',u'http://www.mirror.co.uk/news/rss.xml'),
(u'Sports',u'http://www.mirror.co.uk/sport/rss.xml'),
@ -63,26 +63,31 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
# example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:170%;}
.article figure figcaption {display: block;margin-left: auto;margin-right: auto;
width:100%;font-family:Arial,Helvetica,sans-serif;font-size:40%;}
#h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;}
p{font-family:Arial,Helvetica,sans-serif;}
body{font-family:Helvetica,Arial,sans-serif;}
.article figure{display: block;margin-left: auto;margin-right: auto;width:100%;}
.lead-text p {font-size:150%}
'''
def get_cover_url(self):
soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the mirror button and url
# look for the block containing the mirror button and url
cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
cov2 = str(cov)
cov2='http://www.politicshome.com'+cov2[9:-142]
#cov2 now contains url of the page containing pic
# cov2 now contains url of the page containing pic
soup = self.index_to_soup(cov2)
cov = soup.find(attrs={'id' : 'large'})
cov=str(cov)
cov2 = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
cov2 = str(cov2)
cov2=cov2[2:len(cov2)-2]
#cov2 now is pic url, now go back to original function
# cov2 now is pic url, now go back to original function
br = browser()
br.set_handle_redirect(False)
try:

View File

@ -0,0 +1,45 @@
# vim:fileencoding=utf-8
from calibre.web.feeds.news import BasicNewsRecipe
class DemocracyNowRecipe(BasicNewsRecipe):
title = u'Democracy now!'
__author__ = u'Antoine Beaupré'
description = 'A daily TV/radio news program, hosted by Amy Goodman and Juan Gonzalez, airing on over 1,100 stations, pioneering the largest community media collaboration in the United States.' # noqa
language = 'en'
cover_url = 'http://www.democracynow.org/images/dn-logo-for-podcast.png'
oldest_article = 1
max_articles_per_feed = 10
publication_type = 'magazine'
auto_cleanup = False
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
feeds = [
(u'Daily news', u'http://www.democracynow.org/democracynow.rss')]
keep_only_tags = [dict(name='div', attrs={'id': 'page'}), ]
remove_tags = [dict(name='div', attrs={'id': 'topics_list'}),
dict(name='div', attrs={'id': 'header'}),
dict(name='div', attrs={'id': 'footer'}),
dict(name='div', attrs={'id': 'right'}),
dict(name='div', attrs={'id': 'left-panel'}),
dict(name='div', attrs={'id': 'top-video-content'}),
dict(name='div', attrs={'id': 'google-news-date'}),
dict(name='div', attrs={'id': 'story-donate'}),
dict(
name='div', attrs={'id': 'transcript-expand-collapse'}),
dict(name='span', attrs={'class': 'show-links'}),
dict(name='span', attrs={'class': 'storyNav'}),
dict(name='div', attrs={'class': 'headline_share'}),
dict(name='div', attrs={'class': 'mediaBar'}),
dict(name='div', attrs={'class': 'shareAndPrinterBar'}),
dict(name='div', attrs={'class': 'utility-navigation'}),
dict(name='div', attrs={'class': 'bottomContentNav'}),
dict(name='div', attrs={'class': 'recentShows'}),
dict(
name='div', attrs={'class': 'printer-and-transcript-links'}),
]

View File

@ -1,72 +1,50 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2009-2013, Darko Miletic <darko.miletic at gmail.com>'
'''
elargentino.com
diagonales.infonews.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Diagonales(BasicNewsRecipe):
title = 'Diagonales'
__author__ = 'Darko Miletic'
description = 'El nuevo diario de La Plata'
publisher = 'ElArgentino.com'
description = 'Para estar bien informado sobre los temas de actualidad. Conoce sobre pais, economia, deportes, mundo, espectaculos, sociedad, entrevistas y tecnologia.'
publisher = 'INFOFIN S.A.'
category = 'news, politics, Argentina, La Plata'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = 'es_AR'
lang = 'es-AR'
direction = 'ltr'
INDEX = 'http://www.elargentino.com/medios/122/Diagonales.html'
language = 'es_AR'
publication_type = 'newspaper'
delay = 1
remove_empty_feeds = True
extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} '
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})]
remove_tags = [dict(name='link')]
remove_tags = [dict(name='link')]
feeds = [(u'Articulos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=122&Content-Type=text/xml&ChannelDesc=Diagonales')]
feeds = [
(u'Pais' , u'http://diagonales.infonews.com/Highlights.aspx?ParentType=Section&ParentId=112&Content-Type=text/xml&ChannelDesc=Pa%C3%ADs')
,(u'Deportes' , u'http://diagonales.infonews.com/Highlights.aspx?ParentType=Section&ParentId=106&Content-Type=text/xml&ChannelDesc=Deportes')
,(u'Economia' , u'http://diagonales.infonews.com/Highlights.aspx?ParentType=Section&ParentId=107&Content-Type=text/xml&ChannelDesc=Econom%C3%ADa')
,(u'Sociedad' , u'http://diagonales.infonews.com/Highlights.aspx?ParentType=Section&ParentId=109&Content-Type=text/xml&ChannelDesc=Sociedad')
,(u'Mundo' , u'http://diagonales.infonews.com/Highlights.aspx?ParentType=Section&ParentId=113&Content-Type=text/xml&ChannelDesc=Mundo')
,(u'Espectaculos', u'http://diagonales.infonews.com/Highlights.aspx?ParentType=Section&ParentId=114&Content-Type=text/xml&ChannelDesc=Espect%C3%A1culos')
,(u'Entrevistas' , u'http://diagonales.infonews.com/Highlights.aspx?ParentType=Section&ParentId=115&Content-Type=text/xml&ChannelDesc=Entrevistas')
,(u'Tecnologia' , u'http://diagonales.infonews.com/Highlights.aspx?ParentType=Section&ParentId=118&Content-Type=text/xml&ChannelDesc=Tecnolog%C3%ADa')
]
def print_version(self, url):
main, sep, article_part = url.partition('/nota-')
article_id, rsep, rrest = article_part.partition('-')
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('div',attrs={'class':'colder'})
if cover_item:
clean_url = self.image_url_processor(None,cover_item.div.img['src'])
cover_url = 'http://www.elargentino.com' + clean_url + '&height=600'
return cover_url
def image_url_processor(self, baseurl, url):
base, sep, rest = url.rpartition('?Id=')
img, sep2, rrest = rest.partition('&')
return base + sep + img
return u'http://diagonales.infonews.com/Impresion.aspx?Id=' + article_id

View File

@ -0,0 +1,51 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = '2013, Carlos Alves <carlosalves90@gmail.com>'
'''
diarioelpueblo.com.uy
'''
from calibre.web.feeds.news import BasicNewsRecipe
class General(BasicNewsRecipe):
title = 'Diario El Pueblo'
__author__ = 'Carlos Alves'
description = 'Noticias de Salto - Uruguay'
tags = 'news, sports'
language = 'es_UY'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 5
encoding = 'utf8'
remove_javascript = True
no_stylesheets = True
oldest_article = 2
max_articles_per_feed = 100
keep_only_tags = [dict(name='div', attrs={'class':'post-alt blog'})]
remove_tags = [
dict(name='div', attrs={'class':['hr', 'titlebar', 'volver-arriba-right','navigation']}),
dict(name='div', attrs={'id':'comment','id':'suckerfish','id':'crp_related'}),
dict(name='h3', attrs={'class':['post_date']}),
dict(name=['object','link'])
]
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
p {font-family:Arial,Helvetica,sans-serif;}
'''
feeds = [
(u'Articulos', u'http://www.diarioelpueblo.com.uy/feed')
]
def get_cover_url(self):
return 'http://www.diarioelpueblo.com.uy/wp-content/uploads/2013/06/Cabezal_Web1.jpg'
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = '2013, Carlos Alves <carlosalves90@gmail.com>'
'''
diarisalto.com.uy
'''
from calibre.web.feeds.news import BasicNewsRecipe
class General(BasicNewsRecipe):
title = 'Diario Salto'
__author__ = 'Carlos Alves'
description = 'Noticias de Salto - Uruguay'
tags = 'news, sports'
language = 'es_UY'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 5
encoding = 'utf8'
remove_javascript = True
no_stylesheets = True
oldest_article = 2
max_articles_per_feed = 100
keep_only_tags = [dict(name='div', attrs={'class':'post'})]
remove_tags = [
dict(name='div', attrs={'class':['hr', 'titlebar', 'navigation']}),
dict(name='div', attrs={'id':'comment'}),
dict(name=['object','link'])
]
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
p {font-family:Arial,Helvetica,sans-serif;}
'''
feeds = [
(u'Articulos', u'http://www.diariosalto.com.uy/feed/atom')
]
def get_cover_url(self):
return 'http://diariosalto.com.uy/demo/wp-content/uploads/2011/12/diario-salto_logo-final-b-b.png'
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -6,6 +6,7 @@ DrMerry added cover Image 2011-11-12
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
import re
class DilbertBig(BasicNewsRecipe):
@ -16,7 +17,7 @@ class DilbertBig(BasicNewsRecipe):
oldest_article = 15
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = True
use_embedded_content = False
encoding = 'utf-8'
publisher = 'UNITED FEATURE SYNDICATE, INC.'
category = 'comic'
@ -30,25 +31,14 @@ class DilbertBig(BasicNewsRecipe):
,'publisher' : publisher
}
feeds = [(u'Dilbert', u'http://feed.dilbert.com/dilbert/daily_strip' )]
def get_article_url(self, article):
return article.get('feedburner_origlink', None)
feeds = [(u'Dilbert', u'http://feed.dilbert.com/dilbert/daily_strip')]
preprocess_regexps = [
(re.compile('strip\..*\.gif', re.DOTALL|re.IGNORECASE), lambda match: 'strip.zoom.gif')
]
def preprocess_html(self, soup):
for tag in soup.findAll(name='a'):
if tag['href'].find('http://feedads') >= 0:
tag.extract()
return soup
for tag in soup.findAll(name='input'):
image = BeautifulSoup('<img src=' + tag['value'] + '></img>')
return image
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
img {max-width:100%; min-width:100%;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''

View File

@ -6,46 +6,87 @@ __copyright__ = u'2011, Silviu Cotoar\u0103'
'''
dilemaveche.ro
'''
from calibre.web.feeds.news import BasicNewsRecipe
class DilemaVeche(BasicNewsRecipe):
title = u'Dilema Veche'
__author__ = u'Silviu Cotoar\u0103'
description = 'Sint vechi, domnule! (I.L. Caragiale)'
publisher = u'Adev\u0103rul Holding'
oldest_article = 5
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare'
encoding = 'utf-8'
cover_url = 'http://dilemaveche.ro/sites/all/themes/dilema/theme/dilema_two/layouter/dilema_two_homepage/logo.png'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [
dict(name='div', attrs={'class':'c_left_column'})
]
remove_tags = [
dict(name='div', attrs={'id':['adshop_widget_428x60']}) ,
dict(name='div', attrs={'id':['gallery']})
]
remove_tags_after = [
dict(name='div', attrs={'id':['adshop_widget_428x60']})
]
feeds = [
(u'Feeds', u'http://dilemaveche.ro/rss.xml')
# apare vinerea, mai pe dupa-masa,depinde de Luiza cred (care se semneaza ca fiind creatorul fiecarui articol in feed-ul RSS)
title = u'Dilema Veche'
__author__ = 'song2' # inspirat din scriptul pentru Le Monde. Inspired from the Le Monde script
description = '"Sint vechi, domnule!" (I.L. Caragiale)'
publisher = 'Adevarul Holding'
oldest_article = 7
max_articles_per_feed = 200
encoding = 'utf8'
language = 'ro'
masthead_url = 'http://www.dilemaveche.ro/sites/all/themes/dilema/theme/dilema_two/layouter/dilema_two_homepage/logo.png'
publication_type = 'magazine'
feeds = [
('Editoriale si opinii - Situatiunea', 'http://www.dilemaveche.ro/taxonomy/term/37/0/feed'),
('Editoriale si opinii - Pe ce lume traim', 'http://www.dilemaveche.ro/taxonomy/term/38/0/feed'),
('Editoriale si opinii - Bordeie si obiceie', 'http://www.dilemaveche.ro/taxonomy/term/44/0/feed'),
('Editoriale si opinii - Talc Show', 'http://www.dilemaveche.ro/taxonomy/term/44/0/feed'),
('Tema saptamanii', 'http://www.dilemaveche.ro/taxonomy/term/19/0/feed'),
('La zi in cultura - Dilema va recomanda', 'http://www.dilemaveche.ro/taxonomy/term/58/0/feed'),
('La zi in cultura - Carte', 'http://www.dilemaveche.ro/taxonomy/term/14/0/feed'),
('La zi in cultura - Film', 'http://www.dilemaveche.ro/taxonomy/term/13/0/feed'),
('La zi in cultura - Muzica', 'http://www.dilemaveche.ro/taxonomy/term/1341/0/feed'),
('La zi in cultura - Arte performative', 'http://www.dilemaveche.ro/taxonomy/term/1342/0/feed'),
('La zi in cultura - Arte vizuale', 'http://www.dilemaveche.ro/taxonomy/term/1512/0/feed'),
('Societate - Ieri cu vedere spre azi', 'http://www.dilemaveche.ro/taxonomy/term/15/0/feed'),
('Societate - Din polul opus', 'http://www.dilemaveche.ro/taxonomy/term/41/0/feed'),
('Societate - Mass comedia', 'http://www.dilemaveche.ro/taxonomy/term/43/0/feed'),
('Societate - La singular si la plural', 'http://www.dilemaveche.ro/taxonomy/term/42/0/feed'),
('Oameni si idei - Educatie', 'http://www.dilemaveche.ro/taxonomy/term/46/0/feed'),
('Oameni si idei - Polemici si dezbateri', 'http://www.dilemaveche.ro/taxonomy/term/48/0/feed'),
('Oameni si idei - Stiinta si tehnologie', 'http://www.dilemaveche.ro/taxonomy/term/46/0/feed'),
('Dileme on-line', 'http://www.dilemaveche.ro/taxonomy/term/005/0/feed')
]
remove_tags_before = dict(name='div',attrs={'class':'spacer_10'})
remove_tags = [
dict(name='div', attrs={'class':'art_related_left'}),
dict(name='div', attrs={'class':'controale'}),
dict(name='div', attrs={'class':'simple_overlay'}),
]
remove_tags_after = [dict(id='facebookLike')]
remove_javascript = True
no_stylesheets = True
remove_empty_feeds = True
extra_css = """
body{font-family: Georgia,Times,serif }
img{margin-bottom: 0.4em; display:block}
"""
needs_subscription = 'optional'
cover_margins = (10, 15, '#ffffff')
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
br.open('http://dilemaveche.ro/user/login')
br.select_form(nr=0)
br['username'] = self.username
br['password'] = self.password
br.submit()
return br
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup('http://dilemaveche.ro')
link_item = soup.find('div',attrs={'class':'box_dr_pdf_picture'})
if link_item and link_item.a:
cover_url = link_item.a['href']
br = BasicNewsRecipe.get_browser()
try:
br.open(cover_url)
except: # daca nu gaseste pdf-ul
self.log("\nPDF indisponibil")
link_item = soup.find('div',attrs={'class':'box_dr_pdf_picture'})
if link_item and link_item.img:
cover_url = link_item.img['src']
br = BasicNewsRecipe.get_browser()
try:
br.open(cover_url)
except: # daca nu gaseste nici imaginea mica mica
print('Mama lor de nenorociti! nu este nici pdf nici imagine')
cover_url ='http://www.dilemaveche.ro/sites/all/themes/dilema/theme/dilema_two/layouter/dilema_two_homepage/logo.png'
return cover_url
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -11,30 +11,31 @@ class dotnetMagazine (BasicNewsRecipe):
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
auto_cleanup = True
# recursion = 1
language = 'en'
remove_empty_feeds = True
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} '
cover_url = u'http://media.netmagazine.futurecdn.net/sites/all/themes/netmag/logo.png'
remove_tags_after = dict(name='footer', id=lambda x:not x)
remove_tags_before = dict(name='header', id=lambda x:not x)
#remove_tags_after = dict(name='footer', id=lambda x:not x)
#remove_tags_before = dict(name='header', id=lambda x:not x)
remove_tags = [
dict(name='div', attrs={'class': 'item-list'}),
dict(name='h4', attrs={'class': 'std-hdr'}),
dict(name='div', attrs={'class': 'item-list share-links'}), # removes share links
dict(name=['script', 'noscript']),
dict(name='div', attrs={'id': 'comments-form'}), # comment these out if you want the comments to show
dict(name='div', attrs={'id': re.compile('advertorial_block_($|| )')}),
dict(name='div', attrs={'id': 'right-col'}),
dict(name='div', attrs={'id': 'comments'}), # comment these out if you want the comments to show
dict(name='div', attrs={'class': 'item-list related-content'}),
#remove_tags = [
#dict(name='div', attrs={'class': 'item-list'}),
#dict(name='h4', attrs={'class': 'std-hdr'}),
#dict(name='div', attrs={'class': 'item-list share-links'}), # removes share links
#dict(name=['script', 'noscript']),
#dict(name='div', attrs={'id': 'comments-form'}), # comment these out if you want the comments to show
#dict(name='div', attrs={'id': re.compile('advertorial_block_($|| )')}),
#dict(name='div', attrs={'id': 'right-col'}),
#dict(name='div', attrs={'id': 'comments'}), # comment these out if you want the comments to show
#dict(name='div', attrs={'class': 'item-list related-content'}),
]
#]
feeds = [
(u'net', u'http://feeds.feedburner.com/net/topstories?format=xml')
(u'net', u'http://feeds.feedburner.com/creativebloq/')
]
def skip_ad_pages(self, soup):

View File

@ -3,10 +3,10 @@ __license__ = 'GPL v3'
__copyright__ = '08 Januery 2011, desUBIKado'
__author__ = 'desUBIKado'
__description__ = 'Daily newspaper from Biscay'
__version__ = 'v0.08'
__date__ = '08, Januery 2011'
__version__ = 'v0.10'
__date__ = '07, August 2013'
'''
[url]http://www.elcorreo.com/[/url]
http://www.elcorreo.com/
'''
import time
@ -24,6 +24,7 @@ class heraldo(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
masthead_url = 'http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
encoding = 'iso-8859-1'
@ -33,15 +34,15 @@ class heraldo(BasicNewsRecipe):
feeds = [
(u'Portada', u'http://www.elcorreo.com/vizcaya/portada.xml'),
(u'Local', u'http://www.elcorreo.com/vizcaya/rss/feeds/vizcaya.xml'),
(u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'),
(u'Econom\xeda', u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'),
(u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'),
(u'Econom\xeda', u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'),
(u'Pol\xedtica', u'http://www.elcorreo.com/vizcaya/rss/feeds/politica.xml'),
(u'Opini\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'),
(u'Deportes', u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'),
(u'Opini\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'),
(u'Deportes', u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'),
(u'Sociedad', u'http://www.elcorreo.com/vizcaya/rss/feeds/sociedad.xml'),
(u'Cultura', u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'),
(u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'),
(u'Gente', u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml')
(u'Cultura', u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'),
(u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'),
(u'Gente', u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml')
]
keep_only_tags = [
@ -54,14 +55,14 @@ class heraldo(BasicNewsRecipe):
dict(name='div', attrs={'class':['mod_lomas','bloque_lomas','blm_header','link-app3','link-app4','botones_listado']}),
dict(name='div', attrs={'class':['navegacion_galeria','modulocanalpromocion','separa','separacion','compartir','tags_relacionados']}),
dict(name='div', attrs={'class':['moduloBuscadorDeportes','modulo-gente','moddestacadopeq','OpcArt','articulopiniones']}),
dict(name='div', attrs={'class':['modulo-especial','publiEspecial']}),
dict(name='div', attrs={'id':['articulopina']}),
dict(name='div', attrs={'class':['modulo-especial','publiEspecial','carruselNoticias','vj','modulocomun2']}),
dict(name='div', attrs={'id':['articulopina','webs_asociadas']}),
dict(name='br', attrs={'class':'clear'}),
dict(name='form', attrs={'name':'frm_conversor2'})
]
remove_tags_before = dict(name='div' , attrs={'class':'articulo '})
remove_tags_after = dict(name='div' , attrs={'class':'comentarios'})
remove_tags_after = dict(name='div' , attrs={'class':'robapaginas'})
def get_cover_url(self):
cover = None
@ -69,10 +70,8 @@ class heraldo(BasicNewsRecipe):
year = str(st.tm_year)
month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday
#[url]http://img.kiosko.net/2011/01/02/es/elcorreo.750.jpg[/url]
#[url]http://info.elcorreo.com/pdf/06012011-viz.pdf[/url]
cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf'
# http://info.elcorreo.com/pdf/07082013-viz.pdf
cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf'
br = BasicNewsRecipe.get_browser(self)
try:
br.open(cover)
@ -92,29 +91,27 @@ class heraldo(BasicNewsRecipe):
img{margin-bottom: 0.4em}
'''
preprocess_regexps = [
# To present the image of the embedded video
# Para presentar la imagen de los video incrustados
(re.compile(r'var RUTA_IMAGEN', re.DOTALL|re.IGNORECASE), lambda match: '</script><img src'),
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
(re.compile(r'var SITIO = "elcorreo";', re.DOTALL|re.IGNORECASE), lambda match: '<SCRIPT TYPE="text/JavaScript"'),
# To separate paragraphs with a blank line
# Para separar los parrafos con una linea en blanco
(re.compile(r'<div class="p"', re.DOTALL|re.IGNORECASE), lambda match: '<p></p><div class="p"'),
# To put a blank line between the subtitle and the date and time of the news
# Para poner una linea en blanco entre el subttulo y la fecha y hora de la noticia
(re.compile(r'<div class="date">', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="date">'),
# To put a blank line between the intro of the embedded videos and the previous text
# Para poner una linea en blanco entre la entradilla de los videos incrustados y el texto anterior
(re.compile(r'<div class="video"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="video"'),
# To view photos from the first when these are presented as a gallery
# Para sacar las fotos a partir de la primera cuando se presentan como una galeria
(re.compile(r'src="/img/shim.gif"', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'rel=', re.DOTALL|re.IGNORECASE), lambda match: 'src='),
# To remove the link of the title
# Para quitar el enlace del titulo
(re.compile(r'<h1 class="headline">\n<a href="', re.DOTALL|re.IGNORECASE), lambda match: '<h1 class="'),
(re.compile(r'</a>\n</h1>', re.DOTALL|re.IGNORECASE), lambda match: '</h1>'),

View File

@ -1,18 +1,23 @@
#!/usr/bin/env python
##
## Last Edited: 2013-09-29 Carlos Alves <carlosalves90@gmail.com>
##
__license__ = 'GPL v3'
__author__ = '2010, Yuri Alvarez<me at yurialvarez.com>'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
'''
observa.com.uy
elobservador.com.uy
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ObservaDigital(BasicNewsRecipe):
title = 'Observa Digital'
__author__ = 'yrvn'
description = 'Noticias de Uruguay'
class Noticias(BasicNewsRecipe):
title = 'El Observador'
__author__ = 'yrvn'
description = 'Noticias desde Uruguay'
tags = 'news, sports, entretainment'
language = 'es_UY'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
@ -23,13 +28,18 @@ class ObservaDigital(BasicNewsRecipe):
oldest_article = 2
max_articles_per_feed = 100
keep_only_tags = [dict(id=['contenido'])]
keep_only_tags = [
dict(name='div', attrs={'class':'story collapsed'})
]
remove_tags = [
dict(name='div', attrs={'id':'contenedorVinculadas'}),
dict(name='p', attrs={'id':'nota_firma'}),
dict(name='div', attrs={'class':['fecha', 'copyright', 'story_right']}),
dict(name='div', attrs={'class':['photo', 'social']}),
dict(name='div', attrs={'id':'widget'}),
dict(name=['object','link'])
]
remove_attributes = ['width','height', 'style', 'font', 'color']
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
@ -37,19 +47,9 @@ class ObservaDigital(BasicNewsRecipe):
p {font-family:Arial,Helvetica,sans-serif;}
'''
feeds = [
(u'Actualidad', u'http://www.observa.com.uy/RSS/actualidad.xml'),
(u'Deportes', u'http://www.observa.com.uy/RSS/deportes.xml'),
(u'Vida', u'http://www.observa.com.uy/RSS/vida.xml'),
(u'Ciencia y Tecnologia', u'http://www.observa.com.uy/RSS/ciencia.xml')
(u'Portada', u'http://elobservador.com.uy/rss/portada/'),
]
def get_cover_url(self):
index = 'http://www.observa.com.uy/'
soup = self.index_to_soup(index)
for image in soup.findAll('img',alt=True):
if image['alt'].startswith('Tapa El Observador'):
return image['src'].rstrip('b.jpg') + '.jpg'
return None
def preprocess_html(self, soup):
for item in soup.findAll(style=True):

View File

@ -5,8 +5,8 @@ __license__ = 'GPL v3'
__copyright__ = '04 December 2010, desUBIKado'
__author__ = 'desUBIKado'
__description__ = 'Daily newspaper from Aragon'
__version__ = 'v0.08'
__date__ = '13, November 2011'
__version__ = 'v0.09'
__date__ = '07, August 2013'
'''
elperiodicodearagon.com
'''
@ -25,11 +25,11 @@ class elperiodicodearagon(BasicNewsRecipe):
no_stylesheets = True
use_embedded_content = False
language = 'es'
masthead_url = 'http://pdf.elperiodicodearagon.com/img/logotipo.gif'
encoding = 'iso-8859-1'
remove_empty_feeds = True
remove_javascript = True
conversion_options = {
'comments' : description
,'tags' : category
@ -56,23 +56,21 @@ class elperiodicodearagon(BasicNewsRecipe):
(u'Fiestas del Pilar', u'http://zetaestaticos.com/aragon/rss/107_es.xml')
]
remove_attributes = ['height','width']
keep_only_tags = [dict(name='div', attrs={'id':'Noticia'})]
# Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)
def get_cover_url(self):
index = 'http://pdf.elperiodicodearagon.com/'
index = 'http://pdf.elperiodicodearagon.com/edicion.php'
soup = self.index_to_soup(index)
for image in soup.findAll('img',src=True):
if image['src'].startswith('http://pdf.elperiodicodearagon.com/funciones/portada-preview.php?eid='):
return image['src'].rstrip('format=2') + 'format=1'
if image['src'].startswith('/funciones/img-public.php?key='):
return 'http://pdf.elperiodicodearagon.com' + image['src']
return None
# Usamos la versión para móviles
def print_version(self, url):
return url.replace('http://www.elperiodicodearagon.com/', 'http://www.elperiodicodearagon.com/m/')
return url.replace('http://www.elperiodicodearagon.com/', 'http://www.elperiodicodearagon.com/m/')

View File

@ -1,93 +0,0 @@
__license__ = 'GPL v3'
__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>'
'''
elguardian.com.ar
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class ElGuardian(BasicNewsRecipe):
title = 'El Guardian'
__author__ = 'Darko Miletic'
description = "Semanario con todas las tendencias de un pais"
publisher = 'Editorial Apache SA'
category = 'news,politics,Argentina'
oldest_article = 8
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'es_AR'
remove_empty_feeds = True
publication_type = 'magazine'
issn = '1666-7476'
masthead_url = 'http://elguardian.com.ar/application/templates/frontend/images/home/logo.png'
extra_css = """
body{font-family: Arial,sans-serif}
img{margin-bottom: 0.4em; display:block}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'series' : title
, 'isbn' : issn
}
keep_only_tags = [dict(attrs={'class':['fotos', 'header_nota', 'nota']})]
remove_tags = [dict(name=['meta','link','iframe','embed','object'])]
remove_attributes = ['lang']
feeds = [
(u'El Pais' , u'http://elguardian.com.ar/RSS/el-pais.xml' )
,(u'Columnistas' , u'http://elguardian.com.ar/RSS/columnistas.xml' )
,(u'Personajes' , u'http://elguardian.com.ar/RSS/personajes.xml' )
,(u'Tinta roja' , u'http://elguardian.com.ar/RSS/tinta-roja.xml' )
,(u'Yo fui' , u'http://elguardian.com.ar/RSS/yo-fui.xml' )
,(u'Ciencia' , u'http://elguardian.com.ar/RSS/ciencia.xml' )
,(u'Cronicas' , u'http://elguardian.com.ar/RSS/cronicas.xml' )
,(u'Culturas' , u'http://elguardian.com.ar/RSS/culturas.xml' )
,(u'DxT' , u'http://elguardian.com.ar/RSS/dxt.xml' )
,(u'Fierros' , u'http://elguardian.com.ar/RSS/fierros.xml' )
,(u'Frente fashion', u'http://elguardian.com.ar/RSS/frente-fashion.xml')
,(u'Pan y vino' , u'http://elguardian.com.ar/RSS/pan-y-vino.xml' )
,(u'Turismo' , u'http://elguardian.com.ar/RSS/turismo.xml' )
]
def get_cover_url(self):
soup = self.index_to_soup('http://elguardian.com.ar/')
udata = soup.find('div', attrs={'class':'datosNumero'})
if udata:
sdata = udata.find('div')
if sdata:
stra = re.findall(r'\d+', self.tag_to_string(sdata))
self.conversion_options.update({'series_index':int(stra[1])})
unumero = soup.find('div', attrs={'class':'ultimoNumero'})
if unumero:
img = unumero.find('img', src=True)
if img:
return img['src']
return None
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('a'):
limg = item.find('img')
if item.string is not None:
str = item.string
item.replaceWith(str)
else:
if limg:
item.name = 'div'
item.attrs = []
else:
str = self.tag_to_string(item)
item.replaceWith(str)
for item in soup.findAll('img'):
if not item.has_key('alt'):
item['alt'] = 'image'
return soup

View File

@ -0,0 +1,126 @@
__license__ = 'GPL v3'
__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>'
'''
http://www.eltribuno.info/jujuy/edicion_impresa.aspx
'''
import urllib
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
from collections import OrderedDict
class ElTribunoJujuyImpreso(BasicNewsRecipe):
title = 'El Tribuno Jujuy (Edición Impresa)'
__author__ = 'Darko Miletic'
description = "Diario principal de Jujuy"
publisher = 'Horizontes S.A.'
category = 'news, politics, Jujuy, Argentina, World'
oldest_article = 2
language = 'es_AR'
max_articles_per_feed = 250
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
publication_type = 'newspaper'
delay = 1
articles_are_obfuscated = True
temp_files = []
PREFIX = 'http://www.eltribuno.info/jujuy/'
INDEX = PREFIX + 'edicion_impresa.aspx'
PRINTURL = PREFIX + 'nota_print.aspx?%s'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
}
keep_only_tags = [dict(name='div' , attrs={'class':['notaHead', 'notaContent']})]
remove_tags = [
dict(name=['meta','iframe','base','object','embed','link','img']),
dict(name='ul', attrs={'class':'Tabs'})
]
extra_css = """
body{font-family: Arial,Helvetica,sans-serif}
.notaHead h4{text-transform: uppercase; color: gray}
img{margin-top: 0.8em; display: block}
"""
def parse_index(self):
feeds = OrderedDict()
soup = None
count = 0
while (count < 5):
try:
soup = self.index_to_soup(self.INDEX)
count = 5
except:
print "Retrying download..."
count += 1
if not soup:
return []
alink = soup.find('a', href=True, attrs={'class':'ZoomTapa'})
if alink and 'href' in alink:
self.cover_url = alink['href']
sections = soup.findAll('div', attrs={'id':lambda x: x and x.startswith('Ediciones')})
for section in sections:
section_title = 'Sin titulo'
sectiont=section.find('h3', attrs={'class':'NombreSeccion'})
if sectiont:
section_title = self.tag_to_string(sectiont.span)
arts = section.findAll('div', attrs={'class':'Noticia NoticiaAB1'})
for article in arts:
articles = []
title=self.tag_to_string(article.div.h3.a)
url=article.div.h3.a['href']
description=self.tag_to_string(article.p)
articles.append({'title':title, 'url':url, 'description':description, 'date':''})
if articles:
if section_title not in feeds:
feeds[section_title] = []
feeds[section_title] += articles
ans = [(key, val) for key, val in feeds.iteritems()]
return ans
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('a'):
if item.string is not None:
str = item.string
item.replaceWith(str)
else:
str = self.tag_to_string(item)
item.replaceWith(str)
return soup
def get_masthead_title(self):
return 'El Tribuno'
def get_obfuscated_article(self, url):
count = 0
while (count < 10):
try:
response = self.browser.open(url)
html = response.read()
count = 10
except:
print "Retrying download..."
count += 1
tfile = PersistentTemporaryFile('_fa.html')
tfile.write(html)
tfile.close()
self.temp_files.append(tfile)
return tfile.name
def print_version(self, url):
right = url.rpartition('/')[2]
artid = right.partition('-')[0]
params = {'Note':artid}
return (self.PRINTURL % urllib.urlencode(params))

View File

@ -0,0 +1,126 @@
__license__ = 'GPL v3'
__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>'
'''
http://www.eltribuno.info/salta/edicion_impresa.aspx
'''
import urllib
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
from collections import OrderedDict
class ElTribunoSaltaImpreso(BasicNewsRecipe):
title = 'El Tribuno Salta (Edición Impresa)'
__author__ = 'Darko Miletic'
description = "Diario principal de Salta"
publisher = 'Horizontes S.A.'
category = 'news, politics, Salta, Argentina, World'
oldest_article = 2
language = 'es_AR'
max_articles_per_feed = 250
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
publication_type = 'newspaper'
delay = 1
articles_are_obfuscated = True
temp_files = []
PREFIX = 'http://www.eltribuno.info/salta/'
INDEX = PREFIX + 'edicion_impresa.aspx'
PRINTURL = PREFIX + 'nota_print.aspx?%s'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
}
keep_only_tags = [dict(name='div' , attrs={'class':['notaHead', 'notaContent']})]
remove_tags = [
dict(name=['meta','iframe','base','object','embed','link','img']),
dict(name='ul', attrs={'class':'Tabs'})
]
extra_css = """
body{font-family: Arial,Helvetica,sans-serif}
.notaHead h4{text-transform: uppercase; color: gray}
img{margin-top: 0.8em; display: block}
"""
def parse_index(self):
feeds = OrderedDict()
soup = None
count = 0
while (count < 5):
try:
soup = self.index_to_soup(self.INDEX)
count = 5
except:
print "Retrying download..."
count += 1
if not soup:
return []
alink = soup.find('a', href=True, attrs={'class':'ZoomTapa'})
if alink and 'href' in alink:
self.cover_url = alink['href']
sections = soup.findAll('div', attrs={'id':lambda x: x and x.startswith('Ediciones')})
for section in sections:
section_title = 'Sin titulo'
sectiont=section.find('h3', attrs={'class':'NombreSeccion'})
if sectiont:
section_title = self.tag_to_string(sectiont.span)
arts = section.findAll('div', attrs={'class':'Noticia NoticiaAB1'})
for article in arts:
articles = []
title=self.tag_to_string(article.div.h3.a)
url=article.div.h3.a['href']
description=self.tag_to_string(article.p)
articles.append({'title':title, 'url':url, 'description':description, 'date':''})
if articles:
if section_title not in feeds:
feeds[section_title] = []
feeds[section_title] += articles
ans = [(key, val) for key, val in feeds.iteritems()]
return ans
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('a'):
if item.string is not None:
str = item.string
item.replaceWith(str)
else:
str = self.tag_to_string(item)
item.replaceWith(str)
return soup
def get_masthead_title(self):
return 'El Tribuno'
def get_obfuscated_article(self, url):
count = 0
while (count < 10):
try:
response = self.browser.open(url)
html = response.read()
count = 10
except:
print "Retrying download..."
count += 1
tfile = PersistentTemporaryFile('_fa.html')
tfile.write(html)
tfile.close()
self.temp_files.append(tfile)
return tfile.name
def print_version(self, url):
right = url.rpartition('/')[2]
artid = right.partition('-')[0]
params = {'Note':artid}
return (self.PRINTURL % urllib.urlencode(params))

View File

@ -1,27 +1,28 @@
__license__ = 'GPL v3'
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2010-2013, Darko Miletic <darko.miletic at gmail.com>'
'''
www.eluniversal.com
'''
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
class ElUniversal(BasicNewsRecipe):
title = 'El Universal'
__author__ = 'Darko Miletic'
description = 'Noticias de Venezuela'
description = 'Noticias de Venezuela y el mundo. Avances informativos de ultimo minuto. Incluye secciones de politica, deportes, economia y mas.'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
remove_empty_feeds = True
encoding = 'cp1252'
publisher = 'El Universal'
category = 'news, Caracas, Venezuela, world'
language = 'es_VE'
publication_type = 'newspaper'
cover_url = strftime('http://static.eluniversal.com/%Y/%m/%d/portada.jpg')
publication_type = 'newspaper'
masthead_url = 'http://cdn.eluniversal.com/images/eu4/back/logo-eluniversal.gif'
#cover_url = strftime('http://cdn.eluniversal.com/%Y/%m/%d/portada.jpg')
cover_url = 'http://images.eluniversal.com//pdf/primeraPlana.pdf'
extra_css = """
.txt60{font-family: Tahoma,Geneva,sans-serif; font-size: small}
.txt29{font-family: Tahoma,Geneva,sans-serif; font-size: small; color: gray}
@ -30,10 +31,10 @@ class ElUniversal(BasicNewsRecipe):
body{font-family: Verdana,Arial,Helvetica,sans-serif}
"""
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
remove_tags_before=dict(attrs={'class':'header-print MB10'})

View File

@ -1,85 +1,51 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
from calibre.web.feeds.news import BasicNewsRecipe
class FocusRecipe(BasicNewsRecipe):
class NYTimes(BasicNewsRecipe):
__author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
title = 'Focus'
__author__ = 'Krittika Goyal'
language = 'pl'
version = 1
title = u'Focus'
publisher = u'Gruner + Jahr Polska'
category = u'News'
description = u'Focus.pl - pierwszy w Polsce portal społecznościowy dla miłośników nauki. Tematyka: nauka, historia, cywilizacja, technika, przyroda, sport, gadżety'
category = 'magazine'
cover_url = ''
remove_empty_feeds = True
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 100000
recursions = 0
description = 'Polish scientific monthly magazine'
timefmt = ' [%d %b, %Y]'
needs_subscription = False
no_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
# Seems to work best, but YMMV
simultaneous_downloads = 5
r = re.compile('.*(?P<url>http:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*')
keep_only_tags = []
keep_only_tags.append(dict(name='div', attrs={'id': 'cll'}))
remove_tags = []
remove_tags.append(dict(name='div', attrs={'class': 'ulm noprint'}))
remove_tags.append(dict(name='div', attrs={'class': 'txb'}))
remove_tags.append(dict(name='div', attrs={'class': 'h2'}))
remove_tags.append(dict(name='ul', attrs={'class': 'txu'}))
remove_tags.append(dict(name='div', attrs={'class': 'ulc'}))
extra_css = '''
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
h1{text-align: left;}
h2{font-size: medium; font-weight: bold;}
p.lead {font-weight: bold; text-align: left;}
.authordate {font-size: small; color: #696969;}
.fot{font-size: x-small; color: #666666;}
'''
feeds = [
('Nauka', 'http://www.focus.pl/nauka/rss/'),
('Historia', 'http://www.focus.pl/historia/rss/'),
('Cywilizacja', 'http://www.focus.pl/cywilizacja/rss/'),
('Sport', 'http://www.focus.pl/sport/rss/'),
('Technika', 'http://www.focus.pl/technika/rss/'),
('Przyroda', 'http://www.focus.pl/przyroda/rss/'),
('Technologie', 'http://www.focus.pl/gadzety/rss/')
keep_only_tags = dict(name='article', attrs={'class': 'content'})
remove_tags_after = dict(name='div', attrs={'class': 'inner_article'})
remove_tags = [
dict(name='div', attrs={'class': ['social_btns']}),
]
def skip_ad_pages(self, soup):
if ('advertisement' in soup.find('title').string.lower()):
href = soup.find('a').get('href')
return self.index_to_soup(href, raw=True)
else:
return None
# TO GET ARTICLE TOC
def nejm_get_index(self):
return self.index_to_soup('http://www.focus.pl/')
def get_cover_url(self):
soup = self.index_to_soup('http://www.focus.pl/magazyn/')
tag = soup.find(name='div', attrs={'class': 'clr fl'})
if tag:
self.cover_url = 'http://www.focus.pl/' + tag.a['href']
return getattr(self, 'cover_url', self.cover_url)
# To parse artice toc
def parse_index(self):
soup = self.nejm_get_index()
def print_version(self, url):
if url.count('focus.pl.feedsportal.com'):
u = url.find('focus0Bpl')
u = 'http://www.focus.pl/' + url[u + 11:]
u = u.replace('0C', '/')
u = u.replace('A', '')
u = u.replace('0E', '-')
u = u.replace('/nc/1//story01.htm', '/do-druku/1')
else:
u = url.replace('/nc/1', '/do-druku/1')
return u
toc = soup.find('div', id='wrapper')
articles = []
feeds = []
section_title = 'Focus Articles'
for x in toc.findAll(True):
if x.name == 'h1':
# Article found
a = x.find('a')
if a is None:
continue
title = self.tag_to_string(a)
url = a.get('href', False)
if not url or not title:
continue
# if url.startswith('story'):
url = 'http://www.focus.pl' + url
self.log('\t\tFound article:', title)
self.log('\t\t\t', url)
articles.append({'title': title, 'url': url,
'description': '', 'date': ''})
feeds.append((section_title, articles))
return feeds

View File

@ -46,35 +46,34 @@ class Frontlineonnet(BasicNewsRecipe):
keep_only_tags= [
dict(name='div', attrs={'id':'content'})
#,dict(attrs={'class':'byline'})
]
#remove_attributes=['size','noshade','border']
#def preprocess_html(self, soup):
#for item in soup.findAll(style=True):
#del item['style']
#for item in soup.findAll('img'):
#if not item.has_key('alt'):
#item['alt'] = 'image'
#return soup
remove_attributes=['size','noshade','border']
def parse_index(self):
articles = []
current_section = None
feeds = []
soup = self.index_to_soup(self.INDEX)
for feed_link in soup.findAll('div', id='headseccol'):
a = feed_link.find('a', href=True)
title = self.tag_to_string(a)
url = a['href']
articles.append({
'title' :title
,'date' :''
,'url' :url
,'description':''
})
return [('Frontline', articles)]
for h3 in soup.findAll('h3'):
if h3.get('class', None) == 'artListSec':
if articles:
feeds.append((current_section, articles))
articles = []
current_section = self.tag_to_string(h3).strip()
self.log(current_section)
elif h3.get('id', None) in {'headseccol', 'headsec'}:
a = h3.find('a', href=True)
if a is not None:
title = self.tag_to_string(a)
url = a['href']
articles.append({
'title' :title
,'date' :''
,'url' :url
,'description':''
})
self.log('\t', title, url)
if articles:
feeds.append((current_section, articles))
return feeds
#def print_version(self, url):
#return "http://www.hinduonnet.com/thehindu/thscrip/print.pl?prd=fline&file=" + url.rpartition('/')[2]
#def image_url_processor(self, baseurl, url):
#return url.replace('../images/', self.INDEX + 'images/').strip()

View File

@ -14,19 +14,12 @@ class GalaxyEdge(BasicNewsRecipe):
auto_cleanup = True
#keep_only_tags = [dict(id='content')]
#remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}),
#dict(id=['email-section', 'right-column', 'printfooter', 'topover',
#'slidebox', 'th_footer'])]
extra_css = '.photo-caption { font-size: smaller }'
def parse_index(self):
soup = self.index_to_soup('http://www.galaxysedge.com/')
main = soup.find('table', attrs={'width':'911'})
toc = main.find('td', attrs={'width':'225'})
main = soup.find('table', attrs={'width':'944'})
toc = main.find('td', attrs={'width':'204'})
current_section = None
current_articles = []
@ -68,41 +61,7 @@ class GalaxyEdge(BasicNewsRecipe):
current_articles.append({'title': title, 'url':url,
'description':'', 'date':''})
if current_articles and current_section:
feeds.append((current_section, current_articles))
feeds.append((current_section, current_articles))
return feeds
#def preprocess_raw_html(self, raw, url):
#return raw.replace('<body><p>', '<p>').replace('</p></body>', '</p>')
#def postprocess_html(self, soup, first_fetch):
#for t in soup.findAll(['table', 'tr', 'td','center']):
#t.name = 'div'
#return soup
#def parse_index(self):
#today = time.strftime('%Y-%m-%d')
#soup = self.index_to_soup(
#'http://www.thehindu.com/todays-paper/tp-index/?date=' + today)
#div = soup.find(id='left-column')
#feeds = []
#current_section = None
#current_articles = []
#for x in div.findAll(['h3', 'div']):
#if current_section and x.get('class', '') == 'tpaper':
#a = x.find('a', href=True)
#if a is not None:
#current_articles.append({'url':a['href']+'?css=print',
#'title':self.tag_to_string(a), 'date': '',
#'description':''})
#if x.name == 'h3':
#if current_section and current_articles:
#feeds.append((current_section, current_articles))
#current_section = self.tag_to_string(x)
#current_articles = []
#return feeds

36
recipes/gamekult.recipe Normal file
View File

@ -0,0 +1,36 @@
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Malah <malah at neuf dot fr>'
'''
Gamekult.com
'''
__author__ = '2013, Malah <malah at neuf dot fr>'
from calibre.web.feeds.news import BasicNewsRecipe
class GamekultCom(BasicNewsRecipe):
title = u'Gamekult.com'
__author__ = 'Malah'
description = u'Toute l`actualité du jeu vidéo PC, consoles, mobiles.'
oldest_article = 1.5
language = 'fr'
max_articles_per_feed = 100
remove_empty_feeds = True
use_embedded_content = False
no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'}
keep_only_tags = [dict(id=['story-page','story-body'])]
remove_tags = [
dict(name='div', attrs={'class':'sharebar'}),
dict(name='object', attrs={'type':'application/x-shockwave-flash'}),
dict(name='span', attrs={'class':'share'}),
dict(name='div', attrs={'class':'story-pagination'}),
dict(name='div', attrs={'class':'pagination pagination-centered'}),
]
masthead_url = u'https://upload.wikimedia.org/wikipedia/fr/9/9c/Logo_-_GAMEKULT.png'
feeds = [
('Test', u'http://www.gamekult.com/feeds/test.html'),
('Actu', u'http://www.gamekult.com/feeds/actu.html'),
]

View File

@ -0,0 +1,10 @@
from calibre.web.feeds.news import AutomaticNewsRecipe
class BasicUserRecipe1373130920(AutomaticNewsRecipe):
title = u'Glenn Greenwald | guardian.co.uk'
language = 'en_GB'
__author__ = 'anywho'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Latest', u'http://www.guardian.co.uk/profile/glenn-greenwald/rss')]

View File

@ -4,44 +4,29 @@ __copyright__ = 'Copyright 2010 Starson17'
www.gocomics.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
import re
class GoComics(BasicNewsRecipe):
title = 'Go Comics'
__author__ = 'Starson17'
__version__ = '1.06'
__date__ = '07 June 2011'
description = u'200+ Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.'
description = u'200+ Comics - Customize for more days/comics: Defaults to 1 day, 25 comics - 20 general, 5 editorial.'
category = 'news, comics'
language = 'en'
use_embedded_content= False
no_stylesheets = True
remove_javascript = True
remove_attributes = ['style']
####### USER PREFERENCES - COMICS, IMAGE SIZE AND NUMBER OF COMICS TO RETRIEVE ########
# USER PREFERENCES - COMICS AND NUMBER OF COMICS TO RETRIEVE ########
# num_comics_to_get - I've tried up to 99 on Calvin&Hobbes
num_comics_to_get = 1
# comic_size 300 is small, 600 is medium, 900 is large, 1500 is extra-large
comic_size = 900
# CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS
# Please do not overload their servers by selecting all comics and 1000 strips from each!
conversion_options = {'linearize_tables' : True
, 'comment' : description
, 'tags' : category
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'class':['feature','banner']}),
]
remove_tags = [dict(name='a', attrs={'class':['beginning','prev','cal','next','newest']}),
dict(name='div', attrs={'class':['tag-wrapper']}),
dict(name='a', attrs={'href':re.compile(r'.*mutable_[0-9]+', re.IGNORECASE)}),
dict(name='img', attrs={'src':re.compile(r'.*mutable_[0-9]+', re.IGNORECASE)}),
dict(name='ul', attrs={'class':['share-nav','feature-nav']}),
]
keep_only_tags = [
dict(name='h1'),
dict(name='div', id=lambda x: x and x.startswith('mutable_')),
]
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
@ -50,7 +35,7 @@ class GoComics(BasicNewsRecipe):
def parse_index(self):
feeds = []
for title, url in [
for i, (title, url) in enumerate([ # {{{
#(u"2 Cows and a Chicken", u"http://www.gocomics.com/2cowsandachicken"),
#(u"9 Chickweed Lane", u"http://www.gocomics.com/9chickweedlane"),
#(u"Adam At Home", u"http://www.gocomics.com/adamathome"),
@ -271,7 +256,7 @@ class GoComics(BasicNewsRecipe):
(u"Strange Brew", u"http://www.gocomics.com/strangebrew"),
(u"The Argyle Sweater", u"http://www.gocomics.com/theargylesweater"),
#
######## EDITORIAL CARTOONS #####################
# EDITORIAL CARTOONS #####################
#(u"Adam Zyglis", u"http://www.gocomics.com/adamzyglis"),
#(u"Andy Singer", u"http://www.gocomics.com/andysinger"),
#(u"Ben Sargent",u"http://www.gocomics.com/bensargent"),
@ -363,81 +348,65 @@ class GoComics(BasicNewsRecipe):
#(u"Walt Handelsman",u"http://www.gocomics.com/walthandelsman"),
#(u"Wayne Stayskal",u"http://www.gocomics.com/waynestayskal"),
#(u"Wit of the World",u"http://www.gocomics.com/witoftheworld"),
]:
print 'Working on: ', title
]): # }}}
self.log('Working on: ', title, url)
articles = self.make_links(url)
if articles:
feeds.append((title, articles))
if self.test and i > 0:
break
return feeds
def make_links(self, url):
title = 'Temp'
current_articles = []
pages = range(1, self.num_comics_to_get+1)
for page in pages:
if self.test:
self.num_comics_to_get = 2
num = self.num_comics_to_get
while num > 0:
num -= 1
page_soup = self.index_to_soup(url)
if page_soup:
try:
strip_title = page_soup.find(name='div', attrs={'class':'top'}).h1.a.string
except:
strip_title = 'Error - no Title found'
try:
date_title = page_soup.find('ul', attrs={'class': 'feature-nav'}).li.string
if not date_title:
date_title = page_soup.find('ul', attrs={'class': 'feature-nav'}).li.string
except:
date_title = 'Error - no Date found'
title = strip_title + ' - ' + date_title
for i in range(2):
try:
strip_url_date = page_soup.find(name='div', attrs={'class':'top'}).h1.a['href']
break # success - this is normal exit
except:
strip_url_date = None
continue # try to get strip_url_date again
for i in range(2):
try:
prev_strip_url_date = page_soup.find('a', attrs={'class': 'prev'})['href']
break # success - this is normal exit
except:
prev_strip_url_date = None
continue # try to get prev_strip_url_date again
if strip_url_date:
page_url = 'http://www.gocomics.com' + strip_url_date
else:
continue
if prev_strip_url_date:
prev_page_url = 'http://www.gocomics.com' + prev_strip_url_date
else:
continue
if not page_soup:
break
content = page_soup.find(id='content')
if content is None:
break
feature = content.find(name='div', attrs={'class':'feature'})
feature_nav = content.find(name='ul', attrs={'class':'feature-nav'})
if feature is None or feature_nav is None:
break
try:
a = feature.find('h1').find('a', href=True)
except:
self.log.exception('Failed to find current page link')
break
page_url = a['href']
if page_url.startswith('/'):
page_url = 'http://www.gocomics.com' + page_url
try:
strip_title = self.tag_to_string(feature.find('h1').find('a', href=True))
except:
strip_title = 'Error - no Title found'
try:
date_title = self.tag_to_string(feature_nav.find('li'))
except:
date_title = 'Error - no Date found'
title = strip_title + ' - ' + date_title
current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':''})
url = prev_page_url
a = feature_nav.find('a', href=True, attrs={'class':'prev'})
if a is None:
break
url = a['href']
if url.startswith('/'):
url = 'http://www.gocomics.com' + url
current_articles.reverse()
return current_articles
def preprocess_html(self, soup):
if soup.title:
title_string = soup.title.string.strip()
_cd = title_string.split(',',1)[1]
comic_date = ' '.join(_cd.split(' ', 4)[0:-1])
if soup.h1.span:
artist = soup.h1.span.string
soup.h1.span.string.replaceWith(comic_date + artist)
feature_item = soup.find('p',attrs={'class':'feature_item'})
if feature_item.a:
a_tag = feature_item.a
a_href = a_tag["href"]
img_tag = a_tag.img
img_tag["src"] = a_href
img_tag["width"] = self.comic_size
img_tag["height"] = None
return self.adeify_images(soup)
headings = soup.findAll('h1')
for h1 in headings[1:]:
h1.extract()
self.adeify_images(soup)
return soup
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
img {max-width:100%; min-width:100%;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''

View File

@ -47,13 +47,7 @@ class GN(BasicNewsRecipe):
return feeds
def find_articles(self, main_block):
for a in main_block.findAll('div', attrs={'class':'prev_doc2'}):
art = a.find('a')
yield {
'title' : self.tag_to_string(art),
'url' : 'http://www.gosc.pl' + art['href']
}
for a in main_block.findAll('div', attrs={'class':'sr-document'}):
for a in main_block.findAll('div', attrs={'class':['prev_doc2', 'sr-document']}):
art = a.find('a')
yield {
'title' : self.tag_to_string(art),

View File

@ -39,10 +39,10 @@ class HBR(BasicNewsRecipe):
br.visit('https://hbr.org/login?request_url=/', timeout=20)
except Timeout:
pass
br.click('#accordion div[tabindex="0"]', wait_for_load=False)
f = br.select_form('#signin-form')
f['signin-form:username'] = username
f['signin-form:password'] = password
br.click('#form-wrapper h3[tabindex="0"]', wait_for_load=False)
f = br.select_form('#login-form')
f['username'] = username
f['password'] = password
br.submit(wait_for_load=False)
br.run_for_a_time(30)
@ -56,7 +56,8 @@ class HBR(BasicNewsRecipe):
articles = []
for x in soup.find(id='issueFeaturesContent').findAll(['li', 'h4']):
if x.name == 'h4':
if x.get('class', None) == 'basic':continue
if x.get('class', None) == 'basic':
continue
if current_section is not None and articles:
feeds.append((current_section, articles))
current_section = self.tag_to_string(x).capitalize()
@ -64,7 +65,8 @@ class HBR(BasicNewsRecipe):
self.log('\tFound section:', current_section)
else:
a = x.find('a', href=True)
if a is None: continue
if a is None:
continue
title = self.tag_to_string(a)
url = a['href']
if '/ar/' not in url:
@ -90,11 +92,11 @@ class HBR(BasicNewsRecipe):
def parse_index(self):
soup0 = self.index_to_soup('http://hbr.org/magazine')
datencover = soup0.find('ul', attrs={'id':'magazineArchiveCarousel'}).findAll('li')[-1]
#find date & cover
# find date & cover
self.cover_url=datencover.img['src']
dates=self.tag_to_string(datencover.img['alt'])
self.timefmt = u' [%s]'%dates
soup = self.index_to_soup(self.INDEX + soup0.find('div', attrs = {'class':'magazine_page'}).a['href'])
soup = self.index_to_soup(self.INDEX + soup0.find('div', attrs={'class':'magazine_page'}).a['href'])
feeds = self.hbr_parse_toc(soup)
return feeds

View File

@ -1,44 +0,0 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
'''
Fetch High Country News - Blogs
'''
from calibre.web.feeds.news import BasicNewsRecipe
class HighCountryNewsBlogs(BasicNewsRecipe):
title = u'High Country News - Blogs'
description = u'High Country News - Blogs (RSS Version)'
__author__ = 'Armin Geller' # 2012-08-01
publisher = 'High Country News'
category = 'news, politics, Germany'
timefmt = ' [%a, %d %b %Y]'
language = 'en'
encoding = 'UTF-8'
publication_type = 'newspaper'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
auto_cleanup = True
remove_javascript = True
use_embedded_content = False
masthead_url = 'http://www.hcn.org/logo.jpg'
cover_source = 'http://www.hcn.org'
def get_cover_url(self):
cover_source_soup = self.index_to_soup(self.cover_source)
preview_image_div = cover_source_soup.find(attrs={'class':' portaltype-Plone Site content--hcn template-homepage_view'})
return preview_image_div.div.img['src']
feeds = [
(u'From the Blogs', u'http://feeds.feedburner.com/hcn/FromTheBlogs?format=xml'),
(u'Heard around the West', u'http://feeds.feedburner.com/hcn/heard?format=xml'),
(u'The GOAT Blog', u'http://feeds.feedburner.com/hcn/goat?format=xml'),
(u'The Range', u'http://feeds.feedburner.com/hcn/range?format=xml'),
]
def print_version(self, url):
return url

View File

@ -1,6 +1,12 @@
# -*- coding: utf-8 -*-
#
# Written: 2012-01-28
# Last Edited: 2013-09-06
# Remark: Version 1.3
# Update cleanup for new web article design
#
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
__copyright__ = '2013, Armin Geller'
'''
Fetch High Country News
@ -9,35 +15,77 @@ from calibre.web.feeds.news import BasicNewsRecipe
class HighCountryNews(BasicNewsRecipe):
title = u'High Country News'
description = u'News from the American West'
__author__ = 'Armin Geller' # 2012-01-31
description = u'High Country News (RSS Version)'
__author__ = 'Armin Geller'
publisher = 'High Country News'
category = 'news, politics'
timefmt = ' [%a, %d %b %Y]'
language = 'en'
encoding = 'UTF-8'
publication_type = 'newspaper'
oldest_article = 7
oldest_article = 14
max_articles_per_feed = 100
no_stylesheets = True
auto_cleanup = True
auto_cleanup = False
remove_javascript = True
remove_empty_feeds = True
use_embedded_content = False
masthead_url = 'http://www.hcn.org/logo.jpg' # 2012-01-31 AGe add
cover_source = 'http://www.hcn.org' # 2012-01-31 AGe add
def get_cover_url(self): # 2012-01-31 AGe add
cover_source_soup = self.index_to_soup(self.cover_source)
preview_image_div = cover_source_soup.find(attrs={'class':' portaltype-Plone Site content--hcn template-homepage_view'})
return preview_image_div.div.img['src']
masthead_url = 'http://www.hcn.org/logo.jpg'
cover_source = 'http://www.hcn.org'
def get_cover_url(self):
cover_source_soup = self.index_to_soup(self.cover_source)
preview_image_div = cover_source_soup.find(attrs={'class':' portaltype-Plone Site content--hcn template-homepage_view'})
return preview_image_div.div.img['src']
feeds = [
(u'Most recent', u'http://feeds.feedburner.com/hcn/most-recent'),
(u'Current Issue', u'http://feeds.feedburner.com/hcn/current-issue'),
(u'Most recent', u'http://feeds.feedburner.com/hcn/most-recent?format=xml'),
(u'Current Issue', u'http://feeds.feedburner.com/hcn/current-issue?format=xml'),
(u'From the Blogs', u'http://feeds.feedburner.com/hcn/FromTheBlogs?format=xml'),
(u'Heard around the West', u'http://feeds.feedburner.com/hcn/heard?format=xml'),
(u'The GOAT Blog', u'http://feeds.feedburner.com/hcn/goat?format=xml'),
(u'The Range', u'http://feeds.feedburner.com/hcn/range?format=xml'),
(u'Writers on the Range', u'http://feeds.feedburner.com/hcn/wotr'),
(u'High Country Views', u'http://feeds.feedburner.com/hcn/HighCountryViews'),
]
def print_version(self, url):
return url + '/print_view'
# 2013-07-23 AGe New coding w/o using print_version
keep_only_tags = [
dict(name='div', attrs={'id':['content']}),
]
remove_tags = [
dict(name='div', attrs={'class':['documentActions supercedeDocumentActions editorialDocumentActions',
'documentActions supercedeDocumentActions editorialDocumentActions editorialFooterDocumentActions',
'article-sidebar',
'image-viewer-controls nojs',
'protectedArticleWrapper',
'visualClear',
'feed-icons', # 2013-09-06 AGe add
'PayWallEmail', # 2013-09-06 AGe add
]}),
dict(name='div', attrs={'id':['offer-below-locked-article']}), # 2013-09-06 AGe add
]
INDEX = ''
def append_page(self, soup, appendtag, position):
pager = soup.find('span',attrs={'class':'next'})
if pager:
nexturl = self.INDEX + pager.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'article-text'})
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
appendtag.insert(position,texttag)
def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3)
pager = soup.find('div',attrs={'class':'listingBar listingBar-article'})
if pager:
pager.extract()
return self.adeify_images(soup)

View File

@ -1,41 +1,206 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2013, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
import re, time
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.utils.date import dt_factory, local_tz
from datetime import datetime, timedelta, date
from lxml import html
from calibre.web.feeds.news import BasicNewsRecipe
class HoustonChronicle(BasicNewsRecipe):
title = u'The Houston Chronicle'
title = u'The Houston Chronicle'
description = 'News from Houston, Texas'
__author__ = 'Kovid Goyal'
language = 'en'
timefmt = ' [%a, %d %b, %Y]'
__author__ = 'Dale Furrow'
language = 'en'
no_stylesheets = True
use_embedded_content = False
# use_embedded_content = False
remove_attributes = ['style']
auto_cleanup = True
oldest_article = 3.0
#keep_only_tags = {'class':lambda x: x and ('hst-articletitle' in x or
#'hst-articletext' in x or 'hst-galleryitem' in x)}
remove_empty_feeds = True
timefmt = '[%a, %d %b %Y]'
timestampfmt = '%Y%m%d%H%M%S'
ignore_duplicate_articles = {'url'}
remove_attributes = ['xmlns']
feeds = [
('News', "http://www.chron.com/rss/feed/News-270.php"),
('Sports',
'http://www.chron.com/sports/headlines/collectionRss/Sports-Headlines-Staff-Stories-10767.php'),
('Neighborhood',
'http://www.chron.com/rss/feed/Neighborhood-305.php'),
('Business', 'http://www.chron.com/rss/feed/Business-287.php'),
('Entertainment',
'http://www.chron.com/rss/feed/Entertainment-293.php'),
('Editorials',
'http://www.chron.com/opinion/editorials/collectionRss/Opinion-Editorials-Headline-List-10567.php'),
('Life', 'http://www.chron.com/rss/feed/Life-297.php'),
('Science & Tech',
'http://www.chron.com/rss/feed/AP-Technology-and-Science-266.php'),
]
remove_tags = [dict(name='div', attrs={'class':'socialBar'}),
dict(name='div', attrs={'class':re.compile('post-commentmeta')}),
dict(name='div', attrs={'class':re.compile('slideshow_wrapper')}),
dict(name='div', attrs={'class':'entry-summary'}),
dict(name='a', attrs={'rel':'item-license'})]
baseUrl = 'http://www.chron.com'
oldest_web_article = 7.0
if oldest_web_article is None:
earliest_date = date.today()
else:
earliest_date = date.today() - timedelta(days=oldest_web_article)
pages = [('news' , '/news/houston-texas/'),
('business' , '/business/'),
('opinion', '/opinion/'),
('sports', '/sports/')]
def getLinksFromSectionPage(self, sectionUrl):
pageDoc = html.parse(sectionUrl)
els = pageDoc.xpath("""//div[contains(@class, 'scp-item')
or @class='scp-feature' or contains(@class, 'simplelist')
or contains(@class, 'scp-blogpromo')]
//a[@href and not(@target) and not(child::img)]""")
elList = []
for el in els:
link = el.get('href')
title = el.text
if link[:4] != 'http':
link = self.baseUrl + link
if title is not None:
elList.append((link, el.text))
return elList
def getArticleDescriptionFromDoc(self, pageDoc):
descriptionCharsBreak = 140
descriptionMaxChars = 300
descXpath = """//div[contains(@class, 'article-body') or
contains(@class, 'resource-content') or contains(@class, 'post')]//p"""
sentenceRegex = re.compile("(\S.+?[.!?])(?=\s+|$)")
def stringify_children(node):
return ''.join([x for x in node.itertext()])
try:
els = pageDoc.xpath(descXpath)
outText = ""
ellipsis = ""
for el in els:
sentences = re.findall(sentenceRegex, stringify_children(el))
for sentence in sentences:
if len(outText) < descriptionCharsBreak:
outText += sentence + " "
else:
if len(outText) > descriptionMaxChars:
ellipsis = "..."
return outText[:descriptionMaxChars] + ellipsis
return outText
except:
self.log('Error on Article Description')
return ""
def getPublishedTimeFromDoc(self, pageDoc):
regexDateOnly = re.compile("""(?:January|February|March|April|
May|June|July|August|September|October|November|
December)\s[0-9]{1,2},\s20[01][0-9]""")
regextTimeOnly = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
def getRegularTimestamp(dateString):
try:
outDate = datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%SZ")
return outDate
except:
return None
def getDateFromString(inText):
match = re.findall(regexDateOnly, inText)
if match:
try:
outDate = datetime.strptime(match[0], "%B %d, %Y")
match = re.findall(regextTimeOnly, inText)
if match:
outTime = datetime.strptime(match[0], "%I:%M %p")
return datetime.combine(outDate.date(), outTime.time())
return outDate
except:
return None
else:
return None
el = pageDoc.xpath("//*[@class='timestamp'][1]")
if len(el) == 1:
return getRegularTimestamp(el[0].get('title'))
else:
el = pageDoc.xpath("//*[@class='entry-date' or @class='post-date'][1]")
if len(el) == 1:
return getDateFromString(el[0].text_content())
else:
return None
def getAllFeedDataFromPage(self, page):
articles = []
linkList = self.getLinksFromSectionPage(self.baseUrl + page[1])
self.log('from section: ', page[0], " found ", len(linkList), " links")
for link in linkList:
try:
articleDoc = html.parse(link[0])
description = self.getArticleDescriptionFromDoc(articleDoc)
articleDate = self.getPublishedTimeFromDoc(articleDoc)
if articleDate is not None and description is not None and articleDate.date() > self.earliest_date:
dateText = articleDate.strftime('%a, %d %b')
author = articleDate.strftime(self.timestampfmt)
articles.append({'title':link[1], 'url':link[0],
'description':description, 'date':dateText, 'author':author})
self.log(page[0] + ": " + link[1] + ', from ' + dateText +
" description of " + str(len(description)) + ' characters at ' + link[0])
else:
msg = ""
if articleDate is None:
msg = " No Timestamp Found"
else:
msg = " article older than " + str(self.oldest_web_article) + ' days...'
self.log("Skipping article: ", link[0], msg)
except:
print 'error on fetching ' + link[0]
continue
return articles
def parse_index(self):
self.timefmt = ' [%a, %d %b, %Y]'
self.log('starting parse_index: ', time.strftime(self.timestampfmt))
feeds = []
for page in self.pages:
articles = []
articles = self.getAllFeedDataFromPage(page)
if articles:
feeds.append((page[0], articles))
self.log('finished parse_index: ', time.strftime(self.timestampfmt))
return feeds
def preprocess_html(self, thisSoup):
baseTags = []
baseTags.extend(thisSoup.findAll(name='div', attrs={'id':re.compile('post-\d+')}))
baseTags.extend(thisSoup.findAll(name='div', attrs={'class':'hnews hentry item'}))
allTags = []
allTags.extend(baseTags)
if len(baseTags) > 0:
for tag in baseTags:
allTags.extend(tag.findAll(True))
paragraphs = thisSoup.findAll(name='p')
for paragraph in paragraphs:
if paragraph not in allTags:
allTags.append(paragraph)
for tag in baseTags:
while tag.parent is not None:
allTags.append(tag)
tag = tag.parent
for tag in thisSoup.findAll(True):
if tag not in allTags:
tag.extract()
return thisSoup
def populate_article_metadata(self, article, soup, first):
if not first:
return
try:
article.date = time.strptime(article.author, self.timestampfmt)
article.utctime = dt_factory(article.date, assume_utc=False, as_utc=False)
article.localtime = article.utctime.astimezone(local_tz)
except Exception as inst: # remove after debug
self.log('Exception: ', article.title) # remove after debug
self.log(type(inst)) # remove after debug
self.log(inst) # remove after debug

BIN
recipes/icons/acrimed.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 709 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 592 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 592 B

BIN
recipes/icons/lacapital.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 446 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 510 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

View File

@ -0,0 +1,12 @@
from calibre.web.feeds.news import BasicNewsRecipe
class IC(BasicNewsRecipe):
title = u'il Cambiamento'
oldest_article = 12
max_articles_per_feed = 50
language = 'it'
__author__ = 'ghib9'
auto_cleanup = True
use_embedded_content = False
feeds = [(u'il Cambiamento', u'http://www.ilcambiamento.it/rss.xml')]

16
recipes/il_foglio.recipe Normal file
View File

@ -0,0 +1,16 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1373969939(BasicNewsRecipe):
title = u'Il Foglio - Editoriali'
oldest_article = 1
max_articles_per_feed = 10
auto_cleanup = False
keep_only_tags = [
dict(name='div', attrs={'class':'sec_item'})
]
feeds = [(u'Il Foglio - Editoriali', u'http://feed43.com/8814237344800115.xml')]
no_stylesheets = True
__author__ = 'faber1971'
description = 'Leading articles from an Italian newspaper - v1.00 (16 July, 2013)'
language = 'it'
masthead_url = 'http://www.ilfoglio.it/media/img/interface/logo_testata_small.gif'

View File

@ -1,504 +1,34 @@
# adapted from old recipe by Darko Miletic <darko.miletic at gmail.com>
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
class TheIndependentNew(BasicNewsRecipe):
# flag to enable/disable article graphics on business pages/some others
# eg http://www.independent.co.uk/news/world/europe/berlusconi-departure-fails-to-calm-the-markets-6259682.html
# -max dimensions can be altered using the .pictureContainer img selector in the css
_FETCH_ARTICLE_GRAPHICS = True
#Flag to enable/disable image fetching (not business)
_FETCH_IMAGES = True
#Set max gallery images here (respects _FETCH_IMAGES)
# -1 for infinite
_MAX_GALLERY_IMAGES = -1
#used for converting rating to stars
# used for converting rating to stars
_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png'
_NO_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star_grey.png'
title = u'The Independent'
__author__ = 'Will'
__author__ = 'Krittika Goyal'
description = 'The latest in UK News and World News from The \
Independent. Wide range of international and local news, sports \
news, commentary and opinion pieces.Independent News - Breaking news \
that matters. Your daily comprehensive news source - The \
Independent Newspaper'
publisher = 'The Independent'
oldest_article = 2.0
ignore_duplicate_articles = {'title', 'url'}
remove_empty_feeds = True
category = 'news, UK'
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
auto_cleanup = True
language = 'en_GB'
publication_type = 'newspaper'
masthead_url = 'http://www.independent.co.uk/independent.co.uk/editorial/logo/independent_Masthead.png'
encoding = 'utf-8'
compress_news_images = True
remove_tags =[
dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
dict(attrs={'class' : ['autoplay','openBiogPopup']}),
dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
dict(name='img',attrs={'alt' : ['view gallery']}),
dict(attrs={'style' : re.compile('.*')}),
dict(attrs={'class':lambda x: x and 'voicesRelatedTopics' in x.split()}),
]
keep_only_tags =[dict(attrs={'id':['main','top']})]
recursions = 0
# fixes non compliant html nesting and 'marks' article graphics links
preprocess_regexps = [
(re.compile('<span class="storyTop ">(?P<nested>.*?)</span>', re.DOTALL),
lambda match: '<div class="storyTop">' + match.group('nested') + '</div>'),
(re.compile('(<strong>.*?[Cc]lick.*?<a.*?((HERE)|([Hh]ere)).*?</strong>)', re.DOTALL),
lambda match: '<div class="article-graphic">' + match.group(0) + '</div>'),
]
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
extra_css = """
h1{font-family: Georgia,serif ; font-size: x-large; }
body{font-family: Verdana,Arial,Helvetica,sans-serif}
img{margin-bottom: 0.4em; display:block}
.starRating img {float: left}
.starRating {margin-top:0.4em; display: block}
.image {clear:left; font-size: x-small; color:#888888;}
.articleByTimeLocation {font-size: x-small; color:#888888;
margin-bottom:0.2em ; margin-top:0.2em ; display:block}
.subtitle {clear:left ;}
.column-1 h1 { color: #191919}
.column-1 h2 { color: #333333}
.column-1 h3 { color: #444444}
.subtitle { color: #777777; font-size: medium;}
.column-1 a,h1,h2,h3 { margin: 0; }
.column-1 div{margin: 0;}
.articleContent {display: block; clear:left;}
.articleContent {color: #000000; font-size: medium;}
.ivDrip-section {color: #000000; font-size: medium;}
.datetime {color: #888888}
.title {font-weight:bold;}
.storyTop{}
.pictureContainer img { max-width: 400px; max-height: 400px;}
.image img { max-width: 400px; max-height: 400px;}
"""
oldest_article = 1
max_articles_per_feed = 100
_processed_urls = []
def get_article_url(self, article):
url = super(self.__class__,self).get_article_url(article)
title = article.get('title', None)
if title and re.search("^Video:",title):
return None
#remove duplicates
if not (url in self._processed_urls):
self._processed_urls.append(url)
else:
url = None
return url
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def preprocess_html(self, soup):
#remove 'advertorial articles'
strapline = soup.find('div',attrs={'class' : re.compile('.*strapLine.*')})
if strapline:
for para in strapline.findAll('p'):
if len(para.contents) and isinstance(para.contents[0],NavigableString) \
and para.contents[0] == 'ADVERTORIAL FEATURE':
return None
# remove Suggested Topics
items_to_extract = []
for item in soup.findAll('div',attrs={'class' : re.compile('.*RelatedArtTag.*')}):
items_to_extract.append(item)
for item in items_to_extract:
item.extract()
items_to_extract = []
slideshow_elements = []
for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
remove = True
pattern = re.compile('((articleContent)|(title))$')
if (pattern.search(item['class'])) is not None:
remove = False
# corrections
# story content always good
pattern = re.compile('storyContent')
if (pattern.search(item['class'])) is not None:
remove = False
#images
pattern = re.compile('slideshow')
if (pattern.search(item['class'])) is not None:
if self._FETCH_IMAGES:
remove = False
slideshow_elements.append(item)
else:
remove = True
#social widgets always bad
pattern = re.compile('socialwidget')
if (pattern.search(item['class'])) is not None:
remove = True
if remove:
items_to_extract.append(item)
for item in items_to_extract:
item.extract()
items_to_extract = []
if self._FETCH_IMAGES:
for element in slideshow_elements:
for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
if item.img is not None:
#use full size image
images = []
img = item.findNext('img')
if not '?action=gallery' in item['href']:
img['src'] = item['href']
tag = Tag(soup,'h3')
text = ''
try:
text = img['data-title']
except:
pass
if img.get('title') and (len(img['title']) > 1):
text = NavigableString(img['title'])
tag.insert(0,text)
images.append((img, tag))
else:
gallery_images, remove_link = self._get_gallery_images(item['href'])
images = images + gallery_images
if remove_link:
gal_link = soup.find('a',attrs={'id' : 'view-gallery'})
if gal_link:
gal_link.extract()
img.extract()
for (img, title) in images:
#insert caption if available
if title:
#picture before text
img.extract()
item.insert(0,img)
item.insert(1,title)
# remove link
item.name = "div"
item["class"]='image'
del item["href"]
#remove empty subtitles
"""
currently the subtitle is located in first paragraph after
sibling <h3 class="subtitle"> tag. This may be 'fixed' at
some point.
"""
subtitle = soup.find('h3',attrs={'class' : 'subtitle'})
if subtitle is not None:
subtitleText = subtitle.findNext('p')
if subtitleText is not None:
if len(subtitleText.contents[0]) <= 1 :
subtitleText.extract()
subtitle.extract()
#replace rating numbers with stars
for item in soup.findAll('div',attrs={ 'class' : 'starRating'}):
if item is not None:
soup2 = self._insertRatingStars(soup,item)
if soup2 is not None:
soup = soup2
#remove empty paragraph tags in storyTop which can leave a space
#between first paragraph and rest of story
nested_content = False
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
for item in storyTop.findAll('p'):
for nested in item:
if isinstance(nested, Tag):
nested_content = True
break
if not nested_content and item.contents is not None and len(item.contents[0]) <= 1 :
items_to_extract.append(item)
for item in items_to_extract:
item.extract()
items_to_extract = []
#remove line breaks immediately next to tags with default margins
#to prevent double line spacing and narrow columns of text
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
self._remove_undesired_line_breaks_from_tag(storyTop,soup)
#replace article graphics link with the graphics themselves
if self._FETCH_ARTICLE_GRAPHICS:
items_to_insert = []
for item in soup.findAll('div', attrs={'class' : ['article-graphic']}):
strong = item.find('strong')
if not strong:
continue
for child in strong:
if isinstance(child,Tag):
if str(child.name) == 'a':
items_to_insert.extend(self._get_article_graphic(strong,child['href'],soup))
for item in items_to_insert:
item[0].replaceWith(item[1])
for item in items_to_extract:
item.extract()
return soup
def _get_article_graphic(self,old_item,url,soup):
items_to_insert = []
if re.search('\.jpg$',str(url)):
div = Tag(soup,'div')
div['class'] = 'pictureContainer'
img = Tag(soup,'img')
img['src'] = url
img['alt'] = 'article graphic'
div.insert(0,img)
items_to_insert.append((old_item,div,))
return items_to_insert
soup2 = self.index_to_soup(url)
for item in soup2.findAll('div',attrs={'class' : re.compile("widget picture article.*")}):
items_to_insert.append((old_item,item),)
return items_to_insert
def _insertRatingStars(self,soup,item):
if item.contents is None or len(item.contents) < 1:
return
rating = item.contents[0]
try:
rating = float(item.contents[0])
except:
print 'Could not convert decimal rating to star: malformatted float.'
return
for i in range(1,6):
star = Tag(soup,'img')
if i <= rating:
star['src'] = self._STAR_URL
else:
star['src'] = self._NO_STAR_URL
star['alt'] = 'star number ' + str(i)
item.insert(i,star)
#item.contents[0] = NavigableString('(' + str(rating) + ')')
item.contents[0] = ''
def postprocess_html(self,soup, first_fetch):
#mark subtitle parent as non-compliant nesting causes
# p's to be 'popped out' of the h3 tag they are nested in.
subtitle = soup.find('h3', attrs={'class' : 'subtitle'})
subtitle_div = None
if subtitle:
subtitle_div = subtitle.parent
if subtitle_div:
clazz = ''
if 'class' in subtitle_div:
clazz = subtitle_div['class'] + ' '
clazz = clazz + 'subtitle'
subtitle_div['class'] = clazz
#find broken images and remove captions
items_to_extract = []
for item in soup.findAll('div', attrs={'class' : 'image'}):
img = item.findNext('img')
if img and img.get('src'):
# broken images still point to remote url
pattern = re.compile('http://www.independent.co.uk.*')
if pattern.match(img["src"]) is not None:
caption = img.findNextSibling('h3')
if caption is not None:
items_to_extract.append(caption)
items_to_extract.append(img)
for item in items_to_extract:
item.extract()
# nickredding's fix for non-justified text
for ptag in soup.findAll('p',attrs={'align':'left'}):
del(ptag['align'])
return soup
def _get_gallery_images(self,url):
gallery_soup = self.index_to_soup(url)
images = []
remove_link = True
total = 1
try:
counter = gallery_soup.find('div',attrs={'id' : ['counter']})
total = counter.contents[0].split('/')
total = int(total[1].rstrip())
except:
total = 1
if self._MAX_GALLERY_IMAGES >= 0 and total > self._MAX_GALLERY_IMAGES:
total = self._MAX_GALLERY_IMAGES
remove_link = False
for i in range(1, total +1):
image, title = self._get_image_from_gallery(gallery_soup)
if image:
images.append((image,title))
next = url + '&ino=' + str(i + 1)
gallery_soup = self.index_to_soup(next)
images.reverse()
return images, remove_link
def _get_image_from_gallery(self,soup):
try:
container = soup.find('div',attrs={'id' : ['main-image']})
image = container.find('img')
if image:
title = soup.find('div',attrs={'id' : ['image-title']})
return image, title
except:
print 'error fetching gallery image'
return None
def _recurisvely_linearise_tag_tree(
self,
item,
linearised= None,
count=0,
limit = 100
):
linearised = linearised or []
count = count + 1
if count > limit:
return linearised
if not (isinstance(item,Tag)):
return linearised
for nested in item:
linearised.append(nested)
linearised = self._recurisvely_linearise_tag_tree(nested,linearised, count)
return linearised
def _get_previous_tag(self,current_index, tag_tree):
if current_index == 0:
return None
else:
return tag_tree[current_index - 1]
def _get_next_tag(self,current_index, tag_tree):
if current_index < len(tag_tree) - 1:
return tag_tree[current_index + 1]
else:
return None
def _list_match(self,test_str, list_regex):
for regex in list_regex:
match = re.match(regex, test_str)
if match is not None:
return True
return False
def _remove_undesired_line_breaks_from_tag(self,parent,soup):
if parent is None:
return
tag_tree = self._recurisvely_linearise_tag_tree(parent)
items_to_remove = []
for item in tag_tree:
if item == u'\n':
items_to_remove.append(item)
continue;
for item in items_to_remove:
tag_tree.remove(item)
spaced_tags = [r'p', r'h\d', r'blockquote']
tags_to_extract = []
tags_to_replace = []
for (i, tag) in enumerate(tag_tree):
if isinstance(tag, Tag):
if str(tag) == '<br />':
previous_tag = self._get_previous_tag(i, tag_tree)
if isinstance(previous_tag, Tag):
previous_tag_is_spaced = previous_tag is not None\
and self._list_match(str(previous_tag.name),
spaced_tags)
else:
previous_tag_is_spaced = False
next_tag = self._get_next_tag(i, tag_tree)
if isinstance(next_tag, Tag):
next_tag_is_spaced = next_tag is not None\
and self._list_match(str(next_tag.name), spaced_tags)
else:
next_tag_is_spaced = False
if previous_tag_is_spaced or next_tag_is_spaced or i == 0\
or i == len(tag_tree) - 1:
tags_to_extract.append(tag)
else:
tags_to_replace.append((tag,NavigableString(' '),))
for pair in tags_to_replace:
pair[0].replaceWith(pair[1])
for tag in tags_to_extract:
tag.extract()
feeds = [
(u'News - UK',
@ -610,3 +140,4 @@ class TheIndependentNew(BasicNewsRecipe):
u'http://www.independent.co.uk/extras/indybest/?service=rss'),
]

View File

@ -33,8 +33,8 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe):
(u'Instapaper Starred', u'http://www.instapaper.com/starred')
]
#Adds the title tag to the body of the recipe. Use this if your articles miss headings.
add_title_tag = False;
# Adds the title tag to the body of the recipe. Use this if your articles miss headings.
add_title_tag = False
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
@ -43,7 +43,7 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe):
br.select_form(nr=0)
br['username'] = self.username
if self.password is not None:
br['password'] = self.password
br['password'] = self.password
br.submit()
return br
@ -55,7 +55,7 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe):
self.report_progress(0, 'Fetching feed'+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll('div', attrs={'class':'cornerControls'}):
for item in soup.findAll('div', attrs={'class':'title_row'}):
#description = self.tag_to_string(item.div)
atag = item.a
if atag and atag.has_key('href'):
@ -73,10 +73,10 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe):
article.title = soup.find('title').contents[0].strip()
def postprocess_html(self, soup, first_fetch):
#adds the title to each story, as it is not always included
# adds the title to each story, as it is not always included
if self.add_title_tag:
for link_tag in soup.findAll(attrs={"id" : "story"}):
link_tag.insert(0,'<h1>'+soup.find('title').contents[0].strip()+'</h1>')
#print repr(soup)
# print repr(soup)
return soup

View File

@ -1,5 +1,4 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2011-2013, Darko Miletic <darko.miletic at gmail.com>'
'''
www.iprofesional.com
'''
@ -19,13 +18,15 @@ class iProfesional(BasicNewsRecipe):
use_embedded_content = False
language = 'es_AR'
remove_empty_feeds = True
publication_type = 'nesportal'
masthead_url = 'http://www.iprofesional.com/img/logo-iprofesional.png'
publication_type = 'newsportal'
masthead_url = 'http://www.iprofesional.com/img/header/logoiprofesional.png'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif }
body{font-family: 'Droid Sans',Arial,sans-serif }
img{margin-bottom: 0.4em; display:block}
.titulo-interior{font-family: Georgia,"Times New Roman",Times,serif}
.autor-nota{font-size: small; font-weight: bold; font-style: italic; color: gray}
.titulo{font-family: WhitneyBoldWhitneyBold,Arial,Helvetica,sans-serif; color: blue}
.fecha-archivo{font-weight: bold; color: rgb(205, 150, 24)}
.description{font-weight: bold; color: gray }
.firma{font-size: small}
"""
conversion_options = {
@ -35,27 +36,21 @@ class iProfesional(BasicNewsRecipe):
, 'language' : language
}
keep_only_tags = [dict(attrs={'class':['fecha','interior-nota']})]
remove_tags = [
dict(name=['meta','link','base','embed','object','iframe'])
,dict(attrs={'class':['menu-imprimir','guardarNota','IN-widget','fin','permalink']})
]
remove_attributes=['lang','xmlns:og','xmlns:fb']
keep_only_tags = [dict(attrs={'class':'desarrollo'})]
remove_tags = [dict(name=['meta','link','base','embed','object','iframe'])]
feeds = [
(u'Ultimas noticias' , u'http://feeds.feedburner.com/iprofesional-principales-noticias')
,(u'Finanzas' , u'http://feeds.feedburner.com/iprofesional-finanzas' )
,(u'Impuestos' , u'http://feeds.feedburner.com/iprofesional-impuestos' )
,(u'Negocios' , u'http://feeds.feedburner.com/iprofesional-economia' )
,(u'Comercio Exterior' , u'http://feeds.feedburner.com/iprofesional-comercio-exterior' )
,(u'Tecnologia' , u'http://feeds.feedburner.com/iprofesional-tecnologia' )
,(u'Management' , u'http://feeds.feedburner.com/iprofesional-managment' )
,(u'Marketing' , u'http://feeds.feedburner.com/iprofesional-marketing' )
,(u'Legales' , u'http://feeds.feedburner.com/iprofesional-legales' )
,(u'Autos' , u'http://feeds.feedburner.com/iprofesional-autos' )
,(u'Vinos' , u'http://feeds.feedburner.com/iprofesional-vinos-bodegas' )
,(u'Finanzas' , u'http://feeds.feedburner.com/iprofesional-finanzas')
,(u'Impuestos' , u'http://feeds.feedburner.com/iprofesional-impuestos')
,(u'Negocios' , u'http://feeds.feedburner.com/iprofesional-economia')
,(u'Comercio Exterior' , u'http://feeds.feedburner.com/iprofesional-comercio-exterior')
,(u'Tecnologia' , u'http://feeds.feedburner.com/iprofesional-tecnologia')
,(u'Management' , u'http://feeds.feedburner.com/iprofesional-managment')
,(u'Marketing' , u'http://feeds.feedburner.com/iprofesional-marketing')
,(u'Legales' , u'http://feeds.feedburner.com/iprofesional-legales')
,(u'Autos' , u'http://feeds.feedburner.com/iprofesional-autos')
,(u'Vinos' , u'http://feeds.feedburner.com/iprofesional-vinos-bodegas')
]
def preprocess_html(self, soup):
@ -64,16 +59,17 @@ class iProfesional(BasicNewsRecipe):
for item in soup.findAll('a'):
limg = item.find('img')
if item.string is not None:
str = item.string
item.replaceWith(str)
str = item.string
item.replaceWith(str)
else:
if limg:
item.name = 'div'
item.attrs = []
else:
str = self.tag_to_string(item)
item.replaceWith(str)
if limg:
item.name = 'div'
item.attrs = []
else:
str = self.tag_to_string(item)
item.replaceWith(str)
for item in soup.findAll('img'):
if not item.has_key('alt'):
item['alt'] = 'image'
if 'alt' not in item:
item['alt'] = 'image'
return soup

View File

@ -6,29 +6,23 @@ class JakartaGlobe(BasicNewsRecipe):
max_articles_per_feed = 100
feeds = [
(u'News', u'http://www.thejakartaglobe.com/pages/getrss/getrss-news.php'),
(u'Business', u'http://www.thejakartaglobe.com/pages/getrss/getrss-business.php'),
(u'Technology', u'http://www.thejakartaglobe.com/pages/getrss/getrss-tech.php'),
(u'My Jakarta', u'http://www.thejakartaglobe.com/pages/getrss/getrss-myjakarta.php'),
(u'International', u'http://www.thejakartaglobe.com/pages/getrss/getrss-international.php'),
(u'Life and Times', u'http://www.thejakartaglobe.com/pages/getrss/getrss-lifeandtimes.php'),
]
(u'News', u'http://www.thejakartaglobe.com/news/feed/'),
(u'Business', u'http://www.thejakartaglobe.com/business/feed/'),
(u'Opinion', u'http://www.thejakartaglobe.com/opinion/feed/'),
(u'Count me in', u'http://www.thejakartaglobe.com/count-me-in/feed/'),
(u'International', u'http://www.thejakartaglobe.com/international/feed/'),
(u'Sports', u'http://www.thejakartaglobe.com/sports/feed/'),
]
__author__ = 'rty'
pubisher = 'JakartaGlobe.com'
description = 'JakartaGlobe, Indonesia, Newspaper'
category = 'News, Indonesia'
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
language = 'en_ID'
encoding = 'utf-8'
conversion_options = {'linearize_tables':True}
masthead_url = 'http://www.thejakartaglobe.com/pages/2010/images/jak-globe-logo.jpg'
keep_only_tags = [
dict(name='div', attrs={'class':'story'}),
dict(name='span', attrs={'class':'headline'}),
dict(name='div', attrs={'class':'story'}),
dict(name='p', attrs={'id':'bodytext'})
]

View File

@ -27,12 +27,11 @@ class JakartaPost(BasicNewsRecipe):
use_embedded_content = False
no_javascript = True
remove_empty_feeds = True
auto_cleanup = True
timefmt = ' [%A, %d %B, %Y]'
encoding = 'utf-8'
keep_only_tags = [dict(name='div', attrs ={'id':'news-main'})]
extra_css = '''
h1{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large;}
.cT-storyDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;}
@ -51,10 +50,6 @@ class JakartaPost(BasicNewsRecipe):
body{font-family:Arial,Helvetica,sans-serif; font-size:x-small;}
'''
remove_tags = [
dict(name='div', attrs ={'class':['text-size']}),
]
feeds = [
(u'Breaking News', u'http://www.thejakartapost.com/breaking/feed'),

47
recipes/jeuxvideo.recipe Normal file
View File

@ -0,0 +1,47 @@
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Malah <malah at neuf dot fr>'
'''
JeuxVideo.com
'''
__author__ = '2013, Malah <malah at neuf dot fr>'
from calibre.web.feeds.news import BasicNewsRecipe
class JeuxVideoCom(BasicNewsRecipe):
title = 'JeuxVideo.com'
__author__ = 'Malah'
description = 'La Référence des Jeux Vidéo sur PC et Consoles !'
oldest_article = 1.5
language = 'fr'
max_articles_per_feed = 100
remove_empty_feeds = True
use_embedded_content = False
no_stylesheets = True
ignore_duplicate_articles = {'title', 'url'}
keep_only_tags = [dict(id=['news_detail','test_txt','test_avis'])]
remove_tags = [
dict(name='div', attrs={'id':'player_video_article'}),
dict(name='div', attrs={'class':'liste-fiches'})
]
masthead_url = u'https://upload.wikimedia.org/wikipedia/commons/3/39/Jeuxvideocom.png'
feeds = [
(u'Section PC',u'http://www.jeuxvideo.com/rss/rss-pc.xml'),
(u'Section Xbox One',u'http://www.jeuxvideo.com/rss/rss-xo.xml'),
(u'Section PlayStation 4',u'http://www.jeuxvideo.com/rss/rss-ps4.xml'),
(u'Section Xbox 360',u'http://www.jeuxvideo.com/rss/rss-360.xml'),
(u'Section PlayStation 3',u'http://www.jeuxvideo.com/rss/rss-ps3.xml'),
(u'Section Wii U',u'http://www.jeuxvideo.com/rss/rss-wiiu.xml'),
(u'Section Wii',u'http://www.jeuxvideo.com/rss/rss-wii.xml'),
(u'Section Nintendo 3DS',u'http://www.jeuxvideo.com/rss/rss-3ds.xml'),
(u'Section Nintendo DS',u'http://www.jeuxvideo.com/rss/rss-ds.xml'),
(u'Section PlayStation Vita',u'http://www.jeuxvideo.com/rss/rss-vita.xml'),
(u'Section PlayStation Protable',u'http://www.jeuxvideo.com/rss/rss-psp.xml'),
(u'Section Android',u'http://www.jeuxvideo.com/rss/rss-android.xml'),
(u'Section Iphone',u'http://www.jeuxvideo.com/rss/rss-iphone.xml'),
(u'Section Web',u'http://www.jeuxvideo.com/rss/rss-wb.xml'),
(u'Autres news', u'http://www.jeuxvideo.com/rss/rss-news.xml'),
(u'Autres vidéos', u'http://www.jeuxvideo.com/rss/rss-videos.xml'),
(u'Autres articles', u'http://www.jeuxvideo.com/rss/rss.xml'),
]

69
recipes/jot_down.recipe Normal file
View File

@ -0,0 +1,69 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '23 June 2013, desUBIKado'
__author__ = 'desUBIKado'
__description__ = 'Contemporary Culture Magazine'
__version__ = 'v0.01'
__date__ = '23, June 2013'
'''
http://www.jotdown.es/
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class jotdown(BasicNewsRecipe):
author = 'desUBIKado'
description = 'Revista digital con magníficos y extensos artículos'
title = u'Jot Down - Contemporary Culture Magazine'
publisher = 'Wabi Sabi Investments, S.C.'
category = 'Opinion, culture, science, movies, TV shows, music, blogs'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 7
delay = 1
max_articles_per_feed = 20
masthead_url = 'http://www.jotdown.es/wp-content/uploads/2011/04/logoJotDown.png'
use_embedded_content = False
remove_javascript = True
no_stylesheets = True
feeds = [
(u'Portada', u'http://www.jotdown.es/feed/')
]
keep_only_tags = [dict(name='div', attrs={'class':['single']}),
dict(name='div', attrs={'id':['comments']}),
]
remove_tags = [dict(name='a', attrs={'href':['http://alternativaseconomicas.coop/']}),
dict(name='div', attrs={'class':['reply','after-meta','comment-author vcard']}),
dict(name='div', attrs={'align':['center']}),
dict(name='span', attrs={'class':['fbreplace']}),
dict(name='div', attrs={'id':'respond'})
]
remove_tags_after = dict(name='div' , attrs={'id':'respond'})
extra_css = '''
.comment-list {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:12px;}
'''
preprocess_regexps = [
# To present the image of the embedded video
(re.compile(r'<object type="application/x-shockwave-flash" data="http://www.youtube.com/v',
re.DOTALL|re.IGNORECASE), lambda match: '<img src="http://img.youtube.com/vi'),
(re.compile(r'&rel=0&fs=1"', re.DOTALL|re.IGNORECASE), lambda match: '/0.jpg"><object'),
# To remove the link of the category
(re.compile(r'<div class="meta">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="meta"><!-- '),
(re.compile(r'</a>, <a href="http://www.jotdown.es/category', re.DOTALL|re.IGNORECASE), lambda match: ', <!--'),
(re.compile(r'"category tag">', re.DOTALL|re.IGNORECASE), lambda match: '--> '),
(re.compile(r'</a> &mdash;', re.DOTALL|re.IGNORECASE), lambda match: ''),
# To remove the link of the title
(re.compile(r'<h1> <a href="', re.DOTALL|re.IGNORECASE), lambda match: '<h1> <div class="'),
(re.compile(r'</a> </h1>', re.DOTALL|re.IGNORECASE), lambda match: '</div> </h1>')
]

View File

@ -20,7 +20,7 @@ class crnews(BasicNewsRecipe):
no_stylesheets = True
feeds = [(u'Portada', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=portada'), (u'Ultima Hora', u'http://www.nacion.com/Generales/RSS/UltimaHoraRss.aspx'), (u'Nacionales', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=elpais'), (u'Entretenimiento', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=entretenimiento'), (u'Sucesos', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=sucesos'), (u'Deportes', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=deportes'), (u'Internacionales', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=mundo'), (u'Economia', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=economia'), (u'Aldea Global', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=aldeaglobal'), (u'Tecnologia', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=tecnologia'), (u'Opinion', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=opinion')]
feeds = [(u'Portada', u'http://www.nacion.com/rss/'), (u'Ultima Hora', u'http://www.nacion.com/rss/latest/'), (u'Nacionales', u'http://www.nacion.com/rss/nacional/'), (u'Entretenimiento', u'http://www.nacion.com/rss/ocio/'), (u'Sucesos', u'http://www.nacion.com/rss/sucesos/'), (u'Deportes', u'http://www.nacion.com/rss/deportes/'), (u'Internacionales', u'http://www.nacion.com/rss/mundo/'), (u'Economia', u'http://www.nacion.com/rss/economia/'), (u'Vivir', u'http://www.nacion.com/rss/vivir/'), (u'Tecnologia', u'http://www.nacion.com/rss/tecnologia/'), (u'Opinion', u'http://www.nacion.com/rss/opinion/')]
def get_cover_url(self):
index = 'http://kiosko.net/cr/np/cr_nacion.html'

76
recipes/lacapital.recipe Normal file
View File

@ -0,0 +1,76 @@
__license__ = 'GPL v3'
__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>'
'''
www.lacapital.com.ar
'''
from calibre.web.feeds.news import BasicNewsRecipe
class LaCapital(BasicNewsRecipe):
title = 'La Capital de Rosario'
__author__ = 'Darko Miletic'
description = 'Noticias, actualidad y toda la informacion de Rosario y la region'
publisher = 'Diario La Capital S. A.'
category = 'news, politics, Rosario, Santa Fe, Argentina'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'es_AR'
remove_empty_feeds = True
publication_type = 'newspaper'
masthead_url = 'http://www.lacapital.com.ar/system/modules/com.tfsla.diario.core/resources/images/logoLaCapital_noCom.png'
extra_css = """
body{font-family: Georgia,"Times New Roman",Times,serif }
img{margin-bottom: 0.4em; display:block}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags=[dict(attrs={'class':'leer'})]
remove_tags_after=dict(attrs={'class':'notaA'})
remove_tags = [
dict(name=['meta','link','iframe','object'])
,dict(name='div', attrs={'class':['herramientas','almargen','relacionadas']})
]
feeds = [
(u'Portada' , u'http://www.lacapital.com.ar/rss/home.xml' )
,(u'La Ciudad' , u'http://www.lacapital.com.ar/rss/laciudad.xml' )
,(u'Politica' , u'http://www.lacapital.com.ar/rss/politica.xml' )
,(u'Economia' , u'http://www.lacapital.com.ar/rss/economia.xml' )
,(u'La Region' , u'http://www.lacapital.com.ar/rss/laregion.xml' )
,(u'Informacion General' , u'http://www.lacapital.com.ar/rss/informaciongral.xml' )
,(u'El Mundo' , u'http://www.lacapital.com.ar/rss/elmundo.xml' )
,(u'Opinion' , u'http://www.lacapital.com.ar/rss/opinion.xml' )
,(u'Cartas de lectores' , u'http://www.lacapital.com.ar/rss/cartasdelectores.xml')
,(u'Escenario' , u'http://www.lacapital.com.ar/rss/escenario.xml' )
,(u'Policiales' , u'http://www.lacapital.com.ar/rss/policiales.xml' )
,(u'Ovacion' , u'http://www.lacapital.com.ar/rss/ovacion.xml' )
,(u'Turismo' , u'http://www.lacapital.com.ar/rss/turismo.xml' )
,(u'Economia' , u'http://www.lacapital.com.ar/rss/economia.xml' )
,(u'Señales' , u'http://www.lacapital.com.ar/rss/senales.xml' )
,(u'Educacion' , u'http://www.lacapital.com.ar/rss/educacion.xml' )
,(u'Estilo' , u'http://www.lacapital.com.ar/rss/estilo.xml' )
,(u'Salud' , u'http://www.lacapital.com.ar/rss/salud.xml' )
,(u'Tecnologia' , u'http://www.lacapital.com.ar/rss/tecnologia.xml' )
]
def get_cover_url(self):
soup = self.index_to_soup('http://www.lacapital.com.ar/impresa/tapa.html')
for image in soup.findAll('img',alt=True):
if image['alt'].startswith('Tapa de papel'):
return image['src']
return None
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -13,6 +13,8 @@ class LamebookRecipe(BasicNewsRecipe):
language = 'en'
use_embedded_content = False
publication_type = 'blog'
reverse_article_order = True
encoding = 'utf-8'
keep_only_tags = [
dict(name='div', attrs={'class':'entry'})

34
recipes/le_gorafi.recipe Normal file
View File

@ -0,0 +1,34 @@
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Malah <malah at neuf dot fr>'
'''
Le GORAFI.fr
'''
__author__ = '2013, Malah <malah at neuf dot fr>'
from calibre.web.feeds.news import BasicNewsRecipe
class legorafi(BasicNewsRecipe):
title = u'Le GORAFI.fr'
__author__ = 'Malah'
description = u'Depuis 1826, toute l\'information de sources contradictoires'
oldest_article = 7
language = 'fr'
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
keep_only_tags = [
dict(name='div', attrs={'class':'entry-content'}),
dict(name='h3', attrs={'id':'comments-title'}),
]
remove_tags = [
dict(name='div', attrs={'id':'soshake-sharebox'}),
dict(name='div', attrs={'class':'social-ring'}),
dict(name='div', attrs={'class':'entry-utility'}),
dict(name='div', attrs={'id':'respond'}),
]
masthead_url = u'http://web.gweno.free.fr/img/logositeter.png'
couverture_url = u'http://www.legorafi.fr/wp-content/uploads/2013/02/iconegorafi.png'
feeds = [
(u'Articles', u'http://www.legorafi.fr/feed/'),
]

View File

@ -0,0 +1,111 @@
# vim:fileencoding=utf-8
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013'
'''
monde-diplomatique.fr
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import feeds_from_index
class LeMondeDiplomatiqueSiteWeb(BasicNewsRecipe):
title = u'Le Monde diplomatique.fr'
__author__ = 'Gaëtan Lehmann'
description = "Le Monde diplomatique est un mensuel français dinformation et dopinion à la ligne éditoriale nettement engagée en faveur d'une gauche de rupture avec le capitalisme. Il aborde de nombreux sujets — géopolitique, relations internationales, économie, questions sociales, écologie, culture, médias, …" # noqa
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
publisher = 'monde-diplomatique.fr'
category = 'news, France, world'
language = 'fr'
masthead_url = 'http://www.monde-diplomatique.fr/squelettes/images/logotyfa.png'
timefmt = ' [%d %b %Y]'
no_stylesheets = True
feeds = [(u'Blogs', u'http://blog.mondediplo.net/spip.php?page=backend'), (u'Archives', u'http://www.monde-diplomatique.fr/rss/')]
preprocess_regexps = [
(re.compile(r'<title>(.*) - Les blogs du Diplo</title>'), lambda m: '<title>' + m.group(1) + '</title>'),
(re.compile(r'<h2>(.*) - Les blogs du Diplo</h2>'), lambda m: '<h2>' + m.group(1) + '</h2>'),
(re.compile(r'<title>(.*) \(Le Monde diplomatique\)</title>'), lambda m: '<title>' + m.group(1) + '</title>'),
(re.compile(r'<h2>(.*) \(Le Monde diplomatique\)</h2>'), lambda m: '<h2>' + m.group(1) + '</h2>'),
(re.compile(r'<h3>Grand format</h3>'), lambda m: '')]
remove_tags = [dict(name='div', attrs={'class':'voiraussi liste'}),
dict(name='ul', attrs={'class':'hermetique carto hombre_demi_inverse'}),
dict(name='a', attrs={'class':'tousles'}),
dict(name='h3', attrs={'class':'cat'}),
dict(name='div', attrs={'class':'logodiplo'}),
dict(name='img', attrs={'class':'spip_logos'}),
dict(name='p', attrs={'id':'hierarchie'}),
dict(name='div', attrs={'class':'espace'})]
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
}
remove_empty_feeds = True
filterDuplicates = True
# don't use parse_index - we need it to send an exception so we can mix
# feed and parse_index results in parse_feeds
def parse_index_valise(self):
articles = []
soup = self.index_to_soup('http://www.monde-diplomatique.fr/carnet/')
cnt = soup.find('ul',attrs={'class':'hermetique liste'})
for item in cnt.findAll('li'):
description = ''
feed_link = item.find('a')
desc = item.find('div',attrs={'class':'intro'})
date = item.find('div',attrs={'class':'dates_auteurs'})
if desc:
description = desc.string
if feed_link and feed_link.has_key('href'):
url = 'http://www.monde-diplomatique.fr' + feed_link['href']
title = self.tag_to_string(feed_link)
articles.append({
'title' :title
,'date' :date.string.strip()
,'url' :url
,'description':description
})
return [("La valise diplomatique", articles)]
def parse_index_cartes(self):
articles = []
soup = self.index_to_soup('http://www.monde-diplomatique.fr/cartes/')
cnt = soup.find('div',attrs={'class':'decale hermetique'})
for item in cnt.findAll('div',attrs={'class':re.compile('grid_3 filet hombre_demi')}):
feed_link = item.find('a',attrs={'class':'couve'})
h3 = item.find('h3')
authorAndDate = item.find('div',attrs={'class':'dates_auteurs'})
author, date = authorAndDate.string.strip().split(', ')
if feed_link and feed_link.has_key('href'):
url = 'http://www.monde-diplomatique.fr' + feed_link['href']
title = self.tag_to_string(h3)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description': author
})
return [("Cartes", articles)]
def parse_feeds(self):
feeds = BasicNewsRecipe.parse_feeds(self)
valise = feeds_from_index(self.parse_index_valise(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
cartes = feeds_from_index(self.parse_index_cartes(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
feeds = valise + feeds + cartes
return feeds

View File

@ -2,7 +2,7 @@
__author__ = 'Sylvain Durand <sylvain.durand@ponts.org>'
__license__ = 'GPL v3'
import time
import time, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
@ -13,7 +13,7 @@ class LeMonde(BasicNewsRecipe):
title = u'Le Monde: Édition abonnés'
__author__ = 'Sylvain Durand'
description = u'Disponible du lundi au samedi à partir de 14 heures environ, avec tous ses cahiers.'
description = u'La version papier du quotidien Le Monde, disponible du lundi au samedi à partir de 14 heures environ, avec tous ses cahiers.'
language = 'fr'
encoding = 'utf8'
@ -65,26 +65,38 @@ class LeMonde(BasicNewsRecipe):
url = time.strftime(self.journal_url,self.date)
soup = self.index_to_soup(url).sommaire
sections = []
for sec in soup.findAll("section"):
articles = []
if sec['cahier'] != "Le Monde":
for col in sec.findAll("fnts"):
col.extract()
if sec['cahier']=="Le Monde Magazine":
continue
for art in sec.findAll("art"):
if art.txt.string and art.ttr.string:
if art.find(['url']):
art.insert(6,'<div id="photo"><img src="'+art.find(['url']).string+'" /></div>')
if art.find(['lgd']) and art.find(['lgd']).string:
art.insert(7,'<div id="lgd">'+art.find(['lgd']).string+'</div>')
article = "<html><head></head><body>"+unicode(art)+"</body></html>"
article = article.replace('<![CDATA[','').replace(']]>','').replace(' oC ','°C ')
article = article.replace('srttr>','h3>').replace('ssttr>','h2>').replace('ttr>','h1>')
f = PersistentTemporaryFile()
f.write(article)
articles.append({'title':art.ttr.string,'url':"file:///"+f.name})
sections.append((sec['nom'], articles))
try:
for sec in soup.findAll("section"):
articles = []
if sec['cahier'] != "Le Monde":
for col in sec.findAll("fnts"):
col.extract()
if sec['cahier']=="Le Monde Magazine":
continue
for art in sec.findAll("art"):
if art.txt.string and art.ttr.string:
if art.find(['url']):
art.insert(6,'<div id="photo"><img src="'+art.find(['url']).string+'" /></div>')
if art.find(['lgd']) and art.find(['lgd']).string:
art.insert(7,'<div id="lgd">'+art.find(['lgd']).string+'</div>')
def guillemets(match):
if match.group(1) == u"=":
return match.group(0)
return u'%s«&nbsp;%s&nbsp;»' % (match.group(1), match.group(2))
article = "<html><head></head><body>"+unicode(art)+"</body></html>"
article = article.replace('<![CDATA[','').replace(']]>','').replace(' oC ','°C ')
article = article.replace('srttr>','h3>').replace('ssttr>','h2>').replace('ttr>','h1>')
article = article.replace("'" , u'\u2019')
article = re.sub('(.|^)"([^"]+)"', guillemets, article)
f = PersistentTemporaryFile()
f.write(article)
articles.append({'title':art.ttr.string,'url':"file:///"+f.name})
sections.append((sec['nom'], articles))
except AttributeError:
self.log("Vos identifiants sont incorrects, ou votre abonnement LeMonde.fr ne vous permet pas de télécharger le journal.")
return sections
def preprocess_html(self, soup):
@ -92,3 +104,4 @@ class LeMonde(BasicNewsRecipe):
lgd.contents[-1].extract()
return soup

View File

@ -0,0 +1,49 @@
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Malah <malah at neuf dot fr>'
'''
Le Nouvel Observateur
'''
__author__ = '2013, Malah <malah at neuf dot fr>'
from calibre.web.feeds.news import BasicNewsRecipe
class LeNouvelObs(BasicNewsRecipe):
title = u'Le Nouvel Observateur'
__author__ = 'Malah'
description = u'Actualités en temps réel, Info à la Une'
oldest_article = 1
language = 'fr'
max_articles_per_feed = 25
use_embedded_content = False
ignore_duplicate_articles = ('title', 'url')
remove_empty_feeds = True
no_stylesheets = True
masthead_url = u'https://upload.wikimedia.org/wikipedia/fr/f/f9/Le_Nouvel_observateur.png'
feeds = [
(u'Politique', u'http://tempsreel.nouvelobs.com/politique/rss.xml'),
(u'Société', u'http://tempsreel.nouvelobs.com/societe/rss.xml'),
(u'Monde', u'http://tempsreel.nouvelobs.com/monde/rss.xml'),
(u'Economie', u'http://tempsreel.nouvelobs.com/economie/rss.xml'),
(u'Culture', u'http://tempsreel.nouvelobs.com/culture/rss.xml'),
(u'High Tech', u'http://obsession.nouvelobs.com/high-tech/rss.xml'),
(u'Education', u'http://tempsreel.nouvelobs.com/education/rss.xml'),
(u'Services', u'http://tempsreel.nouvelobs.com/services/rss.xml'),
(u'Sport', u'http://tempsreel.nouvelobs.com/sport/rss.xml'),
(u'CinéObs', u'http://cinema.nouvelobs.com/articles.rss'),
(u'TéléObs', u'http://teleobs.nouvelobs.com/rss.xml'),
(u'Autres Actualités',u'http://tempsreel.nouvelobs.com/rss.xml'),
]
keep_only_tags = [
dict(name='h1', attrs={'id':'obs-article-title'}),
dict(name='div', attrs={'class':'obs-date'}),
dict(name='div', attrs={'class':'art-auteur'}),
dict(name='h2', attrs={'class':'obs-article-intro'}),
dict(name='div', attrs={'id':'obs-article-keywords'}),
dict(name='div', attrs={'id':'obs-article-mainpic'}),
dict(name='div', attrs={'itemprop':'articleBody'}),
dict(name='img', attrs={'id':'ObsImg'}),
dict(name='p', attrs={'class':'date-media'}),
dict(name='p', attrs={'id':'ObsDesc'}),
]

View File

@ -21,42 +21,10 @@ class Liberation(BasicNewsRecipe):
max_articles_per_feed = 15
no_stylesheets = True
remove_empty_feeds = True
filterDuplicates = True
needs_subscription = 'optional'
extra_css = '''
h1, h2, h3 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
p.subtitle {font-size:xx-small; font-family:Arial,Helvetica,sans-serif;}
h4, h5, h2.rubrique, {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.ref, .date, .author, .legende {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.mna-body, entry-body {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
'''
keep_only_tags = [
dict(name='div', attrs={'class':'article'})
,dict(name='div', attrs={'class':'text-article m-bot-s1'})
,dict(name='div', attrs={'class':'entry'})
,dict(name='div', attrs={'class':'col_contenu'})
]
remove_tags_after = [
dict(name='div',attrs={'class':['object-content text text-item', 'object-content', 'entry-content', 'col01', 'bloc_article_01']})
,dict(name='p',attrs={'class':['chapo']})
,dict(id='_twitter_facebook')
]
remove_tags = [
dict(name='iframe')
,dict(name='a', attrs={'class':'lnk-comments'})
,dict(name='div', attrs={'class':'toolbox'})
,dict(name='ul', attrs={'class':'share-box'})
,dict(name='ul', attrs={'class':'tool-box'})
,dict(name='ul', attrs={'class':'rub'})
,dict(name='p',attrs={'class':['chapo']})
,dict(name='p',attrs={'class':['tag']})
,dict(name='div',attrs={'class':['blokLies']})
,dict(name='div',attrs={'class':['alire']})
,dict(id='_twitter_facebook')
]
keep_only_tags = [dict(name='article')]
remove_tags = [dict(attrs={'class':['tool-bar']})]
feeds = [
(u'La une', u'http://rss.liberation.fr/rss/9/')
@ -69,6 +37,16 @@ class Liberation(BasicNewsRecipe):
,(u'Sports', u'http://www.liberation.fr/rss/12/')
]
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
br.open('http://token.liberation.fr/accounts/login/')
br.select_form(nr=0)
br['email'] = self.username
br['password'] = self.password
br.submit()
return br
def get_masthead_url(self):
masthead = 'http://s0.libe.com/libe/img/common/logo-liberation-150.png'
br = BasicNewsRecipe.get_browser(self)
@ -78,3 +56,15 @@ class Liberation(BasicNewsRecipe):
self.log("\nCover unavailable")
masthead = None
return masthead
def get_article_url(self, article):
url = BasicNewsRecipe.get_article_url(self, article)
url = url.split('/')[-2]
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http://', '0S':
'www.', '0I': '_'}
for k, v in encoding.iteritems():
url = url.replace(k, v)
return url.partition('?')[0]

View File

@ -1,103 +0,0 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2012, Rémi Vanicat <vanicat at debian.org>'
'''
liberation.fr
'''
# The cleanning is from the Liberation recipe, by Darko Miletic
from calibre.web.feeds.news import BasicNewsRecipe
class Liberation(BasicNewsRecipe):
title = u'Libération: Édition abonnés'
__author__ = 'Rémi Vanicat'
description = u'Actualités'
category = 'Actualités, France, Monde'
language = 'fr'
needs_subscription = True
use_embedded_content = False
no_stylesheets = True
remove_empty_feeds = True
extra_css = '''
h1, h2, h3 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
p.subtitle {font-size:xx-small; font-family:Arial,Helvetica,sans-serif;}
h4, h5, h2.rubrique, {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.ref, .date, .author, .legende {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.mna-body, entry-body {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
'''
keep_only_tags = [
dict(name='div', attrs={'class':'article'})
,dict(name='div', attrs={'class':'text-article m-bot-s1'})
,dict(name='div', attrs={'class':'entry'})
,dict(name='div', attrs={'class':'col_contenu'})
]
remove_tags_after = [
dict(name='div',attrs={'class':['object-content text text-item', 'object-content', 'entry-content', 'col01', 'bloc_article_01']})
,dict(name='p',attrs={'class':['chapo']})
,dict(id='_twitter_facebook')
]
remove_tags = [
dict(name='iframe')
,dict(name='a', attrs={'class':'lnk-comments'})
,dict(name='div', attrs={'class':'toolbox'})
,dict(name='ul', attrs={'class':'share-box'})
,dict(name='ul', attrs={'class':'tool-box'})
,dict(name='ul', attrs={'class':'rub'})
,dict(name='p',attrs={'class':['chapo']})
,dict(name='p',attrs={'class':['tag']})
,dict(name='div',attrs={'class':['blokLies']})
,dict(name='div',attrs={'class':['alire']})
,dict(id='_twitter_facebook')
]
index = 'http://www.liberation.fr/abonnes/'
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
br.open('http://www.liberation.fr/jogger/login/')
br.select_form(nr=0)
br['email'] = self.username
br['password'] = self.password
br.submit()
return br
def parse_index(self):
soup=self.index_to_soup(self.index)
content = soup.find('div', { 'class':'block-content' })
articles = []
cat_articles = []
for tag in content.findAll(recursive=False):
if(tag['class']=='headrest headrest-basic-rounded'):
cat_articles = []
articles.append((tag.find('h5').contents[0],cat_articles))
else:
title = tag.find('h3').contents[0]
url = tag.find('a')['href']
print(url)
descripion = tag.find('p',{ 'class':'subtitle' }).contents[0]
article = {
'title': title,
'url': url,
'descripion': descripion,
'content': ''
}
cat_articles.append(article)
return articles
# Local Variables:
# mode: python
# End:

View File

@ -1,23 +1,30 @@
# vim:fileencoding=UTF-8
from __future__ import unicode_literals
from calibre.web.feeds.news import BasicNewsRecipe
import re
class AListApart (BasicNewsRecipe):
__author__ = 'Marc Busqué <marc@lamarciana.com>'
__url__ = 'http://www.lamarciana.com'
__version__ = '2.0'
__license__ = 'GPL v3'
__copyright__ = '2012, Marc Busqué <marc@lamarciana.com>'
title = u'A List Apart'
description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices. This recipe retrieve articles and columns.'
language = 'en'
tags = 'web development, software'
oldest_article = 120
remove_empty_feeds = True
encoding = 'utf8'
cover_url = u'http://alistapart.com/pix/alalogo.gif'
extra_css = u'img {max-width: 100%; display: block; margin: auto;}'
__author__ = 'Marc Busqué <marc@lamarciana.com>'
__url__ = 'http://www.lamarciana.com'
__version__ = '2.0.1'
__license__ = 'GPL v3'
__copyright__ = '2012, Marc Busqué <marc@lamarciana.com>'
title = u'A List Apart'
description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices. This recipe retrieve articles and columns.'
language = 'en'
tags = 'web development, software'
oldest_article = 120
remove_empty_feeds = True
encoding = 'utf8'
cover_url = u'http://alistapart.com/pix/alalogo.gif'
extra_css = u'img {max-width: 100%; display: block; margin: auto;}'
feeds = [
(u'A List Apart', u'http://feeds.feedburner.com/alistapart/abridged'),
]
feeds = [
(u'A List Apart', u'http://feeds.feedburner.com/alistapart/abridged'),
]
def image_url_processor(self, baseurl, url):
if re.findall('alistapart\.com', url):
return 'http:'+url
else:
return url

View File

@ -1,16 +0,0 @@
from calibre.web.feeds.news import CalibrePeriodical
class LivingDigital(CalibrePeriodical):
title = 'Living Digital'
calibre_periodicals_slug = 'living-digital'
description = '''
Catch the latest buzz in the digital world with Living Digital. Enjoy
reviews, news, features and recommendations on a wide range of consumer
technology products - from smartphones to flat panel TVs, netbooks to
cameras, and many more consumer lifestyle gadgets. To subscribe, visit
<a href="http://news.calibre-ebook.com/periodical/living-digital">calibre
Periodicals</a>.
'''
language = 'en_IN'

View File

@ -0,0 +1,14 @@
from calibre.web.feeds.news import AutomaticNewsRecipe
class BasicUserRecipe1373130372(AutomaticNewsRecipe):
title = u'Ludwig von Mises Institute'
__author__ = 'anywho'
language = 'en'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Daily Articles (Full text version)',
u'http://feed.mises.org/MisesFullTextArticles'),
(u'Mises Blog Posts',
u'http://mises.org/blog/index.rdf')]

View File

@ -1,3 +1,6 @@
# -*- mode:python -*-
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
'''
@ -6,57 +9,164 @@ Mediapart
__author__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
import re
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import feeds_from_index
from datetime import date,timedelta
class Mediapart(BasicNewsRecipe):
title = 'Mediapart'
title = 'Mediapart'
__author__ = 'Mathieu Godlewski, Louis Gesbert'
description = 'Global news in french from news site Mediapart'
oldest_article = 7
publication_type = 'newspaper'
language = 'fr'
needs_subscription = True
max_articles_per_feed = 50
oldest_article = 2
use_embedded_content = False
no_stylesheets = True
cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg'
cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png'
# --
oldest_article_date = date.today() - timedelta(days=oldest_article)
# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has
# the 10 last elements :/)
feeds = [
('Les articles', 'http://www.mediapart.fr/articles/feed'),
('La Une', 'http://www.mediapart.fr/articles/feed'),
]
def parse_feeds(self):
feeds = super(Mediapart, self).parse_feeds()
feeds += feeds_from_index(self.my_parse_index(feeds))
return feeds
def my_parse_index(self, la_une):
articles = []
breves = []
liens = []
confidentiels = []
soup = self.index_to_soup('https://www.mediapart.fr/journal/fil-dactualites')
page = soup.find('div', {'id':'pageFirstContent'})
fils = page.find('div', {'class':re.compile(r'\bcontent-journal\b')})
for article in fils.findAll('div'):
try:
title = article.find('h2',recursive=False)
if title is None or title['class'] == 'title-specific':
continue
# print "found fil ",title
article_type = article.find('a', {'href': re.compile(r'.*\/type-darticles\/.*')}).renderContents()
# print "kind: ",article_type
for s in title('span'):
s.replaceWith(s.renderContents() + "\n")
url = title.find('a', href=True)['href']
article_date = self.parse_french_date(article.find("span", "article-date").renderContents())
if article_date < self.oldest_article_date:
# print "too old"
continue
authors = article.findAll('a',{'class':re.compile(r'\bjournalist\b')})
authors = [self.tag_to_string(a) for a in authors]
description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p')
# print "fil ",title," by ",authors," : ",description
summary = {
'title': self.tag_to_string(title).strip(),
'author': ', '.join(authors),
'url': url,
'date': u'' + article_date.strftime("%A %d %b %Y"),
'description': '\n'.join([self.tag_to_string(d) for d in description]),
}
{
"Brève": breves,
"Lien": liens,
"Confidentiel": confidentiels,
}.get(article_type).append(summary)
except:
pass
# print 'La Une: ', len(la_une), ' articles'
# for a in la_une: print a["title"]
# print 'Brèves: ', len(breves), ' articles'
# print 'Revue web: ', len(liens), ' articles'
# print 'Confidentiel: ', len(confidentiels), ' articles'
articles += [('Brèves', breves)] if breves else []
articles += [('Revue du Web', liens)] if liens else []
articles += [('Confidentiel', confidentiels)] if confidentiels else []
return articles
# -- print-version
conversion_options = { 'smarten_punctuation' : True }
conversion_options = {'smarten_punctuation' : True}
remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}) ]
remove_tags = [dict(name='div', attrs={'class':'print-source_url'})]
# non-locale specific date parse (strptime("%d %b %Y",s) would work with french locale)
def parse_french_date(self, date_str):
date_arr = date_str.lower().split()
return date(day=int(date_arr[0]),
year=int(date_arr[2]),
month=
[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1]))
def print_version(self, url):
raw = self.browser.open(url).read()
soup = BeautifulSoup(raw.decode('utf8', 'replace'))
link = soup.find('a', {'title':'Imprimer'})
if link is None:
# Filter old articles
article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date')))
if article_date < self.oldest_article_date:
return None
return link['href']
tools = soup.find('div', {'class':'menu-tools'})
link = tools.find('a', {'href': re.compile(r'\/print\/.*')})
if link is None:
print 'Error: print link not found'
return None
return 'https://mediapart.fr/' + link['href']
# -- Handle login
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
br.open('http://www.mediapart.fr/')
br.select_form(nr=0)
br.open('https://www.mediapart.fr/user')
br.select_form(nr=1)
br['name'] = self.username
br['pass'] = self.password
br.submit()
return br
def preprocess_html(self, soup):
for title in soup.findAll('p', {'class':'titre_page'}):
title.name = 'h3'
for legend in soup.findAll('span', {'class':'legend'}):
legend.insert(0, Tag(soup, 'br', []))
legend.name = 'small'
return soup
# This is a workaround articles with scribd content that include
# <body></body> tags _within_ the body
preprocess_regexps = [
(re.compile(r'(<body.*?>)(.*)</body>', re.IGNORECASE|re.DOTALL),
lambda match:
match.group(1)
+ re.sub(re.compile(r'</?body>', re.IGNORECASE|re.DOTALL),'',
match.group(2))
+ '</body>')
]
# def preprocess_html(self, soup):
# for title in soup.findAll('p', {'class':'titre_page'}):
# title.name = 'h3'
# for legend in soup.findAll('span', {'class':'legend'}):
# legend.insert(0, Tag(soup, 'br', []))
# legend.name = 'em'
# return soup

View File

@ -7,71 +7,75 @@ import time
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro UK'
description = 'News from The Metro, UK'
#timefmt = ''
__author__ = 'Dave Asbury'
#last update 4/4/13
#cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
cover_url = 'https://twimg0-a.akamaihd.net/profile_images/1638332595/METRO_LETTERS-01.jpg'
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
auto_cleanup = True
max_articles_per_feed = 12
ignore_duplicate_articles = {'title', 'url'}
encoding = 'UTF-8'
#encoding = 'UTF-8'
language = 'en_GB'
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
compress_news_images = True
compress_news_images_max_size = 30
remove_attributes = ['style', 'font']
preprocess_regexps = [
(re.compile(r'\| Metro News', re.IGNORECASE | re.DOTALL), lambda match: ''),
]
def parse_index(self):
articles = {}
key = None
ans = []
feeds = [ ('UK', 'http://metro.co.uk/news/uk/'),
('World', 'http://metro.co.uk/news/world/'),
('Weird', 'http://metro.co.uk/news/weird/'),
('Money', 'http://metro.co.uk/news/money/'),
('Sport', 'http://metro.co.uk/sport/'),
('Guilty Pleasures', 'http://metro.co.uk/guilty-pleasures/')
]
for key, feed in feeds:
soup = self.index_to_soup(feed)
articles[key] = []
ans.append(key)
articles = {}
key = None
ans = []
feeds = [('UK', 'http://metro.co.uk/news/uk/'),
('World', 'http://metro.co.uk/news/world/'),
('Weird', 'http://metro.co.uk/news/weird/'),
('Money', 'http://metro.co.uk/news/money/'),
('Sport', 'http://metro.co.uk/sport/'),
('Guilty Pleasures', 'http://metro.co.uk/guilty-pleasures/')
]
for key, feed in feeds:
soup = self.index_to_soup(feed)
articles[key] = []
ans.append(key)
today = datetime.date.today()
today = time.mktime(today.timetuple())-60*60*24
today = datetime.date.today()
today = time.mktime(today.timetuple())-60*60*24
for a in soup.findAll('a'):
for name, value in a.attrs:
if name == "class" and value=="post":
url = a['href']
title = a['title']
print title
description = ''
m = re.search('^.*uk/([^/]*)/([^/]*)/([^/]*)/', url)
skip = 1
if len(m.groups()) == 3:
g = m.groups()
dt = datetime.datetime.strptime(''+g[0]+'-'+g[1]+'-'+g[2], '%Y-%m-%d')
pubdate = time.strftime('%a, %d %b', dt.timetuple())
for a in soup.findAll('a'):
for name, value in a.attrs:
if name == "class" and value=="post":
url = a['href']
title = a['title']
print title
description = ''
m = re.search('^.*uk/([^/]*)/([^/]*)/([^/]*)/', url)
skip = 1
if len(m.groups()) == 3:
g = m.groups()
dt = datetime.datetime.strptime(''+g[0]+'-'+g[1]+'-'+g[2], '%Y-%m-%d')
pubdate = time.strftime('%a, %d %b', dt.timetuple())
dt = time.mktime(dt.timetuple())
if dt >= today:
print pubdate
skip = 0
else:
pubdate = strftime('%a, %d %b')
dt = time.mktime(dt.timetuple())
if dt >= today:
print pubdate
skip = 0
else:
pubdate = strftime('%a, %d %b')
summary = a.find(True, attrs={'class':'excerpt'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
summary = a.find(True, attrs={'class':'excerpt'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
if skip == 0:
articles[key].append(
dict(title=title, url=url, date=pubdate,
description=description,
content=''))
#ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans
if skip == 0:
articles[key].append(
dict(title=title, url=url, date=pubdate,
description=description,
content=''))
#ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
ans = [(key, articles[key]) for key in ans if key in articles]
return ans

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3'
__copyright__ = '2010-2011, Eddie Lau'
__copyright__ = '2010-2013, Eddie Lau'
# Region - Hong Kong, Vancouver, Toronto
__Region__ = 'Hong Kong'
@ -32,6 +32,7 @@ __Date__ = ''
'''
Change Log:
2013/09/28: allow thumbnails even with hi-res images
2012/04/24: improved parsing of news.mingpao.com content
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
@ -846,8 +847,7 @@ class MPRecipe(BasicNewsRecipe):
return soup
def populate_article_metadata(self, article, soup, first):
# thumbnails shouldn't be available if using hi-res images
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
if __IncludeThumbnails__ and first and hasattr(self, 'add_toc_thumbnail'):
img = soup.find('img')
if img is not None:
self.add_toc_thumbnail(article, img['src'])
@ -1071,3 +1071,4 @@ class MPRecipe(BasicNewsRecipe):

View File

@ -1,18 +1,15 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2009-2013, Darko Miletic <darko.miletic at gmail.com>'
'''
elargentino.com
sur.infonews.com
'''
import datetime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class MiradasAlSur(BasicNewsRecipe):
title = 'Miradas al Sur'
__author__ = 'Darko Miletic'
description = 'Revista Argentina'
description = 'Semanario Argentino'
publisher = 'ElArgentino.com'
category = 'news, politics, Argentina'
oldest_article = 7
@ -20,53 +17,51 @@ class MiradasAlSur(BasicNewsRecipe):
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = 'es_AR'
language = 'es_AR'
remove_empty_feeds = True
masthead_url = 'http://sur.infonews.com/sites/default/files/www_miradas_al_sur_com_logo.gif'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif}
h1{font-family: Georgia,Times,serif}
.field-field-story-author{color: gray; font-size: small}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'series' : title
}
lang = 'es-AR'
direction = 'ltr'
INDEX = 'http://www.elargentino.com/medios/123/Miradas-al-Sur.html'
extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} '
keep_only_tags = [dict(name='div', attrs={'id':['content-header', 'content-area']})]
remove_tags = [
dict(name=['link','meta','iframe','embed','object']),
dict(name='form', attrs={'class':'fivestar-widget'}),
dict(attrs={'class':lambda x: x and 'terms-inline' in x.split()})
]
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})]
remove_tags = [dict(name='link')]
feeds = [(u'Articulos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=123&Content-Type=text/xml&ChannelDesc=Miradas%20al%20Sur')]
def print_version(self, url):
main, sep, article_part = url.partition('/nota-')
article_id, rsep, rrest = article_part.partition('-')
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup
feeds = [
(u'Politica' , u'http://sur.infonews.com/taxonomy/term/1/0/feed'),
(u'Internacional' , u'http://sur.infonews.com/taxonomy/term/2/0/feed'),
(u'Informe Especial' , u'http://sur.infonews.com/taxonomy/term/14/0/feed'),
(u'Delitos y pesquisas', u'http://sur.infonews.com/taxonomy/term/6/0/feed'),
(u'Lesa Humanidad' , u'http://sur.infonews.com/taxonomy/term/7/0/feed'),
(u'Cultura' , u'http://sur.infonews.com/taxonomy/term/8/0/feed'),
(u'Deportes' , u'http://sur.infonews.com/taxonomy/term/9/0/feed'),
(u'Contratapa' , u'http://sur.infonews.com/taxonomy/term/10/0/feed'),
]
def get_cover_url(self):
# determine the series number, unfortunately not gonna happen now
#self.conversion_options.update({'series_index':seriesnr})
cover_url = None
soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('div',attrs={'class':'colder'})
cdate = datetime.date.today()
todayweekday = cdate.isoweekday()
if (todayweekday != 7):
cdate -= datetime.timedelta(days=todayweekday)
cover_page_url = cdate.strftime('http://sur.infonews.com/ediciones/%Y-%m-%d/tapa')
soup = self.index_to_soup(cover_page_url)
cover_item = soup.find('img', attrs={'class':lambda x: x and 'imagecache-tapa_edicion_full' in x.split()})
if cover_item:
clean_url = self.image_url_processor(None,cover_item.div.img['src'])
cover_url = 'http://www.elargentino.com' + clean_url + '&height=600'
cover_url = cover_item['src']
return cover_url
def image_url_processor(self, baseurl, url):
base, sep, rest = url.rpartition('?Id=')
img, sep2, rrest = rest.partition('&')
return base + sep + img

View File

@ -1,46 +1,49 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
class NatGeoMag(BasicNewsRecipe):
title = 'National Geographic Mag'
__author__ = 'Terminal Veracity'
description = 'The National Geographic Magazine'
publisher = 'National Geographic'
oldest_article = 31
max_articles_per_feed = 50
category = 'geography, magazine'
language = 'en'
publication_type = 'magazine'
cover_url = 'http://www.yourlogoresources.com/wp-content/uploads/2011/09/national-geographic-logo.jpg'
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
recursions = 1
remove_empty_feeds = True
feeds = [('National Geographic Magazine', 'http://feeds.nationalgeographic.com/ng/NGM/NGM_Magazine')]
remove_tags = [dict(name='div', attrs={'class':['nextpage_continue', 'subscribe']})]
keep_only_tags = [dict(attrs={'class':'main_3narrow'})]
extra_css = """
h1 {font-size: large; font-weight: bold; margin: .5em 0; }
h2 {font-size: large; font-weight: bold; margin: .5em 0; }
h3 {font-size: medium; font-weight: bold; margin: 0 0; }
.article_credits_author {font-size: small; font-style: italic; }
.article_credits_photographer {font-size: small; font-style: italic; display: inline }
"""
class NGM(BasicNewsRecipe):
def parse_feeds(self):
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
for article in feed.articles[:]:
if 'Flashback' in article.title:
feed.articles.remove(article)
elif 'Desktop Wallpaper' in article.title:
feed.articles.remove(article)
elif 'Visions of Earth' in article.title:
feed.articles.remove(article)
elif 'Your Shot' in article.title:
feed.articles.remove(article)
elif 'MyShot' in article.title:
feed.articles.remove(article)
elif 'Field Test' in article.title:
feed.articles.remove(article)
return feeds
title = 'National Geographic Magazine'
__author__ = 'Krittika Goyal'
description = 'National Geographic Magazine'
timefmt = ' [%d %b, %Y]'
no_stylesheets = True
auto_cleanup = True
auto_cleanup_keep = '//div[@class="featurepic"]'
def nejm_get_index(self):
return self.index_to_soup('http://ngm.nationalgeographic.com/2013/10/table-of-contents')
# To parse artice toc
def parse_index(self):
soup = self.nejm_get_index()
tocfull = soup.find('div', attrs={'class':'coltoc'})
toc = tocfull.find('div', attrs={'class':'more_section'})
articles = []
feeds = []
section_title = 'Features'
for x in toc.findAll(True):
if x.name == 'a':
# Article found
title = self.tag_to_string(x)
url = x.get('href', False)
if not url or not title:
continue
url = 'http://ngm.nationalgeographic.com' + url
self.log('\t\tFound article:', title)
self.log('\t\t\t', url)
articles.append({'title': title, 'url':url,
'description':'', 'date':''})
feeds.append((section_title, articles))
art1 = tocfull.findAll('a')[1]
art1_title = self.tag_to_string(art1.find('div', attrs={'class': 'toched'}))
art1_url = art1.get('href', False)
art1_url = 'http://ngm.nationalgeographic.com' + art1_url
art1feed = {'title': art1_title, 'url':art1_url,
'description':'', 'date':''}
feeds.append(('Cover Story', [art1feed]))
return feeds

View File

@ -1,49 +1,108 @@
# vim:fileencoding=utf-8
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1344926684(BasicNewsRecipe):
title = u'Neue Osnabrücker Zeitung'
__author__ = 'Krittika Goyal'
oldest_article = 7
max_articles_per_feed = 100
# auto_cleanup = True
no_stylesheets = True
use_embedded_content = False
language = 'de'
class AdvancedUserRecipe1380105782(BasicNewsRecipe):
title = u'Neue Osnabrücker Zeitung'
__author__ = 'vo_he'
description = 'Online auch ohne IPhone'
encoding = 'utf-8'
language = 'de'
remove_javascript = True
no_stylesheets = True
oldest_article = 2
max_articles_per_feed = 100
cover_url = 'http://www.noz.de/bundles/nozplatform/images/logos/osnabruecker-zeitung.png'
remove_tags_before =dict(id='feedContent')
remove_tags_before =dict(id='headline')
remove_tags_after =dict(id='article-authorbox')
remove_tags_after =dict(id='footer-start')
remove_tags_after =dict(name='div', attrs={'class':'morelinks'})
keep_only_tags = [
dict(name='div', attrs={'class':'article'}),
dict(name='span', attrs={'id':'articletext'})
]
remove_tags = [
dict(name='div', attrs={'id':'retresco-title'}),
dict(name='div', attrs={'class':'retresco-item s1 relative'}),
dict(name='a', attrs={'class':'medium2 largeSpaceTop icon'}),
dict(name='div', attrs={'class':'articleFunctions inlineTeaserRight'}),
dict(name='div', attrs={'class':'imageContainer '}),
dict(name='div', attrs={'class':'imageContainer centerContainer'}),
dict(name='div', attrs={'class':'grid singleCol articleTeaser'}),
dict(name='h3', attrs={'class':'teaserRow'}),
dict(name='div', attrs={'class':'related-comments'}),
dict(name='a', attrs={'class':' icon'}),
dict(name='a', attrs={'class':'right small'}),
dict(name='span', attrs={'class':'small block spaceBottom rectangleAd'}),
dict(name='div', attrs={'id':'ui-datepicker-div'}),
dict(name='div', attrs={'class':'nav-second'}),
dict(name='div', attrs={'class':'nav-first'}),
dict(name='div', attrs={'class':'icon-print'}),
dict(name='div', attrs={'class':'social-button'}),
dict(name='div', attrs={'class':'social-media-bar'}),
dict(name='div', attrs={'class':'pull-right'}),
dict(name='div', attrs={'class':'btn btn-primary flat-button'}),
dict(name='div', attrs={'class':'carousel-wrapper'}),
dict(name='a', attrs={'class':'right-content merchandising hidden-tablet'}),
dict(name='div', attrs={'class':'border-circle pull-left'}),
dict(name='div', attrs={'class':'row show-grid general-infoimageContainer '}),
dict(name='div', attrs={'class':'location-list'}),
dict(name='div', attrs={'class':'block'}),
dict(name='div', attrs={'class':'furtherGalleries largeSpaceTop'})
]
feeds = [(u'Lokales', u'http://www.noz.de/rss/Lokales'),
(u'Vermischtes', u'http://www.noz.de/rss/Vermischtes'),
(u'Politik', u'http://www.noz.de/rss/Politik'),
(u'Wirtschaft', u'http://www.noz.de/rss/Wirtschaft'),
(u'Kultur', u'http://www.noz.de/rss/Kultur'),
(u'Medien', u'http://www.noz.de/rss/Medien'),
(u'Wissenschaft', u'http://www.noz.de/rss/wissenschaft'),
(u'Sport', u'http://www.noz.de/rss/Sport'),
(u'Computer', u'http://www.noz.de/rss/Computer'),
(u'Musik', u'http://www.noz.de/rss/Musik'),
(u'Szene', u'http://www.noz.de/rss/Szene'),
(u'Niedersachsen', u'http://www.noz.de/rss/Niedersachsen'),
(u'Kino', u'http://www.noz.de/rss/Kino')]
feeds = [(u'Melle Mitte', u'http://www.noz.de/rss/ressort/Melle%20Mitte'),
(u'Melle Nord', u'http://www.noz.de/rss/ressort/Melle%20Nord'),
(u'Melle Sued', u'http://www.noz.de/rss/ressort/Melle%20S%C3%BCd'),
(u'Nordrhein Westfalen', u'http://www.noz.de/rss/ressort/Nordrhein-Westfalen'),
(u'Niedersachsen', u'http://www.noz.de/rss/ressort/Niedersachsen'),
(u'Vermischtes', u'http://www.noz.de/rss/ressort/Vermischtes'),
(u'GutzuWissen', u'http://www.noz.de/rss/ressort/Gut%20zu%20Wissen'),
(u'Sport', u'http://www.noz.de/rss/ressort/Sport'),
(u'Kultur', u'http://www.noz.de/rss/ressort/Kultur'),
(u'Medien', u'http://www.noz.de/rss/ressort/Medien'),
(u'Belm', u'http://www.noz.de/rss/ressort/Belm'),
(u'Bissendorf', u' [url]http://www.noz.de/rss/ressort/Bissendorf[/url]'),
(u'Osnabrueck', u'http://www.noz.de/rss/ressort/Osnabr%C3%BCck'),
(u'Bad Essen', u'http://www.noz.de/rss/ressort/Bad%20Essen'),
(u'Politik', u'http://www.noz.de/rss/ressort/Politik'),
(u'Wirtschaft', u'http://www.noz.de/rss/ressort/Wirtschaft'),
#(u'Fussball', u'http:/www.noz.de/rss/ressort/Fußball'),
#(u'VfL Osnabrueck', u'http://www.noz.de/rss/ressort/VfL%20Osnabr%C3%BCck'),
#(u'SF Lotte', u'http://www.noz.de/rss/ressort/SF%20Lotte'),
#(u'SV Meppen', u'http://www.noz.de/rss/ressort/SV%20Meppen'),
#(u'Artland Dragons', u'http://www.noz.de/rss/ressort/Artland%20Dragons'),
#(u'Panthers', u'http://www.noz.de/rss/ressort/Panthers'),
(u'OS-Sport', u'http://www.noz.de/rss/ressort/OS-Sport'),
#(u'Emsland Sport', u'http://www.noz.de/rss/ressort/EL-Sport'),
#(u'Lingen', u'http://www.noz.de/rss/ressort/Lingen'),
#(u'Lohne', u'http://www.noz.de/rss/ressort/Lohne'),
#(u'Emsbueren', u'http://www.noz.de/rss/ressort/Emsb%C3%BCren'),
#(u'Salzbergen', u'http://www.noz.de/rss/ressort/Salzbergen'),
#(u'Spelle', u'http://www.noz.de/rss/ressort/Spelle'),
#(u'Freren', u'http://www.noz.de/rss/ressort/Freren'),
#(u'Lengerich', u'http://www.noz.de/rss/ressort/Lengerich'),
#(u'Bad Iburg', u'http://www.noz.de/rss/ressort/Bad%20Iburg'),
#(u'Bad Laer', u'http://www.noz.de/rss/ressort/Bad%20Laer'),
#(u'Bad Rothenfelde', u'http://www.noz.de/rss/ressort/Bad%20Rothenfelde'),
#(u'GMHütte', u'http://www.noz.de/rss/ressort/Georgsmarienh%C3%BCtte'),
#(u'Glandorf', u'http://www.noz.de/rss/ressort/Glandorf'),
#(u'Hagen', u'http://www.noz.de/rss/ressort/Hagen'),
#(u'Hasbergen', u'http://www.noz.de/rss/ressort/Hasbergen'),
#(u'Hilter', u'http://www.noz.de/rss/ressort/Hilter'),
#(u'Lotte', u'http://www.noz.de/rss/ressort/Lotte'),
#(u'Wallenhorst', u'http://www.noz.de/rss/ressort/Wallenhorst'),
#(u'Westerkappeln', u'http://www.noz.de/rss/ressort/Westerkappeln'),
#(u'Artland', u'http://www.noz.de/rss/ressort/Artland'),
#(u'Bersenbrück', u'http://www.noz.de/rss/ressort/Bersenbr%C3%BCck'),
#(u'Fürstenau', u'http://www.noz.de/rss/ressort/F%C3%BCrstenau'),
#(u'Neuenkirchen', u'http://www.noz.de/rss/ressort/Neuenkirchen'),
#(u'Lokalsport', u'http://www.noz.de/rss/ressort/Lokalsport%20Nordkreis'),
#(u'Bramsche', u'http://www.noz.de/rss/ressort/Bramsche'),
#(u'Bramsche Ortsteile', u'http://www.noz.de/rss/ressort/Bramscher%20Ortsteile'),
#(u'Neuenkirchen Vörden', u'http://www.noz.de/rss/ressort/Neuenkirchen-V%C3%B6rden'),
#(u'Papenburg', u'http://www.noz.de/rss/ressort/Papenburg'),
#(u'Dörpen', u'http://www.noz.de/rss/ressort/D%C3%B6rpen'),
#(u'Rhede', u'http://www.noz.de/rss/ressort/Rhede'),
#(u'Lathen', u'http://www.noz.de/rss/ressort/Lathen'),
#(u'Sögel', u'http://www.noz.de/rss/ressort/S%C3%B6gel'),
#(u'Nordhümmling', u'http://www.noz.de/rss/ressort/Nordh%C3%BCmmling'),
#(u'Werlte', u'http://www.noz.de/rss/ressort/Werlte'),
#(u'Westoverledingen', u'http://www.noz.de/rss/ressort/Westoverledingen'),
#(u'Geeste', u'http://www.noz.de/rss/ressort/Geeste'),
#(u'Haren', u'http://www.noz.de/rss/ressort/Haren'),
#(u'Haselünne', u'http://www.noz.de/rss/ressort/Hasel%C3%BCnne'),
#(u'Herzlake', u'http://www.noz.de/rss/ressort/Herzlake'),
#(u'Meppen', u'http://www.noz.de/rss/ressort/Meppen'),
#(u'Twist', u'http://www.noz.de/rss/ressort/Twist'),
#(u'Bohmte', u'http://www.noz.de/rss/ressort/Bohmte'),
#(u'Ostercappeln', u'http://www.noz.de/rss/ressort/Ostercappeln')
]

View File

@ -1,4 +1,3 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
@ -11,6 +10,9 @@ import re
from calibre.web.feeds.news import BasicNewsRecipe
def find_header(tag):
return tag.name == 'header' and tag.parent['class'] == 'article'
class NewYorkReviewOfBooks(BasicNewsRecipe):
title = u'New York Review of Books'
@ -23,65 +25,70 @@ class NewYorkReviewOfBooks(BasicNewsRecipe):
no_javascript = True
needs_subscription = True
keep_only_tags = [dict(id=['article-body','page-title'])]
remove_tags = [dict(attrs={'class':['article-tools', 'article-links',
'center advertisement']})]
keep_only_tags = [
dict(name='section', attrs={'class':'article_body'}),
dict(name=find_header),
dict(name='div', attrs={'class':'for-subscribers-only'}),
]
preprocess_regexps = [(re.compile(r'<head>.*?</head>', re.DOTALL), lambda
m:'<head></head>')]
def print_version(self, url):
return url+'?pagination=false'
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
br.open('http://www.nybooks.com/account/signin/')
br.select_form(nr = 1)
br.select_form(nr=2)
br['username'] = self.username
br['password'] = self.password
br.submit()
return br
def print_version(self, url):
return url+'?pagination=false'
def preprocess_html(self, soup):
header = soup.find('header')
body = soup.find('body')
body.insert(0, header)
header.find('div', attrs={'class':'details'}).extract()
for i in soup.findAll('input'):
i.extract()
return soup
def parse_index(self):
soup = self.index_to_soup('http://www.nybooks.com/current-issue')
# Find cover
sidebar = soup.find(id='sidebar')
sidebar = soup.find('div', attrs={'class':'issue_cover'})
if sidebar is not None:
a = sidebar.find('a', href=lambda x: x and 'view-photo' in x)
if a is not None:
psoup = self.index_to_soup('http://www.nybooks.com'+a['href'])
cover = psoup.find('img', src=True)
self.cover_url = cover['src']
self.log('Found cover at:', self.cover_url)
img = sidebar.find('img', src=True)
self.cover_url = 'http://www.nybooks.com' + img['src']
self.log('Found cover at:', self.cover_url)
# Find date
div = soup.find(id='page-title')
div = soup.find('time', pubdate='pubdate')
if div is not None:
h5 = div.find('h5')
if h5 is not None:
text = self.tag_to_string(h5)
date = text.partition(u'\u2022')[0].strip()
self.timefmt = u' [%s]'%date
self.log('Issue date:', date)
text = self.tag_to_string(div)
date = text.partition(u'\u2022')[0].strip()
self.timefmt = u' [%s]'%date
self.log('Issue date:', date)
# Find TOC
tocs = soup.findAll('ul', attrs={'class':'issue-article-list'})
toc = soup.find('div', attrs={'class':'current_issue'}).find('div', attrs={'class':'articles_list'})
articles = []
for toc in tocs:
for li in toc.findAll('li'):
h3 = li.find('h3')
title = self.tag_to_string(h3)
author = self.tag_to_string(li.find('h4'))
title = title + u' (%s)'%author
url = 'http://www.nybooks.com'+h3.find('a', href=True)['href']
desc = ''
for p in li.findAll('p'):
desc += self.tag_to_string(p)
self.log('Found article:', title)
self.log('\t', url)
self.log('\t', desc)
articles.append({'title':title, 'url':url, 'date':'',
for div in toc.findAll('div', attrs={'class':'row'}):
h2 = div.find('h2')
title = self.tag_to_string(h2).strip()
author = self.tag_to_string(div.find('div', attrs={'class':'author'})).strip()
title = title + u' (%s)'%author
url = 'http://www.nybooks.com' + h2.find('a', href=True)['href']
desc = ''
for p in div.findAll('p', attrs={'class':lambda x: x and 'quiet' in x}):
desc += self.tag_to_string(p)
self.log('Found article:', title)
self.log('\t', url)
self.log('\t', desc)
articles.append({'title':title, 'url':url, 'date':'',
'description':desc})
return [('Current Issue', articles)]

View File

@ -10,6 +10,9 @@ import re
from calibre.web.feeds.news import BasicNewsRecipe
def find_header(tag):
return tag.name == 'header' and tag.parent['class'] == 'article'
class NewYorkReviewOfBooks(BasicNewsRecipe):
title = u'New York Review of Books (no subscription)'
@ -21,9 +24,11 @@ class NewYorkReviewOfBooks(BasicNewsRecipe):
no_stylesheets = True
no_javascript = True
keep_only_tags = [dict(id=['article-body', 'page-title'])]
remove_tags = [dict(attrs={'class':['article-tools', 'article-links',
'center advertisement']})]
keep_only_tags = [
dict(name='section', attrs={'class':'article_body'}),
dict(name=find_header),
dict(name='div', attrs={'class':'for-subscribers-only'}),
]
preprocess_regexps = [(re.compile(r'<head>.*?</head>', re.DOTALL), lambda
m:'<head></head>')]
@ -31,40 +36,44 @@ class NewYorkReviewOfBooks(BasicNewsRecipe):
def print_version(self, url):
return url+'?pagination=false'
def preprocess_html(self, soup):
header = soup.find('header')
body = soup.find('body')
body.insert(0, header)
header.find('div', attrs={'class':'details'}).extract()
for i in soup.findAll('input'):
i.extract()
return soup
def parse_index(self):
soup = self.index_to_soup('http://www.nybooks.com/current-issue')
# Find cover
sidebar = soup.find(id='sidebar')
sidebar = soup.find('div', attrs={'class':'issue_cover'})
if sidebar is not None:
a = sidebar.find('a', href=lambda x: x and 'view-photo' in x)
if a is not None:
psoup = self.index_to_soup('http://www.nybooks.com'+a['href'])
cover = psoup.find('img', src=True)
self.cover_url = cover['src']
self.log('Found cover at:', self.cover_url)
img = sidebar.find('img', src=True)
self.cover_url = 'http://www.nybooks.com' + img['src']
self.log('Found cover at:', self.cover_url)
# Find date
div = soup.find(id='page-title')
div = soup.find('time', pubdate='pubdate')
if div is not None:
h5 = div.find('h5')
if h5 is not None:
text = self.tag_to_string(h5)
date = text.partition(u'\u2022')[0].strip()
self.timefmt = u' [%s]'%date
self.log('Issue date:', date)
text = self.tag_to_string(div)
date = text.partition(u'\u2022')[0].strip()
self.timefmt = u' [%s]'%date
self.log('Issue date:', date)
# Find TOC
toc = soup.find('ul', attrs={'class':'issue-article-list'})
toc = soup.find('div', attrs={'class':'current_issue'}).find('div', attrs={'class':'articles_list'})
articles = []
for li in toc.findAll('li'):
h3 = li.find('h3')
title = self.tag_to_string(h3)
author = self.tag_to_string(li.find('h4'))
for div in toc.findAll('div', attrs={'class':'row'}):
h2 = div.find('h2')
title = self.tag_to_string(h2).strip()
author = self.tag_to_string(div.find('div', attrs={'class':'author'})).strip()
title = title + u' (%s)'%author
url = 'http://www.nybooks.com'+h3.find('a', href=True)['href']
url = 'http://www.nybooks.com' + h2.find('a', href=True)['href']
desc = ''
for p in li.findAll('p'):
for p in div.findAll('p', attrs={'class':lambda x: x and 'quiet' in x}):
desc += self.tag_to_string(p)
self.log('Found article:', title)
self.log('\t', url)

53
recipes/news24.recipe Normal file
View File

@ -0,0 +1,53 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1375900744(BasicNewsRecipe):
title = u'News24'
description = "News24."
__author__ = 'Nicki de Wet'
publisher = 'Media24'
category = 'news, politics, South Africa'
oldest_article = 3
max_articles_per_feed = 20
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
auto_cleanup = False
language = 'en_ZA'
remove_empty_feeds = True
publication_type = 'newsportal'
masthead_url = 'http://www.24.com/images/widgethead_news.png'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif }
img{display: block}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [
dict(name=['object','embed','iframe','table','meta','link']),
dict(attrs={
'class':['TwitterfacebookLink','superSportArticleBlock',
'videoHighlights', 'facebookComments','share',
'item_block','kalahari_product left', 'block red',
'credit']}),
dict(attrs={'id':['comments_wrap', 'article_toolbox_bot',
'inside_news','sponsored-links', 'lnkGalleries',
'relatedlinks_box', 'lnkUserGalleries',
'lnkNewsGalleries', 'relatedlinks',
'divRelatedLinks']})]
keep_only_tags = [
dict(attrs={'class':['left col633', 'article col626',
'columnWrapperLeft', 'articlecolumn',
'article_img', 'picture_caption', 'DiveTable']})]
feeds = [
(u'Top Stories', u'http://feeds.news24.com/articles/news24/TopStories/rss'),
(u'South Africa', u'http://feeds.news24.com/articles/news24/SouthAfrica/rss'),
(u'World', u'http://feeds.news24.com/articles/news24/World/rss'),
(u'Sport', u'http://feeds.24.com/articles/sport/featured/topstories/rss')]

View File

@ -2,173 +2,263 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2010, matek09, matek09@gmail.com; 2012, admroz, a.rozewicki@gmail.com'
__copyright__ = '2010, matek09, matek09@gmail.com; 2012-2013, admroz, a.rozewicki@gmail.com'
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from string import capwords
import datetime
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Newsweek(BasicNewsRecipe):
# how many issues to go back, 0 means get the most current one
BACK_ISSUES = 2
# how many issues to go back, 0 means get the most current one
BACK_ISSUES = 1
EDITION = '0'
DATE = None
YEAR = datetime.datetime.now().year
EDITION = '0'
DATE = None
YEAR = datetime.datetime.now().year
title = u'Newsweek Polska'
__author__ = 'matek09, admroz'
description = 'Weekly magazine'
encoding = 'utf-8'
language = 'pl'
remove_javascript = True
title = u'Newsweek Polska'
__author__ = 'matek09, admroz'
description = 'Weekly magazine'
encoding = 'utf-8'
language = 'pl'
remove_javascript = True
temp_files = []
articles_are_obfuscated = True
temp_files = []
articles_are_obfuscated = True
#
# Parses each article
#
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
source = br.response().read()
page = self.index_to_soup(source)
#
# Parses article contents from one page
#
def get_article_divs(self, css, main_section):
strs = []
main_section = page.find(id='mainSection')
# get all divs with given css class
article_divs = main_section.findAll('div', attrs={'class' : css})
for article_div in article_divs:
title = main_section.find('h1')
info = main_section.find('ul', attrs={'class' : 'articleInfo'})
authors = info.find('li').find('h4')
article = main_section.find('div', attrs={'id' : 'article'})
# remove sections like 'read more...' etc.
for p in article_div.findAll('p'):
# remove related articles box
related = article.find('div', attrs={'class' : 'relatedBox'})
if related is not None:
related.extract()
if p.find('span', attrs={'style' : 'color: #800000; font-size: medium;'}):
p.extract()
continue
# remove div with social networking links and links to
# other articles in web version
for div in article.findAll('div'):
if div.find('span', attrs={'class' : 'google-plus'}):
div.extract()
if p.find('span', attrs={'style' : 'font-size: medium; color: #800000;'}):
p.extract()
continue
for p in div.findAll('p'):
if p.find('span', attrs={'style' : 'color: rgb(255, 0, 0);'}):
p.extract()
continue
for a in p.findAll('a'):
if a.find('span', attrs={'style' : 'font-size: larger;'}):
a.extract()
if p.find('span', attrs={'style' : 'font-size: medium;'}):
p.extract()
continue
if p.find('span', attrs={'style' : 'color: #800000;'}):
p.extract()
continue
obj = p.find('object')
if obj:
obj.extract()
continue
strong = p.find('strong')
if strong:
newest = re.compile("Tekst pochodzi z najnowszego numeru Tygodnika Newsweek")
if newest.search(str(strong)):
strong.extract()
continue
itunes = p.find('a')
if itunes:
reurl = re.compile("itunes.apple.com")
if reurl.search(str(itunes['href'])):
p.extract()
continue
imagedesc = p.find('div', attrs={'class' : 'image-desc'})
if imagedesc:
redesc = re.compile("Okładka numeru")
if (redesc.search(str(imagedesc))):
p.extract()
continue
html = unicode(title) + unicode(authors) + unicode(article)
next = main_section.find('li', attrs={'class' : 'next'})
while next:
url = next.find('a')['href']
br.open(url)
source = br.response().read()
page = self.index_to_soup(source)
main_section = page.find(id='mainSection')
article = main_section.find('div', attrs={'id' : 'article'})
aside = article.find(id='articleAside')
if aside is not None:
aside.extract()
html = html + unicode(article)
next = main_section.find('li', attrs={'class' : 'next'})
# get actual contents
for content in article_div.contents:
strs.append("".join(str(content)))
# return contents as a string
return unicode("".join(strs))
self.temp_files.append(PersistentTemporaryFile('_temparse.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
#
# Articles can be divided into several pages, this method parses them recursevely
#
def get_article_page(self, br, url, page):
br.open(url)
source = br.response().read()
html = ''
matches = re.search(r'<article>(.*)</article>', source, re.DOTALL)
if matches is None:
print "no article tag found, returning..."
return
main_section = BeautifulSoup(matches.group(0))
if page == 0:
title = main_section.find('h1')
html = html + unicode(title)
authors = ''
authorBox = main_section.find('div', attrs={'class' : 'AuthorBox'})
if authorBox is not None:
authorH4 = authorBox.find('h4')
if authorH4 is not None:
authors = self.tag_to_string(authorH4)
html = html + unicode(authors)
info = main_section.find('p', attrs={'class' : 'lead'})
html = html + unicode(info)
html = html + self.get_article_divs('3917dc34e07c9c7180df2ea9ef103361845c8af42b71f51b960059226090a1ac articleStart', main_section)
html = html + self.get_article_divs('3917dc34e07c9c7180df2ea9ef103361845c8af42b71f51b960059226090a1ac', main_section)
nextPage = main_section.find('a', attrs={'class' : 'next'})
if nextPage:
html = html + self.get_article_page(br, nextPage['href'], page+1)
return html
#
# Parses each article
#
def get_obfuscated_article(self, url):
br = self.get_browser()
html = self.get_article_page(br, url, 0)
self.temp_files.append(PersistentTemporaryFile('_temparse.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
#
# Goes back given number of issues. It also knows how to go back
# to the previous year if there are not enough issues in the current one
#
def find_last_issue(self, archive_url):
archive_soup = self.index_to_soup(archive_url)
select = archive_soup.find('select', attrs={'id' : 'paper_issue_select'})
options = select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value'))
#
# Goes back given number of issues. It also knows how to go back
# to the previous year if there are not enough issues in the current one
#
def find_last_issue(self, archive_url):
archive_soup = self.index_to_soup(archive_url, True)
# check if need to go back to previous year
if len(options) > self.BACK_ISSUES:
option = options[self.BACK_ISSUES];
self.EDITION = option['value'].replace('http://www.newsweek.pl/wydania/','')
self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
else:
self.BACK_ISSUES = self.BACK_ISSUES - len(options)
self.YEAR = self.YEAR - 1
self.find_last_issue(archive_url + ',' + str(self.YEAR))
# workaround because html is so messed up that find() method on soup returns None
# and therefore we need to extract subhtml that we need
matches = re.search(r'<ul class="rightIssueList">(.*?)</ul>', archive_soup, re.DOTALL)
if matches is None:
return
subSoup = BeautifulSoup(matches.group(0))
issueLinks = subSoup.findAll('a')
# check if need to go back to previous year
if len(issueLinks) > self.BACK_ISSUES:
link = issueLinks[self.BACK_ISSUES];
self.EDITION = link['href'].replace('http://www.newsweek.pl/wydania/','')
self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
else:
self.BACK_ISSUES = self.BACK_ISSUES - len(issueLinks)
self.YEAR = self.YEAR - 1
self.find_last_issue(archive_url + '/' + str(self.YEAR))
#
# Looks for the last issue which we want to download. Then goes on each
# section and article and stores them (assigning to sections)
#
def parse_index(self):
archive_url = 'http://www.newsweek.pl/wydania/archiwum'
self.find_last_issue(archive_url)
soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
self.DATE = self.tag_to_string(soup.find('span', attrs={'class' : 'data'}))
main_section = soup.find(id='mainSection')
img = main_section.find(lambda tag: tag.name == 'img' and tag.has_key('alt') and tag.has_key('title'))
self.cover_url = img['src']
feeds = []
articles = {}
sections = []
#
# Looks for the last issue which we want to download. Then goes on each
# section and article and stores them (assigning to sections)
#
def parse_index(self):
archive_url = 'http://www.newsweek.pl/wydania/archiwum'
self.find_last_issue(archive_url)
soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
news_list = main_section.find('ul', attrs={'class' : 'newsList'})
section = 'Inne'
matches = re.search(r'<div class="Issue-Entry">(.*)ARTICLE_BOTTOM', soup.prettify(), re.DOTALL)
if matches is None:
return
for li in news_list.findAll('li'):
h3 = li.find('h3')
if h3 is not None:
section = capwords(self.tag_to_string(h3))
continue
else:
h2 = li.find('h2')
if h2 is not None:
article = self.create_article(h2)
if article is None :
continue
main_section = BeautifulSoup(matches.group(0))
if articles.has_key(section):
articles[section].append(article)
else:
articles[section] = [article]
sections.append(section)
# date
matches = re.search(r'(\d{2}-\d{2}-\d{4})', self.tag_to_string(main_section.find('h2')))
if matches:
self.DATE = matches.group(0)
# cover
img = main_section.find(lambda tag: tag.name == 'img' and tag.has_key('alt') and tag.has_key('title'))
self.cover_url = img['src']
feeds = []
articles = {}
sections = []
# sections
for sectionUl in main_section.findAll('ul', attrs={'class' : 'whatsin'}):
# section header
header = sectionUl.find('li', attrs={'class' : 'header'})
if header is None:
continue
section = capwords(self.tag_to_string(header))
# articles in section
articleUl = sectionUl.find('ul')
if articleUl is None:
continue
for articleLi in articleUl.findAll('li'):
# check if article is closed which should be skipped
closed = articleLi.find('span', attrs={'class' : 'closeart'})
if closed is not None:
continue
article = self.create_article(articleLi)
if article is None :
continue
if articles.has_key(section):
articles[section].append(article)
else:
articles[section] = [article]
sections.append(section)
for section in sections:
# print("%s -> %d" % (section, len(articles[section])))
#
# for article in articles[section]:
# print(" - %s" % article)
feeds.append((section, articles[section]))
return feeds
for section in sections:
feeds.append((section, articles[section]))
return feeds
#
# Creates each article metadata (skips locked ones). The content will
# be extracted later by other method (get_obfuscated_article).
#
def create_article(self, articleLi):
article = {}
a = articleLi.find('a')
if a is None:
return None
#
# Creates each article metadata (skips locked ones). The content will
# be extracted later by other method (get_obfuscated_article).
#
def create_article(self, h2):
article = {}
a = h2.find('a')
if a is None:
return None
article['title'] = self.tag_to_string(a)
article['url'] = a['href']
article['date'] = self.DATE
article['description'] = ''
article['title'] = self.tag_to_string(a)
article['url'] = a['href']
article['date'] = self.DATE
desc = h2.findNext('p')
if desc is not None:
article['description'] = self.tag_to_string(desc)
else:
article['description'] = ''
return article
return article

View File

@ -0,0 +1,57 @@
#
# Written: July 2013
# Last Edited: 2013-07-25
# Version: 1.0
# Last update: 2013-07-25
#
__license__ = 'GPL v3'
__copyright__ = '2013, Armin Geller'
'''
Fetch blindenbuch.de
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
title = u'No Names, No Jackets'
__author__ = 'Armin Geller' # AGe 2013-07-25
description = u'One chapter. Just the writing. Discover something new.'
publisher = 'nonamesnojackets.com/'
publication_type = 'ebook news'
tags = 'Books, Literature, E-Books, US'
timefmt = ' [%a, %d %b %Y]'
publication_type = 'Feed'
language = 'en'
encoding = 'utf-8'
oldest_article = 14
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
remove_javascript = True
conversion_options = {'title' : title,
'comments' : description,
'tags' : tags,
'language' : language,
'publisher' : publisher,
'authors' : publisher,
}
# cover_url = ''
# masthead_url = ''
extra_css = '''
h1,h2 {font-weight:bold;font-size:large;}
.entry-meta {font-size: 1em;text-align: left; font-style: italic}
'''
keep_only_tags = [
dict(name='article')
]
feeds = [(u'No Names, No Jackets', u'http://www.nonamesnojackets.com/feed/')]

57
recipes/nuus24.recipe Normal file
View File

@ -0,0 +1,57 @@
import re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
class Nuus24(BasicNewsRecipe):
title = 'Nuus24'
__author__ = 'Nicki de Wet'
encoding = 'utf-8'
description = 'Daaglikse Afrikaanse Nuus via Nuus24'
language = 'af'
publisher = 'Media24'
timefmt = ' [%a, %d %b, %Y]'
masthead_url = 'http://afrikaans.news24.com/images/nuus.jpg'
max_articles_per_feed = 25
remove_tags_before = dict(id='TheFeed')
remove_tags_after = dict(id='TheFeed')
remove_tags = [dict(
attrs={
'class':[
'personal-bar row-fluid', 'navbar main-menu-fixed',
'breaking-news-wrapper', 'row-fluid comments-bg',
'unstyled actions', 'modal-body', 'modal-header', 'desktop']}),
dict(id=['weather-forecast', 'topics', 'side-widgets', 'footer-container', 'sb-container', 'myModal']),
dict(name=['script', 'noscript', 'style'])]
keep_only_tags = [dict(attrs={'class':['span8 border-right']}),
dict(name=['article', 'section']),
dict(id=['img-wrapper'])]
extra_css = """ div.carousel-inner{ overflow:hidden;display: block;height:300px;} img{display: block} """
no_stylesheets = True
def parse_index(self):
soup = self.index_to_soup('http://afrikaans.news24.com/Index.aspx')
def feed_title(div):
return ''.join(div.findAll(text=True, recursive=False)).strip()
articles = {}
key = None
key = 'Nuus in Afrikaans'
articles[key] = []
ans= []
for anchor in soup.findAll(True,
attrs={'id':['lnkLink']}):
url = re.sub(r'\?.*', '', anchor['href'])
title = self.tag_to_string(anchor, use_alt=True).strip()
print title
description = ''
pubdate = strftime('%a, %d %b')
articles[key].append(
dict(title=title, url=url, date=pubdate,
description=description,
content=''))
ans = [(key, articles[key])]
return ans

View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = '2013, Carlos Alves <carlosalves90@gmail.com>'
'''
padreydecano.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class General(BasicNewsRecipe):
title = 'Padre y Decano'
__author__ = 'Carlos Alves'
description = 'El sitio del pueblo'
tags = 'soccer, futbol, Peñarol'
language = 'es_UY'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 5
encoding = None
remove_javascript = True
no_stylesheets = True
oldest_article = 2
max_articles_per_feed = 100
keep_only_tags = [
dict(name='h1', attrs={'class':'entry-title'}),
dict(name='div', attrs={'class':'entry-content clearfix'})
]
remove_tags = [
dict(name='div', attrs={'class':['br', 'hr', 'titlebar', 'navigation']}),
dict(name='dl', attrs={'class':'gallery-item'}),
dict(name=['object','link'])
]
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
p {font-family:Arial,Helvetica,sans-serif;}
'''
feeds = [
(u'Padre y Decano | Club Atlético Peñarol', u'http://www.padreydecano.com/cms/feed/')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -1,16 +0,0 @@
from calibre.web.feeds.news import CalibrePeriodical
class PCQ(CalibrePeriodical):
title = 'PCQuest'
calibre_periodicals_slug = 'pc-quest-india'
description = '''
Buying a tech product? Seeking a tech solution? Consult PCQuest, India's
market-leading selection and implementation guide for the latest
technologies: servers, business apps, security, open source, gadgets and
more. To subscribe visit, <a
href="http://news.calibre-ebook.com/periodical/pc-quest-india">calibre
Periodicals</a>.
'''
language = 'en_IN'

Some files were not shown because too many files have changed in this diff Show More