Merge branch 'master' of git://github.com/kovidgoyal/calibre
2
.gitignore
vendored
@ -14,7 +14,6 @@ build
|
|||||||
dist
|
dist
|
||||||
docs
|
docs
|
||||||
resources/localization
|
resources/localization
|
||||||
resources/images.qrc
|
|
||||||
resources/scripts.pickle
|
resources/scripts.pickle
|
||||||
resources/ebook-convert-complete.pickle
|
resources/ebook-convert-complete.pickle
|
||||||
resources/builtin_recipes.xml
|
resources/builtin_recipes.xml
|
||||||
@ -42,3 +41,4 @@ calibre_plugins/
|
|||||||
recipes/*.mobi
|
recipes/*.mobi
|
||||||
recipes/*.epub
|
recipes/*.epub
|
||||||
recipes/debug
|
recipes/debug
|
||||||
|
/.metadata/
|
||||||
|
944
Changelog.yaml
@ -20,6 +20,950 @@
|
|||||||
# new recipes:
|
# new recipes:
|
||||||
# - title:
|
# - title:
|
||||||
|
|
||||||
|
- version: 1.8.0
|
||||||
|
date: 2013-10-25
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "DOCX Input: Support linked (as opposed to embedded) images, if the linked image is found on the local computer."
|
||||||
|
tickets: [1243597]
|
||||||
|
|
||||||
|
- title: 'FB2 Input: Add support for note and cite back references. Link pairs of type="note" and type="cite" now automatically generate the correct back reference.'
|
||||||
|
tickets: [1243714]
|
||||||
|
|
||||||
|
- title: "When automerging books during during an add, include the author as well as the title in the report of merged books."
|
||||||
|
|
||||||
|
- title: "OS X Mavericks (10.9) breaks connecting to iTunes and iBooks on iOS devices. For more details see: http://www.mobileread.com/forums/showthread.php?t=215624"
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "OS X: Fix system tray notifications causing crashes on some OS X 10.9 (Mavericks) systems (those that had Growl installed at some point)."
|
||||||
|
tickets: [1224491]
|
||||||
|
|
||||||
|
- title: "OSX: Fix font size in completion popups too small on Mavericks (I hope)"
|
||||||
|
tickets: [1243761]
|
||||||
|
|
||||||
|
- title: "PDF Output: Fix rendering of some semi-transparent images. All semi-transparent images are now rendered using soft masks."
|
||||||
|
tickets: [1243829]
|
||||||
|
|
||||||
|
- title: "MOBI Output: Fix text marked with white-space:pre-wrap causing the Kindle to break lines at arbitrary points inside words."
|
||||||
|
tickets: [1240235]
|
||||||
|
|
||||||
|
- title: "FB2 Input: Fix a regression that broke conversion of FB2 files with paragraphs having both a style and an id attribute."
|
||||||
|
tickets: [1243709]
|
||||||
|
|
||||||
|
- title: "TXT Input: Ensure that <title> in the generated HTML has a meaningful value."
|
||||||
|
tickets: [1236923]
|
||||||
|
|
||||||
|
- title: "Book details panel: Fix HTML in author names and identifiers not being escaped"
|
||||||
|
tickets: [1243976]
|
||||||
|
|
||||||
|
- title: "HTML 5 parsing: Fix handling of xml:lang attributes on all elements xml:lang is now mapped to a plain lang on all elements, not just <html>"
|
||||||
|
|
||||||
|
- title: "Update HTML 5 parser used in calibre (html5lib-python) to fix a few corner cases"
|
||||||
|
|
||||||
|
- title: "When bulk deleting formats, use a single temporary directory for the deleted files. This makes restoring them from the recycle bin a little cleaner. Also might fix the reported issue with the windows recycle bin choking on creating a large number of folders."
|
||||||
|
|
||||||
|
- title: "DOCX Input: Add support for hyperlink fields that have only anchors and not URLs"
|
||||||
|
|
||||||
|
- title: "DOCX Input: Fix handling of multiple block level bookmarks at the same location."
|
||||||
|
tickets: [1241451]
|
||||||
|
|
||||||
|
- title: "HTMLZ Output: Fix Htmlz does not apply inline css from <body>."
|
||||||
|
tickets: [1242261]
|
||||||
|
|
||||||
|
- title: "Fix the restore database operation failing on windows installs with long usernames (this would cause the path to the temporary folder used to restore the database to become too long)."
|
||||||
|
|
||||||
|
- title: "ODT Input: Various workarounds for broken ODT files generated my mk4ht"
|
||||||
|
|
||||||
|
- title: "Fix a bug with non-ascii text in the create catalog dialog"
|
||||||
|
ticket: [1241515]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- A List Apart
|
||||||
|
|
||||||
|
- version: 1.7.0
|
||||||
|
date: 2013-10-18
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Cover grid: Allow using images as the background for the cover grid. To choose an image, go to Preferences->Look & Feel->Cover Grid."
|
||||||
|
tickets: [1239194]
|
||||||
|
|
||||||
|
- title: "An option to mark newly added books with a temporary mark. Option is in Preferences->Adding books."
|
||||||
|
tickets: [1238609]
|
||||||
|
|
||||||
|
- title: "Edit metadata dialog: Allow turning off the cover size displayed in the bottom right corner of the cover by right clicking the cover and choosing 'Hide cover size'. It can be restored the same way."
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Conversion: If both embed font family and the filter css option to remove fonts are set, do not remove the font specified by the embed font family option."
|
||||||
|
|
||||||
|
- title: "Fix a few remaining situations that could cause formats column to show an error message about SHLock"
|
||||||
|
|
||||||
|
- title: "Make deleting books to recycle bin more robust. Ensure that the temporary directory created during the move to recycle bin process is not left behind in case of errors."
|
||||||
|
|
||||||
|
- title: "Windows: Check if the books' files are in use before deleting"
|
||||||
|
|
||||||
|
- title: "Fix custom device driver swap main and card option not working. Also fix swapping not happening for a few devices on linux"
|
||||||
|
tickets: [1240504]
|
||||||
|
|
||||||
|
- title: "Edit metadata dialog: The Edit metadata dialog currently limits its max size based on the geometry of the smallest attached screen. Change that to use the geometry of the screen on which it will be shown."
|
||||||
|
tickets: [1239597]
|
||||||
|
|
||||||
|
- title: "HTMLZ Output: Fix <style> tag placed inside <body> instead of <head>."
|
||||||
|
tickets: [1239530]
|
||||||
|
|
||||||
|
- title: "HTMLZ Output: Fix inline styles not escaping quotes properly."
|
||||||
|
tickets: [1239527]
|
||||||
|
|
||||||
|
- title: "HTMLZ Output: Fix incorrect handling of some self closing tags like <br>."
|
||||||
|
tickets: [1239555]
|
||||||
|
|
||||||
|
- title: "Content server: Fix single item categories not working with reverse proxy setup."
|
||||||
|
tickets: [1238987]
|
||||||
|
|
||||||
|
- title: "Fix a bug that could cause calibre to crash when switching from a large library to a smaller library with marked books."
|
||||||
|
tickets: [1239210]
|
||||||
|
|
||||||
|
- title: "Get Books: Fix downloading of some books in formats that do not have metadata yielding nonsense titles"
|
||||||
|
|
||||||
|
- title: "Allow marked book button to be added to main toolbar when device is connected"
|
||||||
|
tickets: [1239163]
|
||||||
|
|
||||||
|
- title: "Fix error if a marked book is deleted/merged."
|
||||||
|
tickets: [1239161]
|
||||||
|
|
||||||
|
- title: "Template language: Fix formatter function days_between to compute the right value when the answer is negative."
|
||||||
|
|
||||||
|
- title: "Windows: Fix spurious file in use by other process error if the book's folder contained multiple hard links pointing to the same file"
|
||||||
|
tickets: [1240788, 1240194]
|
||||||
|
|
||||||
|
- title: "Windows: Fix duplicate files being created in very special circumstances when changing title and/or author. (the title or author had to be between 31 and 35 characters long and the book entry had to have been created by a pre 1.x version of calibre). You can check if your library has any such duplicates and remove them, by using the Check Library tool (Right click the calibre button on the toolbar and select Library Maintenance->Check Library)."
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Wall Street Journal
|
||||||
|
- Newsweek Polska
|
||||||
|
- Wired Magazine
|
||||||
|
- cracked.com
|
||||||
|
- Television Without Pity
|
||||||
|
- Carta
|
||||||
|
- Diagonales
|
||||||
|
|
||||||
|
|
||||||
|
- version: 1.6.0
|
||||||
|
date: 2013-10-11
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Temporary marking of books in the library"
|
||||||
|
description: "This allows you to select books from your calibre library manually and mark them. This 'mark' will remain until you restart calibre, or clear the marks. You can easily work with only the marked subset of books by right clicking the Mark Books button. To use this feature, go to Preferences->Toolbars and add the 'Mark Books' tool to the main toolbar."
|
||||||
|
type: major
|
||||||
|
|
||||||
|
- title: "Get Books: Add Wolne Lektury and Amazon (Canada) ebook stores"
|
||||||
|
|
||||||
|
- title: "DOCX Input: Handle hyperlinks in footnotes and endnotes"
|
||||||
|
tickets: [1232790]
|
||||||
|
|
||||||
|
- title: "Driver for Sunstech reader"
|
||||||
|
tickets: [1231590]
|
||||||
|
|
||||||
|
- title: "Allow using both uri: and url: identifiers to create two different arbitrary links instead of just one in the Book details panel"
|
||||||
|
|
||||||
|
- title: "E-book viewer: Make all keyboard shortcuts configurable"
|
||||||
|
tickets: [1232019]
|
||||||
|
|
||||||
|
- title: "Conversion: Add an option to not condense CSS rules for margin, padding, border, etc. Option is under the Look & Feel section of the conversion dialog."
|
||||||
|
tickets: [1233220]
|
||||||
|
|
||||||
|
- title: "calibredb: Allow setting of title sort field"
|
||||||
|
tickets: [1233711]
|
||||||
|
|
||||||
|
- title: "ebook-meta: Add an --identifier option to set identifiers."
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Fix a locking error when composite columns containing formats are used and formats are added/deleted."
|
||||||
|
tickets: [1233330]
|
||||||
|
|
||||||
|
- title: "EPUB Output: Do not strip <object> tags with type application/svg+xml in addition to those that use image/svg+xml."
|
||||||
|
tickets: [1236845]
|
||||||
|
|
||||||
|
- title: "Cover grid: Fix selecting all books with Ctrl+A causing subsequent deselects to not fully work."
|
||||||
|
tickets: [1236348]
|
||||||
|
|
||||||
|
- title: "HTMLZ Output: Fix long titles causing error when converting on windows."
|
||||||
|
tickets: [1235815]
|
||||||
|
|
||||||
|
- title: "Content server: Fix OPDS category links to composite columns"
|
||||||
|
|
||||||
|
- title: "E-book viewer: Fix regression that broke import/export of bookmarks"
|
||||||
|
tickets: [1231980]
|
||||||
|
|
||||||
|
- title: "E-book viewer: Use the default font size setting for the dictionary view as well."
|
||||||
|
tickets: [1232025]
|
||||||
|
|
||||||
|
- title: "DOCX Input: Avoid using the value attribute for simple numbered lists, to silence the asinine epubcheck"
|
||||||
|
|
||||||
|
- title: "HTML Input: Images linked by the poster attribute of the <video> tag are now recognized and processed."
|
||||||
|
|
||||||
|
- title: "DOCX Input: Fix erorr when converting docx files that have numbering defined with no associated character style."
|
||||||
|
tickets: [1232100]
|
||||||
|
|
||||||
|
- title: "EPUB Metadata: Implementing updating identifiers other than isbn in the epub file from calibre when polishing or exporting the epub"
|
||||||
|
|
||||||
|
- title: "Amazon metadata download: Fix parsing of some dates on amazon.de"
|
||||||
|
tickets: [1238125]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- National Geographic Magazine
|
||||||
|
- New York Review of Books
|
||||||
|
- Focus (PL)
|
||||||
|
- Carta Capital
|
||||||
|
- AM 730
|
||||||
|
- Ming Pao (HK)
|
||||||
|
- Neu Osnabrucker Zeitung
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Various Uruguayan news sources
|
||||||
|
author: Carlos Alves
|
||||||
|
|
||||||
|
- version: 1.5.0
|
||||||
|
date: 2013-09-26
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Driver for Woxter Scriba"
|
||||||
|
tickets: [1228690]
|
||||||
|
|
||||||
|
- title: "Bulk metadata edit: Allow setting the comments for all selected books and also allow cloning the covers. Cloning covers means that the cover of the first selected book will be set for all other selected books."
|
||||||
|
tickets: [1230040]
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Windows: Improved device ejection code. Eject individual drives before trying to eject the device. This fixes incomplete ejection with the Nook devices."
|
||||||
|
|
||||||
|
- title: "Catalogs: fix exclude tags rules not working in non-English locales when creating catalogs in EPUB/MOBI formats."
|
||||||
|
tickets: [1228949]
|
||||||
|
|
||||||
|
- title: "Kobo driver: Fix reading status being cleared when connecting to a Kobo with older firmware and metadata management set to automatic."
|
||||||
|
tickets: [1230018]
|
||||||
|
|
||||||
|
- title: "Content server: Sort virtual libraries by name"
|
||||||
|
tickets: [1229459]
|
||||||
|
|
||||||
|
- title: "DOCX Input: Convert tabs in the docx file into non-breaking spaces in the output document. Note that custom tab stops are not supported."
|
||||||
|
tickets: [1228893]
|
||||||
|
|
||||||
|
- title: "Conversion: Handle the style attribute on the <html> tag"
|
||||||
|
|
||||||
|
- title: "Handle databases with invalid ratings link tables"
|
||||||
|
tickets: [1228517]
|
||||||
|
|
||||||
|
- title: "DOCX Input: Handle DOCX files with missing referenced styles"
|
||||||
|
tickets: [1228669]
|
||||||
|
|
||||||
|
- title: "Update amazon metadata download plugin for changes to the Amazon website"
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Slate
|
||||||
|
- El Universal (VE)
|
||||||
|
- GoComics
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title:
|
||||||
|
|
||||||
|
- version: 1.4.0
|
||||||
|
date: 2013-09-20
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Column icons: Allow the use of multiple icons with column icon rules."
|
||||||
|
description: "You can now have column icon rules display multiple icons in a single column, side by side. There are two ways to do this, either specify multiple icons when creating the rule, or create multiple rules that match the same book and specify the icon type to be 'composed' for every rule. See Preferences->Look & Feel->Column icons for details."
|
||||||
|
|
||||||
|
- title: "Kobo driver: Add support for new cover handling in Kobo Aura with updated firmware"
|
||||||
|
|
||||||
|
- title: "Driver for Icarus Essence"
|
||||||
|
tickets: [1226304]
|
||||||
|
|
||||||
|
- title: "Show a warning when attempting to copy books between libraries that do not have the same set of custom columns."
|
||||||
|
tickets: [1225484]
|
||||||
|
|
||||||
|
- title: "EPUB/AZW3 Output: Use shorthand forms for margin, padding and border CSS properties, where possible"
|
||||||
|
|
||||||
|
- title: "Allow colons in identifier values, needed for using URIs as identifiers"
|
||||||
|
tickets: [1224885]
|
||||||
|
|
||||||
|
- title: "Comments editor: Allow treating arbitrary URLs as images"
|
||||||
|
|
||||||
|
- title: "Show full path of library under mouse in status bar when switching/renaming/deleting libraries via the calibre library button."
|
||||||
|
tickets: [1224925]
|
||||||
|
|
||||||
|
- title: "DOCX Input: Add support for embedded EMF images that are just wrappers around an actual raster image."
|
||||||
|
tickets: [1224849]
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Conversion: Fix font subsetting not working for large fonts with more than 4000 glyphs, such as CJK fonts"
|
||||||
|
|
||||||
|
- title: "HTML Input: Fix a regression that broke processing of HTML files that contain meta tags with dc: namespaced attribute values."
|
||||||
|
|
||||||
|
- title: "Fix switching to an empty virtual library not blanking the book details panel"
|
||||||
|
|
||||||
|
- title: "Keep position when deleting tags in the tag editor"
|
||||||
|
tickets: [1226093]
|
||||||
|
|
||||||
|
- title: "Book details panel: Render custom comments fields the same as the builtin comments field. In particular this fixes problems caused by wide text and images in custom comments fields."
|
||||||
|
tickets: [1226350]
|
||||||
|
|
||||||
|
- title: "Metadata jackets: Do not error out when using a custom template with some custom fields that are not present."
|
||||||
|
tickets: [1225357]
|
||||||
|
|
||||||
|
- title: "AZW3 Output: Dont choke on invalid (undecodable) links in the input document"
|
||||||
|
|
||||||
|
- title: "Cover grid: Respect the double click on library view tweak"
|
||||||
|
|
||||||
|
- title: "Fix covers set by drag and drop or pasting in the edit metadata dialog showing compression artifacts due to aggressive jpeg compression"
|
||||||
|
|
||||||
|
- title: "Conversion: Fix a bug that could cause incorrect border values to be used when cascading, shorthand border CSS is present."
|
||||||
|
|
||||||
|
- title: "Fix regression in 1.3 that caused the book list to not track the current book when using Next/Previous in the edit metadata dialog."
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Liberation
|
||||||
|
- Politika
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Sage News
|
||||||
|
author: Brian Hahn
|
||||||
|
|
||||||
|
- title: Il Cambiamento
|
||||||
|
author: ghib9
|
||||||
|
|
||||||
|
- version: 1.3.0
|
||||||
|
date: 2013-09-13
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "When doing searches or switching between virtual libraries in the main book list, preserve the current book. The currently selected book will remain visible if it is present in the results of the search or the selected virtual library."
|
||||||
|
tickets: [1216713]
|
||||||
|
|
||||||
|
- title: "Drivers for Oppo Find 5 and PocketBook Mini 515"
|
||||||
|
tickets: [1223853]
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "DOCX Input: Handle numbered paragraphs where the numbering is specified in the paragraph style, instead of on the paragraph directly. Also support the use of arbitrary, styled text for bullets."
|
||||||
|
|
||||||
|
- title: "DOCX Input: Fix a single line break at the end of a paragraph not being rendered as a blank line."
|
||||||
|
|
||||||
|
- title: "DOCX Input: Fix extra top/bottom margins around headings when the heading style in word does not specify any top/bottom margins."
|
||||||
|
|
||||||
|
- title: "DOCX Input: Handle images in footnotes and endnotes."
|
||||||
|
tickets: [1221686]
|
||||||
|
|
||||||
|
- title: "ODT Input: Only treat the first image as a cover if it is of suitable size, instead of any image in the document."
|
||||||
|
tickets: [1224157]
|
||||||
|
|
||||||
|
- title: "Book polishing: Do not leave behind the old comments when updating metadata if the comments have been deleted in calibre."
|
||||||
|
|
||||||
|
- title: "PDF Output: Fix non-breaking space characters incorrectly rendered in PDF outline."
|
||||||
|
tickets: [1223862]
|
||||||
|
|
||||||
|
- title: "Content server: Fix error in opds feed after using virtual libraries in the main server."
|
||||||
|
tickets: [1222108]
|
||||||
|
|
||||||
|
- title: "Do not scroll the book list horizontally after editing metadata."
|
||||||
|
tickets: [1221552]
|
||||||
|
|
||||||
|
- title: "New database backend: Handle databases that contain multiple tags/series/publishers/etc. that differ only in case."
|
||||||
|
tickets: [1221545]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Harvard Business Review
|
||||||
|
- Jakarta Post
|
||||||
|
- Jakarta Globe
|
||||||
|
- Dilema Veche
|
||||||
|
- Daily Express
|
||||||
|
- Anandtech
|
||||||
|
- High Country News
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Caravan Magazine
|
||||||
|
author: Kovid Goyal
|
||||||
|
|
||||||
|
- title: Phys Org
|
||||||
|
author: Krittika Goyal
|
||||||
|
|
||||||
|
- version: 1.2.0
|
||||||
|
date: 2013-09-06
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Conversion: Add support for the CSS3 rem font size unit"
|
||||||
|
|
||||||
|
- title: "MTP devices, such as Android tablets/phones: Allow ignoring any folder on the device, not just top level folders. For newly connected devices, also scan /Android/data/com.amazon.kindle for books by default (newer versions of the Kindle app place downloaded files there)."
|
||||||
|
|
||||||
|
- title: "Speed up sorting when the book list is showing a restricted set of books, such as when the results of a search are displayed or a virtual library is used."
|
||||||
|
tickets: [1217622]
|
||||||
|
|
||||||
|
- title: "Edit metadata dialog: Add an undo option for the Trim cover button."
|
||||||
|
tickets: [1219227]
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Amazon metadata download: Update to handle website changes at amazon.com"
|
||||||
|
|
||||||
|
- title: "PDF Output: Workaround a bug in the library calibre uses to render HTML to PDF that caused text in some documents that used small-caps fonts to not render correctly."
|
||||||
|
tickets: [1216354]
|
||||||
|
|
||||||
|
- title: "Kobo driver: When a sideloaded kepub is added from a Kobo device to the calibre library, it is added as an epub, but the file copied is corrupt."
|
||||||
|
tickets: [1221035]
|
||||||
|
|
||||||
|
- title: "Fix changing the user interface language in the welcome wizard causing some parts of the interface to remain in the old language until calibre is restarted."
|
||||||
|
tickets: [1220767]
|
||||||
|
|
||||||
|
- title: "Fix regression in 1.0 that broke setting author names with the | character in them."
|
||||||
|
tickets: [1220348]
|
||||||
|
|
||||||
|
- title: "Content server: When running from inside the main calibre program, do not restrict the books shown based on the current virtual library in the main program. If you wish to restrict the books shown in the content server, use Preferences->Sharing over the net."
|
||||||
|
|
||||||
|
- title: "Output dates in the local timezone instead of UTC when generating CSV catalogs"
|
||||||
|
|
||||||
|
- title: "Library maintenance: When doing a check library instead of dumping the database to SQL and restoring it, run a VACUUM. This works around various bugs in the dump and restore capabilities of apsw."
|
||||||
|
tickets: [1217988]
|
||||||
|
|
||||||
|
- title: "Edit metadata dialog: Fix adding an image to an empty comments block not working"
|
||||||
|
|
||||||
|
- title: "Conversion: Fix font declarations with invalid font-family values causing conversion to abort when subsetting is enabled."
|
||||||
|
|
||||||
|
- title: "MOBI Output: Fix conversion of some super/sub scripts failling if they are the first or last element in a paragraph."
|
||||||
|
tickets: [1218858]
|
||||||
|
|
||||||
|
- title: "New database backend: Various improvements to make the backend more robust against databases with invalid/corrupt data in them."
|
||||||
|
tickets: [1218465, 1218783]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Countryfile
|
||||||
|
|
||||||
|
- version: 1.1.0
|
||||||
|
date: 2013-08-30
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Rewrite the HTML metadata parser to make it faster and more robust."
|
||||||
|
tickets: [1217751]
|
||||||
|
|
||||||
|
- title: "Book list: When sorting on a currently unsorted column, use the last applied sort for that column, instead of always sorting in ascending order."
|
||||||
|
tickets: [1216714]
|
||||||
|
|
||||||
|
- title: "PocketBook driver: Scan for books files in the entire device not just in the 'books' folder"
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Fix a regression in 1.0 that could cause the dates in custom date-type columns to change in some timezones when using the edit metadata dialog to make unrelated changes."
|
||||||
|
tickets: [1217096]
|
||||||
|
|
||||||
|
- title: "When replacing formats in a book with a very long title+authors on windows, calibre could leave behind the old format file, because the filename shortening algorithm has changed. Handle that case."
|
||||||
|
|
||||||
|
- title: "Fix content server giving an error if you return to the top level page after using the virtual libraries."
|
||||||
|
tickets: [1216838]
|
||||||
|
|
||||||
|
- title: "Fix calibredb not updating the running calibre instance properly in 1.0"
|
||||||
|
tickets: [1218177]
|
||||||
|
|
||||||
|
- title: "Fix a regression in 1.0 that broke splitting of multiple valued field like tags into many items during a rename."
|
||||||
|
tickets: [1216699]
|
||||||
|
|
||||||
|
- title: "Fix a regression in 1.0 that caused an error when trying to set values for tags with the same item repeated, with different case."
|
||||||
|
tickets: [1216398]
|
||||||
|
|
||||||
|
- title: "Fix a regression that broke downloading news when output format is set to PDF"
|
||||||
|
|
||||||
|
- title: "Creating a catalog with an already existing catalog in the library would cause a temporary duplicate entry in the book list. Also fix the author sort for catalogs generated in the AZW3 format not being correct."
|
||||||
|
|
||||||
|
- title: "EPUB metadata: When changing the title in an EPUB 2.0 file that has multiple titles, remove the extra titles."
|
||||||
|
tickets: [1211949]
|
||||||
|
|
||||||
|
- title: "Fix a regression in 1.0 that caused Search and Replace in the bulk metadata edit dialog to be much slower than before"
|
||||||
|
|
||||||
|
- title: "Fix a regression in 1.0 that caused incorrect sorting and searching on some composite columns (columns built from other columns)."
|
||||||
|
|
||||||
|
- title: "Fix a regression in 1.0 that prevented the moving of libraries inside calibre"
|
||||||
|
tickets: [1216401]
|
||||||
|
|
||||||
|
- title: "Virtual Library tabs: If the user activates a hidden tab via the Virtual Library button, change the name of the All Books tab to reflect the hidden virtual library."
|
||||||
|
tickets: [1216174]
|
||||||
|
|
||||||
|
- title: "Ignore text records in the database that are damaged, instead of erroring out. Lets the rest of the data be used."
|
||||||
|
tickets: [1215981]
|
||||||
|
|
||||||
|
- title: "Fix regression that broke calibredb catalog when sorting on the id field."
|
||||||
|
tickets: [1216090]
|
||||||
|
|
||||||
|
- title: "HTML Input: Handle malformed OPF files when converting. "
|
||||||
|
tickets: [1215924]
|
||||||
|
|
||||||
|
- title: "Ensure that the Formats custom column (if present) is updated when a new format is created as a result of a conversion."
|
||||||
|
tickets: [1215885]
|
||||||
|
|
||||||
|
- title: "Fix a bug in 1.0 that broke the Check Library function on computers with non-English locales."
|
||||||
|
tickets: [1215819]
|
||||||
|
|
||||||
|
- title: "Content server: Fix blank username causing error on startup."
|
||||||
|
tickets: [1215893]
|
||||||
|
|
||||||
|
- title: "Fix sorting of book list by multi-valued fields like tags not correct in the new backend."
|
||||||
|
tickets: [1215820]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Daily Mirror
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: VFR Magazine
|
||||||
|
author: Krittika Goyal
|
||||||
|
|
||||||
|
- version: 1.0.0
|
||||||
|
date: 2013-08-23
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "A new 'cover grid' view of the books in your calibre library"
|
||||||
|
description: "Excellent for judging your books by their covers :) To use click the button with the icon of a grid in the bottom right corner of the main window. It can be configured via Preferences->Look & Feel->Cover Grid"
|
||||||
|
type: major
|
||||||
|
|
||||||
|
- title: "A new, faster database backend"
|
||||||
|
description: "The database backend in calibre has been re-written from scratch. The new code is smaller, more robust and much faster than the old code. The exact speedup will depend on the number of books and number and type of custom columns in your library. Users have reported calibre startup times decreasing by a factor of 2-3 times."
|
||||||
|
type: major
|
||||||
|
|
||||||
|
- title: "For a summary of the major changes in calibre between 0.9 and 1.0, see http://calibre-ebook.com/new-in/ten"
|
||||||
|
type: major
|
||||||
|
|
||||||
|
- title: "RTF Input: Add option to ignore WMF images iinstead of replacing them with a placeholder."
|
||||||
|
tickets: [1213599]
|
||||||
|
|
||||||
|
- title: "Content server: Make virtual libraries available as searches from the start page. They work just like saved searches, clicking on a virtual library will show you all the books in that virtual library."
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Remove extra, useless 'language' entry in metadata download configuration dialog"
|
||||||
|
|
||||||
|
- title: "Kobo driver: Display device collections even if the driver is configured to not manage shelves on the device."
|
||||||
|
tickets: [1214233]
|
||||||
|
|
||||||
|
- title: "Fix typo in calibre.desktop file on linux"
|
||||||
|
tickets: [1213664]
|
||||||
|
|
||||||
|
- title: "Edit metadata dialog: Disable OK button while results are being downloaded."
|
||||||
|
tickets: [1213397]
|
||||||
|
|
||||||
|
- title: "In OS X 10.8 Apple stopped redirecting stdout/stderr to Console.app for applications launched by launch services. Re-enable the redirection, useful for debugging."
|
||||||
|
|
||||||
|
- title: "Fix virtual library tabs not being updated when using VL button"
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Consumerist
|
||||||
|
- jeuxvideo
|
||||||
|
- Metro UK
|
||||||
|
- El Tribuno
|
||||||
|
- High Country News
|
||||||
|
- Daily Express
|
||||||
|
- Providence Journal
|
||||||
|
- mediapart
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: News24 and Nuus24
|
||||||
|
author: Nicki de Wet
|
||||||
|
|
||||||
|
- version: 0.9.44
|
||||||
|
date: 2013-08-16
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Add an option to display all virtual libraries as tabs above the book list."
|
||||||
|
description: "Convenient to quickly switch between virtual libraries. To enable, click the Virtual library button and select 'Show virtual libraries as tabs'. You can re-arrange the tabs by drag and drop and close tabs you do not want. Right click on the tabs to restore closed tabs."
|
||||||
|
|
||||||
|
- title: "An improved cover trimming algorithm to automatically detect and remove borders and extra space from the edge of cover images. To try it use the 'Trim' button in the edit metadata dialog. This can sometimes remove too much so if you dont like the result, just click cancel. You can make the algorithm more or less aggressive via Preferences->Tweaks"
|
||||||
|
|
||||||
|
- title: "Allow customizing the comic metadata reader plugin via Preferences->Plugins to read the series index from either the volume or the issue number of the comic."
|
||||||
|
tickets: [1211433]
|
||||||
|
|
||||||
|
- title: "Linux MTP driver: Add ids for some newer devices."
|
||||||
|
tickets: [1212458]
|
||||||
|
|
||||||
|
- title: "Add a trim cover option to the bulk metadata edit dialog"
|
||||||
|
|
||||||
|
- title: "Make the book information dialog user resizable, with a splitter between the cover and the info panel. Also change the background of the cover panel for books that have been marked using the Temp marker plugin."
|
||||||
|
tickets: [1209057]
|
||||||
|
|
||||||
|
- title: "Driver for Samsung Galaxy Young Android phone"
|
||||||
|
tickets: [1212918]
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "PDF Output: Do not abort conversion if the document being converted has an HTML cover (found in some broken EPUB files)."
|
||||||
|
|
||||||
|
- title: "RTF Input: When converting RTF files with no codepage, use the input encoding setting as the codepage."
|
||||||
|
tickets: [1163572]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- The Independent
|
||||||
|
- El Periodica de Aragon
|
||||||
|
- El Correo
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Daily Express
|
||||||
|
author: Dave Asbury
|
||||||
|
|
||||||
|
- version: 0.9.43
|
||||||
|
date: 2013-08-09
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "TXT Input: Allow using various markdown extensions for more features when converting markdown formatted txt files. See http://pythonhosted.org/Markdown/extensions/index.html for details."
|
||||||
|
|
||||||
|
- title: "Sending by email: Allow sending by email to an arbitrary combination of email address. Access it via the 'Select recipients' menu entry in the Email To menu."
|
||||||
|
tickets: [1207818]
|
||||||
|
|
||||||
|
- title: "A new 'Sort By' action for the right click menu. This allows sorting on all columns in the library, not just the visible columns. To use it go to Preferences->Toolbars and add it to 'The context menu for books in the calibre library'"
|
||||||
|
|
||||||
|
- title: "Allow adding images into the comments field, by clicking on the insert link button in the comments editor in the edit metadata dialog."
|
||||||
|
|
||||||
|
- title: "Allow skipping the confirm bulk reconvert dialog"
|
||||||
|
|
||||||
|
- title: "EPUB Input: If the EPUB file identifies an actual cover image in addition to the titlepage html file, use the cover image instead of rendering the titlepage. This is faster and has the advantage that an EPUB to EPUB conversion preserves internal cover structure."
|
||||||
|
|
||||||
|
- title: "Get Books: Improve searching by removing punctuation from title/authors before matching."
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Conversion: Fix empty inline tags that are the second child of a paragraph causing text change location."
|
||||||
|
tickets: [1207735]
|
||||||
|
|
||||||
|
- title: "Fix book count in tooltip of choose library button not updating"
|
||||||
|
ticket: [1208217]
|
||||||
|
|
||||||
|
- title: "Kobo driver: When deleting shelves that have been synced, the Activity entry for the shelf was not being deleted. This left a tile for the shelf on the home screen of the Glo and AuraHD."
|
||||||
|
tickets: [1208159]
|
||||||
|
|
||||||
|
- title: "Comments editor: The Insert Link button has no effect until the user clicks inside the comments box, therefore disable it until it is ready, to prevent confusion."
|
||||||
|
tickets: [1208073]
|
||||||
|
|
||||||
|
- title: "Get Books: Update various Polish store plugins"
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- The Sunday Times UK and The Times Online
|
||||||
|
- Telegraph UK
|
||||||
|
- "Le Monde: Edition abonnés"
|
||||||
|
- The Scotsman
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Various French news sources
|
||||||
|
author: Malah
|
||||||
|
|
||||||
|
- title: La Capital de Rosario
|
||||||
|
author: Darko Miletic
|
||||||
|
|
||||||
|
- title: Jot Down
|
||||||
|
author: desUbiKado
|
||||||
|
|
||||||
|
- title: Private Eye
|
||||||
|
author: Martyn Pritchard
|
||||||
|
|
||||||
|
- version: 0.9.42
|
||||||
|
date: 2013-08-02
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "When downloading metadata from Amazon, convert the amazon categories into tags. You can turn this off by going to Preferences->Metadata download and configuring the Amazon source."
|
||||||
|
tickets: [1206763]
|
||||||
|
|
||||||
|
- title: "Kobo driver: Add an option to modify the styling in books being sent to the device, based on a template on the device."
|
||||||
|
tickets: [1207151]
|
||||||
|
|
||||||
|
- title: "Get Books: Add support for two more Polish ebook stores: cdp.pl and ebooki.allegro.pl"
|
||||||
|
|
||||||
|
- title: "calibredb: Add a new clone command to create clones of libraries with the same custom columns, virtual libraries, etc. as the current library."
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "MOBI metadata: Do not fail to set metadata in MOBI files if they have EXTH fields with NULL pointers to a cover or thumbnail."
|
||||||
|
tickets: [1205757]
|
||||||
|
|
||||||
|
- title: "Fix editing of book metadata failing when its timestamp is out of range for the system."
|
||||||
|
tickets: [1191599]
|
||||||
|
|
||||||
|
- title: "Fix renaming a user category to the same name it already has erases the user category."
|
||||||
|
tickets: [1207131]
|
||||||
|
|
||||||
|
- title: "Fix drag 'n drop of cover onto conversion dialog not working"
|
||||||
|
|
||||||
|
- title: "Device drivers: Explicitly fsync() all files when writing to devices, to reduce chances of file corruption if the device is disconnected while jobs are running"
|
||||||
|
|
||||||
|
- title: "Fix calibre not appearing in Ubuntu's 'Open with..' menu"
|
||||||
|
tickets: [1207518]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- PC World
|
||||||
|
|
||||||
|
- version: 0.9.41
|
||||||
|
date: 2013-07-27
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Add a button to clear the current virtual library easily"
|
||||||
|
|
||||||
|
- title: "Driver for Surftab Ventos"
|
||||||
|
tickets: [1204885]
|
||||||
|
|
||||||
|
- title: "Ebook-viewer: Allow re-ordering bookmarks in the bookmarks manager by drag and drop."
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "DOCX Input: Fix conversion breaking for files that use heading style paragraphs to insert line rules"
|
||||||
|
|
||||||
|
- title: "Content server: Fix last search query not being fully sanitized in results page"
|
||||||
|
tickets: [1205385]
|
||||||
|
|
||||||
|
- title: "Book polishing: Fix page margins being removed if an unused font was found during subsetting of embedded fonts."
|
||||||
|
|
||||||
|
- title: "PDF Output: Do not error out when the input document uses a font that cannot be subset, such as the Symbol font. Instead print a warning and embed the full font."
|
||||||
|
tickets: [1203449]
|
||||||
|
|
||||||
|
- title: "Conversion: Fix a regression in the last release that broke conversion of a few files with comments just before a chapter start."
|
||||||
|
tickets: [1188635]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Something Awful
|
||||||
|
- Spektrum der Wissenschaft
|
||||||
|
- mediapart.fr
|
||||||
|
- Dilbert
|
||||||
|
- Antyweb
|
||||||
|
- Scientific American
|
||||||
|
- taz.de (RSS)
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Blindbuch and No names, No jackets
|
||||||
|
author: Armin Geller
|
||||||
|
|
||||||
|
- title: El Tribuno Salta and Jujuy
|
||||||
|
author: Darko Miletic
|
||||||
|
|
||||||
|
- version: 0.9.40
|
||||||
|
date: 2013-07-19
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "EPUB Output: Add an option to insert an inline Table of Contents into the main text."
|
||||||
|
tickets: [1201006]
|
||||||
|
|
||||||
|
- title: "Driver for LG Android phone"
|
||||||
|
tickets: [1202013]
|
||||||
|
|
||||||
|
- title: "When matching books in the library against the device manually, pre-fill the search field with the book title"
|
||||||
|
tickets: [1200826]
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "PDF Input: Fix a regression that caused some images to be flipped when converting PDF files that use image rotation operators."
|
||||||
|
tickets: [1201083]
|
||||||
|
|
||||||
|
- title: "Fix regression that caused incorrect font size in dropcaps generated by the DOCX input plugin"
|
||||||
|
|
||||||
|
- title: "Get Books: Fix searching for title and author returning some extra matches, if the title starts with an article like the, a or an."
|
||||||
|
tickets: [1200012]
|
||||||
|
|
||||||
|
- title: "PDF Output: Fix extra blank page being inserted at the start of the chapter when converting some epub files from feedbooks"
|
||||||
|
|
||||||
|
- title: "PDF Output: Workaround bug in WebKit's getBoundingClientRect() method that could cause links to occasionally point to incorrect locations."
|
||||||
|
tickets: [1202390]
|
||||||
|
|
||||||
|
- title: "E-book viewer: Fix a bug that could cause the reported position to be incorrect immediately after opening a previously opened book. This also fixes the Back button not working if a link is clicked on the page immediately after opening the book."
|
||||||
|
|
||||||
|
- title: "Fix memory card not being detected for Elonex 621 on Windows"
|
||||||
|
|
||||||
|
- title: "Fix regression in last release that broke auto-conversion of ebooks when sending to device/sending by email."
|
||||||
|
tickets: [1200864]
|
||||||
|
|
||||||
|
- title: "Get Books: Update amazon plugins for website changes"
|
||||||
|
|
||||||
|
- title: "Allow using non-ascii chars in email passwords."
|
||||||
|
tickets: [1202825]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Galaxy's Edge
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Il Foglio
|
||||||
|
author: faber1971
|
||||||
|
|
||||||
|
- title: Le Monde Diplomatique and Acrimed
|
||||||
|
author: Gaetan Lehmann
|
||||||
|
|
||||||
|
- version: 0.9.39
|
||||||
|
date: 2013-07-12
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Bulk metadata edit: Add a checkbox to prevent the refreshing of the book list after the bulk edit. This means that the book list will not be resorted and any existing search/virtual library will not be refreshed. Useful if you have a large library as the refresh can be slow."
|
||||||
|
|
||||||
|
- title: "Allow manually marking a book in the calibre library as being on the device. To do so click the device icon in calibre, then right click on the book you want marked and choose 'Match book to library'. Once you are done marking all the books, right click the device icon and choose 'Update cached metadata'"
|
||||||
|
|
||||||
|
- title: "Driver for Coby Kyros MID1126"
|
||||||
|
tickets: [1199410]
|
||||||
|
|
||||||
|
- title: "When adding formats to an existing book, by right clicking the add books button, ask for confirmation if some formats will be overwritten."
|
||||||
|
|
||||||
|
- title: "Add a tweak to restrict the list of output formats available in the conversion dialog. Go to Preferences->Tweaks to change it."
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Amazon metadata download: Update plugin to deal with the new amazon.com website"
|
||||||
|
|
||||||
|
- title: "Edelweiss metadata download plugin: Workaround for advanced search being broken at the Edelweiss website."
|
||||||
|
|
||||||
|
- title: "Invalid data in the device database on sony readers could cause errors when sorting device collections, ignore those errors."
|
||||||
|
|
||||||
|
- title: "DOCX Input: Fix no page break being inserted before the last section."
|
||||||
|
tickets: [1198414]
|
||||||
|
|
||||||
|
- title: "Metadata download dialog: Have the OK button enabled in the results screen as well."
|
||||||
|
tickets: [1198288]
|
||||||
|
|
||||||
|
- title: "Get Books: Update empik store plugin"
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Houston Chronicle
|
||||||
|
- cracked.com
|
||||||
|
- mediapart.fr
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Glenn Brenwald and Ludwig von Mises Institute
|
||||||
|
author: anywho
|
||||||
|
|
||||||
|
- version: 0.9.38
|
||||||
|
date: 2013-07-05
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Book polishing: Add option to embed all referenced fonts when polishing books using the 'Polish Books' tool."
|
||||||
|
tickets: [1196038]
|
||||||
|
|
||||||
|
- title: "DOCX Input: Add support for clickable (hyperlinked) images"
|
||||||
|
tickets: [1196728]
|
||||||
|
|
||||||
|
- title: "DOCX Input: Insert page breaks at the start of every new section"
|
||||||
|
tickets: [1196728]
|
||||||
|
|
||||||
|
- title: "Drivers for Trekstor Pyrus Maxi and PocketBook Surfpad 2"
|
||||||
|
tickets: [1196931, 1182850]
|
||||||
|
|
||||||
|
- title: "DOCX Input: Add support for horizontal rules created by typing three hyphens and pressing enter."
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Fix detection of SD Card in some PRS-T2N devices"
|
||||||
|
tickets: [1197970]
|
||||||
|
|
||||||
|
- title: "MOBI Input: Fix a regression that broke parsing of MOBI files with malformed markup that also used entities for apostrophes."
|
||||||
|
ticket: [1197585]
|
||||||
|
|
||||||
|
- title: "Get Books: Update Woblink store plugin"
|
||||||
|
|
||||||
|
- title: "Metadata download dialog: Prevent the buttons from being re-ordered when the Next button is clicked."
|
||||||
|
|
||||||
|
- title: "PDF Output: Fix links that point to URLs with query parameters being mangled by the conversion process."
|
||||||
|
tickets: [1197006]
|
||||||
|
|
||||||
|
- title: "DOCX Input: Fix links pointing to locations in the same document that contain multiple, redundant bookmarks not working."
|
||||||
|
|
||||||
|
- title: "EPUB/AZW3 Output: Fix splitting on page-break-after with plain text immediately following the split point causing the text to be added before rather than after the split point."
|
||||||
|
tickets: [1196728]
|
||||||
|
|
||||||
|
- title: "DOCX Input: handle bookmarks defined at the paragraph level"
|
||||||
|
tickets: [1196728]
|
||||||
|
|
||||||
|
- title: "DOCX Input: Handle hyperlinks created as fields"
|
||||||
|
tickets: [1196728]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- iprofessional
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Democracy Now
|
||||||
|
author: Antoine Beaupre
|
||||||
|
|
||||||
|
- version: 0.9.37
|
||||||
|
date: 2013-06-28
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Conversion: Add option to embed all referenced fonts"
|
||||||
|
type: major
|
||||||
|
description: "Add an option to embed all fonts that are referenced in the input document but are not already embedded. This will search your system for the referenced font, and if found, the font will be embedded. Only works if the output format supports font embedding (for example: EPUB or AZW3). The option is under the Look & Feel section of the conversion dialog."
|
||||||
|
|
||||||
|
- title: "ToC Editor: When generating a ToC from files, if the file has no text, do not skip it. Instead create an entry using the filename of the file."
|
||||||
|
|
||||||
|
- title: "AZW3 Input: Add support for the page-progression-direction that is used to indicate page turns should happen from right to left. The attribute is passed into EPUB when converting."
|
||||||
|
tickets: [1194766]
|
||||||
|
|
||||||
|
- title: "ebook-convert: Add a --from-opf option to read metadata from OPF files directly, instead of having to run ebook-meta --from-opf after conversion"
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "PDF Output: Fix Table of Contents being added to the end of the PDF even without the Add Table of Contents option being enabled."
|
||||||
|
tickets: [1194836]
|
||||||
|
|
||||||
|
- title: "When auto-merging books on add, also merge identifiers."
|
||||||
|
|
||||||
|
- title: "Fix an error when using the Template Editor to create a template that uses custom columns."
|
||||||
|
tickets: [1193763]
|
||||||
|
|
||||||
|
- title: "LRF Output: Fix " entities in attribute values causing problems"
|
||||||
|
|
||||||
|
- title: "News download: Apply the default page margin conversion settings. Also, when converting to PDF, apply the pdf conversion defaults."
|
||||||
|
tickets: [1193912]
|
||||||
|
|
||||||
|
- title: "Fix a regression that broke scanning for books on all devices that used the Aluratek Color driver."
|
||||||
|
tickets: [1192940]
|
||||||
|
|
||||||
|
- title: "fetch-ebbok-metadata: Fix --opf argument erroneously requiring a value"
|
||||||
|
|
||||||
|
- title: "When waiting before sending email, log the wait."
|
||||||
|
tickets: [1195173]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- taz.de (RSS)
|
||||||
|
- Miradas al sur
|
||||||
|
- Frontline
|
||||||
|
- La Nacion (Costa Rica)
|
||||||
|
|
||||||
|
|
||||||
|
- version: 0.9.36
|
||||||
|
date: 2013-06-21
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "DOCX Input: Support for Table of Contents created using the Word Table of Contents tool. calibre now first looks for such a Table of Contents and only if one is not found does it generate a ToC from headings."
|
||||||
|
|
||||||
|
- title: "DOCX Input: Add support for images used as bullets in lists"
|
||||||
|
|
||||||
|
- title: "DOCX Input: If a large image that looks like a cover is present at the start of the document, remove it and use it as the cover of the output ebook. This can be turned off under the DOCX Input section of the conversion dialog."
|
||||||
|
|
||||||
|
- title: "When dropping files onto the Book Details panel, ask for confirmation before adding the files to the book. The confirmation can be disabled."
|
||||||
|
|
||||||
|
- title: "News download: Add the 'downloaded from' link at the bottom of every article when using a touchscreen output profile (like the Tablet profile)."
|
||||||
|
|
||||||
|
- title: "E-book viewer: Change the bookmark button to always popup a menu when clicked, makes accessing existing bookmarks easier."
|
||||||
|
|
||||||
|
- title: "After a bulk metadata download, focus the review button on the popup notification, instead of the OK button."
|
||||||
|
tickets: [1190931]
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "DOCX Input: Hide text that has been marked as not being visible in the web view in Word."
|
||||||
|
|
||||||
|
- title: "DOCX Input: When converting docx files with large numbers of unnamed images, do not crash on windows."
|
||||||
|
tickets: [1191354]
|
||||||
|
|
||||||
|
- title: "DOCX Input: Add support for the Word setting 'No space between paragraphs with the same style'."
|
||||||
|
tickets: [119100]
|
||||||
|
|
||||||
|
- title: "MOBI Output: Fix rendering of SVG images that embed large raster images in 64bit calibre installs."
|
||||||
|
tickets: [1191020]
|
||||||
|
|
||||||
|
- title: "HTMLZ Output: Fix handling of images with URL unsafe filenames."
|
||||||
|
tickets: [1192687]
|
||||||
|
|
||||||
|
- title: "Fix unable to change the case of a previously used search because of the search history."
|
||||||
|
|
||||||
|
- title: "When searching allow use of uppercase location names, such as AUTHOR instead of author, automatically lowercasing them."
|
||||||
|
tickets: [1192785]
|
||||||
|
|
||||||
|
- title: "DOCX metadata: When reading covers from DOCX files use the first image as specified in the actual markup instead of just the first image in the container."
|
||||||
|
|
||||||
|
- title: "Kobo driver: Fix a regression when deleting empty shelves on Kobo devices with older firmware."
|
||||||
|
tickets: [1192441]
|
||||||
|
|
||||||
|
- title: "Do not show builtin plugins in the get new plugins dialog If a builtin plugin with the same name as a third party plugin exists, then the builtin plagin was displayed in the get new plugins dialog as installed (happened with the new DOCX Input plugin)."
|
||||||
|
|
||||||
|
- title: "Apple driver: When in synchronous mode (direct to iBooks), disable PDF transfers, as we can't update metadata in iTunes. Not sure when this started, but as of iTunes 11.0.4 it's broken."
|
||||||
|
|
||||||
|
- title: "Get Books: Fix error when using internal browser on some systems"
|
||||||
|
tickets: [1191199]
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- The Walrus Mag
|
||||||
|
- Various Polish news sources
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: Various Polish news sources
|
||||||
|
author: fenuks
|
||||||
|
|
||||||
- version: 0.9.35
|
- version: 0.9.35
|
||||||
date: 2013-06-14
|
date: 2013-06-14
|
||||||
|
|
||||||
|
@ -24,3 +24,10 @@ Development
|
|||||||
|
|
||||||
A [tarball of the source code](http://status.calibre-ebook.com/dist/src) for the
|
A [tarball of the source code](http://status.calibre-ebook.com/dist/src) for the
|
||||||
current calibre release.
|
current calibre release.
|
||||||
|
|
||||||
|
Bugs
|
||||||
|
------
|
||||||
|
|
||||||
|
Bug reports and feature requests should be made in the calibre bug tracker at [launchpad](https://bugs.launchpad.net/calibre).
|
||||||
|
The GitHub bug tracker is only for people contributing code to calibre.
|
||||||
|
|
||||||
|
162
imgsrc/marked.svg
Normal file
@ -0,0 +1,162 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||||
|
<!-- Created with Inkscape (http://www.inkscape.org/) -->
|
||||||
|
|
||||||
|
<svg
|
||||||
|
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||||
|
xmlns:cc="http://creativecommons.org/ns#"
|
||||||
|
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||||
|
xmlns:svg="http://www.w3.org/2000/svg"
|
||||||
|
xmlns="http://www.w3.org/2000/svg"
|
||||||
|
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||||
|
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||||
|
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||||
|
width="128"
|
||||||
|
height="128"
|
||||||
|
id="svg2"
|
||||||
|
version="1.1"
|
||||||
|
inkscape:version="0.48.4 r9939"
|
||||||
|
sodipodi:docname="marked.svg"
|
||||||
|
inkscape:export-filename="/home/kovid/work/calibre/resources/images/marked.png"
|
||||||
|
inkscape:export-xdpi="90"
|
||||||
|
inkscape:export-ydpi="90">
|
||||||
|
<title
|
||||||
|
id="title3847">Pushpin Icon</title>
|
||||||
|
<defs
|
||||||
|
id="defs4">
|
||||||
|
<linearGradient
|
||||||
|
id="linearGradient3782">
|
||||||
|
<stop
|
||||||
|
style="stop-color:#000000;stop-opacity:1;"
|
||||||
|
offset="0"
|
||||||
|
id="stop3784" />
|
||||||
|
<stop
|
||||||
|
style="stop-color:#c3c3c0;stop-opacity:1;"
|
||||||
|
offset="1"
|
||||||
|
id="stop3786" />
|
||||||
|
</linearGradient>
|
||||||
|
<linearGradient
|
||||||
|
inkscape:collect="always"
|
||||||
|
xlink:href="#linearGradient3782"
|
||||||
|
id="linearGradient3813"
|
||||||
|
gradientUnits="userSpaceOnUse"
|
||||||
|
gradientTransform="matrix(0.70710678,-0.70710678,0.70710678,0.70710678,-18.805519,996.21376)"
|
||||||
|
x1="58"
|
||||||
|
y1="91"
|
||||||
|
x2="73"
|
||||||
|
y2="91" />
|
||||||
|
<filter
|
||||||
|
id="filter3014"
|
||||||
|
inkscape:label="Ridged border"
|
||||||
|
inkscape:menu="Bevels"
|
||||||
|
inkscape:menu-tooltip="Ridged border with inner bevel"
|
||||||
|
color-interpolation-filters="sRGB">
|
||||||
|
<feMorphology
|
||||||
|
id="feMorphology3016"
|
||||||
|
radius="4.3"
|
||||||
|
in="SourceAlpha"
|
||||||
|
result="result91" />
|
||||||
|
<feComposite
|
||||||
|
id="feComposite3018"
|
||||||
|
in2="result91"
|
||||||
|
operator="out"
|
||||||
|
in="SourceGraphic" />
|
||||||
|
<feGaussianBlur
|
||||||
|
id="feGaussianBlur3020"
|
||||||
|
result="result0"
|
||||||
|
stdDeviation="1.2" />
|
||||||
|
<feDiffuseLighting
|
||||||
|
id="feDiffuseLighting3022"
|
||||||
|
diffuseConstant="1"
|
||||||
|
result="result92">
|
||||||
|
<feDistantLight
|
||||||
|
id="feDistantLight3024"
|
||||||
|
elevation="66"
|
||||||
|
azimuth="225" />
|
||||||
|
</feDiffuseLighting>
|
||||||
|
<feBlend
|
||||||
|
id="feBlend3026"
|
||||||
|
in2="SourceGraphic"
|
||||||
|
mode="multiply"
|
||||||
|
result="result93" />
|
||||||
|
<feComposite
|
||||||
|
id="feComposite3028"
|
||||||
|
in2="SourceAlpha"
|
||||||
|
operator="in" />
|
||||||
|
</filter>
|
||||||
|
</defs>
|
||||||
|
<sodipodi:namedview
|
||||||
|
id="base"
|
||||||
|
pagecolor="#ffffff"
|
||||||
|
bordercolor="#666666"
|
||||||
|
borderopacity="1.0"
|
||||||
|
inkscape:pageopacity="0.0"
|
||||||
|
inkscape:pageshadow="2"
|
||||||
|
inkscape:zoom="5.6568542"
|
||||||
|
inkscape:cx="30.580486"
|
||||||
|
inkscape:cy="63.624717"
|
||||||
|
inkscape:document-units="px"
|
||||||
|
inkscape:current-layer="layer1"
|
||||||
|
showgrid="true"
|
||||||
|
inkscape:snap-smooth-nodes="false"
|
||||||
|
inkscape:window-width="1920"
|
||||||
|
inkscape:window-height="1058"
|
||||||
|
inkscape:window-x="0"
|
||||||
|
inkscape:window-y="22"
|
||||||
|
inkscape:window-maximized="0"
|
||||||
|
inkscape:snap-bbox="false"
|
||||||
|
inkscape:object-paths="true"
|
||||||
|
inkscape:snap-midpoints="false"
|
||||||
|
inkscape:snap-global="true">
|
||||||
|
<inkscape:grid
|
||||||
|
empspacing="5"
|
||||||
|
visible="true"
|
||||||
|
enabled="true"
|
||||||
|
snapvisiblegridlinesonly="true"
|
||||||
|
type="xygrid"
|
||||||
|
id="grid2985" />
|
||||||
|
</sodipodi:namedview>
|
||||||
|
<metadata
|
||||||
|
id="metadata7">
|
||||||
|
<rdf:RDF>
|
||||||
|
<cc:Work
|
||||||
|
rdf:about="">
|
||||||
|
<dc:format>image/svg+xml</dc:format>
|
||||||
|
<dc:type
|
||||||
|
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||||
|
<dc:title>Pushpin Icon</dc:title>
|
||||||
|
<dc:creator>
|
||||||
|
<cc:Agent>
|
||||||
|
<dc:title>Kovid Goyal</dc:title>
|
||||||
|
</cc:Agent>
|
||||||
|
</dc:creator>
|
||||||
|
<dc:rights>
|
||||||
|
<cc:Agent>
|
||||||
|
<dc:title>Public domain</dc:title>
|
||||||
|
</cc:Agent>
|
||||||
|
</dc:rights>
|
||||||
|
</cc:Work>
|
||||||
|
</rdf:RDF>
|
||||||
|
</metadata>
|
||||||
|
<g
|
||||||
|
inkscape:label="Layer 1"
|
||||||
|
inkscape:groupmode="layer"
|
||||||
|
id="layer1"
|
||||||
|
transform="translate(0,-924.36218)">
|
||||||
|
<path
|
||||||
|
style="fill:#f39509;fill-opacity:1;stroke:#7a6822;stroke-opacity:1;stroke-width:0;stroke-miterlimit:4;stroke-dasharray:none;filter:url(#filter3014)"
|
||||||
|
d="m 1.9128912,974.70018 49.4974748,-49.49747 -7.071068,21.2132 31.819805,17.67767 24.433067,-3.85121 -63.639613,63.63963 3.851207,-24.43308 -17.677669,-31.81981 z"
|
||||||
|
id="path3088"
|
||||||
|
inkscape:connector-curvature="0"
|
||||||
|
sodipodi:nodetypes="ccccccccc"
|
||||||
|
inkscape:export-xdpi="90"
|
||||||
|
inkscape:export-ydpi="90" />
|
||||||
|
<path
|
||||||
|
style="fill:url(#linearGradient3813);fill-opacity:1;stroke:none"
|
||||||
|
d="M 63.925974,996.92087 120,1042.5389 74.532576,986.31427"
|
||||||
|
id="path3097"
|
||||||
|
inkscape:connector-curvature="0"
|
||||||
|
sodipodi:nodetypes="ccc"
|
||||||
|
inkscape:export-xdpi="90"
|
||||||
|
inkscape:export-ydpi="90" />
|
||||||
|
</g>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 4.9 KiB |
7412
imgsrc/tweak.svg
Normal file
After Width: | Height: | Size: 249 KiB |
@ -537,25 +537,38 @@ Set the :guilabel:`Level 1 TOC` setting to ``//h:h2``. Then, for chapter two, |a
|
|||||||
How options are set/saved for Conversion
|
How options are set/saved for Conversion
|
||||||
-------------------------------------------
|
-------------------------------------------
|
||||||
|
|
||||||
There are two places where conversion options can be set in |app|. The first is in Preferences->Conversion. These
|
There are two places where conversion options can be set in |app|. The first is
|
||||||
settings are the defaults for the conversion options. Whenever you try to convert a new book, the settings set here
|
in Preferences->Conversion. These settings are the defaults for the conversion
|
||||||
will be used by default.
|
options. Whenever you try to convert a new book, the settings set here will be
|
||||||
|
used by default.
|
||||||
|
|
||||||
You can also change settings in the conversion dialog for each book conversion. When you convert a book, |app| remembers the
|
You can also change settings in the conversion dialog for each book conversion.
|
||||||
settings you used for that book, so that if you convert it again, the saved settings for the individual book will take
|
When you convert a book, |app| remembers the settings you used for that book,
|
||||||
precedence over the defaults set in Preferences. You can restore the individual settings to defaults by using the Restore to defaults
|
so that if you convert it again, the saved settings for the individual book
|
||||||
button in the individual book conversion dialog.
|
will take precedence over the defaults set in Preferences. You can restore the
|
||||||
|
individual settings to defaults by using the Restore to defaults button in the
|
||||||
|
individual book conversion dialog. You can remove the saved settings for a
|
||||||
|
group of books by selecting all the books and then clicking the edit metadata
|
||||||
|
button to bring up the bulk metadata edit dialog, near the bottom of the dialog
|
||||||
|
is an option to remove stored conversion settings.
|
||||||
|
|
||||||
When you Bulk Convert a set of books, settings are taken in the following order:
|
When you Bulk Convert a set of books, settings are taken in the following order (last one wins):
|
||||||
|
|
||||||
* From the defaults set in Preferences->Conversion
|
* From the defaults set in Preferences->Conversion
|
||||||
* From the saved conversion settings for each book being converted (if any). This can be turned off by the option in the top left corner of the Bulk Conversion dialog.
|
|
||||||
|
* From the saved conversion settings for each book being converted (if
|
||||||
|
any). This can be turned off by the option in the top left corner of the
|
||||||
|
Bulk Conversion dialog.
|
||||||
|
|
||||||
* From the settings set in the Bulk conversion dialog
|
* From the settings set in the Bulk conversion dialog
|
||||||
|
|
||||||
Note that the final settings for each book in a Bulk Conversion will be saved and re-used if the book is converted again. Since the
|
Note that the final settings for each book in a Bulk Conversion will be saved
|
||||||
highest priority in Bulk Conversion is given to the settings in the Bulk Conversion dialog, these will override any book specific
|
and re-used if the book is converted again. Since the highest priority in Bulk
|
||||||
settings. So you should only bulk convert books together that need similar settings. The exceptions are metadata and input format specific
|
Conversion is given to the settings in the Bulk Conversion dialog, these will
|
||||||
settings. Since the Bulk Conversion dialog does not have settings for these two categories, they will be taken from book specific
|
override any book specific settings. So you should only bulk convert books
|
||||||
|
together that need similar settings. The exceptions are metadata and input
|
||||||
|
format specific settings. Since the Bulk Conversion dialog does not have
|
||||||
|
settings for these two categories, they will be taken from book specific
|
||||||
settings (if any) or the defaults.
|
settings (if any) or the defaults.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
@ -772,9 +785,11 @@ size. By default, |app| uses a page size defined by the current
|
|||||||
:guilabel:`Output profile`. So if your output profile is set to Kindle, |app|
|
:guilabel:`Output profile`. So if your output profile is set to Kindle, |app|
|
||||||
will create a PDF with page size suitable for viewing on the small kindle
|
will create a PDF with page size suitable for viewing on the small kindle
|
||||||
screen. However, if you view this PDF file on a computer screen, then it will
|
screen. However, if you view this PDF file on a computer screen, then it will
|
||||||
appear to have too large fonts. To create "normal" sized PDFs, use the override
|
appear to have too large fonts. To create "normal" sized PDFs, use the
|
||||||
page size option under :guilabel:`PDF Output` in the conversion dialog.
|
:guilabel:`Override page size` option under :guilabel:`PDF Output` in the conversion dialog.
|
||||||
|
|
||||||
|
Headers and Footers
|
||||||
|
^^^^^^^^^^^^^^^^^^^^
|
||||||
You can insert arbitrary headers and footers on each page of the PDF by
|
You can insert arbitrary headers and footers on each page of the PDF by
|
||||||
specifying header and footer templates. Templates are just snippets of HTML
|
specifying header and footer templates. Templates are just snippets of HTML
|
||||||
code that get rendered in the header and footer locations. For example, to
|
code that get rendered in the header and footer locations. For example, to
|
||||||
@ -813,6 +828,9 @@ the page will be used.
|
|||||||
bottom margins to large enough values, under the Page Setup section of the
|
bottom margins to large enough values, under the Page Setup section of the
|
||||||
conversion dialog.
|
conversion dialog.
|
||||||
|
|
||||||
|
Printable Table of Contents
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
You can also insert a printable Table of Contents at the end of the PDF that
|
You can also insert a printable Table of Contents at the end of the PDF that
|
||||||
lists the page numbers for every section. This is very useful if you intend to
|
lists the page numbers for every section. This is very useful if you intend to
|
||||||
print out the PDF to paper. If you wish to use the PDF on an electronic device,
|
print out the PDF to paper. If you wish to use the PDF on an electronic device,
|
||||||
|
@ -92,6 +92,11 @@ The first thing to note is that this zip file has a lot more files in it, explai
|
|||||||
**about.txt**
|
**about.txt**
|
||||||
A text file with information about the plugin
|
A text file with information about the plugin
|
||||||
|
|
||||||
|
**translations**
|
||||||
|
A folder containing .mo files with the translations of the user
|
||||||
|
interface of your plugin into different languages. See below for
|
||||||
|
details.
|
||||||
|
|
||||||
Now let's look at the code.
|
Now let's look at the code.
|
||||||
|
|
||||||
__init__.py
|
__init__.py
|
||||||
@ -175,6 +180,42 @@ You can see the ``prefs`` object being used in main.py:
|
|||||||
.. literalinclude:: plugin_examples/interface_demo/main.py
|
.. literalinclude:: plugin_examples/interface_demo/main.py
|
||||||
:pyobject: DemoDialog.config
|
:pyobject: DemoDialog.config
|
||||||
|
|
||||||
|
Adding translations to your plugin
|
||||||
|
--------------------------------------
|
||||||
|
|
||||||
|
You can have all the user interface strings in your plugin translated and
|
||||||
|
displayed in whatever language is set for the main calibre user interface.
|
||||||
|
|
||||||
|
The first step is to go through your plugin's source code and mark all user
|
||||||
|
visible strings as translatable, by surrounding them in _(). For example::
|
||||||
|
|
||||||
|
action_spec = (_('My plugin'), None, _('My plugin is cool'), None)
|
||||||
|
|
||||||
|
Then use some program to generate .po files from your plugin source code. There
|
||||||
|
should be one .po file for every language you want to translate into. For
|
||||||
|
example: de.po for German, fr.po for French and so on. You can use the
|
||||||
|
`poedit <http://www.poedit.net/>`_ program for this.
|
||||||
|
|
||||||
|
Send these .po files to your translators. Once you get them back, compile them
|
||||||
|
into .mo files. You can again use poedit for that, or just do::
|
||||||
|
|
||||||
|
calibre-debug -c "from calibre.translations.msgfmt import main; main()" filename.po
|
||||||
|
|
||||||
|
Put the .mo files into the ``translations`` folder in your plugin.
|
||||||
|
|
||||||
|
The last step is to simply call the function `load_translations()` at the top
|
||||||
|
of your plugin's .py files. For performance reasons you should only call this
|
||||||
|
function in those .py files that actually have translatable strings. So in a
|
||||||
|
typical User Interface plugin you would call it at the top of ``ui.py`` but not
|
||||||
|
``__init__.py``.
|
||||||
|
|
||||||
|
You can test the translations of your plugins by changing the user interface
|
||||||
|
language in calibre under Preferences->Look & Feel or by running calibre like
|
||||||
|
this::
|
||||||
|
|
||||||
|
CALIBRE_OVERRIDE_LANG=de calibre
|
||||||
|
|
||||||
|
Replace ``de`` with the language code of the language you want to test.
|
||||||
|
|
||||||
The plugin API
|
The plugin API
|
||||||
--------------------------------
|
--------------------------------
|
||||||
|
@ -83,7 +83,6 @@ def generate_calibredb_help(preamble, info):
|
|||||||
|
|
||||||
global_options = '\n'.join(render_options('calibredb', groups, False, False))
|
global_options = '\n'.join(render_options('calibredb', groups, False, False))
|
||||||
|
|
||||||
|
|
||||||
lines, toc = [], []
|
lines, toc = [], []
|
||||||
for cmd in COMMANDS:
|
for cmd in COMMANDS:
|
||||||
args = []
|
args = []
|
||||||
@ -99,7 +98,7 @@ def generate_calibredb_help(preamble, info):
|
|||||||
usage = [i for i in usage.replace('%prog', 'calibredb').splitlines()]
|
usage = [i for i in usage.replace('%prog', 'calibredb').splitlines()]
|
||||||
cmdline = ' '+usage[0]
|
cmdline = ' '+usage[0]
|
||||||
usage = usage[1:]
|
usage = usage[1:]
|
||||||
usage = [i.replace(cmd, ':command:`%s`'%cmd) for i in usage]
|
usage = [re.sub(r'(%s)([^a-zA-Z0-9])'%cmd, r':command:`\1`\2', i) for i in usage]
|
||||||
lines += ['.. code-block:: none', '', cmdline, '']
|
lines += ['.. code-block:: none', '', cmdline, '']
|
||||||
lines += usage
|
lines += usage
|
||||||
groups = [(None, None, parser.option_list)]
|
groups = [(None, None, parser.option_list)]
|
||||||
@ -152,7 +151,6 @@ def generate_ebook_convert_help(preamble, info):
|
|||||||
prog = 'ebook-convert-'+(pl.name.lower().replace(' ', '-'))
|
prog = 'ebook-convert-'+(pl.name.lower().replace(' ', '-'))
|
||||||
raw += '\n\n' + '\n'.join(render_options(prog, groups, False, True))
|
raw += '\n\n' + '\n'.join(render_options(prog, groups, False, True))
|
||||||
|
|
||||||
|
|
||||||
update_cli_doc(os.path.join('cli', 'ebook-convert.rst'), raw, info)
|
update_cli_doc(os.path.join('cli', 'ebook-convert.rst'), raw, info)
|
||||||
|
|
||||||
def update_cli_doc(path, raw, info):
|
def update_cli_doc(path, raw, info):
|
||||||
@ -200,7 +198,8 @@ def cli_docs(app):
|
|||||||
for script in entry_points['console_scripts'] + entry_points['gui_scripts']:
|
for script in entry_points['console_scripts'] + entry_points['gui_scripts']:
|
||||||
module = script[script.index('=')+1:script.index(':')].strip()
|
module = script[script.index('=')+1:script.index(':')].strip()
|
||||||
cmd = script[:script.index('=')].strip()
|
cmd = script[:script.index('=')].strip()
|
||||||
if cmd in ('calibre-complete', 'calibre-parallel'): continue
|
if cmd in ('calibre-complete', 'calibre-parallel'):
|
||||||
|
continue
|
||||||
module = __import__(module, fromlist=[module.split('.')[-1]])
|
module = __import__(module, fromlist=[module.split('.')[-1]])
|
||||||
if hasattr(module, 'option_parser'):
|
if hasattr(module, 'option_parser'):
|
||||||
documented_cmds.append((cmd, getattr(module, 'option_parser')()))
|
documented_cmds.append((cmd, getattr(module, 'option_parser')()))
|
||||||
@ -260,3 +259,4 @@ def setup(app):
|
|||||||
def finished(app, exception):
|
def finished(app, exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@ -30,10 +30,13 @@ Environment variables
|
|||||||
* ``CALIBRE_OVERRIDE_DATABASE_PATH`` - allows you to specify the full path to metadata.db. Using this variable you can have metadata.db be in a location other than the library folder. Useful if your library folder is on a networked drive that does not support file locking.
|
* ``CALIBRE_OVERRIDE_DATABASE_PATH`` - allows you to specify the full path to metadata.db. Using this variable you can have metadata.db be in a location other than the library folder. Useful if your library folder is on a networked drive that does not support file locking.
|
||||||
* ``CALIBRE_DEVELOP_FROM`` - Used to run from a calibre development environment. See :ref:`develop`.
|
* ``CALIBRE_DEVELOP_FROM`` - Used to run from a calibre development environment. See :ref:`develop`.
|
||||||
* ``CALIBRE_OVERRIDE_LANG`` - Used to force the language used by the interface (ISO 639 language code)
|
* ``CALIBRE_OVERRIDE_LANG`` - Used to force the language used by the interface (ISO 639 language code)
|
||||||
* ``CALIBRE_NO_NATIVE_FILEDIALOGS`` - Causes calibre to not use native file dialogs for selecting files/directories.
|
* ``CALIBRE_NO_NATIVE_FILEDIALOGS`` - Causes calibre to not use native file dialogs for selecting files/directories. Set it to 1 to enable.
|
||||||
* ``SYSFS_PATH`` - Use if sysfs is mounted somewhere other than /sys
|
* ``SYSFS_PATH`` - Use if sysfs is mounted somewhere other than /sys
|
||||||
* ``http_proxy`` - Used on linux to specify an HTTP proxy
|
* ``http_proxy`` - Used on linux to specify an HTTP proxy
|
||||||
|
|
||||||
|
See `How to set environment variables in windows <http://www.computerhope.com/issues/ch000549.htm>`_ or
|
||||||
|
`How to set environment variables in OS X <http://blog.dowdandassociates.com/content/howto-set-an-environment-variable-in-mac-os-x-home-slash-dot-macosx-slash-environment-dot-plist/>`_.
|
||||||
|
|
||||||
Tweaks
|
Tweaks
|
||||||
------------
|
------------
|
||||||
|
|
||||||
@ -46,17 +49,31 @@ The default values for the tweaks are reproduced below
|
|||||||
Overriding icons, templates, et cetera
|
Overriding icons, templates, et cetera
|
||||||
----------------------------------------
|
----------------------------------------
|
||||||
|
|
||||||
|app| allows you to override the static resources, like icons, templates, javascript, etc. with customized versions that you like.
|
|app| allows you to override the static resources, like icons, javascript and
|
||||||
All static resources are stored in the resources sub-folder of the calibre install location. On Windows, this is usually
|
templates for the metadata jacket, catalogs, etc. with customized versions that
|
||||||
:file:`C:/Program Files/Calibre2/resources`. On OS X, :file:`/Applications/calibre.app/Contents/Resources/resources/`. On linux, if you are using the binary installer
|
you like. All static resources are stored in the resources sub-folder of the
|
||||||
from the calibre website it will be :file:`/opt/calibre/resources`. These paths can change depending on where you choose to install |app|.
|
calibre install location. On Windows, this is usually :file:`C:/Program Files/Calibre2/resources`.
|
||||||
|
On OS X, :file:`/Applications/calibre.app/Contents/Resources/resources/`. On linux, if
|
||||||
|
you are using the binary installer from the calibre website it will be
|
||||||
|
:file:`/opt/calibre/resources`. These paths can change depending on where you
|
||||||
|
choose to install |app|.
|
||||||
|
|
||||||
You should not change the files in this resources folder, as your changes will get overwritten the next time you update |app|. Instead, go to
|
You should not change the files in this resources folder, as your changes will
|
||||||
:guilabel:`Preferences->Advanced->Miscellaneous` and click :guilabel:`Open calibre configuration directory`. In this configuration directory, create a sub-folder called resources and place the files you want to override in it. Place the files in the appropriate sub folders, for example place images in :file:`resources/images`, etc.
|
get overwritten the next time you update |app|. Instead, go to
|
||||||
|app| will automatically use your custom file in preference to the built-in one the next time it is started.
|
:guilabel:`Preferences->Advanced->Miscellaneous` and click
|
||||||
|
:guilabel:`Open calibre configuration directory`. In this configuration directory, create a
|
||||||
|
sub-folder called resources and place the files you want to override in it.
|
||||||
|
Place the files in the appropriate sub folders, for example place images in
|
||||||
|
:file:`resources/images`, etc. |app| will automatically use your custom file
|
||||||
|
in preference to the built-in one the next time it is started.
|
||||||
|
|
||||||
For example, if you wanted to change the icon for the :guilabel:`Remove books` action, you would first look in the built-in resources folder and see that the relevant file is
|
For example, if you wanted to change the icon for the :guilabel:`Remove books`
|
||||||
:file:`resources/images/trash.png`. Assuming you have an alternate icon in PNG format called :file:`mytrash.png` you would save it in the configuration directory as :file:`resources/images/trash.png`. All the icons used by the calibre user interface are in :file:`resources/images` and its sub-folders.
|
action, you would first look in the built-in resources folder and see that the
|
||||||
|
relevant file is :file:`resources/images/trash.png`. Assuming you have an
|
||||||
|
alternate icon in PNG format called :file:`mytrash.png` you would save it in
|
||||||
|
the configuration directory as :file:`resources/images/trash.png`. All the
|
||||||
|
icons used by the calibre user interface are in :file:`resources/images` and
|
||||||
|
its sub-folders.
|
||||||
|
|
||||||
Customizing |app| with plugins
|
Customizing |app| with plugins
|
||||||
--------------------------------
|
--------------------------------
|
||||||
|
@ -49,7 +49,7 @@ All the |app| python code is in the ``calibre`` package. This package contains t
|
|||||||
* Metadata reading, writing, and downloading is all in ``ebooks.metadata``
|
* Metadata reading, writing, and downloading is all in ``ebooks.metadata``
|
||||||
* Conversion happens in a pipeline, for the structure of the pipeline,
|
* Conversion happens in a pipeline, for the structure of the pipeline,
|
||||||
see :ref:`conversion-introduction`. The pipeline consists of an input
|
see :ref:`conversion-introduction`. The pipeline consists of an input
|
||||||
plugin, various transforms and an output plugin. The that code constructs
|
plugin, various transforms and an output plugin. The code that constructs
|
||||||
and drives the pipeline is in :file:`plumber.py`. The pipeline works on a
|
and drives the pipeline is in :file:`plumber.py`. The pipeline works on a
|
||||||
representation of an ebook that is like an unzipped epub, with
|
representation of an ebook that is like an unzipped epub, with
|
||||||
manifest, spine, toc, guide, html content, etc. The
|
manifest, spine, toc, guide, html content, etc. The
|
||||||
|
102
manual/faq.rst
@ -499,11 +499,17 @@ that allows you to create collections on your Kindle from the |app| metadata. It
|
|||||||
I am getting an error when I try to use |app| with my Kobo Touch/Glo/etc.?
|
I am getting an error when I try to use |app| with my Kobo Touch/Glo/etc.?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
The Kobo has very buggy firmware. Connecting to it has been known to fail at random. Certain combinations of motherboard, USB ports/cables/hubs can exacerbate this tendency to fail. If you are getting an error when connecting to your touch with |app| try the following, each of which has solved the problem for *some* |app| users.
|
The Kobo has very buggy firmware. Connecting to it has been known to fail at
|
||||||
|
random. Certain combinations of motherboard, USB ports/cables/hubs can
|
||||||
|
exacerbate this tendency to fail. If you are getting an error when connecting
|
||||||
|
to your touch with |app| try the following, each of which has solved the
|
||||||
|
problem for *some* |app| users.
|
||||||
|
|
||||||
* Connect the Kobo directly to your computer, not via USB Hub
|
* Connect the Kobo directly to your computer, not via USB Hub
|
||||||
* Try a different USB cable and a different USB port on your computer
|
* Try a different USB cable and a different USB port on your computer
|
||||||
* Try a different computer (preferably an older model)
|
* Try a different computer, in particular the Kobo does not work well with
|
||||||
|
some Windows XP machines. If you are on Windows XP, try a computer with a
|
||||||
|
newer version of windows.
|
||||||
* Try upgrading the firmware on your Kobo Touch to the latest
|
* Try upgrading the firmware on your Kobo Touch to the latest
|
||||||
* Try resetting the Kobo (sometimes this cures the problem for a little while, but then it re-appears, in which case you have to reset again and again)
|
* Try resetting the Kobo (sometimes this cures the problem for a little while, but then it re-appears, in which case you have to reset again and again)
|
||||||
* Try only putting one or two books onto the Kobo at a time and do not keep large collections on the Kobo
|
* Try only putting one or two books onto the Kobo at a time and do not keep large collections on the Kobo
|
||||||
@ -622,13 +628,29 @@ should fix by hand.
|
|||||||
The list of books in |app| is blank!
|
The list of books in |app| is blank!
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
In order to understand why that happened, you have to understand what a |app| library is. At the most basic level, a |app| library is just a folder. Whenever you add a book to |app|, that book's files are copied into this folder (arranged into sub folders by author and title). Inside the |app| library folder, at the top level, you will see a file called metadata.db. This file is where |app| stores the metadata like title/author/rating/tags etc. for *every* book in your |app| library. The list of books that |app| displays is created by reading the contents of this metadata.db file.
|
In order to understand why that happened, you have to understand what a |app|
|
||||||
|
library is. At the most basic level, a |app| library is just a folder. Whenever
|
||||||
|
you add a book to |app|, that book's files are copied into this folder
|
||||||
|
(arranged into sub folders by author and title). Inside the |app| library
|
||||||
|
folder, at the top level, you will see a file called metadata.db. This file is
|
||||||
|
where |app| stores the metadata like title/author/rating/tags etc. for *every*
|
||||||
|
book in your |app| library. The list of books that |app| displays is created by
|
||||||
|
reading the contents of this metadata.db file.
|
||||||
|
|
||||||
There can be two reasons why |app| is showing a empty list of books:
|
There can be two reasons why |app| is showing a empty list of books:
|
||||||
|
|
||||||
* Your |app| library folder changed its location. This can happen if it was on an external disk and the drive letter for that disk changed. Or if you accidentally moved the folder. In this case, |app| cannot find its library and so starts up with an empty library instead. To remedy this, do a right-click on the |app| icon in the |app| toolbar (it will say 0 books underneath it) and select Switch/create library. Click the little blue icon to select the new location of your |app| library and click OK.
|
* Your |app| library folder changed its location. This can happen if it was
|
||||||
|
on an external disk and the drive letter for that disk changed. Or if you
|
||||||
|
accidentally moved the folder. In this case, |app| cannot find its library
|
||||||
|
and so starts up with an empty library instead. To remedy this, do a
|
||||||
|
right-click on the |app| icon in the |app| toolbar and select Switch/create
|
||||||
|
library. Click the little blue icon to select the new location of your
|
||||||
|
|app| library and click OK.
|
||||||
|
|
||||||
* Your metadata.db file was deleted/corrupted. In this case, you can ask |app| to rebuild the metadata.db from its backups. Right click the |app| icon in the |app| toolbar (it will say 0 books underneath it) and select Library maintenance->Restore database. |app| will automatically rebuild metadata.db.
|
* Your metadata.db file was deleted/corrupted. In this case, you can ask
|
||||||
|
|app| to rebuild the metadata.db from its backups. Right click the |app|
|
||||||
|
icon in the |app| toolbar and select Library maintenance->Restore database.
|
||||||
|
|app| will automatically rebuild metadata.db.
|
||||||
|
|
||||||
I am getting errors with my calibre library on a networked drive/NAS?
|
I am getting errors with my calibre library on a networked drive/NAS?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
@ -701,7 +723,13 @@ Take your pick:
|
|||||||
|
|
||||||
Why does |app| show only some of my fonts on OS X?
|
Why does |app| show only some of my fonts on OS X?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|app| embeds fonts in ebook files it creates. Ebook files support embedding only TrueType (.ttf) fonts. Most fonts on OS X systems are in .dfont format, thus they cannot be embedded. |app| shows only TrueType fonts found on your system. You can obtain many TrueType fonts on the web. Simply download the .ttf files and add them to the Library/Fonts directory in your home directory.
|
|
||||||
|
|app| embeds fonts in ebook files it creates. Ebook files support embedding
|
||||||
|
only TrueType and OpenType (.ttf and .otf) fonts. Most fonts on OS X systems
|
||||||
|
are in .dfont format, thus they cannot be embedded. |app| shows only TrueType
|
||||||
|
and OpenType fonts found on your system. You can obtain many such fonts on the
|
||||||
|
web. Simply download the .ttf/.otf files and add them to the Library/Fonts
|
||||||
|
directory in your home directory.
|
||||||
|
|
||||||
|app| is not starting on Windows?
|
|app| is not starting on Windows?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
@ -763,6 +791,13 @@ There are several possible things I know of, that can cause this:
|
|||||||
that prevent 64-bit |app| from working properly. If you are using the 64-bit
|
that prevent 64-bit |app| from working properly. If you are using the 64-bit
|
||||||
version of |app| try switching to the 32-bit version.
|
version of |app| try switching to the 32-bit version.
|
||||||
|
|
||||||
|
* If the crashes happen specifically when you are using a file open dialog,
|
||||||
|
like clicking on the Add Books button or the Save to Disk button, then
|
||||||
|
you may have an issue with the windows file open dialogs on your
|
||||||
|
computer. You can tell calibre to use its own file open dialogs by
|
||||||
|
setting the environment variable ``CALIBRE_NO_NATIVE_FILEDIALOGS=1``.
|
||||||
|
See `How to set environment variables in windows <http://www.computerhope.com/issues/ch000549.htm>`_.
|
||||||
|
|
||||||
If none of the above apply to you, then there is some other program on your
|
If none of the above apply to you, then there is some other program on your
|
||||||
computer that is interfering with |app|. First reboot your computer in safe
|
computer that is interfering with |app|. First reboot your computer in safe
|
||||||
mode, to have as few running programs as possible, and see if the crashes still
|
mode, to have as few running programs as possible, and see if the crashes still
|
||||||
@ -776,6 +811,31 @@ The only way to find the culprit is to eliminate the programs one by one and
|
|||||||
see which one is causing the issue. Basically, stop a program, run calibre,
|
see which one is causing the issue. Basically, stop a program, run calibre,
|
||||||
check for crashes. If they still happen, stop another program and repeat.
|
check for crashes. If they still happen, stop another program and repeat.
|
||||||
|
|
||||||
|
|
||||||
|
Using the viewer or doing any conversions results in a permission denied error on windows
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Something on your computer is preventing calibre from accessing its own
|
||||||
|
temporary files. Most likely the permissions on your Temp folder are incorrect.
|
||||||
|
Go to the folder file:`C:\\Users\\USERNAME\\AppData\\Local` in Windows
|
||||||
|
Explorer and then right click on the file:`Temp` folder, select Properties and go to
|
||||||
|
the Security tab. Make sure that your user account has full control for this
|
||||||
|
folder.
|
||||||
|
|
||||||
|
Some users have reported that running the following command in an Administrator
|
||||||
|
Command Prompt fixed their permissions. To get an Administrator Command Prompt
|
||||||
|
search for cmd.exe in the start menu, then right click on the command prompt
|
||||||
|
entry and select Run as Administrator. At the command prompt type the following
|
||||||
|
command and press Enter::
|
||||||
|
|
||||||
|
icacls "%appdata%\..\Local\Temp" /reset /T
|
||||||
|
|
||||||
|
Alternately, you can run calibre as Administrator, but doing so will cause
|
||||||
|
some functionality, such as drag and drop to not work.
|
||||||
|
|
||||||
|
Finally, some users have reported that disabling UAC fixes the problem.
|
||||||
|
|
||||||
|
|
||||||
|app| is not starting on OS X?
|
|app| is not starting on OS X?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
@ -816,9 +876,10 @@ My antivirus program claims |app| is a virus/trojan?
|
|||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
The first thing to check is that you are downloading |app| from the official
|
The first thing to check is that you are downloading |app| from the official
|
||||||
website: `<http://calibre-ebook.com/download>`_. |app| is a very popular program
|
website: `<http://calibre-ebook.com/download>`_. Make sure you are clicking the
|
||||||
and unscrupulous people try to setup websites offering it for download to fool
|
download links on the left, not the advertisements on the right. |app| is a
|
||||||
the unwary.
|
very popular program and unscrupulous people try to setup websites offering it
|
||||||
|
for download to fool the unwary.
|
||||||
|
|
||||||
If you have the official download and your antivirus program is still claiming
|
If you have the official download and your antivirus program is still claiming
|
||||||
|app| is a virus, then, your antivirus program is wrong. Antivirus programs use
|
|app| is a virus, then, your antivirus program is wrong. Antivirus programs use
|
||||||
@ -880,10 +941,25 @@ Why doesn't |app| have an automatic update?
|
|||||||
|
|
||||||
For many reasons:
|
For many reasons:
|
||||||
|
|
||||||
* *There is no need to update every week*. If you are happy with how |app| works turn off the update notification and be on your merry way. Check back to see if you want to update once a year or so.
|
* *There is no need to update every week*. If you are happy with how |app|
|
||||||
* Pre downloading the updates for all users in the background would require about 80TB of bandwidth *every week*. That costs thousands of dollars a month. And |app| is currently growing at 300,000 new users every month.
|
works turn off the update notification and be on your merry way. Check back
|
||||||
* If I implement a dialog that downloads the update and launches it, instead of going to the website as it does now, that would save the most ardent |app| updater, *at most five clicks a week*. There are far higher priority things to do in |app| development.
|
to see if you want to update once a year or so. There is a check box to
|
||||||
* If you really, really hate downloading |app| every week but still want to be up to the latest, I encourage you to run from source, which makes updating trivial. Instructions are :ref:`available here <develop>`.
|
turn off the update notification, on the update notification itself.
|
||||||
|
|
||||||
|
* |app| downloads currently use `about 100TB of bandwidth a month
|
||||||
|
<http://status.calibre-ebook.com/downloads>`_. Implementing automatic
|
||||||
|
updates would greatly increase that and end up costing thousands of dollars
|
||||||
|
a month, which someone has to pay. And |app| is currently growing at `half
|
||||||
|
a million new installs a month <https://status.calibre-ebook.com>`_.
|
||||||
|
|
||||||
|
* If I implement a dialog that downloads the update and launches it, instead
|
||||||
|
of going to the website as it does now, that would save the most ardent
|
||||||
|
|app| updater, *at most five clicks a week*. There are far higher priority
|
||||||
|
things to do in |app| development.
|
||||||
|
|
||||||
|
* If you really, really hate downloading |app| every week but still want to
|
||||||
|
be up to the latest, I encourage you to run from source, which makes
|
||||||
|
updating trivial. Instructions are :ref:`available here <develop>`.
|
||||||
|
|
||||||
How is |app| licensed?
|
How is |app| licensed?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
@ -62,7 +62,13 @@ Add books
|
|||||||
The :guilabel:`Add books` action can read metadata from a wide variety of ebook formats. In addition, it tries to guess metadata from the filename.
|
The :guilabel:`Add books` action can read metadata from a wide variety of ebook formats. In addition, it tries to guess metadata from the filename.
|
||||||
See the :ref:`config_filename_metadata` section, to learn how to configure this.
|
See the :ref:`config_filename_metadata` section, to learn how to configure this.
|
||||||
|
|
||||||
To add an additional format for an existing book use the :ref:`edit_meta_information` action.
|
To add an additional format for an existing book you can do any of three things:
|
||||||
|
|
||||||
|
1. Drag and drop the file onto the book details panel on the right side of the main window
|
||||||
|
|
||||||
|
2. Right click the Add books button and choose :guilabel`:Add files to selected books`.
|
||||||
|
|
||||||
|
3. Click the red add books button in the top right area of the :guilabel:`Edit Metadata` dialog, accessed by the :ref:`edit_meta_information` action.
|
||||||
|
|
||||||
.. _edit_meta_information:
|
.. _edit_meta_information:
|
||||||
|
|
||||||
@ -593,6 +599,8 @@ Calibre has several keyboard shortcuts to save you time and mouse movement. Thes
|
|||||||
- Toggle Book Details panel
|
- Toggle Book Details panel
|
||||||
* - :kbd:`Alt+Shift+T`
|
* - :kbd:`Alt+Shift+T`
|
||||||
- Toggle Tag Browser
|
- Toggle Tag Browser
|
||||||
|
* - :kbd:`Alt+Shift+G`
|
||||||
|
- Toggle Cover Grid
|
||||||
* - :kbd:`Alt+A`
|
* - :kbd:`Alt+A`
|
||||||
- Show books by the same author as the current book
|
- Show books by the same author as the current book
|
||||||
* - :kbd:`Alt+T`
|
* - :kbd:`Alt+T`
|
||||||
|
@ -38,6 +38,8 @@ Sections
|
|||||||
glossary
|
glossary
|
||||||
|
|
||||||
|
|
||||||
|
.. REMOVE_IN_PDF
|
||||||
|
|
||||||
The main |app| user interface
|
The main |app| user interface
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
.. include:: global.rst
|
.. include:: global.rst
|
||||||
|
|
||||||
.. _regexptutorial:
|
.. _regexptutorial:
|
||||||
@ -26,7 +25,7 @@ There are a few places |app| uses regular expressions. There's the Search & Repl
|
|||||||
What on earth *is* a regular expression?
|
What on earth *is* a regular expression?
|
||||||
------------------------------------------------
|
------------------------------------------------
|
||||||
|
|
||||||
A regular expression is a way to describe sets of strings. A single regular expression cat *match* a number of different strings. This is what makes regular expression so powerful -- they are a concise way of describing a potentially large number of variations.
|
A regular expression is a way to describe sets of strings. A single regular expression can *match* a number of different strings. This is what makes regular expression so powerful -- they are a concise way of describing a potentially large number of variations.
|
||||||
|
|
||||||
.. note:: I'm using string here in the sense it is used in programming languages: a string of one or more characters, characters including actual characters, numbers, punctuation and so-called whitespace (linebreaks, tabulators etc.). Please note that generally, uppercase and lowercase characters are not considered the same, thus "a" being a different character from "A" and so forth. In |app|, regular expressions are case insensitive in the search bar, but not in the conversion options. There's a way to make every regular expression case insensitive, but we'll discuss that later. It gets complicated because regular expressions allow for variations in the strings it matches, so one expression can match multiple strings, which is why people bother using them at all. More on that in a bit.
|
.. note:: I'm using string here in the sense it is used in programming languages: a string of one or more characters, characters including actual characters, numbers, punctuation and so-called whitespace (linebreaks, tabulators etc.). Please note that generally, uppercase and lowercase characters are not considered the same, thus "a" being a different character from "A" and so forth. In |app|, regular expressions are case insensitive in the search bar, but not in the conversion options. There's a way to make every regular expression case insensitive, but we'll discuss that later. It gets complicated because regular expressions allow for variations in the strings it matches, so one expression can match multiple strings, which is why people bother using them at all. More on that in a bit.
|
||||||
|
|
||||||
|
BIN
manual/resources/simple_donate_button.gif
Normal file
After Width: | Height: | Size: 2.1 KiB |
@ -104,7 +104,7 @@ Save this adapter as :file:`calibre-wsgi-adpater.py` somewhere your server will
|
|||||||
|
|
||||||
Let's suppose that we want to use WSGI in Apache. First enable WSGI in Apache by adding the following to :file:`httpd.conf`::
|
Let's suppose that we want to use WSGI in Apache. First enable WSGI in Apache by adding the following to :file:`httpd.conf`::
|
||||||
|
|
||||||
LoadModule proxy_module modules/mod_wsgi.so
|
LoadModule wsgi_module modules/mod_wsgi.so
|
||||||
|
|
||||||
The exact technique for enabling the wsgi module will vary depending on your Apache installation. Once you have the proxy modules enabled, add the following rules to httpd.conf (or if you are using virtual hosts to the conf file for the virtual host in question::
|
The exact technique for enabling the wsgi module will vary depending on your Apache installation. Once you have the proxy modules enabled, add the following rules to httpd.conf (or if you are using virtual hosts to the conf file for the virtual host in question::
|
||||||
|
|
||||||
|
@ -16,16 +16,13 @@
|
|||||||
<div class="body">
|
<div class="body">
|
||||||
{% if not embedded %}
|
{% if not embedded %}
|
||||||
<div id="ad-container" style="text-align:center">
|
<div id="ad-container" style="text-align:center">
|
||||||
<script type="text/javascript"><!--
|
<script async="async" src="http://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
|
||||||
google_ad_client = "ca-pub-5939552585043235";
|
<ins class="adsbygoogle"
|
||||||
/* User Manual horizontal */
|
style="display:inline-block;width:728px;height:90px"
|
||||||
google_ad_slot = "7580893187";
|
data-ad-client="ca-pub-5939552585043235"
|
||||||
google_ad_width = 728;
|
data-ad-slot="7580893187"></ins>
|
||||||
google_ad_height = 90;
|
<script>
|
||||||
//-->
|
(adsbygoogle = window.adsbygoogle || []).push({});
|
||||||
</script>
|
|
||||||
<script type="text/javascript"
|
|
||||||
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
|
|
||||||
</script>
|
</script>
|
||||||
</div>
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
@ -62,7 +59,7 @@
|
|||||||
<form action="https://www.paypal.com/cgi-bin/webscr" method="post" title="Contribute to support calibre development">
|
<form action="https://www.paypal.com/cgi-bin/webscr" method="post" title="Contribute to support calibre development">
|
||||||
<input type="hidden" name="cmd" value="_s-xclick" />
|
<input type="hidden" name="cmd" value="_s-xclick" />
|
||||||
<input type="hidden" name="hosted_button_id" value="AF4H3B8QVDG6N" />
|
<input type="hidden" name="hosted_button_id" value="AF4H3B8QVDG6N" />
|
||||||
<input type="image" src="http://manual.calibre-ebook.com/simple_donate_button.gif" border="0" name="submit" alt="Contribute to support calibre development" style="border:0pt" />
|
<input type="image" src="_static/simple_donate_button.gif" border="0" name="submit" alt="Contribute to support calibre development" style="border:0pt" />
|
||||||
<img alt="" border="0" src="https://www.paypalobjects.com/en_GB/i/scr/pixel.gif" width="1" height="1" />
|
<img alt="" border="0" src="https://www.paypalobjects.com/en_GB/i/scr/pixel.gif" width="1" height="1" />
|
||||||
</form>
|
</form>
|
||||||
<hr/>
|
<hr/>
|
||||||
|
@ -94,6 +94,13 @@ You can quickly use the current search as a temporary virtual library by
|
|||||||
clicking the :guilabel:`Virtual Library` button and choosing the
|
clicking the :guilabel:`Virtual Library` button and choosing the
|
||||||
:guilabel:`*current search` entry.
|
:guilabel:`*current search` entry.
|
||||||
|
|
||||||
|
You can display all available virtual libraries as tabs above the book list.
|
||||||
|
This is particularly handy if you like switching between virtual libraries very
|
||||||
|
often. Click the :guilabel:`Virtual Library` button and select :guilabel:`Show
|
||||||
|
virtual libraries as tabs`. You can re-arrange the tabs by drag and drop and
|
||||||
|
close ones you do not want to see. Closed tabs can be restored by
|
||||||
|
right-clicking on the tab bar.
|
||||||
|
|
||||||
Using additional restrictions
|
Using additional restrictions
|
||||||
-------------------------------
|
-------------------------------
|
||||||
|
|
||||||
|
50
recipes/10minutos.recipe
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = '2013, Carlos Alves <carlosalves90@gmail.com>'
|
||||||
|
'''
|
||||||
|
10minutos.com.uy
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class General(BasicNewsRecipe):
|
||||||
|
title = '10minutos'
|
||||||
|
__author__ = 'Carlos Alves'
|
||||||
|
description = 'Noticias de Salto - Uruguay'
|
||||||
|
tags = 'news, sports'
|
||||||
|
language = 'es_UY'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
use_embedded_content = False
|
||||||
|
recursion = 5
|
||||||
|
encoding = 'utf8'
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':'post-content'})]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['hr', 'titlebar', 'navigation']}),
|
||||||
|
dict(name='p', attrs={'class':'post-meta'}),
|
||||||
|
dict(name=['object','link'])
|
||||||
|
]
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||||
|
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||||
|
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||||
|
p {font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
'''
|
||||||
|
feeds = [
|
||||||
|
(u'Articulos', u'http://10minutos.com.uy/feed/')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
return 'http://10minutos.com.uy/a/img/logo.png'
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
30
recipes/acrimed.recipe
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
# vim:fileencoding=utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012'
|
||||||
|
'''
|
||||||
|
acrimed.org
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Acrimed(BasicNewsRecipe):
|
||||||
|
title = u'Acrimed'
|
||||||
|
__author__ = 'Gaëtan Lehmann'
|
||||||
|
oldest_article = 30
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
auto_cleanup_keep = '//div[@class="crayon article-chapo-4112 chapo"]'
|
||||||
|
language = 'fr'
|
||||||
|
masthead_url = 'http://www.acrimed.org/IMG/siteon0.gif'
|
||||||
|
feeds = [(u'Acrimed', u'http://www.acrimed.org/spip.php?page=backend')]
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
|
(re.compile(r'<title>(.*) - Acrimed \| Action Critique M.*dias</title>'), lambda m: '<title>' + m.group(1) + '</title>'),
|
||||||
|
(re.compile(r'<h2>(.*) - Acrimed \| Action Critique M.*dias</h2>'), lambda m: '<h2>' + m.group(1) + '</h2>')]
|
||||||
|
|
||||||
|
extra_css = """
|
||||||
|
.chapo{font-style:italic; margin: 1em 0 0.5em}
|
||||||
|
"""
|
@ -3,10 +3,10 @@ from __future__ import unicode_literals
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2013, Eddie Lau'
|
__copyright__ = '2013, Eddie Lau'
|
||||||
__Date__ = ''
|
__Date__ = ''
|
||||||
__HiResImg__ = True
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Change Log:
|
Change Log:
|
||||||
|
2013/09/28 -- update due to website redesign, add cover
|
||||||
2013/03/30 -- first version
|
2013/03/30 -- first version
|
||||||
'''
|
'''
|
||||||
|
|
||||||
@ -15,7 +15,7 @@ from calibre.utils.date import now as nowf
|
|||||||
import os, datetime, re
|
import os, datetime, re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from contextlib import nested
|
from contextlib import nested
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.metadata.toc import TOC
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
@ -32,18 +32,17 @@ class AppleDaily(BasicNewsRecipe):
|
|||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
auto_cleanup = False
|
auto_cleanup = False
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
description = 'http://www.am730.com.hk'
|
description = 'http://www.am730.com.hk'
|
||||||
category = 'Chinese, News, Hong Kong'
|
category = 'Chinese, News, Hong Kong'
|
||||||
masthead_url = 'http://www.am730.com.hk/images/logo.jpg'
|
masthead_url = 'http://www.am730.com.hk/images/logo.jpg'
|
||||||
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 20px; margin-bottom: 20px; max-height:70%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} li {font-size:50%; margin-left:auto; margin-right:auto;}'
|
||||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} photocaption {font-size:50%; margin-left:auto; margin-right:auto;}'
|
keep_only_tags = [dict(name='h2', attrs={'class':'printTopic'}),
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':'articleHeader'}),
|
dict(name='div', attrs={'id':'article_content'}),
|
||||||
dict(name='div', attrs={'class':'thecontent wordsnap'}),
|
dict(name='div', attrs={'id':'slider'})]
|
||||||
dict(name='a', attrs={'class':'lightboximg'})]
|
remove_tags = [dict(name='img', attrs={'src':'images/am730_article_logo.jpg'}),
|
||||||
remove_tags = [dict(name='img', attrs={'src':'/images/am730_article_logo.jpg'}),
|
dict(name='img', attrs={'src':'images/am_endmark.gif'})]
|
||||||
dict(name='img', attrs={'src':'/images/am_endmark.gif'})]
|
|
||||||
|
|
||||||
def get_dtlocal(self):
|
def get_dtlocal(self):
|
||||||
dt_utc = datetime.datetime.utcnow()
|
dt_utc = datetime.datetime.utcnow()
|
||||||
@ -84,6 +83,16 @@ class AppleDaily(BasicNewsRecipe):
|
|||||||
def get_weekday(self):
|
def get_weekday(self):
|
||||||
return self.get_dtlocal().weekday()
|
return self.get_dtlocal().weekday()
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
soup = self.index_to_soup('http://www.am730.com.hk')
|
||||||
|
cover = 'http://www.am730.com.hk/' + soup.find(attrs={'id':'mini_news_img'}).find('img').get('src', False)
|
||||||
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
|
try:
|
||||||
|
br.open(cover)
|
||||||
|
except:
|
||||||
|
cover = None
|
||||||
|
return cover
|
||||||
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
def populate_article_metadata(self, article, soup, first):
|
||||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||||
picdiv = soup.find('img')
|
picdiv = soup.find('img')
|
||||||
@ -93,48 +102,17 @@ class AppleDaily(BasicNewsRecipe):
|
|||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
feeds = []
|
feeds = []
|
||||||
soup = self.index_to_soup('http://www.am730.com.hk/')
|
soup = self.index_to_soup('http://www.am730.com.hk/')
|
||||||
ul = soup.find(attrs={'class':'nav-section'})
|
optgroups = soup.findAll('optgroup')
|
||||||
sectionList = []
|
for optgroup in optgroups:
|
||||||
for li in ul.findAll('li'):
|
sectitle = optgroup.get('label')
|
||||||
a = 'http://www.am730.com.hk/' + li.find('a', href=True).get('href', False)
|
articles = []
|
||||||
title = li.find('a').get('title', False).strip()
|
for option in optgroup.findAll('option'):
|
||||||
sectionList.append((title, a))
|
articlelink = "http://www.am730.com.hk/" + option.get('value')
|
||||||
for title, url in sectionList:
|
title = option.string
|
||||||
articles = self.parse_section(url)
|
articles.append({'title': title, 'url': articlelink})
|
||||||
if articles:
|
feeds.append((sectitle, articles))
|
||||||
feeds.append((title, articles))
|
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def parse_section(self, url):
|
|
||||||
soup = self.index_to_soup(url)
|
|
||||||
items = soup.findAll(attrs={'style':'padding-bottom: 15px;'})
|
|
||||||
current_articles = []
|
|
||||||
for item in items:
|
|
||||||
a = item.find(attrs={'class':'t6 f14'}).find('a', href=True)
|
|
||||||
articlelink = 'http://www.am730.com.hk/' + a.get('href', True)
|
|
||||||
title = self.tag_to_string(a)
|
|
||||||
description = self.tag_to_string(item.find(attrs={'class':'t3 f14'}))
|
|
||||||
current_articles.append({'title': title, 'url': articlelink, 'description': description})
|
|
||||||
return current_articles
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
multia = soup.findAll('a')
|
|
||||||
for a in multia:
|
|
||||||
if not (a == None):
|
|
||||||
image = a.find('img')
|
|
||||||
if not (image == None):
|
|
||||||
if __HiResImg__:
|
|
||||||
image['src'] = image.get('src').replace('/thumbs/', '/')
|
|
||||||
caption = image.get('alt')
|
|
||||||
tag = Tag(soup, "photo", [])
|
|
||||||
tag2 = Tag(soup, "photocaption", [])
|
|
||||||
tag.insert(0, image)
|
|
||||||
if not caption == None:
|
|
||||||
tag2.insert(0, caption)
|
|
||||||
tag.insert(1, tag2)
|
|
||||||
a.replaceWith(tag)
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def create_opf(self, feeds, dir=None):
|
def create_opf(self, feeds, dir=None):
|
||||||
if dir is None:
|
if dir is None:
|
||||||
dir = self.output_dir
|
dir = self.output_dir
|
||||||
@ -288,3 +266,4 @@ class AppleDaily(BasicNewsRecipe):
|
|||||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||||
opf.render(opf_file, ncx_file)
|
opf.render(opf_file, ncx_file)
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,26 +12,30 @@ class anan(BasicNewsRecipe):
|
|||||||
|
|
||||||
title = 'Anandtech'
|
title = 'Anandtech'
|
||||||
description = 'comprehensive Hardware Tests'
|
description = 'comprehensive Hardware Tests'
|
||||||
__author__ = 'Oliver Niesner' # 2012-09-20 AGE: update
|
__author__ = 'Oliver Niesner, Armin Geller' # 2013-09-07 AGE: update
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'en'
|
language = 'en'
|
||||||
timefmt = ' [%d %b %Y]'
|
timefmt = ' [%d %b %Y]'
|
||||||
oldest_article = 7 # 2012-09-20 AGE: update
|
oldest_article = 7
|
||||||
max_articles_per_feed = 40
|
max_articles_per_feed = 40
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
|
|
||||||
cover_url = 'http://www.anandtech.com/content/images/globals/header_logo.png' # 2012-09-20 AGE: new
|
cover_url = 'http://www.anandtech.com/content/images/globals/header_logo.png'
|
||||||
masthead_url = 'http://www.anandtech.com/content/images/globals/printheader.png' # 2012-09-20 AGE: update
|
masthead_url = 'http://www.anandtech.com/content/images/globals/printheader.png'
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='section', attrs={'class':['main_cont']}),
|
||||||
|
]
|
||||||
|
remove_tags=[ # 2013-09-07 AGE: update
|
||||||
|
dict(name='div', attrs={'class':['print', # logo
|
||||||
|
'breadcrumb_area noprint',
|
||||||
|
'fl-rt noprint',
|
||||||
|
'blog_top_right',]})
|
||||||
|
]
|
||||||
|
|
||||||
remove_tags=[
|
feeds = [('Anandtech', 'http://www.anandtech.com/rss/')]
|
||||||
dict(name='a', attrs={'class': 'bluebutton noprint'}),
|
|
||||||
dict(name='img', attrs={'alt': 'header'}),
|
|
||||||
] # 2012-09-20 AGE: update
|
|
||||||
|
|
||||||
feeds = [ ('Anandtech', 'http://www.anandtech.com/rss/')]
|
|
||||||
|
|
||||||
def print_version(self,url):
|
def print_version(self,url):
|
||||||
return url.replace('0Cshow0C', '0Cprint0C') # 2012-09-20 AGE: update
|
return url.replace("0Cshow0C", "0Cprint0C") # 2013-09-07 AGE: update
|
||||||
|
@ -21,21 +21,9 @@ class AntywebRecipe(BasicNewsRecipe):
|
|||||||
simultaneous_downloads = 3
|
simultaneous_downloads = 3
|
||||||
|
|
||||||
keep_only_tags =[]
|
keep_only_tags =[]
|
||||||
keep_only_tags.append(dict(name = 'h1', attrs = { 'class' : 'mm-article-title'}))
|
keep_only_tags.append(dict(name = 'h1', attrs = { 'class' : 'entry-title '}))
|
||||||
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'mm-article-content'}))
|
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'news-content'}))
|
||||||
|
extra_css = '''body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}'''
|
||||||
|
|
||||||
remove_tags =[]
|
|
||||||
remove_tags.append(dict(name = 'h2', attrs = {'class' : 'widgettitle'}))
|
|
||||||
remove_tags.append(dict(name = 'img', attrs = {'class' : 'alignleft'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'float: right;margin-left:1em;margin-bottom: 0.5em;padding-bottom: 3px; width: 72px;'}))
|
|
||||||
remove_tags.append(dict(name = 'img', attrs = {'src' : 'http://antyweb.pl/wp-content/uploads/2011/09/HOSTERSI_testy_pasek600x30.gif'}))
|
|
||||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'podwpisowe'}))
|
|
||||||
|
|
||||||
|
|
||||||
extra_css = '''
|
|
||||||
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
|
|
||||||
'''
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Artykuly', u'feed://feeds.feedburner.com/Antyweb?format=xml'),
|
(u'Artykuly', u'feed://feeds.feedburner.com/Antyweb?format=xml'),
|
||||||
|
63
recipes/blind_buch_de.recipe
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
#
|
||||||
|
# Written: July 2013
|
||||||
|
# Last Edited: 2013-07-11
|
||||||
|
# Version: 1.0
|
||||||
|
# Last update: 2013-07-25
|
||||||
|
#
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Armin Geller'
|
||||||
|
|
||||||
|
'''
|
||||||
|
Fetch blindenbuch.de
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = u'Blindbuch - Bücher neu entdecken'
|
||||||
|
__author__ = 'Armin Geller' # AGe 2013-07-11
|
||||||
|
description = u'Bücher blind präsentiert'
|
||||||
|
publisher = 'blindbuch.de'
|
||||||
|
publication_type = 'ebook news'
|
||||||
|
tags = 'Bücher, Literatur, E-Books, Germany'
|
||||||
|
timefmt = ' [%a, %d %b %Y]'
|
||||||
|
publication_type = 'Feed'
|
||||||
|
language = 'de-DE'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
|
||||||
|
oldest_article = 14
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
conversion_options = {'title' : title,
|
||||||
|
'comments' : description,
|
||||||
|
'tags' : tags,
|
||||||
|
'language' : language,
|
||||||
|
'publisher' : publisher,
|
||||||
|
'authors' : publisher,
|
||||||
|
}
|
||||||
|
|
||||||
|
cover_url = 'http://blindbuch.de/img/blindbuch_calibre.png'
|
||||||
|
masthead_url = 'http://www.blindbuch.de/img/Masterhead.JPG'
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-weight:bold;font-size:large;}
|
||||||
|
.post-meta {font-size: 1em;text-align: left; font-style: italic}
|
||||||
|
'''
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='article')
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['su-spoiler su-spoiler-style-1','post-comments comments',]}),
|
||||||
|
dict(name='span', attrs={'class':['post-comments comments',]}),
|
||||||
|
dict(name='div', attrs={'addthis':['title',]}),
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [(u'Blindbuch', u'http://www.blindbuch.de/feed/')]
|
||||||
|
|
92
recipes/caravan_magazine.recipe
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
import html5lib
|
||||||
|
from lxml import etree
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from calibre.utils.cleantext import clean_xml_chars
|
||||||
|
|
||||||
|
def is_title(tag):
|
||||||
|
return tag.name == 'h2' and tag.parent.name == 'div' and tag.parent['class'] == 'left-corner'
|
||||||
|
|
||||||
|
class CaravanMagazine(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = 'Caravan Magazine'
|
||||||
|
__author__ = 'Kovid Goyal'
|
||||||
|
description = 'An Indian Journal of politics and culture'
|
||||||
|
language = 'en_IN'
|
||||||
|
timefmt = ' [%b, %Y]'
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name=is_title),
|
||||||
|
dict(attrs={'class':['subhheading', 'authorndate', 'full-image-view', 'fullpage-body']}),
|
||||||
|
]
|
||||||
|
remove_tags = [
|
||||||
|
dict(attrs={'class':['share-with']}),
|
||||||
|
dict(attrs={'class':lambda x: x and 'thumb-image-view' in x}),
|
||||||
|
]
|
||||||
|
|
||||||
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
|
root = html5lib.parse(
|
||||||
|
clean_xml_chars(raw_html), treebuilder='lxml',
|
||||||
|
namespaceHTMLElements=False)
|
||||||
|
for s in root.xpath('//script'):
|
||||||
|
s.getparent().remove(s)
|
||||||
|
return etree.tostring(root, encoding=unicode)
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
# Handle the image thumbnails
|
||||||
|
for div in soup.findAll('div', attrs={'class':lambda x: x and x.startswith('show-image')}):
|
||||||
|
if div['class'] == 'show-image':
|
||||||
|
div.extract()
|
||||||
|
else:
|
||||||
|
div['style'] = 'page-break-inside:avoid'
|
||||||
|
|
||||||
|
return soup
|
||||||
|
|
||||||
|
# To parse artice toc
|
||||||
|
def parse_index(self):
|
||||||
|
raw = self.index_to_soup(
|
||||||
|
'http://caravanmagazine.in/current-issue', raw=True)
|
||||||
|
raw = raw.decode('utf-8')
|
||||||
|
raw = self.preprocess_raw_html(raw, None)
|
||||||
|
soup = self.index_to_soup(raw)
|
||||||
|
|
||||||
|
a = soup.find('a', rel=lambda x:x and '[field_c_issues_image]' in x)
|
||||||
|
if a is not None:
|
||||||
|
self.cover_url = a['href']
|
||||||
|
|
||||||
|
ci = soup.find(attrs={'class': 'current-issue-block'})
|
||||||
|
current_section = 'Section'
|
||||||
|
current_articles = []
|
||||||
|
feeds = []
|
||||||
|
for div in ci.findAll(
|
||||||
|
attrs={'class': ['view-header', 'view-content']}):
|
||||||
|
if div['class'] == 'view-header':
|
||||||
|
if current_articles:
|
||||||
|
feeds.append((current_section, current_articles))
|
||||||
|
current_section = self.tag_to_string(div).replace('paging_filter', '')
|
||||||
|
current_articles = []
|
||||||
|
self.log('Section:', current_section)
|
||||||
|
else:
|
||||||
|
for art in div.findAll('div', attrs={'class': lambda x: x and 'views-row' in x.split()}):
|
||||||
|
title = div.find(attrs={'class': 'views-field-title'})
|
||||||
|
if title is not None:
|
||||||
|
a = title.find('a', href=True)
|
||||||
|
if a is not None:
|
||||||
|
href = a['href']
|
||||||
|
if href.startswith('/'):
|
||||||
|
href = 'http://caravanmagazine.in' + href
|
||||||
|
article = {
|
||||||
|
'title': self.tag_to_string(title), 'url': href}
|
||||||
|
title.extract()
|
||||||
|
desc = self.tag_to_string(div).strip()
|
||||||
|
if desc:
|
||||||
|
article['description'] = desc
|
||||||
|
current_articles.append(article)
|
||||||
|
self.log('\t' + article['title'])
|
||||||
|
self.log('\t\t' + article['url'])
|
||||||
|
|
||||||
|
if current_articles:
|
||||||
|
feeds.append((current_section, current_articles))
|
||||||
|
|
||||||
|
return feeds
|
@ -12,7 +12,7 @@ class Carta(BasicNewsRecipe):
|
|||||||
|
|
||||||
title = u'Carta'
|
title = u'Carta'
|
||||||
description = 'News about electronic publishing'
|
description = 'News about electronic publishing'
|
||||||
__author__ = 'Oliver Niesner'
|
__author__ = 'Oliver Niesner' # AGe Update 2013-10-13
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
timefmt = ' [%a %d %b %Y]'
|
timefmt = ' [%a %d %b %Y]'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
@ -25,7 +25,7 @@ class Carta(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
remove_tags_after = [dict(name='p', attrs={'class':'tags-blog'})]
|
remove_tags_after = [dict(name='div', attrs={'id':'BlogContent'})] # AGe
|
||||||
|
|
||||||
remove_tags = [dict(name='p', attrs={'class':'print'}),
|
remove_tags = [dict(name='p', attrs={'class':'print'}),
|
||||||
dict(name='p', attrs={'class':'tags-blog'}),
|
dict(name='p', attrs={'class':'tags-blog'}),
|
||||||
|
@ -1,23 +1,29 @@
|
|||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class AdvancedUserRecipe1312361378(BasicNewsRecipe):
|
class AdvancedUserRecipe1380852962(BasicNewsRecipe):
|
||||||
title = u'Carta capital'
|
title = u'Carta Capital'
|
||||||
__author__ = 'Pablo Aldama'
|
__author__ = 'Erico Lisboa'
|
||||||
language = 'pt_BR'
|
language = 'pt_BR'
|
||||||
oldest_article = 9
|
oldest_article = 15
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
feeds = [(u'Politica', u'http://www.cartacapital.com.br/category/politica/feed')
|
feeds = [(u'Pol\xedtica',
|
||||||
,(u'Economia', u'http://www.cartacapital.com.br/category/economia/feed')
|
u'http://www.cartacapital.com.br/politica/politica/rss'), (u'Economia',
|
||||||
,(u'Cultura', u'http://www.cartacapital.com.br/category/cultura/feed')
|
u'http://www.cartacapital.com.br/economia/economia/atom.xml'),
|
||||||
,(u'Internacional', u'http://www.cartacapital.com.br/category/internacional/feed')
|
(u'Sociedade',
|
||||||
,(u'Saude', u'http://www.cartacapital.com.br/category/saude/feed')
|
u'http://www.cartacapital.com.br/sociedade/sociedade/atom.xml'),
|
||||||
,(u'Sociedade', u'http://www.cartacapital.com.br/category/sociedade/feed')
|
(u'Internacional',
|
||||||
,(u'Tecnologia', u'http://www.cartacapital.com.br/category/tecnologia/feed')
|
u'http://www.cartacapital.com.br/internacional/internacional/atom.xml'),
|
||||||
,(u'Carta na escola', u'http://www.cartacapital.com.br/category/carta-na-escola/feed')
|
(u'Tecnologia',
|
||||||
,(u'Carta fundamental', u'http://www.cartacapital.com.br/category/carta-fundamental/feed')
|
u'http://www.cartacapital.com.br/tecnologia/tecnologia/atom.xml'),
|
||||||
,(u'Carta verde', u'http://www.cartacapital.com.br/category/carta-verde/feed')
|
(u'Cultura',
|
||||||
|
u'http://www.cartacapital.com.br/cultura/cultura/atom.xml'),
|
||||||
]
|
(u'Sa\xfade', u'http://www.cartacapital.com.br/saude/saude/atom.xml'),
|
||||||
def print_version(self, url):
|
(u'Educa\xe7\xe3o',
|
||||||
return url + '/print'
|
u'http://www.cartacapital.com.br/educacao/educacao/atom.xml')]
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
## Last Edit: 2013-08-23
|
||||||
|
## From: Armin Geller
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2010, NA'
|
__copyright__ = '2010, NA'
|
||||||
'''
|
'''
|
||||||
@ -18,33 +20,30 @@ class Consumerist(BasicNewsRecipe):
|
|||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'en'
|
language = 'en'
|
||||||
masthead_url = 'http://consumerist.com/css/images/footer_man.gif'
|
masthead_url = 'http://consumermediallc.files.wordpress.com/2013/02/consumerist.png'# AGe 2013-08-23
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
body{font-family: "Lucida Grande",Helvetica,Arial,sans-serif}
|
body{font-family: "Lucida Grande",Helvetica,Arial,sans-serif}
|
||||||
img{margin-bottom: 1em}
|
img{margin-bottom: 1em}
|
||||||
h1{font-family :Arial,Helvetica,sans-serif; font-size:x-large}
|
h1{font-family :Arial,Helvetica,sans-serif; font-size:x-large}
|
||||||
h2{font-family :Arial,Helvetica,sans-serif; font-size:large}
|
h2{font-family :Arial,Helvetica,sans-serif; font-size:large}
|
||||||
'''
|
'''
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment' : description,
|
||||||
, 'tags' : category
|
'tags' : category,
|
||||||
, 'publisher' : publisher
|
'publisher' : publisher,
|
||||||
, 'language' : language
|
'language' : language,
|
||||||
}
|
}
|
||||||
|
|
||||||
remove_attributes = ['width','height']
|
remove_attributes = ['width','height']
|
||||||
#keep_only_tags = [dict(attrs={'class':['', 'category-breadcrumb']}),]
|
|
||||||
remove_tags_before = dict(name='h2')
|
|
||||||
|
|
||||||
remove_tags = [
|
keep_only_tags = dict(name='div', attrs={'class':['hfeed',]}) # AGe 2013-08-23
|
||||||
#dict(name='iframe'),
|
|
||||||
dict(name='div', attrs={'class':['e-comments', 'more-about', 'entry-tags']}),
|
|
||||||
#dict(name='div', attrs={'id':['IEContainer', 'clickIncludeBox']}),
|
|
||||||
#dict(name='ul', attrs={'class':'article-tools'}),
|
|
||||||
#dict(name='ul', attrs={'class':'articleTools'}),
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags_after = dict(attrs={'class':'e-body'})
|
remove_tags = [dict(name='div', attrs={'class':['navigation', # AGe 2013-08-23
|
||||||
|
'wpcom-related-posts widget widget_related_posts', # AGe 2013-08-23
|
||||||
|
'sharedaddy sd-like-enabled sd-sharing-enabled',]}), # AGe 2013-08-23
|
||||||
|
dict(name='div', attrs={'id':['comments',]}), # AGe 2013-08-23
|
||||||
|
]
|
||||||
|
|
||||||
feeds = [(u'Articles', u'http://consumerist.com/index.xml')]
|
feeds = [(u'Articles', u'http://consumerist.com/index.xml')]
|
||||||
|
|
||||||
|
@ -20,27 +20,22 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
|||||||
#article_already_exists = False
|
#article_already_exists = False
|
||||||
#feed_hash = ''
|
#feed_hash = ''
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
soup = self.index_to_soup('http://www.countryfile.com/magazine')
|
soup = self.index_to_soup('http://www.countryfile.com/magazine')
|
||||||
cov = soup.find(attrs={'class' : re.compile('imagecache imagecache-250px_wide')})#'width' : '160',
|
cov = soup.find(attrs={'class' : re.compile('imagecache imagecache-250px')}) # 'width' : '160',
|
||||||
print '&&&&&&&& ',cov,' ***'
|
|
||||||
cov=str(cov)
|
|
||||||
#cov2 = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
|
|
||||||
cov2 = re.findall('/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
|
|
||||||
|
|
||||||
cov2 = str(cov2)
|
cov=str(cov)
|
||||||
cov2= "http://www.countryfile.com"+cov2[2:len(cov2)-8]
|
cov=cov[10:]
|
||||||
|
cov=cov[:-135]
|
||||||
print '******** ',cov2,' ***'
|
br = browser()
|
||||||
# try to get cover - if can't get known cover
|
br.set_handle_redirect(False)
|
||||||
br = browser()
|
try:
|
||||||
|
br.open_novisit(cov)
|
||||||
br.set_handle_redirect(False)
|
cover_url = cov
|
||||||
try:
|
except:
|
||||||
br.open_novisit(cov2)
|
cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
|
||||||
cover_url = cov2
|
return cover_url
|
||||||
except:
|
preprocess_regexps = [
|
||||||
cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
|
(re.compile(r' \| Countryfile.com', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||||
return cover_url
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
# dict(attrs={'class' : ['player']}),
|
# dict(attrs={'class' : ['player']}),
|
||||||
|
|
||||||
@ -48,6 +43,5 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
|||||||
feeds = [
|
feeds = [
|
||||||
(u'Homepage', u'http://www.countryfile.com/rss/home'),
|
(u'Homepage', u'http://www.countryfile.com/rss/home'),
|
||||||
(u'Country News', u'http://www.countryfile.com/rss/news'),
|
(u'Country News', u'http://www.countryfile.com/rss/news'),
|
||||||
(u'Countryside', u'http://www.countryfile.com/rss/countryside'),
|
(u'Countryside', u'http://www.countryfile.com/rss/countryside'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -1,63 +1,51 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Cracked(BasicNewsRecipe):
|
|
||||||
title = u'Cracked.com'
|
|
||||||
__author__ = 'UnWeave'
|
|
||||||
language = 'en'
|
|
||||||
description = "America's Only HumorSite since 1958"
|
|
||||||
publisher = 'Cracked'
|
|
||||||
category = 'comedy, lists'
|
|
||||||
oldest_article = 3 #days
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
no_stylesheets = True
|
|
||||||
encoding = 'ascii'
|
|
||||||
remove_javascript = True
|
|
||||||
use_embedded_content = False
|
|
||||||
|
|
||||||
feeds = [ (u'Articles', u'http://feeds.feedburner.com/CrackedRSS/') ]
|
class Cracked(BasicNewsRecipe):
|
||||||
|
title = u'Cracked.com'
|
||||||
|
__author__ = 'UnWeave'
|
||||||
|
language = 'en'
|
||||||
|
description = "America's Only HumorSite since 1958"
|
||||||
|
publisher = 'Cracked'
|
||||||
|
category = 'comedy, lists'
|
||||||
|
oldest_article = 3 # days
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'ascii'
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = False
|
||||||
|
# auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [(u'Articles', u'http://feeds.feedburner.com/CrackedRSS/')]
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
|
||||||
, 'tags' : category
|
}
|
||||||
, 'publisher' : publisher
|
|
||||||
, 'language' : language
|
|
||||||
}
|
|
||||||
|
|
||||||
remove_tags_before = dict(id='PrimaryContent')
|
keep_only_tags = [dict(name='article', attrs={'class': 'module article dropShadowBottomCurved'}),
|
||||||
|
dict(name='article', attrs={'class': 'module blog dropShadowBottomCurved'})]
|
||||||
|
|
||||||
remove_tags_after = dict(name='div', attrs={'class':'shareBar'})
|
remove_tags = [
|
||||||
|
dict(name='section', attrs={'class': ['socialTools', 'quickFixModule']})]
|
||||||
remove_tags = [ dict(name='div', attrs={'class':['social',
|
|
||||||
'FacebookLike',
|
|
||||||
'shareBar'
|
|
||||||
]}),
|
|
||||||
|
|
||||||
dict(name='div', attrs={'id':['inline-share-buttons',
|
|
||||||
]}),
|
|
||||||
|
|
||||||
dict(name='span', attrs={'class':['views',
|
|
||||||
'KonaFilter'
|
|
||||||
]}),
|
|
||||||
#dict(name='img'),
|
|
||||||
]
|
|
||||||
|
|
||||||
def appendPage(self, soup, appendTag, position):
|
def appendPage(self, soup, appendTag, position):
|
||||||
# Check if article has multiple pages
|
# Check if article has multiple pages
|
||||||
pageNav = soup.find('nav', attrs={'class':'PaginationContent'})
|
pageNav = soup.find('nav', attrs={'class': 'PaginationContent'})
|
||||||
if pageNav:
|
if pageNav:
|
||||||
# Check not at last page
|
# Check not at last page
|
||||||
nextPage = pageNav.find('a', attrs={'class':'next'})
|
nextPage = pageNav.find('a', attrs={'class': 'next'})
|
||||||
if nextPage:
|
if nextPage:
|
||||||
nextPageURL = nextPage['href']
|
nextPageURL = nextPage['href']
|
||||||
nextPageSoup = self.index_to_soup(nextPageURL)
|
nextPageSoup = self.index_to_soup(nextPageURL)
|
||||||
# 8th <section> tag contains article content
|
# 8th <section> tag contains article content
|
||||||
nextPageContent = nextPageSoup.findAll('section')[7]
|
nextPageContent = nextPageSoup.findAll('article')[0]
|
||||||
newPosition = len(nextPageContent.contents)
|
newPosition = len(nextPageContent.contents)
|
||||||
self.appendPage(nextPageSoup,nextPageContent,newPosition)
|
self.appendPage(nextPageSoup, nextPageContent, newPosition)
|
||||||
nextPageContent.extract()
|
nextPageContent.extract()
|
||||||
pageNav.extract()
|
pageNav.extract()
|
||||||
appendTag.insert(position,nextPageContent)
|
appendTag.insert(position, nextPageContent)
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
self.appendPage(soup, soup.body, 3)
|
self.appendPage(soup, soup.body, 3)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
88
recipes/daily_express.recipe
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
class AdvancedUserRecipe1376229553(BasicNewsRecipe):
|
||||||
|
title = u'Daily Express'
|
||||||
|
__author__ = 'Dave Asbury'
|
||||||
|
# 9-9-13 added article author and now use (re.compile(r'>[\w].+? News<'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
#remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 10
|
||||||
|
#auto_cleanup = True
|
||||||
|
compress_news_images = True
|
||||||
|
compress_news_images_max_size = 30
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
masthead_url = 'http://cdn.images.dailyexpress.co.uk/img/page/express_logo.png'
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
|
|
||||||
|
(re.compile(r'widget', re.IGNORECASE | re.DOTALL), lambda match: ''),
|
||||||
|
(re.compile(r'Related articles', re.IGNORECASE | re.DOTALL), lambda match: ''),
|
||||||
|
(re.compile(r'Add Your Comment<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
|
||||||
|
(re.compile(r'>More [\w].+?<', re.IGNORECASE), lambda match: '><'),
|
||||||
|
(re.compile(r'>[\w].+? News<', re.IGNORECASE), lambda match: '><'),
|
||||||
|
#(re.compile(r'Health News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
|
||||||
|
#(re.compile(r'Car News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
|
||||||
|
#(re.compile(r'TV & Radio News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
|
||||||
|
#(re.compile(r'Food & Recipe News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
|
||||||
|
#(re.compile(r'More City & Business<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
|
||||||
|
#(re.compile(r'Travel News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
|
||||||
|
#(re.compile(r'Garden News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
|
||||||
|
#(re.compile(r'Fashion & Beauty News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
|
||||||
|
#(re.compile(r'More Personal Finance<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
|
||||||
|
#(re.compile(r'<h3>More UK</h3>', re.IGNORECASE | re.DOTALL), lambda match: ''),
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(attrs={'class' : 'quote'}),
|
||||||
|
#dict(attrs={'class' : 'author'}),
|
||||||
|
dict(name='footer'),
|
||||||
|
dict(attrs={'id' : 'header_addons'}),
|
||||||
|
dict(attrs={'class' : 'hoverException'}),
|
||||||
|
dict(name='_li'),dict(name='li'),
|
||||||
|
dict(attrs={'class' : 'box related-articles clear'}),
|
||||||
|
dict(attrs={'class' : 'news-list'}),
|
||||||
|
dict(attrs={'class' : 'sponsored-section'}),
|
||||||
|
dict(attrs={'class' : 'pull-quote on-right'}),
|
||||||
|
dict(attrs={'class' : 'pull-quote on-left'}),
|
||||||
|
|
||||||
|
]
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='h1'),
|
||||||
|
dict(attrs={'class' : 'publish-info'}),
|
||||||
|
dict(name='h3', limit=2),
|
||||||
|
dict(attrs={'class' : 'clearfix hR new-style'}),
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [(u'UK News', u'http://www.express.co.uk/posts/rss/1/uk'),
|
||||||
|
(u'World News',u'http://www.express.co.uk/posts/rss/78/world'),
|
||||||
|
(u'Finance',u'http://www.express.co.uk/posts/rss/21/finance'),
|
||||||
|
(u'Sport',u'http://www.express.co.uk/posts/rss/65/sport'),
|
||||||
|
(u'Entertainment',u'http://www.express.co.uk/posts/rss/18/entertainment'),
|
||||||
|
(u'Lifestyle',u'http://www.express.co.uk/posts/rss/8/life&style'),
|
||||||
|
(u'Fun',u'http://www.express.co.uk/posts/rss/110/fun'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
soup = self.index_to_soup('http://www.express.co.uk/ourpaper/')
|
||||||
|
cov = soup.find(attrs={'src' : re.compile('http://images.dailyexpress.co.uk/img/covers/')})
|
||||||
|
cov=str(cov)
|
||||||
|
cov2 = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
|
||||||
|
|
||||||
|
cov=str(cov2)
|
||||||
|
cov=cov[2:len(cov)-2]
|
||||||
|
cover_url=cov
|
||||||
|
return cover_url
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-weight:bold;font-size:175%;}
|
||||||
|
h2{font-weight:normal;font-size:75%;}
|
||||||
|
#p{font-size:14px;}
|
||||||
|
#body{font-size:14px;}
|
||||||
|
.photo-caption {display: block;margin-left: auto;margin-right: auto;width:100%;font-size:40%;}
|
||||||
|
.publish-info {font-size:50%;}
|
||||||
|
.photo img {display: block;margin-left: auto;margin-right: auto;width:100%;}
|
||||||
|
'''
|
@ -7,50 +7,50 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
|||||||
description = 'News as provided by The Daily Mirror -UK'
|
description = 'News as provided by The Daily Mirror -UK'
|
||||||
|
|
||||||
__author__ = 'Dave Asbury'
|
__author__ = 'Dave Asbury'
|
||||||
# last updated 19/10/12
|
# last updated 27/8/13
|
||||||
language = 'en_GB'
|
language = 'en_GB'
|
||||||
#cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
|
#cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
|
||||||
|
|
||||||
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
|
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
|
||||||
|
#recursions = 10
|
||||||
compress_news_images = True
|
compress_news_images = True
|
||||||
oldest_article = 1
|
compress_news_images_max_size = 30
|
||||||
max_articles_per_feed = 12
|
oldest_article = 1.5
|
||||||
|
max_articles_per_feed = 10
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
ignore_duplicate_articles = {'title'}
|
ignore_duplicate_articles = {'url'}
|
||||||
|
|
||||||
# auto_cleanup = True
|
#auto_cleanup = True
|
||||||
#conversion_options = { 'linearize_tables' : True }
|
#conversion_options = { 'linearize_tables' : True }
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='h1'),
|
||||||
|
|
||||||
keep_only_tags = [ dict(name='h1'),
|
|
||||||
dict(name='div',attrs={'class' : 'lead-text'}),
|
dict(name='div',attrs={'class' : 'lead-text'}),
|
||||||
dict(name='div',attrs={'class' : 'styleGroup clearfix'}),
|
dict(attrs={'class' : 'tools clearfix'}),
|
||||||
dict(name='div',attrs={'class' : 'widget relatedContents pictures widget-editable viziwyg-section-245 inpage-widget-158123'}),
|
dict(name='div',attrs={'class' : 'widget relatedContents pictures widget-editable viziwyg-section-245 inpage-widget-158123'}),
|
||||||
# dict(name='figure',attrs={'class' : 'clearfix'}),
|
# dict(name='figure',attrs={'class' : 'clearfix'}),
|
||||||
dict(name='div',attrs={'class' :'body '}),
|
dict(name='div',attrs={'class' :'body '}),
|
||||||
|
dict(name='div',attrs={'class' :'thumb'}),
|
||||||
|
dict(attrs={'img alt' : ['Perishers','Horace']}),
|
||||||
|
#dict(attrs={'class' : 'tmRow span-15-5 col-1 article-page'}),
|
||||||
#dict(attrs={'class' : ['article-attr','byline append-1','published']}),
|
#dict(attrs={'class' : ['article-attr','byline append-1','published']}),
|
||||||
#dict(name='p'),
|
# dict(name='p'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(attrs={'class' : ['article sa-teaser type-opinion','image-gallery','gallery-caption']}),
|
dict(attrs={'class' : ['article sa-teaser type-opinion','last','gallery-caption','gallery-data','ir btn-fullscreen','avatar']}), # ,'image-gallery'
|
||||||
dict(attrs={'class' : 'comment'}),
|
dict(attrs={'class' : 'comment'}),
|
||||||
dict(name='title'),
|
dict(name='title'),
|
||||||
dict(name='ul',attrs={'class' : 'clearfix breadcrumbs '}),
|
dict(name='ul',attrs={'class' : 'clearfix breadcrumbs '}),
|
||||||
dict(name='ul',attrs={'id' : 'login-201109171215'}),
|
dict(name='ul',attrs={'id' : 'login-201109171215'}),
|
||||||
dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),#'widget navigation breadcrumb widget-editable viziwyg-section-198 inpage-widget-80721 span-17','image-credit'
|
#'widget navigation breadcrumb widget-editable viziwyg-section-198 inpage-widget-80721 span-17','image-credit'
|
||||||
|
dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),
|
||||||
]
|
]
|
||||||
|
|
||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
(re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
(re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||||
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'News',u'http://www.mirror.co.uk/news/rss.xml'),
|
(u'News',u'http://www.mirror.co.uk/news/rss.xml'),
|
||||||
(u'Sports',u'http://www.mirror.co.uk/sport/rss.xml'),
|
(u'Sports',u'http://www.mirror.co.uk/sport/rss.xml'),
|
||||||
@ -63,26 +63,31 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
|||||||
# example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
|
# example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
|
||||||
]
|
]
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:170%;}
|
||||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
.article figure figcaption {display: block;margin-left: auto;margin-right: auto;
|
||||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
width:100%;font-family:Arial,Helvetica,sans-serif;font-size:40%;}
|
||||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
|
||||||
'''
|
#h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;}
|
||||||
|
p{font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
body{font-family:Helvetica,Arial,sans-serif;}
|
||||||
|
.article figure{display: block;margin-left: auto;margin-right: auto;width:100%;}
|
||||||
|
.lead-text p {font-size:150%}
|
||||||
|
'''
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
|
soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
|
||||||
# look for the block containing the mirror button and url
|
# look for the block containing the mirror button and url
|
||||||
cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
|
cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
|
||||||
cov2 = str(cov)
|
cov2 = str(cov)
|
||||||
cov2='http://www.politicshome.com'+cov2[9:-142]
|
cov2='http://www.politicshome.com'+cov2[9:-142]
|
||||||
#cov2 now contains url of the page containing pic
|
# cov2 now contains url of the page containing pic
|
||||||
soup = self.index_to_soup(cov2)
|
soup = self.index_to_soup(cov2)
|
||||||
cov = soup.find(attrs={'id' : 'large'})
|
cov = soup.find(attrs={'id' : 'large'})
|
||||||
cov=str(cov)
|
cov=str(cov)
|
||||||
cov2 = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
|
cov2 = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
|
||||||
cov2 = str(cov2)
|
cov2 = str(cov2)
|
||||||
cov2=cov2[2:len(cov2)-2]
|
cov2=cov2[2:len(cov2)-2]
|
||||||
#cov2 now is pic url, now go back to original function
|
# cov2 now is pic url, now go back to original function
|
||||||
br = browser()
|
br = browser()
|
||||||
br.set_handle_redirect(False)
|
br.set_handle_redirect(False)
|
||||||
try:
|
try:
|
||||||
|
45
recipes/democracy_now.recipe
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
# vim:fileencoding=utf-8
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class DemocracyNowRecipe(BasicNewsRecipe):
|
||||||
|
title = u'Democracy now!'
|
||||||
|
__author__ = u'Antoine Beaupré'
|
||||||
|
description = 'A daily TV/radio news program, hosted by Amy Goodman and Juan Gonzalez, airing on over 1,100 stations, pioneering the largest community media collaboration in the United States.' # noqa
|
||||||
|
language = 'en'
|
||||||
|
cover_url = 'http://www.democracynow.org/images/dn-logo-for-podcast.png'
|
||||||
|
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 10
|
||||||
|
publication_type = 'magazine'
|
||||||
|
|
||||||
|
auto_cleanup = False
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Daily news', u'http://www.democracynow.org/democracynow.rss')]
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id': 'page'}), ]
|
||||||
|
remove_tags = [dict(name='div', attrs={'id': 'topics_list'}),
|
||||||
|
dict(name='div', attrs={'id': 'header'}),
|
||||||
|
dict(name='div', attrs={'id': 'footer'}),
|
||||||
|
dict(name='div', attrs={'id': 'right'}),
|
||||||
|
dict(name='div', attrs={'id': 'left-panel'}),
|
||||||
|
dict(name='div', attrs={'id': 'top-video-content'}),
|
||||||
|
dict(name='div', attrs={'id': 'google-news-date'}),
|
||||||
|
dict(name='div', attrs={'id': 'story-donate'}),
|
||||||
|
dict(
|
||||||
|
name='div', attrs={'id': 'transcript-expand-collapse'}),
|
||||||
|
dict(name='span', attrs={'class': 'show-links'}),
|
||||||
|
dict(name='span', attrs={'class': 'storyNav'}),
|
||||||
|
dict(name='div', attrs={'class': 'headline_share'}),
|
||||||
|
dict(name='div', attrs={'class': 'mediaBar'}),
|
||||||
|
dict(name='div', attrs={'class': 'shareAndPrinterBar'}),
|
||||||
|
dict(name='div', attrs={'class': 'utility-navigation'}),
|
||||||
|
dict(name='div', attrs={'class': 'bottomContentNav'}),
|
||||||
|
dict(name='div', attrs={'class': 'recentShows'}),
|
||||||
|
dict(
|
||||||
|
name='div', attrs={'class': 'printer-and-transcript-links'}),
|
||||||
|
]
|
||||||
|
|
@ -1,72 +1,50 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2009-2013, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
elargentino.com
|
diagonales.infonews.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
|
||||||
|
|
||||||
class Diagonales(BasicNewsRecipe):
|
class Diagonales(BasicNewsRecipe):
|
||||||
title = 'Diagonales'
|
title = 'Diagonales'
|
||||||
__author__ = 'Darko Miletic'
|
__author__ = 'Darko Miletic'
|
||||||
description = 'El nuevo diario de La Plata'
|
description = 'Para estar bien informado sobre los temas de actualidad. Conoce sobre pais, economia, deportes, mundo, espectaculos, sociedad, entrevistas y tecnologia.'
|
||||||
publisher = 'ElArgentino.com'
|
publisher = 'INFOFIN S.A.'
|
||||||
category = 'news, politics, Argentina, La Plata'
|
category = 'news, politics, Argentina, La Plata'
|
||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'es_AR'
|
language = 'es_AR'
|
||||||
|
publication_type = 'newspaper'
|
||||||
lang = 'es-AR'
|
delay = 1
|
||||||
direction = 'ltr'
|
remove_empty_feeds = True
|
||||||
INDEX = 'http://www.elargentino.com/medios/122/Diagonales.html'
|
|
||||||
extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} '
|
extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} '
|
||||||
|
|
||||||
html2lrf_options = [
|
conversion_options = {
|
||||||
'--comment' , description
|
'comment' : description
|
||||||
, '--category' , category
|
, 'tags' : category
|
||||||
, '--publisher', publisher
|
, 'publisher' : publisher
|
||||||
]
|
, 'language' : language
|
||||||
|
}
|
||||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})]
|
keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})]
|
||||||
|
remove_tags = [dict(name='link')]
|
||||||
|
|
||||||
remove_tags = [dict(name='link')]
|
feeds = [
|
||||||
|
(u'Pais' , u'http://diagonales.infonews.com/Highlights.aspx?ParentType=Section&ParentId=112&Content-Type=text/xml&ChannelDesc=Pa%C3%ADs')
|
||||||
feeds = [(u'Articulos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=122&Content-Type=text/xml&ChannelDesc=Diagonales')]
|
,(u'Deportes' , u'http://diagonales.infonews.com/Highlights.aspx?ParentType=Section&ParentId=106&Content-Type=text/xml&ChannelDesc=Deportes')
|
||||||
|
,(u'Economia' , u'http://diagonales.infonews.com/Highlights.aspx?ParentType=Section&ParentId=107&Content-Type=text/xml&ChannelDesc=Econom%C3%ADa')
|
||||||
|
,(u'Sociedad' , u'http://diagonales.infonews.com/Highlights.aspx?ParentType=Section&ParentId=109&Content-Type=text/xml&ChannelDesc=Sociedad')
|
||||||
|
,(u'Mundo' , u'http://diagonales.infonews.com/Highlights.aspx?ParentType=Section&ParentId=113&Content-Type=text/xml&ChannelDesc=Mundo')
|
||||||
|
,(u'Espectaculos', u'http://diagonales.infonews.com/Highlights.aspx?ParentType=Section&ParentId=114&Content-Type=text/xml&ChannelDesc=Espect%C3%A1culos')
|
||||||
|
,(u'Entrevistas' , u'http://diagonales.infonews.com/Highlights.aspx?ParentType=Section&ParentId=115&Content-Type=text/xml&ChannelDesc=Entrevistas')
|
||||||
|
,(u'Tecnologia' , u'http://diagonales.infonews.com/Highlights.aspx?ParentType=Section&ParentId=118&Content-Type=text/xml&ChannelDesc=Tecnolog%C3%ADa')
|
||||||
|
]
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
main, sep, article_part = url.partition('/nota-')
|
main, sep, article_part = url.partition('/nota-')
|
||||||
article_id, rsep, rrest = article_part.partition('-')
|
article_id, rsep, rrest = article_part.partition('-')
|
||||||
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
|
return u'http://diagonales.infonews.com/Impresion.aspx?Id=' + article_id
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
for item in soup.findAll(style=True):
|
|
||||||
del item['style']
|
|
||||||
soup.html['lang'] = self.lang
|
|
||||||
soup.html['dir' ] = self.direction
|
|
||||||
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
|
|
||||||
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
|
|
||||||
soup.head.insert(0,mlang)
|
|
||||||
soup.head.insert(1,mcharset)
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
|
||||||
cover_url = None
|
|
||||||
soup = self.index_to_soup(self.INDEX)
|
|
||||||
cover_item = soup.find('div',attrs={'class':'colder'})
|
|
||||||
if cover_item:
|
|
||||||
clean_url = self.image_url_processor(None,cover_item.div.img['src'])
|
|
||||||
cover_url = 'http://www.elargentino.com' + clean_url + '&height=600'
|
|
||||||
return cover_url
|
|
||||||
|
|
||||||
def image_url_processor(self, baseurl, url):
|
|
||||||
base, sep, rest = url.rpartition('?Id=')
|
|
||||||
img, sep2, rrest = rest.partition('&')
|
|
||||||
return base + sep + img
|
|
||||||
|
51
recipes/diario_el_pueblo.recipe
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = '2013, Carlos Alves <carlosalves90@gmail.com>'
|
||||||
|
'''
|
||||||
|
diarioelpueblo.com.uy
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class General(BasicNewsRecipe):
|
||||||
|
title = 'Diario El Pueblo'
|
||||||
|
__author__ = 'Carlos Alves'
|
||||||
|
description = 'Noticias de Salto - Uruguay'
|
||||||
|
tags = 'news, sports'
|
||||||
|
language = 'es_UY'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
use_embedded_content = False
|
||||||
|
recursion = 5
|
||||||
|
encoding = 'utf8'
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':'post-alt blog'})]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['hr', 'titlebar', 'volver-arriba-right','navigation']}),
|
||||||
|
dict(name='div', attrs={'id':'comment','id':'suckerfish','id':'crp_related'}),
|
||||||
|
dict(name='h3', attrs={'class':['post_date']}),
|
||||||
|
dict(name=['object','link'])
|
||||||
|
]
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||||
|
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||||
|
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||||
|
p {font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
'''
|
||||||
|
feeds = [
|
||||||
|
(u'Articulos', u'http://www.diarioelpueblo.com.uy/feed')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
return 'http://www.diarioelpueblo.com.uy/wp-content/uploads/2013/06/Cabezal_Web1.jpg'
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
50
recipes/diario_salto.recipe
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = '2013, Carlos Alves <carlosalves90@gmail.com>'
|
||||||
|
'''
|
||||||
|
diarisalto.com.uy
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class General(BasicNewsRecipe):
|
||||||
|
title = 'Diario Salto'
|
||||||
|
__author__ = 'Carlos Alves'
|
||||||
|
description = 'Noticias de Salto - Uruguay'
|
||||||
|
tags = 'news, sports'
|
||||||
|
language = 'es_UY'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
use_embedded_content = False
|
||||||
|
recursion = 5
|
||||||
|
encoding = 'utf8'
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':'post'})]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['hr', 'titlebar', 'navigation']}),
|
||||||
|
dict(name='div', attrs={'id':'comment'}),
|
||||||
|
dict(name=['object','link'])
|
||||||
|
]
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||||
|
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||||
|
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||||
|
p {font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
'''
|
||||||
|
feeds = [
|
||||||
|
(u'Articulos', u'http://www.diariosalto.com.uy/feed/atom')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
return 'http://diariosalto.com.uy/demo/wp-content/uploads/2011/12/diario-salto_logo-final-b-b.png'
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
@ -6,6 +6,7 @@ DrMerry added cover Image 2011-11-12
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
import re
|
import re
|
||||||
|
|
||||||
class DilbertBig(BasicNewsRecipe):
|
class DilbertBig(BasicNewsRecipe):
|
||||||
@ -16,7 +17,7 @@ class DilbertBig(BasicNewsRecipe):
|
|||||||
oldest_article = 15
|
oldest_article = 15
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = True
|
use_embedded_content = False
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
publisher = 'UNITED FEATURE SYNDICATE, INC.'
|
publisher = 'UNITED FEATURE SYNDICATE, INC.'
|
||||||
category = 'comic'
|
category = 'comic'
|
||||||
@ -30,25 +31,14 @@ class DilbertBig(BasicNewsRecipe):
|
|||||||
,'publisher' : publisher
|
,'publisher' : publisher
|
||||||
}
|
}
|
||||||
|
|
||||||
feeds = [(u'Dilbert', u'http://feed.dilbert.com/dilbert/daily_strip' )]
|
feeds = [(u'Dilbert', u'http://feed.dilbert.com/dilbert/daily_strip')]
|
||||||
|
|
||||||
def get_article_url(self, article):
|
|
||||||
return article.get('feedburner_origlink', None)
|
|
||||||
|
|
||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
(re.compile('strip\..*\.gif', re.DOTALL|re.IGNORECASE), lambda match: 'strip.zoom.gif')
|
(re.compile('strip\..*\.gif', re.DOTALL|re.IGNORECASE), lambda match: 'strip.zoom.gif')
|
||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for tag in soup.findAll(name='a'):
|
for tag in soup.findAll(name='input'):
|
||||||
if tag['href'].find('http://feedads') >= 0:
|
image = BeautifulSoup('<img src=' + tag['value'] + '></img>')
|
||||||
tag.extract()
|
return image
|
||||||
return soup
|
|
||||||
|
|
||||||
extra_css = '''
|
|
||||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
|
||||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
|
||||||
img {max-width:100%; min-width:100%;}
|
|
||||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
|
||||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
|
||||||
'''
|
|
||||||
|
@ -6,46 +6,87 @@ __copyright__ = u'2011, Silviu Cotoar\u0103'
|
|||||||
'''
|
'''
|
||||||
dilemaveche.ro
|
dilemaveche.ro
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class DilemaVeche(BasicNewsRecipe):
|
class DilemaVeche(BasicNewsRecipe):
|
||||||
title = u'Dilema Veche'
|
# apare vinerea, mai pe dupa-masa,depinde de Luiza cred (care se semneaza ca fiind creatorul fiecarui articol in feed-ul RSS)
|
||||||
__author__ = u'Silviu Cotoar\u0103'
|
title = u'Dilema Veche'
|
||||||
description = 'Sint vechi, domnule! (I.L. Caragiale)'
|
__author__ = 'song2' # inspirat din scriptul pentru Le Monde. Inspired from the Le Monde script
|
||||||
publisher = u'Adev\u0103rul Holding'
|
description = '"Sint vechi, domnule!" (I.L. Caragiale)'
|
||||||
oldest_article = 5
|
publisher = 'Adevarul Holding'
|
||||||
language = 'ro'
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 200
|
||||||
no_stylesheets = True
|
encoding = 'utf8'
|
||||||
use_embedded_content = False
|
language = 'ro'
|
||||||
category = 'Ziare'
|
masthead_url = 'http://www.dilemaveche.ro/sites/all/themes/dilema/theme/dilema_two/layouter/dilema_two_homepage/logo.png'
|
||||||
encoding = 'utf-8'
|
publication_type = 'magazine'
|
||||||
cover_url = 'http://dilemaveche.ro/sites/all/themes/dilema/theme/dilema_two/layouter/dilema_two_homepage/logo.png'
|
feeds = [
|
||||||
|
('Editoriale si opinii - Situatiunea', 'http://www.dilemaveche.ro/taxonomy/term/37/0/feed'),
|
||||||
conversion_options = {
|
('Editoriale si opinii - Pe ce lume traim', 'http://www.dilemaveche.ro/taxonomy/term/38/0/feed'),
|
||||||
'comments' : description
|
('Editoriale si opinii - Bordeie si obiceie', 'http://www.dilemaveche.ro/taxonomy/term/44/0/feed'),
|
||||||
,'tags' : category
|
('Editoriale si opinii - Talc Show', 'http://www.dilemaveche.ro/taxonomy/term/44/0/feed'),
|
||||||
,'language' : language
|
('Tema saptamanii', 'http://www.dilemaveche.ro/taxonomy/term/19/0/feed'),
|
||||||
,'publisher' : publisher
|
('La zi in cultura - Dilema va recomanda', 'http://www.dilemaveche.ro/taxonomy/term/58/0/feed'),
|
||||||
}
|
('La zi in cultura - Carte', 'http://www.dilemaveche.ro/taxonomy/term/14/0/feed'),
|
||||||
|
('La zi in cultura - Film', 'http://www.dilemaveche.ro/taxonomy/term/13/0/feed'),
|
||||||
keep_only_tags = [
|
('La zi in cultura - Muzica', 'http://www.dilemaveche.ro/taxonomy/term/1341/0/feed'),
|
||||||
dict(name='div', attrs={'class':'c_left_column'})
|
('La zi in cultura - Arte performative', 'http://www.dilemaveche.ro/taxonomy/term/1342/0/feed'),
|
||||||
]
|
('La zi in cultura - Arte vizuale', 'http://www.dilemaveche.ro/taxonomy/term/1512/0/feed'),
|
||||||
|
('Societate - Ieri cu vedere spre azi', 'http://www.dilemaveche.ro/taxonomy/term/15/0/feed'),
|
||||||
remove_tags = [
|
('Societate - Din polul opus', 'http://www.dilemaveche.ro/taxonomy/term/41/0/feed'),
|
||||||
dict(name='div', attrs={'id':['adshop_widget_428x60']}) ,
|
('Societate - Mass comedia', 'http://www.dilemaveche.ro/taxonomy/term/43/0/feed'),
|
||||||
dict(name='div', attrs={'id':['gallery']})
|
('Societate - La singular si la plural', 'http://www.dilemaveche.ro/taxonomy/term/42/0/feed'),
|
||||||
]
|
('Oameni si idei - Educatie', 'http://www.dilemaveche.ro/taxonomy/term/46/0/feed'),
|
||||||
|
('Oameni si idei - Polemici si dezbateri', 'http://www.dilemaveche.ro/taxonomy/term/48/0/feed'),
|
||||||
remove_tags_after = [
|
('Oameni si idei - Stiinta si tehnologie', 'http://www.dilemaveche.ro/taxonomy/term/46/0/feed'),
|
||||||
dict(name='div', attrs={'id':['adshop_widget_428x60']})
|
('Dileme on-line', 'http://www.dilemaveche.ro/taxonomy/term/005/0/feed')
|
||||||
]
|
|
||||||
|
|
||||||
feeds = [
|
|
||||||
(u'Feeds', u'http://dilemaveche.ro/rss.xml')
|
|
||||||
]
|
]
|
||||||
|
remove_tags_before = dict(name='div',attrs={'class':'spacer_10'})
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':'art_related_left'}),
|
||||||
|
dict(name='div', attrs={'class':'controale'}),
|
||||||
|
dict(name='div', attrs={'class':'simple_overlay'}),
|
||||||
|
]
|
||||||
|
remove_tags_after = [dict(id='facebookLike')]
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Georgia,Times,serif }
|
||||||
|
img{margin-bottom: 0.4em; display:block}
|
||||||
|
"""
|
||||||
|
needs_subscription = 'optional'
|
||||||
|
cover_margins = (10, 15, '#ffffff')
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
br.open('http://dilemaveche.ro/user/login')
|
||||||
|
br.select_form(nr=0)
|
||||||
|
br['username'] = self.username
|
||||||
|
br['password'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
cover_url = None
|
||||||
|
soup = self.index_to_soup('http://dilemaveche.ro')
|
||||||
|
link_item = soup.find('div',attrs={'class':'box_dr_pdf_picture'})
|
||||||
|
if link_item and link_item.a:
|
||||||
|
cover_url = link_item.a['href']
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
try:
|
||||||
|
br.open(cover_url)
|
||||||
|
except: # daca nu gaseste pdf-ul
|
||||||
|
self.log("\nPDF indisponibil")
|
||||||
|
link_item = soup.find('div',attrs={'class':'box_dr_pdf_picture'})
|
||||||
|
if link_item and link_item.img:
|
||||||
|
cover_url = link_item.img['src']
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
try:
|
||||||
|
br.open(cover_url)
|
||||||
|
except: # daca nu gaseste nici imaginea mica mica
|
||||||
|
print('Mama lor de nenorociti! nu este nici pdf nici imagine')
|
||||||
|
cover_url ='http://www.dilemaveche.ro/sites/all/themes/dilema/theme/dilema_two/layouter/dilema_two_homepage/logo.png'
|
||||||
|
return cover_url
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
return self.adeify_images(soup)
|
|
||||||
|
@ -11,30 +11,31 @@ class dotnetMagazine (BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
auto_cleanup = True
|
||||||
# recursion = 1
|
# recursion = 1
|
||||||
language = 'en'
|
language = 'en'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} '
|
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} '
|
||||||
cover_url = u'http://media.netmagazine.futurecdn.net/sites/all/themes/netmag/logo.png'
|
cover_url = u'http://media.netmagazine.futurecdn.net/sites/all/themes/netmag/logo.png'
|
||||||
|
|
||||||
remove_tags_after = dict(name='footer', id=lambda x:not x)
|
#remove_tags_after = dict(name='footer', id=lambda x:not x)
|
||||||
remove_tags_before = dict(name='header', id=lambda x:not x)
|
#remove_tags_before = dict(name='header', id=lambda x:not x)
|
||||||
|
|
||||||
remove_tags = [
|
#remove_tags = [
|
||||||
dict(name='div', attrs={'class': 'item-list'}),
|
#dict(name='div', attrs={'class': 'item-list'}),
|
||||||
dict(name='h4', attrs={'class': 'std-hdr'}),
|
#dict(name='h4', attrs={'class': 'std-hdr'}),
|
||||||
dict(name='div', attrs={'class': 'item-list share-links'}), # removes share links
|
#dict(name='div', attrs={'class': 'item-list share-links'}), # removes share links
|
||||||
dict(name=['script', 'noscript']),
|
#dict(name=['script', 'noscript']),
|
||||||
dict(name='div', attrs={'id': 'comments-form'}), # comment these out if you want the comments to show
|
#dict(name='div', attrs={'id': 'comments-form'}), # comment these out if you want the comments to show
|
||||||
dict(name='div', attrs={'id': re.compile('advertorial_block_($|| )')}),
|
#dict(name='div', attrs={'id': re.compile('advertorial_block_($|| )')}),
|
||||||
dict(name='div', attrs={'id': 'right-col'}),
|
#dict(name='div', attrs={'id': 'right-col'}),
|
||||||
dict(name='div', attrs={'id': 'comments'}), # comment these out if you want the comments to show
|
#dict(name='div', attrs={'id': 'comments'}), # comment these out if you want the comments to show
|
||||||
dict(name='div', attrs={'class': 'item-list related-content'}),
|
#dict(name='div', attrs={'class': 'item-list related-content'}),
|
||||||
|
|
||||||
]
|
#]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'net', u'http://feeds.feedburner.com/net/topstories?format=xml')
|
(u'net', u'http://feeds.feedburner.com/creativebloq/')
|
||||||
]
|
]
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
def skip_ad_pages(self, soup):
|
||||||
|
@ -3,10 +3,10 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '08 Januery 2011, desUBIKado'
|
__copyright__ = '08 Januery 2011, desUBIKado'
|
||||||
__author__ = 'desUBIKado'
|
__author__ = 'desUBIKado'
|
||||||
__description__ = 'Daily newspaper from Biscay'
|
__description__ = 'Daily newspaper from Biscay'
|
||||||
__version__ = 'v0.08'
|
__version__ = 'v0.10'
|
||||||
__date__ = '08, Januery 2011'
|
__date__ = '07, August 2013'
|
||||||
'''
|
'''
|
||||||
[url]http://www.elcorreo.com/[/url]
|
http://www.elcorreo.com/
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import time
|
import time
|
||||||
@ -24,6 +24,7 @@ class heraldo(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
masthead_url = 'http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
|
||||||
language = 'es'
|
language = 'es'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
encoding = 'iso-8859-1'
|
encoding = 'iso-8859-1'
|
||||||
@ -33,15 +34,15 @@ class heraldo(BasicNewsRecipe):
|
|||||||
feeds = [
|
feeds = [
|
||||||
(u'Portada', u'http://www.elcorreo.com/vizcaya/portada.xml'),
|
(u'Portada', u'http://www.elcorreo.com/vizcaya/portada.xml'),
|
||||||
(u'Local', u'http://www.elcorreo.com/vizcaya/rss/feeds/vizcaya.xml'),
|
(u'Local', u'http://www.elcorreo.com/vizcaya/rss/feeds/vizcaya.xml'),
|
||||||
(u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'),
|
(u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'),
|
||||||
(u'Econom\xeda', u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'),
|
(u'Econom\xeda', u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'),
|
||||||
(u'Pol\xedtica', u'http://www.elcorreo.com/vizcaya/rss/feeds/politica.xml'),
|
(u'Pol\xedtica', u'http://www.elcorreo.com/vizcaya/rss/feeds/politica.xml'),
|
||||||
(u'Opini\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'),
|
(u'Opini\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'),
|
||||||
(u'Deportes', u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'),
|
(u'Deportes', u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'),
|
||||||
(u'Sociedad', u'http://www.elcorreo.com/vizcaya/rss/feeds/sociedad.xml'),
|
(u'Sociedad', u'http://www.elcorreo.com/vizcaya/rss/feeds/sociedad.xml'),
|
||||||
(u'Cultura', u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'),
|
(u'Cultura', u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'),
|
||||||
(u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'),
|
(u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'),
|
||||||
(u'Gente', u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml')
|
(u'Gente', u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml')
|
||||||
]
|
]
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
@ -54,14 +55,14 @@ class heraldo(BasicNewsRecipe):
|
|||||||
dict(name='div', attrs={'class':['mod_lomas','bloque_lomas','blm_header','link-app3','link-app4','botones_listado']}),
|
dict(name='div', attrs={'class':['mod_lomas','bloque_lomas','blm_header','link-app3','link-app4','botones_listado']}),
|
||||||
dict(name='div', attrs={'class':['navegacion_galeria','modulocanalpromocion','separa','separacion','compartir','tags_relacionados']}),
|
dict(name='div', attrs={'class':['navegacion_galeria','modulocanalpromocion','separa','separacion','compartir','tags_relacionados']}),
|
||||||
dict(name='div', attrs={'class':['moduloBuscadorDeportes','modulo-gente','moddestacadopeq','OpcArt','articulopiniones']}),
|
dict(name='div', attrs={'class':['moduloBuscadorDeportes','modulo-gente','moddestacadopeq','OpcArt','articulopiniones']}),
|
||||||
dict(name='div', attrs={'class':['modulo-especial','publiEspecial']}),
|
dict(name='div', attrs={'class':['modulo-especial','publiEspecial','carruselNoticias','vj','modulocomun2']}),
|
||||||
dict(name='div', attrs={'id':['articulopina']}),
|
dict(name='div', attrs={'id':['articulopina','webs_asociadas']}),
|
||||||
dict(name='br', attrs={'class':'clear'}),
|
dict(name='br', attrs={'class':'clear'}),
|
||||||
dict(name='form', attrs={'name':'frm_conversor2'})
|
dict(name='form', attrs={'name':'frm_conversor2'})
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags_before = dict(name='div' , attrs={'class':'articulo '})
|
remove_tags_before = dict(name='div' , attrs={'class':'articulo '})
|
||||||
remove_tags_after = dict(name='div' , attrs={'class':'comentarios'})
|
remove_tags_after = dict(name='div' , attrs={'class':'robapaginas'})
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
cover = None
|
cover = None
|
||||||
@ -69,10 +70,8 @@ class heraldo(BasicNewsRecipe):
|
|||||||
year = str(st.tm_year)
|
year = str(st.tm_year)
|
||||||
month = "%.2d" % st.tm_mon
|
month = "%.2d" % st.tm_mon
|
||||||
day = "%.2d" % st.tm_mday
|
day = "%.2d" % st.tm_mday
|
||||||
#[url]http://img.kiosko.net/2011/01/02/es/elcorreo.750.jpg[/url]
|
# http://info.elcorreo.com/pdf/07082013-viz.pdf
|
||||||
#[url]http://info.elcorreo.com/pdf/06012011-viz.pdf[/url]
|
cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf'
|
||||||
cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf'
|
|
||||||
|
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
try:
|
try:
|
||||||
br.open(cover)
|
br.open(cover)
|
||||||
@ -92,29 +91,27 @@ class heraldo(BasicNewsRecipe):
|
|||||||
img{margin-bottom: 0.4em}
|
img{margin-bottom: 0.4em}
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
|
|
||||||
# To present the image of the embedded video
|
# Para presentar la imagen de los video incrustados
|
||||||
(re.compile(r'var RUTA_IMAGEN', re.DOTALL|re.IGNORECASE), lambda match: '</script><img src'),
|
(re.compile(r'var RUTA_IMAGEN', re.DOTALL|re.IGNORECASE), lambda match: '</script><img src'),
|
||||||
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
|
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
|
||||||
(re.compile(r'var SITIO = "elcorreo";', re.DOTALL|re.IGNORECASE), lambda match: '<SCRIPT TYPE="text/JavaScript"'),
|
(re.compile(r'var SITIO = "elcorreo";', re.DOTALL|re.IGNORECASE), lambda match: '<SCRIPT TYPE="text/JavaScript"'),
|
||||||
|
|
||||||
# To separate paragraphs with a blank line
|
# Para separar los parrafos con una linea en blanco
|
||||||
(re.compile(r'<div class="p"', re.DOTALL|re.IGNORECASE), lambda match: '<p></p><div class="p"'),
|
(re.compile(r'<div class="p"', re.DOTALL|re.IGNORECASE), lambda match: '<p></p><div class="p"'),
|
||||||
|
|
||||||
# To put a blank line between the subtitle and the date and time of the news
|
# Para poner una linea en blanco entre el subttulo y la fecha y hora de la noticia
|
||||||
(re.compile(r'<div class="date">', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="date">'),
|
(re.compile(r'<div class="date">', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="date">'),
|
||||||
|
|
||||||
# To put a blank line between the intro of the embedded videos and the previous text
|
# Para poner una linea en blanco entre la entradilla de los videos incrustados y el texto anterior
|
||||||
(re.compile(r'<div class="video"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="video"'),
|
(re.compile(r'<div class="video"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="video"'),
|
||||||
|
|
||||||
# To view photos from the first when these are presented as a gallery
|
# Para sacar las fotos a partir de la primera cuando se presentan como una galeria
|
||||||
(re.compile(r'src="/img/shim.gif"', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
(re.compile(r'src="/img/shim.gif"', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||||
(re.compile(r'rel=', re.DOTALL|re.IGNORECASE), lambda match: 'src='),
|
(re.compile(r'rel=', re.DOTALL|re.IGNORECASE), lambda match: 'src='),
|
||||||
|
|
||||||
# To remove the link of the title
|
# Para quitar el enlace del titulo
|
||||||
(re.compile(r'<h1 class="headline">\n<a href="', re.DOTALL|re.IGNORECASE), lambda match: '<h1 class="'),
|
(re.compile(r'<h1 class="headline">\n<a href="', re.DOTALL|re.IGNORECASE), lambda match: '<h1 class="'),
|
||||||
(re.compile(r'</a>\n</h1>', re.DOTALL|re.IGNORECASE), lambda match: '</h1>'),
|
(re.compile(r'</a>\n</h1>', re.DOTALL|re.IGNORECASE), lambda match: '</h1>'),
|
||||||
|
|
||||||
|
@ -1,18 +1,23 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
##
|
||||||
|
## Last Edited: 2013-09-29 Carlos Alves <carlosalves90@gmail.com>
|
||||||
|
##
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__author__ = '2010, Yuri Alvarez<me at yurialvarez.com>'
|
__author__ = '2010, Yuri Alvarez<me at yurialvarez.com>'
|
||||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
observa.com.uy
|
elobservador.com.uy
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class ObservaDigital(BasicNewsRecipe):
|
class Noticias(BasicNewsRecipe):
|
||||||
title = 'Observa Digital'
|
title = 'El Observador'
|
||||||
__author__ = 'yrvn'
|
__author__ = 'yrvn'
|
||||||
description = 'Noticias de Uruguay'
|
description = 'Noticias desde Uruguay'
|
||||||
|
tags = 'news, sports, entretainment'
|
||||||
language = 'es_UY'
|
language = 'es_UY'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
@ -23,13 +28,18 @@ class ObservaDigital(BasicNewsRecipe):
|
|||||||
|
|
||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
keep_only_tags = [dict(id=['contenido'])]
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'class':'story collapsed'})
|
||||||
|
]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='div', attrs={'id':'contenedorVinculadas'}),
|
dict(name='div', attrs={'class':['fecha', 'copyright', 'story_right']}),
|
||||||
dict(name='p', attrs={'id':'nota_firma'}),
|
dict(name='div', attrs={'class':['photo', 'social']}),
|
||||||
|
dict(name='div', attrs={'id':'widget'}),
|
||||||
dict(name=['object','link'])
|
dict(name=['object','link'])
|
||||||
]
|
]
|
||||||
|
|
||||||
|
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||||
@ -37,19 +47,9 @@ class ObservaDigital(BasicNewsRecipe):
|
|||||||
p {font-family:Arial,Helvetica,sans-serif;}
|
p {font-family:Arial,Helvetica,sans-serif;}
|
||||||
'''
|
'''
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Actualidad', u'http://www.observa.com.uy/RSS/actualidad.xml'),
|
(u'Portada', u'http://elobservador.com.uy/rss/portada/'),
|
||||||
(u'Deportes', u'http://www.observa.com.uy/RSS/deportes.xml'),
|
|
||||||
(u'Vida', u'http://www.observa.com.uy/RSS/vida.xml'),
|
|
||||||
(u'Ciencia y Tecnologia', u'http://www.observa.com.uy/RSS/ciencia.xml')
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def get_cover_url(self):
|
|
||||||
index = 'http://www.observa.com.uy/'
|
|
||||||
soup = self.index_to_soup(index)
|
|
||||||
for image in soup.findAll('img',alt=True):
|
|
||||||
if image['alt'].startswith('Tapa El Observador'):
|
|
||||||
return image['src'].rstrip('b.jpg') + '.jpg'
|
|
||||||
return None
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
|
@ -5,8 +5,8 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '04 December 2010, desUBIKado'
|
__copyright__ = '04 December 2010, desUBIKado'
|
||||||
__author__ = 'desUBIKado'
|
__author__ = 'desUBIKado'
|
||||||
__description__ = 'Daily newspaper from Aragon'
|
__description__ = 'Daily newspaper from Aragon'
|
||||||
__version__ = 'v0.08'
|
__version__ = 'v0.09'
|
||||||
__date__ = '13, November 2011'
|
__date__ = '07, August 2013'
|
||||||
'''
|
'''
|
||||||
elperiodicodearagon.com
|
elperiodicodearagon.com
|
||||||
'''
|
'''
|
||||||
@ -25,11 +25,11 @@ class elperiodicodearagon(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es'
|
language = 'es'
|
||||||
|
masthead_url = 'http://pdf.elperiodicodearagon.com/img/logotipo.gif'
|
||||||
encoding = 'iso-8859-1'
|
encoding = 'iso-8859-1'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comments' : description
|
'comments' : description
|
||||||
,'tags' : category
|
,'tags' : category
|
||||||
@ -56,23 +56,21 @@ class elperiodicodearagon(BasicNewsRecipe):
|
|||||||
(u'Fiestas del Pilar', u'http://zetaestaticos.com/aragon/rss/107_es.xml')
|
(u'Fiestas del Pilar', u'http://zetaestaticos.com/aragon/rss/107_es.xml')
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
remove_attributes = ['height','width']
|
remove_attributes = ['height','width']
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':'Noticia'})]
|
keep_only_tags = [dict(name='div', attrs={'id':'Noticia'})]
|
||||||
|
|
||||||
|
|
||||||
# Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)
|
# Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
index = 'http://pdf.elperiodicodearagon.com/'
|
index = 'http://pdf.elperiodicodearagon.com/edicion.php'
|
||||||
soup = self.index_to_soup(index)
|
soup = self.index_to_soup(index)
|
||||||
for image in soup.findAll('img',src=True):
|
for image in soup.findAll('img',src=True):
|
||||||
if image['src'].startswith('http://pdf.elperiodicodearagon.com/funciones/portada-preview.php?eid='):
|
if image['src'].startswith('/funciones/img-public.php?key='):
|
||||||
return image['src'].rstrip('format=2') + 'format=1'
|
return 'http://pdf.elperiodicodearagon.com' + image['src']
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Usamos la versión para móviles
|
# Usamos la versión para móviles
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url.replace('http://www.elperiodicodearagon.com/', 'http://www.elperiodicodearagon.com/m/')
|
return url.replace('http://www.elperiodicodearagon.com/', 'http://www.elperiodicodearagon.com/m/')
|
||||||
|
|
||||||
|
@ -1,93 +0,0 @@
|
|||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>'
|
|
||||||
'''
|
|
||||||
elguardian.com.ar
|
|
||||||
'''
|
|
||||||
|
|
||||||
import re
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
|
|
||||||
class ElGuardian(BasicNewsRecipe):
|
|
||||||
title = 'El Guardian'
|
|
||||||
__author__ = 'Darko Miletic'
|
|
||||||
description = "Semanario con todas las tendencias de un pais"
|
|
||||||
publisher = 'Editorial Apache SA'
|
|
||||||
category = 'news,politics,Argentina'
|
|
||||||
oldest_article = 8
|
|
||||||
max_articles_per_feed = 200
|
|
||||||
no_stylesheets = True
|
|
||||||
encoding = 'utf8'
|
|
||||||
use_embedded_content = False
|
|
||||||
language = 'es_AR'
|
|
||||||
remove_empty_feeds = True
|
|
||||||
publication_type = 'magazine'
|
|
||||||
issn = '1666-7476'
|
|
||||||
masthead_url = 'http://elguardian.com.ar/application/templates/frontend/images/home/logo.png'
|
|
||||||
extra_css = """
|
|
||||||
body{font-family: Arial,sans-serif}
|
|
||||||
img{margin-bottom: 0.4em; display:block}
|
|
||||||
"""
|
|
||||||
|
|
||||||
conversion_options = {
|
|
||||||
'comment' : description
|
|
||||||
, 'tags' : category
|
|
||||||
, 'publisher' : publisher
|
|
||||||
, 'language' : language
|
|
||||||
, 'series' : title
|
|
||||||
, 'isbn' : issn
|
|
||||||
}
|
|
||||||
|
|
||||||
keep_only_tags = [dict(attrs={'class':['fotos', 'header_nota', 'nota']})]
|
|
||||||
remove_tags = [dict(name=['meta','link','iframe','embed','object'])]
|
|
||||||
remove_attributes = ['lang']
|
|
||||||
|
|
||||||
feeds = [
|
|
||||||
(u'El Pais' , u'http://elguardian.com.ar/RSS/el-pais.xml' )
|
|
||||||
,(u'Columnistas' , u'http://elguardian.com.ar/RSS/columnistas.xml' )
|
|
||||||
,(u'Personajes' , u'http://elguardian.com.ar/RSS/personajes.xml' )
|
|
||||||
,(u'Tinta roja' , u'http://elguardian.com.ar/RSS/tinta-roja.xml' )
|
|
||||||
,(u'Yo fui' , u'http://elguardian.com.ar/RSS/yo-fui.xml' )
|
|
||||||
,(u'Ciencia' , u'http://elguardian.com.ar/RSS/ciencia.xml' )
|
|
||||||
,(u'Cronicas' , u'http://elguardian.com.ar/RSS/cronicas.xml' )
|
|
||||||
,(u'Culturas' , u'http://elguardian.com.ar/RSS/culturas.xml' )
|
|
||||||
,(u'DxT' , u'http://elguardian.com.ar/RSS/dxt.xml' )
|
|
||||||
,(u'Fierros' , u'http://elguardian.com.ar/RSS/fierros.xml' )
|
|
||||||
,(u'Frente fashion', u'http://elguardian.com.ar/RSS/frente-fashion.xml')
|
|
||||||
,(u'Pan y vino' , u'http://elguardian.com.ar/RSS/pan-y-vino.xml' )
|
|
||||||
,(u'Turismo' , u'http://elguardian.com.ar/RSS/turismo.xml' )
|
|
||||||
]
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
|
||||||
soup = self.index_to_soup('http://elguardian.com.ar/')
|
|
||||||
udata = soup.find('div', attrs={'class':'datosNumero'})
|
|
||||||
if udata:
|
|
||||||
sdata = udata.find('div')
|
|
||||||
if sdata:
|
|
||||||
stra = re.findall(r'\d+', self.tag_to_string(sdata))
|
|
||||||
self.conversion_options.update({'series_index':int(stra[1])})
|
|
||||||
unumero = soup.find('div', attrs={'class':'ultimoNumero'})
|
|
||||||
if unumero:
|
|
||||||
img = unumero.find('img', src=True)
|
|
||||||
if img:
|
|
||||||
return img['src']
|
|
||||||
return None
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
for item in soup.findAll(style=True):
|
|
||||||
del item['style']
|
|
||||||
for item in soup.findAll('a'):
|
|
||||||
limg = item.find('img')
|
|
||||||
if item.string is not None:
|
|
||||||
str = item.string
|
|
||||||
item.replaceWith(str)
|
|
||||||
else:
|
|
||||||
if limg:
|
|
||||||
item.name = 'div'
|
|
||||||
item.attrs = []
|
|
||||||
else:
|
|
||||||
str = self.tag_to_string(item)
|
|
||||||
item.replaceWith(str)
|
|
||||||
for item in soup.findAll('img'):
|
|
||||||
if not item.has_key('alt'):
|
|
||||||
item['alt'] = 'image'
|
|
||||||
return soup
|
|
126
recipes/eltribuno_jujuy_impreso.recipe
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
http://www.eltribuno.info/jujuy/edicion_impresa.aspx
|
||||||
|
'''
|
||||||
|
|
||||||
|
import urllib
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
class ElTribunoJujuyImpreso(BasicNewsRecipe):
|
||||||
|
title = 'El Tribuno Jujuy (Edición Impresa)'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = "Diario principal de Jujuy"
|
||||||
|
publisher = 'Horizontes S.A.'
|
||||||
|
category = 'news, politics, Jujuy, Argentina, World'
|
||||||
|
oldest_article = 2
|
||||||
|
language = 'es_AR'
|
||||||
|
max_articles_per_feed = 250
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
encoding = 'utf8'
|
||||||
|
publication_type = 'newspaper'
|
||||||
|
delay = 1
|
||||||
|
articles_are_obfuscated = True
|
||||||
|
temp_files = []
|
||||||
|
PREFIX = 'http://www.eltribuno.info/jujuy/'
|
||||||
|
INDEX = PREFIX + 'edicion_impresa.aspx'
|
||||||
|
PRINTURL = PREFIX + 'nota_print.aspx?%s'
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
, 'linearize_tables' : True
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div' , attrs={'class':['notaHead', 'notaContent']})]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['meta','iframe','base','object','embed','link','img']),
|
||||||
|
dict(name='ul', attrs={'class':'Tabs'})
|
||||||
|
]
|
||||||
|
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Arial,Helvetica,sans-serif}
|
||||||
|
.notaHead h4{text-transform: uppercase; color: gray}
|
||||||
|
img{margin-top: 0.8em; display: block}
|
||||||
|
"""
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
feeds = OrderedDict()
|
||||||
|
soup = None
|
||||||
|
count = 0
|
||||||
|
while (count < 5):
|
||||||
|
try:
|
||||||
|
soup = self.index_to_soup(self.INDEX)
|
||||||
|
count = 5
|
||||||
|
except:
|
||||||
|
print "Retrying download..."
|
||||||
|
count += 1
|
||||||
|
if not soup:
|
||||||
|
return []
|
||||||
|
alink = soup.find('a', href=True, attrs={'class':'ZoomTapa'})
|
||||||
|
if alink and 'href' in alink:
|
||||||
|
self.cover_url = alink['href']
|
||||||
|
sections = soup.findAll('div', attrs={'id':lambda x: x and x.startswith('Ediciones')})
|
||||||
|
for section in sections:
|
||||||
|
section_title = 'Sin titulo'
|
||||||
|
sectiont=section.find('h3', attrs={'class':'NombreSeccion'})
|
||||||
|
if sectiont:
|
||||||
|
section_title = self.tag_to_string(sectiont.span)
|
||||||
|
|
||||||
|
arts = section.findAll('div', attrs={'class':'Noticia NoticiaAB1'})
|
||||||
|
for article in arts:
|
||||||
|
articles = []
|
||||||
|
title=self.tag_to_string(article.div.h3.a)
|
||||||
|
url=article.div.h3.a['href']
|
||||||
|
description=self.tag_to_string(article.p)
|
||||||
|
articles.append({'title':title, 'url':url, 'description':description, 'date':''})
|
||||||
|
|
||||||
|
if articles:
|
||||||
|
if section_title not in feeds:
|
||||||
|
feeds[section_title] = []
|
||||||
|
feeds[section_title] += articles
|
||||||
|
|
||||||
|
ans = [(key, val) for key, val in feeds.iteritems()]
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
for item in soup.findAll('a'):
|
||||||
|
if item.string is not None:
|
||||||
|
str = item.string
|
||||||
|
item.replaceWith(str)
|
||||||
|
else:
|
||||||
|
str = self.tag_to_string(item)
|
||||||
|
item.replaceWith(str)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def get_masthead_title(self):
|
||||||
|
return 'El Tribuno'
|
||||||
|
|
||||||
|
def get_obfuscated_article(self, url):
|
||||||
|
count = 0
|
||||||
|
while (count < 10):
|
||||||
|
try:
|
||||||
|
response = self.browser.open(url)
|
||||||
|
html = response.read()
|
||||||
|
count = 10
|
||||||
|
except:
|
||||||
|
print "Retrying download..."
|
||||||
|
count += 1
|
||||||
|
tfile = PersistentTemporaryFile('_fa.html')
|
||||||
|
tfile.write(html)
|
||||||
|
tfile.close()
|
||||||
|
self.temp_files.append(tfile)
|
||||||
|
return tfile.name
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
right = url.rpartition('/')[2]
|
||||||
|
artid = right.partition('-')[0]
|
||||||
|
params = {'Note':artid}
|
||||||
|
return (self.PRINTURL % urllib.urlencode(params))
|
126
recipes/eltribuno_salta_impreso.recipe
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
http://www.eltribuno.info/salta/edicion_impresa.aspx
|
||||||
|
'''
|
||||||
|
|
||||||
|
import urllib
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
class ElTribunoSaltaImpreso(BasicNewsRecipe):
|
||||||
|
title = 'El Tribuno Salta (Edición Impresa)'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = "Diario principal de Salta"
|
||||||
|
publisher = 'Horizontes S.A.'
|
||||||
|
category = 'news, politics, Salta, Argentina, World'
|
||||||
|
oldest_article = 2
|
||||||
|
language = 'es_AR'
|
||||||
|
max_articles_per_feed = 250
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
encoding = 'utf8'
|
||||||
|
publication_type = 'newspaper'
|
||||||
|
delay = 1
|
||||||
|
articles_are_obfuscated = True
|
||||||
|
temp_files = []
|
||||||
|
PREFIX = 'http://www.eltribuno.info/salta/'
|
||||||
|
INDEX = PREFIX + 'edicion_impresa.aspx'
|
||||||
|
PRINTURL = PREFIX + 'nota_print.aspx?%s'
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
, 'linearize_tables' : True
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div' , attrs={'class':['notaHead', 'notaContent']})]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['meta','iframe','base','object','embed','link','img']),
|
||||||
|
dict(name='ul', attrs={'class':'Tabs'})
|
||||||
|
]
|
||||||
|
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Arial,Helvetica,sans-serif}
|
||||||
|
.notaHead h4{text-transform: uppercase; color: gray}
|
||||||
|
img{margin-top: 0.8em; display: block}
|
||||||
|
"""
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
feeds = OrderedDict()
|
||||||
|
soup = None
|
||||||
|
count = 0
|
||||||
|
while (count < 5):
|
||||||
|
try:
|
||||||
|
soup = self.index_to_soup(self.INDEX)
|
||||||
|
count = 5
|
||||||
|
except:
|
||||||
|
print "Retrying download..."
|
||||||
|
count += 1
|
||||||
|
if not soup:
|
||||||
|
return []
|
||||||
|
alink = soup.find('a', href=True, attrs={'class':'ZoomTapa'})
|
||||||
|
if alink and 'href' in alink:
|
||||||
|
self.cover_url = alink['href']
|
||||||
|
sections = soup.findAll('div', attrs={'id':lambda x: x and x.startswith('Ediciones')})
|
||||||
|
for section in sections:
|
||||||
|
section_title = 'Sin titulo'
|
||||||
|
sectiont=section.find('h3', attrs={'class':'NombreSeccion'})
|
||||||
|
if sectiont:
|
||||||
|
section_title = self.tag_to_string(sectiont.span)
|
||||||
|
|
||||||
|
arts = section.findAll('div', attrs={'class':'Noticia NoticiaAB1'})
|
||||||
|
for article in arts:
|
||||||
|
articles = []
|
||||||
|
title=self.tag_to_string(article.div.h3.a)
|
||||||
|
url=article.div.h3.a['href']
|
||||||
|
description=self.tag_to_string(article.p)
|
||||||
|
articles.append({'title':title, 'url':url, 'description':description, 'date':''})
|
||||||
|
|
||||||
|
if articles:
|
||||||
|
if section_title not in feeds:
|
||||||
|
feeds[section_title] = []
|
||||||
|
feeds[section_title] += articles
|
||||||
|
|
||||||
|
ans = [(key, val) for key, val in feeds.iteritems()]
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
for item in soup.findAll('a'):
|
||||||
|
if item.string is not None:
|
||||||
|
str = item.string
|
||||||
|
item.replaceWith(str)
|
||||||
|
else:
|
||||||
|
str = self.tag_to_string(item)
|
||||||
|
item.replaceWith(str)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def get_masthead_title(self):
|
||||||
|
return 'El Tribuno'
|
||||||
|
|
||||||
|
def get_obfuscated_article(self, url):
|
||||||
|
count = 0
|
||||||
|
while (count < 10):
|
||||||
|
try:
|
||||||
|
response = self.browser.open(url)
|
||||||
|
html = response.read()
|
||||||
|
count = 10
|
||||||
|
except:
|
||||||
|
print "Retrying download..."
|
||||||
|
count += 1
|
||||||
|
tfile = PersistentTemporaryFile('_fa.html')
|
||||||
|
tfile.write(html)
|
||||||
|
tfile.close()
|
||||||
|
self.temp_files.append(tfile)
|
||||||
|
return tfile.name
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
right = url.rpartition('/')[2]
|
||||||
|
artid = right.partition('-')[0]
|
||||||
|
params = {'Note':artid}
|
||||||
|
return (self.PRINTURL % urllib.urlencode(params))
|
@ -1,16 +1,15 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2010-2013, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
www.eluniversal.com
|
www.eluniversal.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre import strftime
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
class ElUniversal(BasicNewsRecipe):
|
class ElUniversal(BasicNewsRecipe):
|
||||||
title = 'El Universal'
|
title = 'El Universal'
|
||||||
__author__ = 'Darko Miletic'
|
__author__ = 'Darko Miletic'
|
||||||
description = 'Noticias de Venezuela'
|
description = 'Noticias de Venezuela y el mundo. Avances informativos de ultimo minuto. Incluye secciones de politica, deportes, economia y mas.'
|
||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
@ -21,7 +20,9 @@ class ElUniversal(BasicNewsRecipe):
|
|||||||
category = 'news, Caracas, Venezuela, world'
|
category = 'news, Caracas, Venezuela, world'
|
||||||
language = 'es_VE'
|
language = 'es_VE'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
cover_url = strftime('http://static.eluniversal.com/%Y/%m/%d/portada.jpg')
|
masthead_url = 'http://cdn.eluniversal.com/images/eu4/back/logo-eluniversal.gif'
|
||||||
|
#cover_url = strftime('http://cdn.eluniversal.com/%Y/%m/%d/portada.jpg')
|
||||||
|
cover_url = 'http://images.eluniversal.com//pdf/primeraPlana.pdf'
|
||||||
extra_css = """
|
extra_css = """
|
||||||
.txt60{font-family: Tahoma,Geneva,sans-serif; font-size: small}
|
.txt60{font-family: Tahoma,Geneva,sans-serif; font-size: small}
|
||||||
.txt29{font-family: Tahoma,Geneva,sans-serif; font-size: small; color: gray}
|
.txt29{font-family: Tahoma,Geneva,sans-serif; font-size: small; color: gray}
|
||||||
@ -30,10 +31,10 @@ class ElUniversal(BasicNewsRecipe):
|
|||||||
body{font-family: Verdana,Arial,Helvetica,sans-serif}
|
body{font-family: Verdana,Arial,Helvetica,sans-serif}
|
||||||
"""
|
"""
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comments' : description
|
'comments' : description
|
||||||
,'tags' : category
|
,'tags' : category
|
||||||
,'language' : language
|
,'language' : language
|
||||||
,'publisher' : publisher
|
,'publisher' : publisher
|
||||||
}
|
}
|
||||||
|
|
||||||
remove_tags_before=dict(attrs={'class':'header-print MB10'})
|
remove_tags_before=dict(attrs={'class':'header-print MB10'})
|
||||||
|
@ -1,85 +1,51 @@
|
|||||||
#!/usr/bin/env python
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
__license__ = 'GPL v3'
|
|
||||||
|
|
||||||
import re
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
|
|
||||||
class FocusRecipe(BasicNewsRecipe):
|
class NYTimes(BasicNewsRecipe):
|
||||||
|
|
||||||
__author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
|
title = 'Focus'
|
||||||
|
__author__ = 'Krittika Goyal'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
version = 1
|
description = 'Polish scientific monthly magazine'
|
||||||
|
timefmt = ' [%d %b, %Y]'
|
||||||
title = u'Focus'
|
needs_subscription = False
|
||||||
publisher = u'Gruner + Jahr Polska'
|
|
||||||
category = u'News'
|
|
||||||
description = u'Focus.pl - pierwszy w Polsce portal społecznościowy dla miłośników nauki. Tematyka: nauka, historia, cywilizacja, technika, przyroda, sport, gadżety'
|
|
||||||
category = 'magazine'
|
|
||||||
cover_url = ''
|
|
||||||
remove_empty_feeds = True
|
|
||||||
no_stylesheets = True
|
|
||||||
oldest_article = 7
|
|
||||||
max_articles_per_feed = 100000
|
|
||||||
recursions = 0
|
|
||||||
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
keep_only_tags = dict(name='article', attrs={'class': 'content'})
|
||||||
encoding = 'utf-8'
|
remove_tags_after = dict(name='div', attrs={'class': 'inner_article'})
|
||||||
# Seems to work best, but YMMV
|
remove_tags = [
|
||||||
simultaneous_downloads = 5
|
dict(name='div', attrs={'class': ['social_btns']}),
|
||||||
|
|
||||||
r = re.compile('.*(?P<url>http:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*')
|
|
||||||
keep_only_tags = []
|
|
||||||
keep_only_tags.append(dict(name='div', attrs={'id': 'cll'}))
|
|
||||||
|
|
||||||
remove_tags = []
|
|
||||||
remove_tags.append(dict(name='div', attrs={'class': 'ulm noprint'}))
|
|
||||||
remove_tags.append(dict(name='div', attrs={'class': 'txb'}))
|
|
||||||
remove_tags.append(dict(name='div', attrs={'class': 'h2'}))
|
|
||||||
remove_tags.append(dict(name='ul', attrs={'class': 'txu'}))
|
|
||||||
remove_tags.append(dict(name='div', attrs={'class': 'ulc'}))
|
|
||||||
|
|
||||||
extra_css = '''
|
|
||||||
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
|
|
||||||
h1{text-align: left;}
|
|
||||||
h2{font-size: medium; font-weight: bold;}
|
|
||||||
p.lead {font-weight: bold; text-align: left;}
|
|
||||||
.authordate {font-size: small; color: #696969;}
|
|
||||||
.fot{font-size: x-small; color: #666666;}
|
|
||||||
'''
|
|
||||||
|
|
||||||
feeds = [
|
|
||||||
('Nauka', 'http://www.focus.pl/nauka/rss/'),
|
|
||||||
('Historia', 'http://www.focus.pl/historia/rss/'),
|
|
||||||
('Cywilizacja', 'http://www.focus.pl/cywilizacja/rss/'),
|
|
||||||
('Sport', 'http://www.focus.pl/sport/rss/'),
|
|
||||||
('Technika', 'http://www.focus.pl/technika/rss/'),
|
|
||||||
('Przyroda', 'http://www.focus.pl/przyroda/rss/'),
|
|
||||||
('Technologie', 'http://www.focus.pl/gadzety/rss/')
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def skip_ad_pages(self, soup):
|
# TO GET ARTICLE TOC
|
||||||
if ('advertisement' in soup.find('title').string.lower()):
|
def nejm_get_index(self):
|
||||||
href = soup.find('a').get('href')
|
return self.index_to_soup('http://www.focus.pl/')
|
||||||
return self.index_to_soup(href, raw=True)
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
# To parse artice toc
|
||||||
soup = self.index_to_soup('http://www.focus.pl/magazyn/')
|
def parse_index(self):
|
||||||
tag = soup.find(name='div', attrs={'class': 'clr fl'})
|
soup = self.nejm_get_index()
|
||||||
if tag:
|
|
||||||
self.cover_url = 'http://www.focus.pl/' + tag.a['href']
|
|
||||||
return getattr(self, 'cover_url', self.cover_url)
|
|
||||||
|
|
||||||
def print_version(self, url):
|
toc = soup.find('div', id='wrapper')
|
||||||
if url.count('focus.pl.feedsportal.com'):
|
|
||||||
u = url.find('focus0Bpl')
|
articles = []
|
||||||
u = 'http://www.focus.pl/' + url[u + 11:]
|
feeds = []
|
||||||
u = u.replace('0C', '/')
|
section_title = 'Focus Articles'
|
||||||
u = u.replace('A', '')
|
for x in toc.findAll(True):
|
||||||
u = u.replace('0E', '-')
|
if x.name == 'h1':
|
||||||
u = u.replace('/nc/1//story01.htm', '/do-druku/1')
|
# Article found
|
||||||
else:
|
a = x.find('a')
|
||||||
u = url.replace('/nc/1', '/do-druku/1')
|
if a is None:
|
||||||
return u
|
continue
|
||||||
|
title = self.tag_to_string(a)
|
||||||
|
url = a.get('href', False)
|
||||||
|
if not url or not title:
|
||||||
|
continue
|
||||||
|
# if url.startswith('story'):
|
||||||
|
url = 'http://www.focus.pl' + url
|
||||||
|
self.log('\t\tFound article:', title)
|
||||||
|
self.log('\t\t\t', url)
|
||||||
|
articles.append({'title': title, 'url': url,
|
||||||
|
'description': '', 'date': ''})
|
||||||
|
feeds.append((section_title, articles))
|
||||||
|
|
||||||
|
return feeds
|
||||||
|
@ -46,35 +46,34 @@ class Frontlineonnet(BasicNewsRecipe):
|
|||||||
|
|
||||||
keep_only_tags= [
|
keep_only_tags= [
|
||||||
dict(name='div', attrs={'id':'content'})
|
dict(name='div', attrs={'id':'content'})
|
||||||
#,dict(attrs={'class':'byline'})
|
|
||||||
]
|
]
|
||||||
#remove_attributes=['size','noshade','border']
|
remove_attributes=['size','noshade','border']
|
||||||
|
|
||||||
#def preprocess_html(self, soup):
|
|
||||||
#for item in soup.findAll(style=True):
|
|
||||||
#del item['style']
|
|
||||||
#for item in soup.findAll('img'):
|
|
||||||
#if not item.has_key('alt'):
|
|
||||||
#item['alt'] = 'image'
|
|
||||||
#return soup
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
articles = []
|
articles = []
|
||||||
|
current_section = None
|
||||||
|
feeds = []
|
||||||
soup = self.index_to_soup(self.INDEX)
|
soup = self.index_to_soup(self.INDEX)
|
||||||
for feed_link in soup.findAll('div', id='headseccol'):
|
for h3 in soup.findAll('h3'):
|
||||||
a = feed_link.find('a', href=True)
|
if h3.get('class', None) == 'artListSec':
|
||||||
title = self.tag_to_string(a)
|
if articles:
|
||||||
url = a['href']
|
feeds.append((current_section, articles))
|
||||||
articles.append({
|
articles = []
|
||||||
'title' :title
|
current_section = self.tag_to_string(h3).strip()
|
||||||
,'date' :''
|
self.log(current_section)
|
||||||
,'url' :url
|
elif h3.get('id', None) in {'headseccol', 'headsec'}:
|
||||||
,'description':''
|
a = h3.find('a', href=True)
|
||||||
})
|
if a is not None:
|
||||||
return [('Frontline', articles)]
|
title = self.tag_to_string(a)
|
||||||
|
url = a['href']
|
||||||
|
articles.append({
|
||||||
|
'title' :title
|
||||||
|
,'date' :''
|
||||||
|
,'url' :url
|
||||||
|
,'description':''
|
||||||
|
})
|
||||||
|
self.log('\t', title, url)
|
||||||
|
if articles:
|
||||||
|
feeds.append((current_section, articles))
|
||||||
|
return feeds
|
||||||
|
|
||||||
#def print_version(self, url):
|
|
||||||
#return "http://www.hinduonnet.com/thehindu/thscrip/print.pl?prd=fline&file=" + url.rpartition('/')[2]
|
|
||||||
|
|
||||||
#def image_url_processor(self, baseurl, url):
|
|
||||||
#return url.replace('../images/', self.INDEX + 'images/').strip()
|
|
||||||
|
@ -14,19 +14,12 @@ class GalaxyEdge(BasicNewsRecipe):
|
|||||||
|
|
||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
|
|
||||||
#keep_only_tags = [dict(id='content')]
|
|
||||||
#remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}),
|
|
||||||
#dict(id=['email-section', 'right-column', 'printfooter', 'topover',
|
|
||||||
#'slidebox', 'th_footer'])]
|
|
||||||
|
|
||||||
extra_css = '.photo-caption { font-size: smaller }'
|
extra_css = '.photo-caption { font-size: smaller }'
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup('http://www.galaxysedge.com/')
|
soup = self.index_to_soup('http://www.galaxysedge.com/')
|
||||||
main = soup.find('table', attrs={'width':'911'})
|
main = soup.find('table', attrs={'width':'944'})
|
||||||
toc = main.find('td', attrs={'width':'225'})
|
toc = main.find('td', attrs={'width':'204'})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
current_section = None
|
current_section = None
|
||||||
current_articles = []
|
current_articles = []
|
||||||
@ -68,41 +61,7 @@ class GalaxyEdge(BasicNewsRecipe):
|
|||||||
current_articles.append({'title': title, 'url':url,
|
current_articles.append({'title': title, 'url':url,
|
||||||
'description':'', 'date':''})
|
'description':'', 'date':''})
|
||||||
if current_articles and current_section:
|
if current_articles and current_section:
|
||||||
feeds.append((current_section, current_articles))
|
feeds.append((current_section, current_articles))
|
||||||
|
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#def preprocess_raw_html(self, raw, url):
|
|
||||||
#return raw.replace('<body><p>', '<p>').replace('</p></body>', '</p>')
|
|
||||||
|
|
||||||
#def postprocess_html(self, soup, first_fetch):
|
|
||||||
#for t in soup.findAll(['table', 'tr', 'td','center']):
|
|
||||||
#t.name = 'div'
|
|
||||||
#return soup
|
|
||||||
|
|
||||||
#def parse_index(self):
|
|
||||||
#today = time.strftime('%Y-%m-%d')
|
|
||||||
#soup = self.index_to_soup(
|
|
||||||
#'http://www.thehindu.com/todays-paper/tp-index/?date=' + today)
|
|
||||||
#div = soup.find(id='left-column')
|
|
||||||
#feeds = []
|
|
||||||
#current_section = None
|
|
||||||
#current_articles = []
|
|
||||||
#for x in div.findAll(['h3', 'div']):
|
|
||||||
#if current_section and x.get('class', '') == 'tpaper':
|
|
||||||
#a = x.find('a', href=True)
|
|
||||||
#if a is not None:
|
|
||||||
#current_articles.append({'url':a['href']+'?css=print',
|
|
||||||
#'title':self.tag_to_string(a), 'date': '',
|
|
||||||
#'description':''})
|
|
||||||
#if x.name == 'h3':
|
|
||||||
#if current_section and current_articles:
|
|
||||||
#feeds.append((current_section, current_articles))
|
|
||||||
#current_section = self.tag_to_string(x)
|
|
||||||
#current_articles = []
|
|
||||||
#return feeds
|
|
||||||
|
|
||||||
|
|
||||||
|
36
recipes/gamekult.recipe
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Malah <malah at neuf dot fr>'
|
||||||
|
'''
|
||||||
|
Gamekult.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
__author__ = '2013, Malah <malah at neuf dot fr>'
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class GamekultCom(BasicNewsRecipe):
|
||||||
|
title = u'Gamekult.com'
|
||||||
|
__author__ = 'Malah'
|
||||||
|
description = u'Toute l`actualité du jeu vidéo PC, consoles, mobiles.'
|
||||||
|
oldest_article = 1.5
|
||||||
|
language = 'fr'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_empty_feeds = True
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
keep_only_tags = [dict(id=['story-page','story-body'])]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':'sharebar'}),
|
||||||
|
dict(name='object', attrs={'type':'application/x-shockwave-flash'}),
|
||||||
|
dict(name='span', attrs={'class':'share'}),
|
||||||
|
dict(name='div', attrs={'class':'story-pagination'}),
|
||||||
|
dict(name='div', attrs={'class':'pagination pagination-centered'}),
|
||||||
|
]
|
||||||
|
|
||||||
|
masthead_url = u'https://upload.wikimedia.org/wikipedia/fr/9/9c/Logo_-_GAMEKULT.png'
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('Test', u'http://www.gamekult.com/feeds/test.html'),
|
||||||
|
('Actu', u'http://www.gamekult.com/feeds/actu.html'),
|
||||||
|
]
|
10
recipes/glenn_greenwald.recipe
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
from calibre.web.feeds.news import AutomaticNewsRecipe
|
||||||
|
class BasicUserRecipe1373130920(AutomaticNewsRecipe):
|
||||||
|
title = u'Glenn Greenwald | guardian.co.uk'
|
||||||
|
language = 'en_GB'
|
||||||
|
__author__ = 'anywho'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [(u'Latest', u'http://www.guardian.co.uk/profile/glenn-greenwald/rss')]
|
@ -4,44 +4,29 @@ __copyright__ = 'Copyright 2010 Starson17'
|
|||||||
www.gocomics.com
|
www.gocomics.com
|
||||||
'''
|
'''
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
import re
|
|
||||||
|
|
||||||
class GoComics(BasicNewsRecipe):
|
class GoComics(BasicNewsRecipe):
|
||||||
title = 'Go Comics'
|
title = 'Go Comics'
|
||||||
__author__ = 'Starson17'
|
__author__ = 'Starson17'
|
||||||
__version__ = '1.06'
|
__version__ = '1.06'
|
||||||
__date__ = '07 June 2011'
|
__date__ = '07 June 2011'
|
||||||
description = u'200+ Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.'
|
description = u'200+ Comics - Customize for more days/comics: Defaults to 1 day, 25 comics - 20 general, 5 editorial.'
|
||||||
category = 'news, comics'
|
category = 'news, comics'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
use_embedded_content= False
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
remove_attributes = ['style']
|
remove_attributes = ['style']
|
||||||
|
|
||||||
####### USER PREFERENCES - COMICS, IMAGE SIZE AND NUMBER OF COMICS TO RETRIEVE ########
|
# USER PREFERENCES - COMICS AND NUMBER OF COMICS TO RETRIEVE ########
|
||||||
# num_comics_to_get - I've tried up to 99 on Calvin&Hobbes
|
# num_comics_to_get - I've tried up to 99 on Calvin&Hobbes
|
||||||
num_comics_to_get = 1
|
num_comics_to_get = 1
|
||||||
# comic_size 300 is small, 600 is medium, 900 is large, 1500 is extra-large
|
|
||||||
comic_size = 900
|
|
||||||
# CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS
|
# CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS
|
||||||
# Please do not overload their servers by selecting all comics and 1000 strips from each!
|
# Please do not overload their servers by selecting all comics and 1000 strips from each!
|
||||||
|
|
||||||
conversion_options = {'linearize_tables' : True
|
keep_only_tags = [
|
||||||
, 'comment' : description
|
dict(name='h1'),
|
||||||
, 'tags' : category
|
dict(name='div', id=lambda x: x and x.startswith('mutable_')),
|
||||||
, 'language' : language
|
]
|
||||||
}
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':['feature','banner']}),
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags = [dict(name='a', attrs={'class':['beginning','prev','cal','next','newest']}),
|
|
||||||
dict(name='div', attrs={'class':['tag-wrapper']}),
|
|
||||||
dict(name='a', attrs={'href':re.compile(r'.*mutable_[0-9]+', re.IGNORECASE)}),
|
|
||||||
dict(name='img', attrs={'src':re.compile(r'.*mutable_[0-9]+', re.IGNORECASE)}),
|
|
||||||
dict(name='ul', attrs={'class':['share-nav','feature-nav']}),
|
|
||||||
]
|
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
@ -50,7 +35,7 @@ class GoComics(BasicNewsRecipe):
|
|||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
feeds = []
|
feeds = []
|
||||||
for title, url in [
|
for i, (title, url) in enumerate([ # {{{
|
||||||
#(u"2 Cows and a Chicken", u"http://www.gocomics.com/2cowsandachicken"),
|
#(u"2 Cows and a Chicken", u"http://www.gocomics.com/2cowsandachicken"),
|
||||||
#(u"9 Chickweed Lane", u"http://www.gocomics.com/9chickweedlane"),
|
#(u"9 Chickweed Lane", u"http://www.gocomics.com/9chickweedlane"),
|
||||||
#(u"Adam At Home", u"http://www.gocomics.com/adamathome"),
|
#(u"Adam At Home", u"http://www.gocomics.com/adamathome"),
|
||||||
@ -271,7 +256,7 @@ class GoComics(BasicNewsRecipe):
|
|||||||
(u"Strange Brew", u"http://www.gocomics.com/strangebrew"),
|
(u"Strange Brew", u"http://www.gocomics.com/strangebrew"),
|
||||||
(u"The Argyle Sweater", u"http://www.gocomics.com/theargylesweater"),
|
(u"The Argyle Sweater", u"http://www.gocomics.com/theargylesweater"),
|
||||||
#
|
#
|
||||||
######## EDITORIAL CARTOONS #####################
|
# EDITORIAL CARTOONS #####################
|
||||||
#(u"Adam Zyglis", u"http://www.gocomics.com/adamzyglis"),
|
#(u"Adam Zyglis", u"http://www.gocomics.com/adamzyglis"),
|
||||||
#(u"Andy Singer", u"http://www.gocomics.com/andysinger"),
|
#(u"Andy Singer", u"http://www.gocomics.com/andysinger"),
|
||||||
#(u"Ben Sargent",u"http://www.gocomics.com/bensargent"),
|
#(u"Ben Sargent",u"http://www.gocomics.com/bensargent"),
|
||||||
@ -363,81 +348,65 @@ class GoComics(BasicNewsRecipe):
|
|||||||
#(u"Walt Handelsman",u"http://www.gocomics.com/walthandelsman"),
|
#(u"Walt Handelsman",u"http://www.gocomics.com/walthandelsman"),
|
||||||
#(u"Wayne Stayskal",u"http://www.gocomics.com/waynestayskal"),
|
#(u"Wayne Stayskal",u"http://www.gocomics.com/waynestayskal"),
|
||||||
#(u"Wit of the World",u"http://www.gocomics.com/witoftheworld"),
|
#(u"Wit of the World",u"http://www.gocomics.com/witoftheworld"),
|
||||||
]:
|
]): # }}}
|
||||||
print 'Working on: ', title
|
self.log('Working on: ', title, url)
|
||||||
articles = self.make_links(url)
|
articles = self.make_links(url)
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
if self.test and i > 0:
|
||||||
|
break
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def make_links(self, url):
|
def make_links(self, url):
|
||||||
title = 'Temp'
|
title = 'Temp'
|
||||||
current_articles = []
|
current_articles = []
|
||||||
pages = range(1, self.num_comics_to_get+1)
|
if self.test:
|
||||||
for page in pages:
|
self.num_comics_to_get = 2
|
||||||
|
num = self.num_comics_to_get
|
||||||
|
while num > 0:
|
||||||
|
num -= 1
|
||||||
page_soup = self.index_to_soup(url)
|
page_soup = self.index_to_soup(url)
|
||||||
if page_soup:
|
if not page_soup:
|
||||||
try:
|
break
|
||||||
strip_title = page_soup.find(name='div', attrs={'class':'top'}).h1.a.string
|
content = page_soup.find(id='content')
|
||||||
except:
|
if content is None:
|
||||||
strip_title = 'Error - no Title found'
|
break
|
||||||
try:
|
feature = content.find(name='div', attrs={'class':'feature'})
|
||||||
date_title = page_soup.find('ul', attrs={'class': 'feature-nav'}).li.string
|
feature_nav = content.find(name='ul', attrs={'class':'feature-nav'})
|
||||||
if not date_title:
|
if feature is None or feature_nav is None:
|
||||||
date_title = page_soup.find('ul', attrs={'class': 'feature-nav'}).li.string
|
break
|
||||||
except:
|
try:
|
||||||
date_title = 'Error - no Date found'
|
a = feature.find('h1').find('a', href=True)
|
||||||
title = strip_title + ' - ' + date_title
|
except:
|
||||||
for i in range(2):
|
self.log.exception('Failed to find current page link')
|
||||||
try:
|
break
|
||||||
strip_url_date = page_soup.find(name='div', attrs={'class':'top'}).h1.a['href']
|
page_url = a['href']
|
||||||
break # success - this is normal exit
|
if page_url.startswith('/'):
|
||||||
except:
|
page_url = 'http://www.gocomics.com' + page_url
|
||||||
strip_url_date = None
|
try:
|
||||||
continue # try to get strip_url_date again
|
strip_title = self.tag_to_string(feature.find('h1').find('a', href=True))
|
||||||
for i in range(2):
|
except:
|
||||||
try:
|
strip_title = 'Error - no Title found'
|
||||||
prev_strip_url_date = page_soup.find('a', attrs={'class': 'prev'})['href']
|
try:
|
||||||
break # success - this is normal exit
|
date_title = self.tag_to_string(feature_nav.find('li'))
|
||||||
except:
|
except:
|
||||||
prev_strip_url_date = None
|
date_title = 'Error - no Date found'
|
||||||
continue # try to get prev_strip_url_date again
|
title = strip_title + ' - ' + date_title
|
||||||
if strip_url_date:
|
|
||||||
page_url = 'http://www.gocomics.com' + strip_url_date
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
if prev_strip_url_date:
|
|
||||||
prev_page_url = 'http://www.gocomics.com' + prev_strip_url_date
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':''})
|
current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':''})
|
||||||
url = prev_page_url
|
a = feature_nav.find('a', href=True, attrs={'class':'prev'})
|
||||||
|
if a is None:
|
||||||
|
break
|
||||||
|
url = a['href']
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = 'http://www.gocomics.com' + url
|
||||||
current_articles.reverse()
|
current_articles.reverse()
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
if soup.title:
|
headings = soup.findAll('h1')
|
||||||
title_string = soup.title.string.strip()
|
for h1 in headings[1:]:
|
||||||
_cd = title_string.split(',',1)[1]
|
h1.extract()
|
||||||
comic_date = ' '.join(_cd.split(' ', 4)[0:-1])
|
self.adeify_images(soup)
|
||||||
if soup.h1.span:
|
return soup
|
||||||
artist = soup.h1.span.string
|
|
||||||
soup.h1.span.string.replaceWith(comic_date + artist)
|
|
||||||
feature_item = soup.find('p',attrs={'class':'feature_item'})
|
|
||||||
if feature_item.a:
|
|
||||||
a_tag = feature_item.a
|
|
||||||
a_href = a_tag["href"]
|
|
||||||
img_tag = a_tag.img
|
|
||||||
img_tag["src"] = a_href
|
|
||||||
img_tag["width"] = self.comic_size
|
|
||||||
img_tag["height"] = None
|
|
||||||
return self.adeify_images(soup)
|
|
||||||
|
|
||||||
extra_css = '''
|
|
||||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
|
||||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
|
||||||
img {max-width:100%; min-width:100%;}
|
|
||||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
|
||||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
@ -47,13 +47,7 @@ class GN(BasicNewsRecipe):
|
|||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def find_articles(self, main_block):
|
def find_articles(self, main_block):
|
||||||
for a in main_block.findAll('div', attrs={'class':'prev_doc2'}):
|
for a in main_block.findAll('div', attrs={'class':['prev_doc2', 'sr-document']}):
|
||||||
art = a.find('a')
|
|
||||||
yield {
|
|
||||||
'title' : self.tag_to_string(art),
|
|
||||||
'url' : 'http://www.gosc.pl' + art['href']
|
|
||||||
}
|
|
||||||
for a in main_block.findAll('div', attrs={'class':'sr-document'}):
|
|
||||||
art = a.find('a')
|
art = a.find('a')
|
||||||
yield {
|
yield {
|
||||||
'title' : self.tag_to_string(art),
|
'title' : self.tag_to_string(art),
|
||||||
|
@ -39,10 +39,10 @@ class HBR(BasicNewsRecipe):
|
|||||||
br.visit('https://hbr.org/login?request_url=/', timeout=20)
|
br.visit('https://hbr.org/login?request_url=/', timeout=20)
|
||||||
except Timeout:
|
except Timeout:
|
||||||
pass
|
pass
|
||||||
br.click('#accordion div[tabindex="0"]', wait_for_load=False)
|
br.click('#form-wrapper h3[tabindex="0"]', wait_for_load=False)
|
||||||
f = br.select_form('#signin-form')
|
f = br.select_form('#login-form')
|
||||||
f['signin-form:username'] = username
|
f['username'] = username
|
||||||
f['signin-form:password'] = password
|
f['password'] = password
|
||||||
br.submit(wait_for_load=False)
|
br.submit(wait_for_load=False)
|
||||||
br.run_for_a_time(30)
|
br.run_for_a_time(30)
|
||||||
|
|
||||||
@ -56,7 +56,8 @@ class HBR(BasicNewsRecipe):
|
|||||||
articles = []
|
articles = []
|
||||||
for x in soup.find(id='issueFeaturesContent').findAll(['li', 'h4']):
|
for x in soup.find(id='issueFeaturesContent').findAll(['li', 'h4']):
|
||||||
if x.name == 'h4':
|
if x.name == 'h4':
|
||||||
if x.get('class', None) == 'basic':continue
|
if x.get('class', None) == 'basic':
|
||||||
|
continue
|
||||||
if current_section is not None and articles:
|
if current_section is not None and articles:
|
||||||
feeds.append((current_section, articles))
|
feeds.append((current_section, articles))
|
||||||
current_section = self.tag_to_string(x).capitalize()
|
current_section = self.tag_to_string(x).capitalize()
|
||||||
@ -64,7 +65,8 @@ class HBR(BasicNewsRecipe):
|
|||||||
self.log('\tFound section:', current_section)
|
self.log('\tFound section:', current_section)
|
||||||
else:
|
else:
|
||||||
a = x.find('a', href=True)
|
a = x.find('a', href=True)
|
||||||
if a is None: continue
|
if a is None:
|
||||||
|
continue
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = a['href']
|
url = a['href']
|
||||||
if '/ar/' not in url:
|
if '/ar/' not in url:
|
||||||
@ -90,11 +92,11 @@ class HBR(BasicNewsRecipe):
|
|||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup0 = self.index_to_soup('http://hbr.org/magazine')
|
soup0 = self.index_to_soup('http://hbr.org/magazine')
|
||||||
datencover = soup0.find('ul', attrs={'id':'magazineArchiveCarousel'}).findAll('li')[-1]
|
datencover = soup0.find('ul', attrs={'id':'magazineArchiveCarousel'}).findAll('li')[-1]
|
||||||
#find date & cover
|
# find date & cover
|
||||||
self.cover_url=datencover.img['src']
|
self.cover_url=datencover.img['src']
|
||||||
dates=self.tag_to_string(datencover.img['alt'])
|
dates=self.tag_to_string(datencover.img['alt'])
|
||||||
self.timefmt = u' [%s]'%dates
|
self.timefmt = u' [%s]'%dates
|
||||||
soup = self.index_to_soup(self.INDEX + soup0.find('div', attrs = {'class':'magazine_page'}).a['href'])
|
soup = self.index_to_soup(self.INDEX + soup0.find('div', attrs={'class':'magazine_page'}).a['href'])
|
||||||
feeds = self.hbr_parse_toc(soup)
|
feeds = self.hbr_parse_toc(soup)
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
|
@ -1,44 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
|
|
||||||
|
|
||||||
'''
|
|
||||||
Fetch High Country News - Blogs
|
|
||||||
'''
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
class HighCountryNewsBlogs(BasicNewsRecipe):
|
|
||||||
|
|
||||||
title = u'High Country News - Blogs'
|
|
||||||
description = u'High Country News - Blogs (RSS Version)'
|
|
||||||
__author__ = 'Armin Geller' # 2012-08-01
|
|
||||||
publisher = 'High Country News'
|
|
||||||
category = 'news, politics, Germany'
|
|
||||||
timefmt = ' [%a, %d %b %Y]'
|
|
||||||
language = 'en'
|
|
||||||
encoding = 'UTF-8'
|
|
||||||
publication_type = 'newspaper'
|
|
||||||
oldest_article = 7
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
no_stylesheets = True
|
|
||||||
auto_cleanup = True
|
|
||||||
remove_javascript = True
|
|
||||||
use_embedded_content = False
|
|
||||||
masthead_url = 'http://www.hcn.org/logo.jpg'
|
|
||||||
cover_source = 'http://www.hcn.org'
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
|
||||||
cover_source_soup = self.index_to_soup(self.cover_source)
|
|
||||||
preview_image_div = cover_source_soup.find(attrs={'class':' portaltype-Plone Site content--hcn template-homepage_view'})
|
|
||||||
return preview_image_div.div.img['src']
|
|
||||||
|
|
||||||
feeds = [
|
|
||||||
(u'From the Blogs', u'http://feeds.feedburner.com/hcn/FromTheBlogs?format=xml'),
|
|
||||||
|
|
||||||
(u'Heard around the West', u'http://feeds.feedburner.com/hcn/heard?format=xml'),
|
|
||||||
(u'The GOAT Blog', u'http://feeds.feedburner.com/hcn/goat?format=xml'),
|
|
||||||
(u'The Range', u'http://feeds.feedburner.com/hcn/range?format=xml'),
|
|
||||||
]
|
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
return url
|
|
||||||
|
|
@ -1,6 +1,12 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
#
|
||||||
|
# Written: 2012-01-28
|
||||||
|
# Last Edited: 2013-09-06
|
||||||
|
# Remark: Version 1.3
|
||||||
|
# Update cleanup for new web article design
|
||||||
|
#
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
|
__copyright__ = '2013, Armin Geller'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Fetch High Country News
|
Fetch High Country News
|
||||||
@ -9,35 +15,77 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
class HighCountryNews(BasicNewsRecipe):
|
class HighCountryNews(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'High Country News'
|
title = u'High Country News'
|
||||||
description = u'News from the American West'
|
description = u'High Country News (RSS Version)'
|
||||||
__author__ = 'Armin Geller' # 2012-01-31
|
__author__ = 'Armin Geller'
|
||||||
publisher = 'High Country News'
|
publisher = 'High Country News'
|
||||||
|
category = 'news, politics'
|
||||||
timefmt = ' [%a, %d %b %Y]'
|
timefmt = ' [%a, %d %b %Y]'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
encoding = 'UTF-8'
|
encoding = 'UTF-8'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
oldest_article = 7
|
oldest_article = 14
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
auto_cleanup = True
|
auto_cleanup = False
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
remove_empty_feeds = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
masthead_url = 'http://www.hcn.org/logo.jpg' # 2012-01-31 AGe add
|
|
||||||
cover_source = 'http://www.hcn.org' # 2012-01-31 AGe add
|
|
||||||
|
|
||||||
def get_cover_url(self): # 2012-01-31 AGe add
|
masthead_url = 'http://www.hcn.org/logo.jpg'
|
||||||
cover_source_soup = self.index_to_soup(self.cover_source)
|
cover_source = 'http://www.hcn.org'
|
||||||
preview_image_div = cover_source_soup.find(attrs={'class':' portaltype-Plone Site content--hcn template-homepage_view'})
|
|
||||||
return preview_image_div.div.img['src']
|
def get_cover_url(self):
|
||||||
|
cover_source_soup = self.index_to_soup(self.cover_source)
|
||||||
|
preview_image_div = cover_source_soup.find(attrs={'class':' portaltype-Plone Site content--hcn template-homepage_view'})
|
||||||
|
return preview_image_div.div.img['src']
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Most recent', u'http://feeds.feedburner.com/hcn/most-recent'),
|
(u'Most recent', u'http://feeds.feedburner.com/hcn/most-recent?format=xml'),
|
||||||
(u'Current Issue', u'http://feeds.feedburner.com/hcn/current-issue'),
|
(u'Current Issue', u'http://feeds.feedburner.com/hcn/current-issue?format=xml'),
|
||||||
|
|
||||||
|
(u'From the Blogs', u'http://feeds.feedburner.com/hcn/FromTheBlogs?format=xml'),
|
||||||
|
(u'Heard around the West', u'http://feeds.feedburner.com/hcn/heard?format=xml'),
|
||||||
|
(u'The GOAT Blog', u'http://feeds.feedburner.com/hcn/goat?format=xml'),
|
||||||
|
(u'The Range', u'http://feeds.feedburner.com/hcn/range?format=xml'),
|
||||||
|
|
||||||
(u'Writers on the Range', u'http://feeds.feedburner.com/hcn/wotr'),
|
(u'Writers on the Range', u'http://feeds.feedburner.com/hcn/wotr'),
|
||||||
(u'High Country Views', u'http://feeds.feedburner.com/hcn/HighCountryViews'),
|
(u'High Country Views', u'http://feeds.feedburner.com/hcn/HighCountryViews'),
|
||||||
]
|
]
|
||||||
|
|
||||||
def print_version(self, url):
|
# 2013-07-23 AGe New coding w/o using print_version
|
||||||
return url + '/print_view'
|
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'id':['content']}),
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['documentActions supercedeDocumentActions editorialDocumentActions',
|
||||||
|
'documentActions supercedeDocumentActions editorialDocumentActions editorialFooterDocumentActions',
|
||||||
|
'article-sidebar',
|
||||||
|
'image-viewer-controls nojs',
|
||||||
|
'protectedArticleWrapper',
|
||||||
|
'visualClear',
|
||||||
|
'feed-icons', # 2013-09-06 AGe add
|
||||||
|
'PayWallEmail', # 2013-09-06 AGe add
|
||||||
|
]}),
|
||||||
|
dict(name='div', attrs={'id':['offer-below-locked-article']}), # 2013-09-06 AGe add
|
||||||
|
]
|
||||||
|
|
||||||
|
INDEX = ''
|
||||||
|
def append_page(self, soup, appendtag, position):
|
||||||
|
pager = soup.find('span',attrs={'class':'next'})
|
||||||
|
if pager:
|
||||||
|
nexturl = self.INDEX + pager.a['href']
|
||||||
|
soup2 = self.index_to_soup(nexturl)
|
||||||
|
texttag = soup2.find('div', attrs={'class':'article-text'})
|
||||||
|
newpos = len(texttag.contents)
|
||||||
|
self.append_page(soup2,texttag,newpos)
|
||||||
|
texttag.extract()
|
||||||
|
appendtag.insert(position,texttag)
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
self.append_page(soup, soup.body, 3)
|
||||||
|
pager = soup.find('div',attrs={'class':'listingBar listingBar-article'})
|
||||||
|
if pager:
|
||||||
|
pager.extract()
|
||||||
|
return self.adeify_images(soup)
|
||||||
|
@ -1,41 +1,206 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
# -*- coding: utf-8 -*-
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Dale Furrow dkfurrow@gmail.com'
|
||||||
|
'''
|
||||||
|
chron.com
|
||||||
|
'''
|
||||||
|
import re, time
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from calibre.utils.date import dt_factory, local_tz
|
||||||
|
from datetime import datetime, timedelta, date
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
|
|
||||||
class HoustonChronicle(BasicNewsRecipe):
|
class HoustonChronicle(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'The Houston Chronicle'
|
title = u'The Houston Chronicle'
|
||||||
description = 'News from Houston, Texas'
|
description = 'News from Houston, Texas'
|
||||||
__author__ = 'Kovid Goyal'
|
__author__ = 'Dale Furrow'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
timefmt = ' [%a, %d %b, %Y]'
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
# use_embedded_content = False
|
||||||
remove_attributes = ['style']
|
remove_attributes = ['style']
|
||||||
auto_cleanup = True
|
remove_empty_feeds = True
|
||||||
|
timefmt = '[%a, %d %b %Y]'
|
||||||
oldest_article = 3.0
|
timestampfmt = '%Y%m%d%H%M%S'
|
||||||
|
ignore_duplicate_articles = {'url'}
|
||||||
#keep_only_tags = {'class':lambda x: x and ('hst-articletitle' in x or
|
|
||||||
#'hst-articletext' in x or 'hst-galleryitem' in x)}
|
|
||||||
remove_attributes = ['xmlns']
|
remove_attributes = ['xmlns']
|
||||||
|
|
||||||
feeds = [
|
remove_tags = [dict(name='div', attrs={'class':'socialBar'}),
|
||||||
('News', "http://www.chron.com/rss/feed/News-270.php"),
|
dict(name='div', attrs={'class':re.compile('post-commentmeta')}),
|
||||||
('Sports',
|
dict(name='div', attrs={'class':re.compile('slideshow_wrapper')}),
|
||||||
'http://www.chron.com/sports/headlines/collectionRss/Sports-Headlines-Staff-Stories-10767.php'),
|
dict(name='div', attrs={'class':'entry-summary'}),
|
||||||
('Neighborhood',
|
dict(name='a', attrs={'rel':'item-license'})]
|
||||||
'http://www.chron.com/rss/feed/Neighborhood-305.php'),
|
|
||||||
('Business', 'http://www.chron.com/rss/feed/Business-287.php'),
|
baseUrl = 'http://www.chron.com'
|
||||||
('Entertainment',
|
|
||||||
'http://www.chron.com/rss/feed/Entertainment-293.php'),
|
oldest_web_article = 7.0
|
||||||
('Editorials',
|
|
||||||
'http://www.chron.com/opinion/editorials/collectionRss/Opinion-Editorials-Headline-List-10567.php'),
|
if oldest_web_article is None:
|
||||||
('Life', 'http://www.chron.com/rss/feed/Life-297.php'),
|
earliest_date = date.today()
|
||||||
('Science & Tech',
|
else:
|
||||||
'http://www.chron.com/rss/feed/AP-Technology-and-Science-266.php'),
|
earliest_date = date.today() - timedelta(days=oldest_web_article)
|
||||||
]
|
|
||||||
|
pages = [('news' , '/news/houston-texas/'),
|
||||||
|
('business' , '/business/'),
|
||||||
|
('opinion', '/opinion/'),
|
||||||
|
('sports', '/sports/')]
|
||||||
|
|
||||||
|
def getLinksFromSectionPage(self, sectionUrl):
|
||||||
|
pageDoc = html.parse(sectionUrl)
|
||||||
|
els = pageDoc.xpath("""//div[contains(@class, 'scp-item')
|
||||||
|
or @class='scp-feature' or contains(@class, 'simplelist')
|
||||||
|
or contains(@class, 'scp-blogpromo')]
|
||||||
|
//a[@href and not(@target) and not(child::img)]""")
|
||||||
|
elList = []
|
||||||
|
for el in els:
|
||||||
|
link = el.get('href')
|
||||||
|
title = el.text
|
||||||
|
if link[:4] != 'http':
|
||||||
|
link = self.baseUrl + link
|
||||||
|
if title is not None:
|
||||||
|
elList.append((link, el.text))
|
||||||
|
return elList
|
||||||
|
|
||||||
|
def getArticleDescriptionFromDoc(self, pageDoc):
|
||||||
|
descriptionCharsBreak = 140
|
||||||
|
descriptionMaxChars = 300
|
||||||
|
descXpath = """//div[contains(@class, 'article-body') or
|
||||||
|
contains(@class, 'resource-content') or contains(@class, 'post')]//p"""
|
||||||
|
sentenceRegex = re.compile("(\S.+?[.!?])(?=\s+|$)")
|
||||||
|
|
||||||
|
def stringify_children(node):
|
||||||
|
return ''.join([x for x in node.itertext()])
|
||||||
|
try:
|
||||||
|
els = pageDoc.xpath(descXpath)
|
||||||
|
outText = ""
|
||||||
|
ellipsis = ""
|
||||||
|
for el in els:
|
||||||
|
sentences = re.findall(sentenceRegex, stringify_children(el))
|
||||||
|
for sentence in sentences:
|
||||||
|
if len(outText) < descriptionCharsBreak:
|
||||||
|
outText += sentence + " "
|
||||||
|
else:
|
||||||
|
if len(outText) > descriptionMaxChars:
|
||||||
|
ellipsis = "..."
|
||||||
|
return outText[:descriptionMaxChars] + ellipsis
|
||||||
|
return outText
|
||||||
|
except:
|
||||||
|
self.log('Error on Article Description')
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def getPublishedTimeFromDoc(self, pageDoc):
|
||||||
|
regexDateOnly = re.compile("""(?:January|February|March|April|
|
||||||
|
May|June|July|August|September|October|November|
|
||||||
|
December)\s[0-9]{1,2},\s20[01][0-9]""")
|
||||||
|
regextTimeOnly = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
|
||||||
|
def getRegularTimestamp(dateString):
|
||||||
|
try:
|
||||||
|
outDate = datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
return outDate
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
def getDateFromString(inText):
|
||||||
|
match = re.findall(regexDateOnly, inText)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
outDate = datetime.strptime(match[0], "%B %d, %Y")
|
||||||
|
match = re.findall(regextTimeOnly, inText)
|
||||||
|
if match:
|
||||||
|
outTime = datetime.strptime(match[0], "%I:%M %p")
|
||||||
|
return datetime.combine(outDate.date(), outTime.time())
|
||||||
|
return outDate
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
el = pageDoc.xpath("//*[@class='timestamp'][1]")
|
||||||
|
if len(el) == 1:
|
||||||
|
return getRegularTimestamp(el[0].get('title'))
|
||||||
|
else:
|
||||||
|
el = pageDoc.xpath("//*[@class='entry-date' or @class='post-date'][1]")
|
||||||
|
if len(el) == 1:
|
||||||
|
return getDateFromString(el[0].text_content())
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getAllFeedDataFromPage(self, page):
|
||||||
|
articles = []
|
||||||
|
linkList = self.getLinksFromSectionPage(self.baseUrl + page[1])
|
||||||
|
self.log('from section: ', page[0], " found ", len(linkList), " links")
|
||||||
|
for link in linkList:
|
||||||
|
try:
|
||||||
|
articleDoc = html.parse(link[0])
|
||||||
|
description = self.getArticleDescriptionFromDoc(articleDoc)
|
||||||
|
articleDate = self.getPublishedTimeFromDoc(articleDoc)
|
||||||
|
if articleDate is not None and description is not None and articleDate.date() > self.earliest_date:
|
||||||
|
dateText = articleDate.strftime('%a, %d %b')
|
||||||
|
author = articleDate.strftime(self.timestampfmt)
|
||||||
|
articles.append({'title':link[1], 'url':link[0],
|
||||||
|
'description':description, 'date':dateText, 'author':author})
|
||||||
|
self.log(page[0] + ": " + link[1] + ', from ' + dateText +
|
||||||
|
" description of " + str(len(description)) + ' characters at ' + link[0])
|
||||||
|
else:
|
||||||
|
msg = ""
|
||||||
|
if articleDate is None:
|
||||||
|
msg = " No Timestamp Found"
|
||||||
|
else:
|
||||||
|
msg = " article older than " + str(self.oldest_web_article) + ' days...'
|
||||||
|
self.log("Skipping article: ", link[0], msg)
|
||||||
|
except:
|
||||||
|
print 'error on fetching ' + link[0]
|
||||||
|
continue
|
||||||
|
return articles
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
|
||||||
|
self.timefmt = ' [%a, %d %b, %Y]'
|
||||||
|
self.log('starting parse_index: ', time.strftime(self.timestampfmt))
|
||||||
|
feeds = []
|
||||||
|
for page in self.pages:
|
||||||
|
articles = []
|
||||||
|
articles = self.getAllFeedDataFromPage(page)
|
||||||
|
if articles:
|
||||||
|
feeds.append((page[0], articles))
|
||||||
|
self.log('finished parse_index: ', time.strftime(self.timestampfmt))
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def preprocess_html(self, thisSoup):
|
||||||
|
baseTags = []
|
||||||
|
baseTags.extend(thisSoup.findAll(name='div', attrs={'id':re.compile('post-\d+')}))
|
||||||
|
baseTags.extend(thisSoup.findAll(name='div', attrs={'class':'hnews hentry item'}))
|
||||||
|
allTags = []
|
||||||
|
allTags.extend(baseTags)
|
||||||
|
if len(baseTags) > 0:
|
||||||
|
for tag in baseTags:
|
||||||
|
allTags.extend(tag.findAll(True))
|
||||||
|
paragraphs = thisSoup.findAll(name='p')
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
if paragraph not in allTags:
|
||||||
|
allTags.append(paragraph)
|
||||||
|
for tag in baseTags:
|
||||||
|
while tag.parent is not None:
|
||||||
|
allTags.append(tag)
|
||||||
|
tag = tag.parent
|
||||||
|
for tag in thisSoup.findAll(True):
|
||||||
|
if tag not in allTags:
|
||||||
|
tag.extract()
|
||||||
|
return thisSoup
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
if not first:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
article.date = time.strptime(article.author, self.timestampfmt)
|
||||||
|
article.utctime = dt_factory(article.date, assume_utc=False, as_utc=False)
|
||||||
|
article.localtime = article.utctime.astimezone(local_tz)
|
||||||
|
except Exception as inst: # remove after debug
|
||||||
|
self.log('Exception: ', article.title) # remove after debug
|
||||||
|
self.log(type(inst)) # remove after debug
|
||||||
|
self.log(inst) # remove after debug
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
BIN
recipes/icons/acrimed.png
Normal file
After Width: | Height: | Size: 709 B |
BIN
recipes/icons/diagonales.png
Normal file
After Width: | Height: | Size: 4.8 KiB |
BIN
recipes/icons/eltribuno_jujuy_impreso.png
Normal file
After Width: | Height: | Size: 592 B |
BIN
recipes/icons/eltribuno_salta_impreso.png
Normal file
After Width: | Height: | Size: 592 B |
BIN
recipes/icons/lacapital.png
Normal file
After Width: | Height: | Size: 1.1 KiB |
BIN
recipes/icons/le_monde_diplomatique_fr.png
Normal file
After Width: | Height: | Size: 446 B |
BIN
recipes/icons/le_monde_sub.png
Normal file
After Width: | Height: | Size: 510 B |
BIN
recipes/icons/miradasalsur.png
Normal file
After Width: | Height: | Size: 1.1 KiB |
12
recipes/il_cambiamento.recipe
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class IC(BasicNewsRecipe):
|
||||||
|
title = u'il Cambiamento'
|
||||||
|
oldest_article = 12
|
||||||
|
max_articles_per_feed = 50
|
||||||
|
language = 'it'
|
||||||
|
__author__ = 'ghib9'
|
||||||
|
auto_cleanup = True
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
|
feeds = [(u'il Cambiamento', u'http://www.ilcambiamento.it/rss.xml')]
|
16
recipes/il_foglio.recipe
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1373969939(BasicNewsRecipe):
|
||||||
|
title = u'Il Foglio - Editoriali'
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 10
|
||||||
|
auto_cleanup = False
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'class':'sec_item'})
|
||||||
|
]
|
||||||
|
feeds = [(u'Il Foglio - Editoriali', u'http://feed43.com/8814237344800115.xml')]
|
||||||
|
no_stylesheets = True
|
||||||
|
__author__ = 'faber1971'
|
||||||
|
description = 'Leading articles from an Italian newspaper - v1.00 (16 July, 2013)'
|
||||||
|
language = 'it'
|
||||||
|
masthead_url = 'http://www.ilfoglio.it/media/img/interface/logo_testata_small.gif'
|
@ -1,504 +1,34 @@
|
|||||||
# adapted from old recipe by Darko Miletic <darko.miletic at gmail.com>
|
# adapted from old recipe by Darko Miletic <darko.miletic at gmail.com>
|
||||||
|
|
||||||
import re
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
|
||||||
|
|
||||||
|
|
||||||
class TheIndependentNew(BasicNewsRecipe):
|
class TheIndependentNew(BasicNewsRecipe):
|
||||||
|
|
||||||
# flag to enable/disable article graphics on business pages/some others
|
# used for converting rating to stars
|
||||||
# eg http://www.independent.co.uk/news/world/europe/berlusconi-departure-fails-to-calm-the-markets-6259682.html
|
|
||||||
# -max dimensions can be altered using the .pictureContainer img selector in the css
|
|
||||||
_FETCH_ARTICLE_GRAPHICS = True
|
|
||||||
|
|
||||||
#Flag to enable/disable image fetching (not business)
|
|
||||||
_FETCH_IMAGES = True
|
|
||||||
|
|
||||||
#Set max gallery images here (respects _FETCH_IMAGES)
|
|
||||||
# -1 for infinite
|
|
||||||
_MAX_GALLERY_IMAGES = -1
|
|
||||||
|
|
||||||
|
|
||||||
#used for converting rating to stars
|
|
||||||
_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png'
|
_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png'
|
||||||
_NO_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star_grey.png'
|
_NO_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star_grey.png'
|
||||||
|
|
||||||
|
|
||||||
title = u'The Independent'
|
title = u'The Independent'
|
||||||
__author__ = 'Will'
|
__author__ = 'Krittika Goyal'
|
||||||
description = 'The latest in UK News and World News from The \
|
description = 'The latest in UK News and World News from The \
|
||||||
Independent. Wide range of international and local news, sports \
|
Independent. Wide range of international and local news, sports \
|
||||||
news, commentary and opinion pieces.Independent News - Breaking news \
|
news, commentary and opinion pieces.Independent News - Breaking news \
|
||||||
that matters. Your daily comprehensive news source - The \
|
that matters. Your daily comprehensive news source - The \
|
||||||
Independent Newspaper'
|
Independent Newspaper'
|
||||||
publisher = 'The Independent'
|
publisher = 'The Independent'
|
||||||
|
oldest_article = 2.0
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
remove_empty_feeds = True
|
||||||
category = 'news, UK'
|
category = 'news, UK'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
|
auto_cleanup = True
|
||||||
language = 'en_GB'
|
language = 'en_GB'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
masthead_url = 'http://www.independent.co.uk/independent.co.uk/editorial/logo/independent_Masthead.png'
|
masthead_url = 'http://www.independent.co.uk/independent.co.uk/editorial/logo/independent_Masthead.png'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
compress_news_images = True
|
compress_news_images = True
|
||||||
remove_tags =[
|
|
||||||
dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
|
|
||||||
dict(attrs={'class' : ['autoplay','openBiogPopup']}),
|
|
||||||
dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
|
|
||||||
dict(name='img',attrs={'alt' : ['view gallery']}),
|
|
||||||
dict(attrs={'style' : re.compile('.*')}),
|
|
||||||
dict(attrs={'class':lambda x: x and 'voicesRelatedTopics' in x.split()}),
|
|
||||||
]
|
|
||||||
|
|
||||||
keep_only_tags =[dict(attrs={'id':['main','top']})]
|
|
||||||
recursions = 0
|
|
||||||
|
|
||||||
# fixes non compliant html nesting and 'marks' article graphics links
|
|
||||||
preprocess_regexps = [
|
|
||||||
(re.compile('<span class="storyTop ">(?P<nested>.*?)</span>', re.DOTALL),
|
|
||||||
lambda match: '<div class="storyTop">' + match.group('nested') + '</div>'),
|
|
||||||
(re.compile('(<strong>.*?[Cc]lick.*?<a.*?((HERE)|([Hh]ere)).*?</strong>)', re.DOTALL),
|
|
||||||
lambda match: '<div class="article-graphic">' + match.group(0) + '</div>'),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
conversion_options = {
|
|
||||||
'comment' : description
|
|
||||||
, 'tags' : category
|
|
||||||
, 'publisher' : publisher
|
|
||||||
, 'language' : language
|
|
||||||
}
|
|
||||||
|
|
||||||
extra_css = """
|
|
||||||
h1{font-family: Georgia,serif ; font-size: x-large; }
|
|
||||||
body{font-family: Verdana,Arial,Helvetica,sans-serif}
|
|
||||||
img{margin-bottom: 0.4em; display:block}
|
|
||||||
.starRating img {float: left}
|
|
||||||
.starRating {margin-top:0.4em; display: block}
|
|
||||||
.image {clear:left; font-size: x-small; color:#888888;}
|
|
||||||
.articleByTimeLocation {font-size: x-small; color:#888888;
|
|
||||||
margin-bottom:0.2em ; margin-top:0.2em ; display:block}
|
|
||||||
.subtitle {clear:left ;}
|
|
||||||
.column-1 h1 { color: #191919}
|
|
||||||
.column-1 h2 { color: #333333}
|
|
||||||
.column-1 h3 { color: #444444}
|
|
||||||
.subtitle { color: #777777; font-size: medium;}
|
|
||||||
.column-1 a,h1,h2,h3 { margin: 0; }
|
|
||||||
.column-1 div{margin: 0;}
|
|
||||||
.articleContent {display: block; clear:left;}
|
|
||||||
.articleContent {color: #000000; font-size: medium;}
|
|
||||||
.ivDrip-section {color: #000000; font-size: medium;}
|
|
||||||
.datetime {color: #888888}
|
|
||||||
.title {font-weight:bold;}
|
|
||||||
.storyTop{}
|
|
||||||
.pictureContainer img { max-width: 400px; max-height: 400px;}
|
|
||||||
.image img { max-width: 400px; max-height: 400px;}
|
|
||||||
"""
|
|
||||||
|
|
||||||
oldest_article = 1
|
|
||||||
max_articles_per_feed = 100
|
|
||||||
|
|
||||||
_processed_urls = []
|
|
||||||
|
|
||||||
|
|
||||||
def get_article_url(self, article):
|
|
||||||
url = super(self.__class__,self).get_article_url(article)
|
|
||||||
|
|
||||||
title = article.get('title', None)
|
|
||||||
if title and re.search("^Video:",title):
|
|
||||||
return None
|
|
||||||
|
|
||||||
#remove duplicates
|
|
||||||
if not (url in self._processed_urls):
|
|
||||||
self._processed_urls.append(url)
|
|
||||||
else:
|
|
||||||
url = None
|
|
||||||
return url
|
|
||||||
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
|
||||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
|
||||||
picdiv = soup.find('img')
|
|
||||||
if picdiv is not None:
|
|
||||||
self.add_toc_thumbnail(article,picdiv['src'])
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
|
|
||||||
#remove 'advertorial articles'
|
|
||||||
strapline = soup.find('div',attrs={'class' : re.compile('.*strapLine.*')})
|
|
||||||
if strapline:
|
|
||||||
for para in strapline.findAll('p'):
|
|
||||||
if len(para.contents) and isinstance(para.contents[0],NavigableString) \
|
|
||||||
and para.contents[0] == 'ADVERTORIAL FEATURE':
|
|
||||||
return None
|
|
||||||
|
|
||||||
# remove Suggested Topics
|
|
||||||
items_to_extract = []
|
|
||||||
|
|
||||||
for item in soup.findAll('div',attrs={'class' : re.compile('.*RelatedArtTag.*')}):
|
|
||||||
items_to_extract.append(item)
|
|
||||||
|
|
||||||
for item in items_to_extract:
|
|
||||||
item.extract()
|
|
||||||
|
|
||||||
items_to_extract = []
|
|
||||||
slideshow_elements = []
|
|
||||||
|
|
||||||
for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
|
|
||||||
remove = True
|
|
||||||
pattern = re.compile('((articleContent)|(title))$')
|
|
||||||
if (pattern.search(item['class'])) is not None:
|
|
||||||
remove = False
|
|
||||||
|
|
||||||
# corrections
|
|
||||||
# story content always good
|
|
||||||
pattern = re.compile('storyContent')
|
|
||||||
if (pattern.search(item['class'])) is not None:
|
|
||||||
remove = False
|
|
||||||
|
|
||||||
#images
|
|
||||||
pattern = re.compile('slideshow')
|
|
||||||
if (pattern.search(item['class'])) is not None:
|
|
||||||
if self._FETCH_IMAGES:
|
|
||||||
remove = False
|
|
||||||
slideshow_elements.append(item)
|
|
||||||
else:
|
|
||||||
remove = True
|
|
||||||
|
|
||||||
#social widgets always bad
|
|
||||||
pattern = re.compile('socialwidget')
|
|
||||||
if (pattern.search(item['class'])) is not None:
|
|
||||||
remove = True
|
|
||||||
|
|
||||||
if remove:
|
|
||||||
items_to_extract.append(item)
|
|
||||||
|
|
||||||
for item in items_to_extract:
|
|
||||||
item.extract()
|
|
||||||
|
|
||||||
items_to_extract = []
|
|
||||||
|
|
||||||
if self._FETCH_IMAGES:
|
|
||||||
for element in slideshow_elements:
|
|
||||||
for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
|
|
||||||
if item.img is not None:
|
|
||||||
#use full size image
|
|
||||||
images = []
|
|
||||||
|
|
||||||
img = item.findNext('img')
|
|
||||||
|
|
||||||
if not '?action=gallery' in item['href']:
|
|
||||||
img['src'] = item['href']
|
|
||||||
tag = Tag(soup,'h3')
|
|
||||||
text = ''
|
|
||||||
try:
|
|
||||||
text = img['data-title']
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if img.get('title') and (len(img['title']) > 1):
|
|
||||||
text = NavigableString(img['title'])
|
|
||||||
tag.insert(0,text)
|
|
||||||
images.append((img, tag))
|
|
||||||
else:
|
|
||||||
gallery_images, remove_link = self._get_gallery_images(item['href'])
|
|
||||||
images = images + gallery_images
|
|
||||||
if remove_link:
|
|
||||||
gal_link = soup.find('a',attrs={'id' : 'view-gallery'})
|
|
||||||
if gal_link:
|
|
||||||
gal_link.extract()
|
|
||||||
img.extract()
|
|
||||||
for (img, title) in images:
|
|
||||||
#insert caption if available
|
|
||||||
if title:
|
|
||||||
#picture before text
|
|
||||||
img.extract()
|
|
||||||
item.insert(0,img)
|
|
||||||
item.insert(1,title)
|
|
||||||
|
|
||||||
# remove link
|
|
||||||
item.name = "div"
|
|
||||||
item["class"]='image'
|
|
||||||
del item["href"]
|
|
||||||
|
|
||||||
|
|
||||||
#remove empty subtitles
|
|
||||||
"""
|
|
||||||
currently the subtitle is located in first paragraph after
|
|
||||||
sibling <h3 class="subtitle"> tag. This may be 'fixed' at
|
|
||||||
some point.
|
|
||||||
"""
|
|
||||||
subtitle = soup.find('h3',attrs={'class' : 'subtitle'})
|
|
||||||
if subtitle is not None:
|
|
||||||
subtitleText = subtitle.findNext('p')
|
|
||||||
if subtitleText is not None:
|
|
||||||
if len(subtitleText.contents[0]) <= 1 :
|
|
||||||
subtitleText.extract()
|
|
||||||
subtitle.extract()
|
|
||||||
|
|
||||||
|
|
||||||
#replace rating numbers with stars
|
|
||||||
for item in soup.findAll('div',attrs={ 'class' : 'starRating'}):
|
|
||||||
if item is not None:
|
|
||||||
soup2 = self._insertRatingStars(soup,item)
|
|
||||||
if soup2 is not None:
|
|
||||||
soup = soup2
|
|
||||||
|
|
||||||
|
|
||||||
#remove empty paragraph tags in storyTop which can leave a space
|
|
||||||
#between first paragraph and rest of story
|
|
||||||
nested_content = False
|
|
||||||
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
|
|
||||||
for item in storyTop.findAll('p'):
|
|
||||||
for nested in item:
|
|
||||||
if isinstance(nested, Tag):
|
|
||||||
nested_content = True
|
|
||||||
break
|
|
||||||
if not nested_content and item.contents is not None and len(item.contents[0]) <= 1 :
|
|
||||||
items_to_extract.append(item)
|
|
||||||
|
|
||||||
for item in items_to_extract:
|
|
||||||
item.extract()
|
|
||||||
|
|
||||||
items_to_extract = []
|
|
||||||
|
|
||||||
|
|
||||||
#remove line breaks immediately next to tags with default margins
|
|
||||||
#to prevent double line spacing and narrow columns of text
|
|
||||||
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
|
|
||||||
self._remove_undesired_line_breaks_from_tag(storyTop,soup)
|
|
||||||
|
|
||||||
|
|
||||||
#replace article graphics link with the graphics themselves
|
|
||||||
if self._FETCH_ARTICLE_GRAPHICS:
|
|
||||||
items_to_insert = []
|
|
||||||
for item in soup.findAll('div', attrs={'class' : ['article-graphic']}):
|
|
||||||
strong = item.find('strong')
|
|
||||||
if not strong:
|
|
||||||
continue
|
|
||||||
for child in strong:
|
|
||||||
if isinstance(child,Tag):
|
|
||||||
if str(child.name) == 'a':
|
|
||||||
items_to_insert.extend(self._get_article_graphic(strong,child['href'],soup))
|
|
||||||
|
|
||||||
for item in items_to_insert:
|
|
||||||
item[0].replaceWith(item[1])
|
|
||||||
|
|
||||||
for item in items_to_extract:
|
|
||||||
item.extract()
|
|
||||||
|
|
||||||
return soup
|
|
||||||
|
|
||||||
|
|
||||||
def _get_article_graphic(self,old_item,url,soup):
|
|
||||||
|
|
||||||
items_to_insert = []
|
|
||||||
|
|
||||||
if re.search('\.jpg$',str(url)):
|
|
||||||
div = Tag(soup,'div')
|
|
||||||
div['class'] = 'pictureContainer'
|
|
||||||
img = Tag(soup,'img')
|
|
||||||
img['src'] = url
|
|
||||||
img['alt'] = 'article graphic'
|
|
||||||
div.insert(0,img)
|
|
||||||
items_to_insert.append((old_item,div,))
|
|
||||||
return items_to_insert
|
|
||||||
|
|
||||||
soup2 = self.index_to_soup(url)
|
|
||||||
for item in soup2.findAll('div',attrs={'class' : re.compile("widget picture article.*")}):
|
|
||||||
items_to_insert.append((old_item,item),)
|
|
||||||
return items_to_insert
|
|
||||||
|
|
||||||
|
|
||||||
def _insertRatingStars(self,soup,item):
|
|
||||||
if item.contents is None or len(item.contents) < 1:
|
|
||||||
return
|
|
||||||
rating = item.contents[0]
|
|
||||||
|
|
||||||
try:
|
|
||||||
rating = float(item.contents[0])
|
|
||||||
except:
|
|
||||||
print 'Could not convert decimal rating to star: malformatted float.'
|
|
||||||
return
|
|
||||||
for i in range(1,6):
|
|
||||||
star = Tag(soup,'img')
|
|
||||||
if i <= rating:
|
|
||||||
star['src'] = self._STAR_URL
|
|
||||||
else:
|
|
||||||
star['src'] = self._NO_STAR_URL
|
|
||||||
star['alt'] = 'star number ' + str(i)
|
|
||||||
item.insert(i,star)
|
|
||||||
#item.contents[0] = NavigableString('(' + str(rating) + ')')
|
|
||||||
item.contents[0] = ''
|
|
||||||
|
|
||||||
def postprocess_html(self,soup, first_fetch):
|
|
||||||
|
|
||||||
#mark subtitle parent as non-compliant nesting causes
|
|
||||||
# p's to be 'popped out' of the h3 tag they are nested in.
|
|
||||||
subtitle = soup.find('h3', attrs={'class' : 'subtitle'})
|
|
||||||
subtitle_div = None
|
|
||||||
if subtitle:
|
|
||||||
subtitle_div = subtitle.parent
|
|
||||||
if subtitle_div:
|
|
||||||
clazz = ''
|
|
||||||
if 'class' in subtitle_div:
|
|
||||||
clazz = subtitle_div['class'] + ' '
|
|
||||||
clazz = clazz + 'subtitle'
|
|
||||||
subtitle_div['class'] = clazz
|
|
||||||
|
|
||||||
#find broken images and remove captions
|
|
||||||
items_to_extract = []
|
|
||||||
for item in soup.findAll('div', attrs={'class' : 'image'}):
|
|
||||||
img = item.findNext('img')
|
|
||||||
if img and img.get('src'):
|
|
||||||
# broken images still point to remote url
|
|
||||||
pattern = re.compile('http://www.independent.co.uk.*')
|
|
||||||
if pattern.match(img["src"]) is not None:
|
|
||||||
caption = img.findNextSibling('h3')
|
|
||||||
if caption is not None:
|
|
||||||
items_to_extract.append(caption)
|
|
||||||
items_to_extract.append(img)
|
|
||||||
|
|
||||||
for item in items_to_extract:
|
|
||||||
item.extract()
|
|
||||||
|
|
||||||
# nickredding's fix for non-justified text
|
|
||||||
for ptag in soup.findAll('p',attrs={'align':'left'}):
|
|
||||||
del(ptag['align'])
|
|
||||||
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def _get_gallery_images(self,url):
|
|
||||||
gallery_soup = self.index_to_soup(url)
|
|
||||||
images = []
|
|
||||||
remove_link = True
|
|
||||||
total = 1
|
|
||||||
try:
|
|
||||||
counter = gallery_soup.find('div',attrs={'id' : ['counter']})
|
|
||||||
total = counter.contents[0].split('/')
|
|
||||||
total = int(total[1].rstrip())
|
|
||||||
except:
|
|
||||||
total = 1
|
|
||||||
|
|
||||||
if self._MAX_GALLERY_IMAGES >= 0 and total > self._MAX_GALLERY_IMAGES:
|
|
||||||
total = self._MAX_GALLERY_IMAGES
|
|
||||||
remove_link = False
|
|
||||||
|
|
||||||
for i in range(1, total +1):
|
|
||||||
image, title = self._get_image_from_gallery(gallery_soup)
|
|
||||||
if image:
|
|
||||||
images.append((image,title))
|
|
||||||
next = url + '&ino=' + str(i + 1)
|
|
||||||
gallery_soup = self.index_to_soup(next)
|
|
||||||
images.reverse()
|
|
||||||
return images, remove_link
|
|
||||||
|
|
||||||
def _get_image_from_gallery(self,soup):
|
|
||||||
try:
|
|
||||||
container = soup.find('div',attrs={'id' : ['main-image']})
|
|
||||||
image = container.find('img')
|
|
||||||
if image:
|
|
||||||
title = soup.find('div',attrs={'id' : ['image-title']})
|
|
||||||
return image, title
|
|
||||||
except:
|
|
||||||
print 'error fetching gallery image'
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _recurisvely_linearise_tag_tree(
|
|
||||||
self,
|
|
||||||
item,
|
|
||||||
linearised= None,
|
|
||||||
count=0,
|
|
||||||
limit = 100
|
|
||||||
):
|
|
||||||
linearised = linearised or []
|
|
||||||
count = count + 1
|
|
||||||
if count > limit:
|
|
||||||
return linearised
|
|
||||||
if not (isinstance(item,Tag)):
|
|
||||||
return linearised
|
|
||||||
for nested in item:
|
|
||||||
linearised.append(nested)
|
|
||||||
linearised = self._recurisvely_linearise_tag_tree(nested,linearised, count)
|
|
||||||
return linearised
|
|
||||||
|
|
||||||
|
|
||||||
def _get_previous_tag(self,current_index, tag_tree):
|
|
||||||
if current_index == 0:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return tag_tree[current_index - 1]
|
|
||||||
|
|
||||||
|
|
||||||
def _get_next_tag(self,current_index, tag_tree):
|
|
||||||
if current_index < len(tag_tree) - 1:
|
|
||||||
return tag_tree[current_index + 1]
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _list_match(self,test_str, list_regex):
|
|
||||||
for regex in list_regex:
|
|
||||||
match = re.match(regex, test_str)
|
|
||||||
if match is not None:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _remove_undesired_line_breaks_from_tag(self,parent,soup):
|
|
||||||
|
|
||||||
if parent is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
tag_tree = self._recurisvely_linearise_tag_tree(parent)
|
|
||||||
items_to_remove = []
|
|
||||||
|
|
||||||
|
|
||||||
for item in tag_tree:
|
|
||||||
if item == u'\n':
|
|
||||||
items_to_remove.append(item)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
for item in items_to_remove:
|
|
||||||
tag_tree.remove(item)
|
|
||||||
|
|
||||||
|
|
||||||
spaced_tags = [r'p', r'h\d', r'blockquote']
|
|
||||||
tags_to_extract = []
|
|
||||||
tags_to_replace = []
|
|
||||||
for (i, tag) in enumerate(tag_tree):
|
|
||||||
if isinstance(tag, Tag):
|
|
||||||
if str(tag) == '<br />':
|
|
||||||
previous_tag = self._get_previous_tag(i, tag_tree)
|
|
||||||
|
|
||||||
if isinstance(previous_tag, Tag):
|
|
||||||
previous_tag_is_spaced = previous_tag is not None\
|
|
||||||
and self._list_match(str(previous_tag.name),
|
|
||||||
spaced_tags)
|
|
||||||
else:
|
|
||||||
previous_tag_is_spaced = False
|
|
||||||
|
|
||||||
next_tag = self._get_next_tag(i, tag_tree)
|
|
||||||
|
|
||||||
if isinstance(next_tag, Tag):
|
|
||||||
next_tag_is_spaced = next_tag is not None\
|
|
||||||
and self._list_match(str(next_tag.name), spaced_tags)
|
|
||||||
else:
|
|
||||||
next_tag_is_spaced = False
|
|
||||||
|
|
||||||
if previous_tag_is_spaced or next_tag_is_spaced or i == 0\
|
|
||||||
or i == len(tag_tree) - 1:
|
|
||||||
tags_to_extract.append(tag)
|
|
||||||
else:
|
|
||||||
tags_to_replace.append((tag,NavigableString(' '),))
|
|
||||||
|
|
||||||
|
|
||||||
for pair in tags_to_replace:
|
|
||||||
pair[0].replaceWith(pair[1])
|
|
||||||
for tag in tags_to_extract:
|
|
||||||
tag.extract()
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'News - UK',
|
(u'News - UK',
|
||||||
@ -610,3 +140,4 @@ class TheIndependentNew(BasicNewsRecipe):
|
|||||||
u'http://www.independent.co.uk/extras/indybest/?service=rss'),
|
u'http://www.independent.co.uk/extras/indybest/?service=rss'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -33,8 +33,8 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe):
|
|||||||
(u'Instapaper Starred', u'http://www.instapaper.com/starred')
|
(u'Instapaper Starred', u'http://www.instapaper.com/starred')
|
||||||
]
|
]
|
||||||
|
|
||||||
#Adds the title tag to the body of the recipe. Use this if your articles miss headings.
|
# Adds the title tag to the body of the recipe. Use this if your articles miss headings.
|
||||||
add_title_tag = False;
|
add_title_tag = False
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
@ -43,7 +43,7 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe):
|
|||||||
br.select_form(nr=0)
|
br.select_form(nr=0)
|
||||||
br['username'] = self.username
|
br['username'] = self.username
|
||||||
if self.password is not None:
|
if self.password is not None:
|
||||||
br['password'] = self.password
|
br['password'] = self.password
|
||||||
br.submit()
|
br.submit()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
@ -55,7 +55,7 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe):
|
|||||||
self.report_progress(0, 'Fetching feed'+' %s...'%(feedtitle if feedtitle else feedurl))
|
self.report_progress(0, 'Fetching feed'+' %s...'%(feedtitle if feedtitle else feedurl))
|
||||||
articles = []
|
articles = []
|
||||||
soup = self.index_to_soup(feedurl)
|
soup = self.index_to_soup(feedurl)
|
||||||
for item in soup.findAll('div', attrs={'class':'cornerControls'}):
|
for item in soup.findAll('div', attrs={'class':'title_row'}):
|
||||||
#description = self.tag_to_string(item.div)
|
#description = self.tag_to_string(item.div)
|
||||||
atag = item.a
|
atag = item.a
|
||||||
if atag and atag.has_key('href'):
|
if atag and atag.has_key('href'):
|
||||||
@ -73,10 +73,10 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe):
|
|||||||
article.title = soup.find('title').contents[0].strip()
|
article.title = soup.find('title').contents[0].strip()
|
||||||
|
|
||||||
def postprocess_html(self, soup, first_fetch):
|
def postprocess_html(self, soup, first_fetch):
|
||||||
#adds the title to each story, as it is not always included
|
# adds the title to each story, as it is not always included
|
||||||
if self.add_title_tag:
|
if self.add_title_tag:
|
||||||
for link_tag in soup.findAll(attrs={"id" : "story"}):
|
for link_tag in soup.findAll(attrs={"id" : "story"}):
|
||||||
link_tag.insert(0,'<h1>'+soup.find('title').contents[0].strip()+'</h1>')
|
link_tag.insert(0,'<h1>'+soup.find('title').contents[0].strip()+'</h1>')
|
||||||
|
|
||||||
#print repr(soup)
|
# print repr(soup)
|
||||||
return soup
|
return soup
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
__license__ = 'GPL v3'
|
__copyright__ = '2011-2013, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
|
||||||
'''
|
'''
|
||||||
www.iprofesional.com
|
www.iprofesional.com
|
||||||
'''
|
'''
|
||||||
@ -19,13 +18,15 @@ class iProfesional(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es_AR'
|
language = 'es_AR'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
publication_type = 'nesportal'
|
publication_type = 'newsportal'
|
||||||
masthead_url = 'http://www.iprofesional.com/img/logo-iprofesional.png'
|
masthead_url = 'http://www.iprofesional.com/img/header/logoiprofesional.png'
|
||||||
extra_css = """
|
extra_css = """
|
||||||
body{font-family: Arial,Helvetica,sans-serif }
|
body{font-family: 'Droid Sans',Arial,sans-serif }
|
||||||
img{margin-bottom: 0.4em; display:block}
|
img{margin-bottom: 0.4em; display:block}
|
||||||
.titulo-interior{font-family: Georgia,"Times New Roman",Times,serif}
|
.titulo{font-family: WhitneyBoldWhitneyBold,Arial,Helvetica,sans-serif; color: blue}
|
||||||
.autor-nota{font-size: small; font-weight: bold; font-style: italic; color: gray}
|
.fecha-archivo{font-weight: bold; color: rgb(205, 150, 24)}
|
||||||
|
.description{font-weight: bold; color: gray }
|
||||||
|
.firma{font-size: small}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
@ -35,27 +36,21 @@ class iProfesional(BasicNewsRecipe):
|
|||||||
, 'language' : language
|
, 'language' : language
|
||||||
}
|
}
|
||||||
|
|
||||||
keep_only_tags = [dict(attrs={'class':['fecha','interior-nota']})]
|
keep_only_tags = [dict(attrs={'class':'desarrollo'})]
|
||||||
|
remove_tags = [dict(name=['meta','link','base','embed','object','iframe'])]
|
||||||
remove_tags = [
|
|
||||||
dict(name=['meta','link','base','embed','object','iframe'])
|
|
||||||
,dict(attrs={'class':['menu-imprimir','guardarNota','IN-widget','fin','permalink']})
|
|
||||||
]
|
|
||||||
remove_attributes=['lang','xmlns:og','xmlns:fb']
|
|
||||||
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Ultimas noticias' , u'http://feeds.feedburner.com/iprofesional-principales-noticias')
|
(u'Ultimas noticias' , u'http://feeds.feedburner.com/iprofesional-principales-noticias')
|
||||||
,(u'Finanzas' , u'http://feeds.feedburner.com/iprofesional-finanzas' )
|
,(u'Finanzas' , u'http://feeds.feedburner.com/iprofesional-finanzas')
|
||||||
,(u'Impuestos' , u'http://feeds.feedburner.com/iprofesional-impuestos' )
|
,(u'Impuestos' , u'http://feeds.feedburner.com/iprofesional-impuestos')
|
||||||
,(u'Negocios' , u'http://feeds.feedburner.com/iprofesional-economia' )
|
,(u'Negocios' , u'http://feeds.feedburner.com/iprofesional-economia')
|
||||||
,(u'Comercio Exterior' , u'http://feeds.feedburner.com/iprofesional-comercio-exterior' )
|
,(u'Comercio Exterior' , u'http://feeds.feedburner.com/iprofesional-comercio-exterior')
|
||||||
,(u'Tecnologia' , u'http://feeds.feedburner.com/iprofesional-tecnologia' )
|
,(u'Tecnologia' , u'http://feeds.feedburner.com/iprofesional-tecnologia')
|
||||||
,(u'Management' , u'http://feeds.feedburner.com/iprofesional-managment' )
|
,(u'Management' , u'http://feeds.feedburner.com/iprofesional-managment')
|
||||||
,(u'Marketing' , u'http://feeds.feedburner.com/iprofesional-marketing' )
|
,(u'Marketing' , u'http://feeds.feedburner.com/iprofesional-marketing')
|
||||||
,(u'Legales' , u'http://feeds.feedburner.com/iprofesional-legales' )
|
,(u'Legales' , u'http://feeds.feedburner.com/iprofesional-legales')
|
||||||
,(u'Autos' , u'http://feeds.feedburner.com/iprofesional-autos' )
|
,(u'Autos' , u'http://feeds.feedburner.com/iprofesional-autos')
|
||||||
,(u'Vinos' , u'http://feeds.feedburner.com/iprofesional-vinos-bodegas' )
|
,(u'Vinos' , u'http://feeds.feedburner.com/iprofesional-vinos-bodegas')
|
||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
@ -64,16 +59,17 @@ class iProfesional(BasicNewsRecipe):
|
|||||||
for item in soup.findAll('a'):
|
for item in soup.findAll('a'):
|
||||||
limg = item.find('img')
|
limg = item.find('img')
|
||||||
if item.string is not None:
|
if item.string is not None:
|
||||||
str = item.string
|
str = item.string
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
else:
|
else:
|
||||||
if limg:
|
if limg:
|
||||||
item.name = 'div'
|
item.name = 'div'
|
||||||
item.attrs = []
|
item.attrs = []
|
||||||
else:
|
else:
|
||||||
str = self.tag_to_string(item)
|
str = self.tag_to_string(item)
|
||||||
item.replaceWith(str)
|
item.replaceWith(str)
|
||||||
for item in soup.findAll('img'):
|
for item in soup.findAll('img'):
|
||||||
if not item.has_key('alt'):
|
if 'alt' not in item:
|
||||||
item['alt'] = 'image'
|
item['alt'] = 'image'
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
@ -6,29 +6,23 @@ class JakartaGlobe(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'News', u'http://www.thejakartaglobe.com/pages/getrss/getrss-news.php'),
|
(u'News', u'http://www.thejakartaglobe.com/news/feed/'),
|
||||||
(u'Business', u'http://www.thejakartaglobe.com/pages/getrss/getrss-business.php'),
|
(u'Business', u'http://www.thejakartaglobe.com/business/feed/'),
|
||||||
(u'Technology', u'http://www.thejakartaglobe.com/pages/getrss/getrss-tech.php'),
|
(u'Opinion', u'http://www.thejakartaglobe.com/opinion/feed/'),
|
||||||
(u'My Jakarta', u'http://www.thejakartaglobe.com/pages/getrss/getrss-myjakarta.php'),
|
(u'Count me in', u'http://www.thejakartaglobe.com/count-me-in/feed/'),
|
||||||
(u'International', u'http://www.thejakartaglobe.com/pages/getrss/getrss-international.php'),
|
(u'International', u'http://www.thejakartaglobe.com/international/feed/'),
|
||||||
(u'Life and Times', u'http://www.thejakartaglobe.com/pages/getrss/getrss-lifeandtimes.php'),
|
(u'Sports', u'http://www.thejakartaglobe.com/sports/feed/'),
|
||||||
]
|
]
|
||||||
__author__ = 'rty'
|
__author__ = 'rty'
|
||||||
pubisher = 'JakartaGlobe.com'
|
pubisher = 'JakartaGlobe.com'
|
||||||
description = 'JakartaGlobe, Indonesia, Newspaper'
|
description = 'JakartaGlobe, Indonesia, Newspaper'
|
||||||
category = 'News, Indonesia'
|
category = 'News, Indonesia'
|
||||||
|
|
||||||
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
auto_cleanup = True
|
||||||
language = 'en_ID'
|
language = 'en_ID'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
conversion_options = {'linearize_tables':True}
|
conversion_options = {'linearize_tables':True}
|
||||||
masthead_url = 'http://www.thejakartaglobe.com/pages/2010/images/jak-globe-logo.jpg'
|
masthead_url = 'http://www.thejakartaglobe.com/pages/2010/images/jak-globe-logo.jpg'
|
||||||
keep_only_tags = [
|
|
||||||
dict(name='div', attrs={'class':'story'}),
|
|
||||||
dict(name='span', attrs={'class':'headline'}),
|
|
||||||
dict(name='div', attrs={'class':'story'}),
|
|
||||||
dict(name='p', attrs={'id':'bodytext'})
|
|
||||||
]
|
|
||||||
|
@ -27,12 +27,11 @@ class JakartaPost(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_javascript = True
|
no_javascript = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
timefmt = ' [%A, %d %B, %Y]'
|
timefmt = ' [%A, %d %B, %Y]'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs ={'id':'news-main'})]
|
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
h1{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large;}
|
h1{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large;}
|
||||||
.cT-storyDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;}
|
.cT-storyDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;}
|
||||||
@ -51,10 +50,6 @@ class JakartaPost(BasicNewsRecipe):
|
|||||||
body{font-family:Arial,Helvetica,sans-serif; font-size:x-small;}
|
body{font-family:Arial,Helvetica,sans-serif; font-size:x-small;}
|
||||||
'''
|
'''
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='div', attrs ={'class':['text-size']}),
|
|
||||||
]
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
|
|
||||||
(u'Breaking News', u'http://www.thejakartapost.com/breaking/feed'),
|
(u'Breaking News', u'http://www.thejakartapost.com/breaking/feed'),
|
||||||
|
47
recipes/jeuxvideo.recipe
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Malah <malah at neuf dot fr>'
|
||||||
|
'''
|
||||||
|
JeuxVideo.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
__author__ = '2013, Malah <malah at neuf dot fr>'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class JeuxVideoCom(BasicNewsRecipe):
|
||||||
|
title = 'JeuxVideo.com'
|
||||||
|
__author__ = 'Malah'
|
||||||
|
description = 'La Référence des Jeux Vidéo sur PC et Consoles !'
|
||||||
|
oldest_article = 1.5
|
||||||
|
language = 'fr'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_empty_feeds = True
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
|
keep_only_tags = [dict(id=['news_detail','test_txt','test_avis'])]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'id':'player_video_article'}),
|
||||||
|
dict(name='div', attrs={'class':'liste-fiches'})
|
||||||
|
]
|
||||||
|
masthead_url = u'https://upload.wikimedia.org/wikipedia/commons/3/39/Jeuxvideocom.png'
|
||||||
|
feeds = [
|
||||||
|
(u'Section PC',u'http://www.jeuxvideo.com/rss/rss-pc.xml'),
|
||||||
|
(u'Section Xbox One',u'http://www.jeuxvideo.com/rss/rss-xo.xml'),
|
||||||
|
(u'Section PlayStation 4',u'http://www.jeuxvideo.com/rss/rss-ps4.xml'),
|
||||||
|
(u'Section Xbox 360',u'http://www.jeuxvideo.com/rss/rss-360.xml'),
|
||||||
|
(u'Section PlayStation 3',u'http://www.jeuxvideo.com/rss/rss-ps3.xml'),
|
||||||
|
(u'Section Wii U',u'http://www.jeuxvideo.com/rss/rss-wiiu.xml'),
|
||||||
|
(u'Section Wii',u'http://www.jeuxvideo.com/rss/rss-wii.xml'),
|
||||||
|
(u'Section Nintendo 3DS',u'http://www.jeuxvideo.com/rss/rss-3ds.xml'),
|
||||||
|
(u'Section Nintendo DS',u'http://www.jeuxvideo.com/rss/rss-ds.xml'),
|
||||||
|
(u'Section PlayStation Vita',u'http://www.jeuxvideo.com/rss/rss-vita.xml'),
|
||||||
|
(u'Section PlayStation Protable',u'http://www.jeuxvideo.com/rss/rss-psp.xml'),
|
||||||
|
(u'Section Android',u'http://www.jeuxvideo.com/rss/rss-android.xml'),
|
||||||
|
(u'Section Iphone',u'http://www.jeuxvideo.com/rss/rss-iphone.xml'),
|
||||||
|
(u'Section Web',u'http://www.jeuxvideo.com/rss/rss-wb.xml'),
|
||||||
|
(u'Autres news', u'http://www.jeuxvideo.com/rss/rss-news.xml'),
|
||||||
|
(u'Autres vidéos', u'http://www.jeuxvideo.com/rss/rss-videos.xml'),
|
||||||
|
(u'Autres articles', u'http://www.jeuxvideo.com/rss/rss.xml'),
|
||||||
|
]
|
69
recipes/jot_down.recipe
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '23 June 2013, desUBIKado'
|
||||||
|
__author__ = 'desUBIKado'
|
||||||
|
__description__ = 'Contemporary Culture Magazine'
|
||||||
|
__version__ = 'v0.01'
|
||||||
|
__date__ = '23, June 2013'
|
||||||
|
'''
|
||||||
|
http://www.jotdown.es/
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class jotdown(BasicNewsRecipe):
|
||||||
|
author = 'desUBIKado'
|
||||||
|
description = 'Revista digital con magníficos y extensos artículos'
|
||||||
|
title = u'Jot Down - Contemporary Culture Magazine'
|
||||||
|
publisher = 'Wabi Sabi Investments, S.C.'
|
||||||
|
category = 'Opinion, culture, science, movies, TV shows, music, blogs'
|
||||||
|
language = 'es'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
oldest_article = 7
|
||||||
|
delay = 1
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
masthead_url = 'http://www.jotdown.es/wp-content/uploads/2011/04/logoJotDown.png'
|
||||||
|
use_embedded_content = False
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Portada', u'http://www.jotdown.es/feed/')
|
||||||
|
]
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':['single']}),
|
||||||
|
dict(name='div', attrs={'id':['comments']}),
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [dict(name='a', attrs={'href':['http://alternativaseconomicas.coop/']}),
|
||||||
|
dict(name='div', attrs={'class':['reply','after-meta','comment-author vcard']}),
|
||||||
|
dict(name='div', attrs={'align':['center']}),
|
||||||
|
dict(name='span', attrs={'class':['fbreplace']}),
|
||||||
|
dict(name='div', attrs={'id':'respond'})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags_after = dict(name='div' , attrs={'id':'respond'})
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
.comment-list {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:12px;}
|
||||||
|
'''
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
|
# To present the image of the embedded video
|
||||||
|
(re.compile(r'<object type="application/x-shockwave-flash" data="http://www.youtube.com/v',
|
||||||
|
re.DOTALL|re.IGNORECASE), lambda match: '<img src="http://img.youtube.com/vi'),
|
||||||
|
(re.compile(r'&rel=0&fs=1"', re.DOTALL|re.IGNORECASE), lambda match: '/0.jpg"><object'),
|
||||||
|
# To remove the link of the category
|
||||||
|
(re.compile(r'<div class="meta">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="meta"><!-- '),
|
||||||
|
(re.compile(r'</a>, <a href="http://www.jotdown.es/category', re.DOTALL|re.IGNORECASE), lambda match: ', <!--'),
|
||||||
|
(re.compile(r'"category tag">', re.DOTALL|re.IGNORECASE), lambda match: '--> '),
|
||||||
|
(re.compile(r'</a> —', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||||
|
# To remove the link of the title
|
||||||
|
(re.compile(r'<h1> <a href="', re.DOTALL|re.IGNORECASE), lambda match: '<h1> <div class="'),
|
||||||
|
(re.compile(r'</a> </h1>', re.DOTALL|re.IGNORECASE), lambda match: '</div> </h1>')
|
||||||
|
|
||||||
|
]
|
||||||
|
|
@ -20,7 +20,7 @@ class crnews(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
|
|
||||||
feeds = [(u'Portada', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=portada'), (u'Ultima Hora', u'http://www.nacion.com/Generales/RSS/UltimaHoraRss.aspx'), (u'Nacionales', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=elpais'), (u'Entretenimiento', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=entretenimiento'), (u'Sucesos', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=sucesos'), (u'Deportes', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=deportes'), (u'Internacionales', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=mundo'), (u'Economia', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=economia'), (u'Aldea Global', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=aldeaglobal'), (u'Tecnologia', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=tecnologia'), (u'Opinion', u'http://www.nacion.com/Generales/RSS/EdicionRss.aspx?section=opinion')]
|
feeds = [(u'Portada', u'http://www.nacion.com/rss/'), (u'Ultima Hora', u'http://www.nacion.com/rss/latest/'), (u'Nacionales', u'http://www.nacion.com/rss/nacional/'), (u'Entretenimiento', u'http://www.nacion.com/rss/ocio/'), (u'Sucesos', u'http://www.nacion.com/rss/sucesos/'), (u'Deportes', u'http://www.nacion.com/rss/deportes/'), (u'Internacionales', u'http://www.nacion.com/rss/mundo/'), (u'Economia', u'http://www.nacion.com/rss/economia/'), (u'Vivir', u'http://www.nacion.com/rss/vivir/'), (u'Tecnologia', u'http://www.nacion.com/rss/tecnologia/'), (u'Opinion', u'http://www.nacion.com/rss/opinion/')]
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
index = 'http://kiosko.net/cr/np/cr_nacion.html'
|
index = 'http://kiosko.net/cr/np/cr_nacion.html'
|
||||||
|
76
recipes/lacapital.recipe
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
www.lacapital.com.ar
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class LaCapital(BasicNewsRecipe):
|
||||||
|
title = 'La Capital de Rosario'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'Noticias, actualidad y toda la informacion de Rosario y la region'
|
||||||
|
publisher = 'Diario La Capital S. A.'
|
||||||
|
category = 'news, politics, Rosario, Santa Fe, Argentina'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 200
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf8'
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'es_AR'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
publication_type = 'newspaper'
|
||||||
|
masthead_url = 'http://www.lacapital.com.ar/system/modules/com.tfsla.diario.core/resources/images/logoLaCapital_noCom.png'
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Georgia,"Times New Roman",Times,serif }
|
||||||
|
img{margin-bottom: 0.4em; display:block}
|
||||||
|
"""
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags=[dict(attrs={'class':'leer'})]
|
||||||
|
remove_tags_after=dict(attrs={'class':'notaA'})
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['meta','link','iframe','object'])
|
||||||
|
,dict(name='div', attrs={'class':['herramientas','almargen','relacionadas']})
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Portada' , u'http://www.lacapital.com.ar/rss/home.xml' )
|
||||||
|
,(u'La Ciudad' , u'http://www.lacapital.com.ar/rss/laciudad.xml' )
|
||||||
|
,(u'Politica' , u'http://www.lacapital.com.ar/rss/politica.xml' )
|
||||||
|
,(u'Economia' , u'http://www.lacapital.com.ar/rss/economia.xml' )
|
||||||
|
,(u'La Region' , u'http://www.lacapital.com.ar/rss/laregion.xml' )
|
||||||
|
,(u'Informacion General' , u'http://www.lacapital.com.ar/rss/informaciongral.xml' )
|
||||||
|
,(u'El Mundo' , u'http://www.lacapital.com.ar/rss/elmundo.xml' )
|
||||||
|
,(u'Opinion' , u'http://www.lacapital.com.ar/rss/opinion.xml' )
|
||||||
|
,(u'Cartas de lectores' , u'http://www.lacapital.com.ar/rss/cartasdelectores.xml')
|
||||||
|
,(u'Escenario' , u'http://www.lacapital.com.ar/rss/escenario.xml' )
|
||||||
|
,(u'Policiales' , u'http://www.lacapital.com.ar/rss/policiales.xml' )
|
||||||
|
,(u'Ovacion' , u'http://www.lacapital.com.ar/rss/ovacion.xml' )
|
||||||
|
,(u'Turismo' , u'http://www.lacapital.com.ar/rss/turismo.xml' )
|
||||||
|
,(u'Economia' , u'http://www.lacapital.com.ar/rss/economia.xml' )
|
||||||
|
,(u'Señales' , u'http://www.lacapital.com.ar/rss/senales.xml' )
|
||||||
|
,(u'Educacion' , u'http://www.lacapital.com.ar/rss/educacion.xml' )
|
||||||
|
,(u'Estilo' , u'http://www.lacapital.com.ar/rss/estilo.xml' )
|
||||||
|
,(u'Salud' , u'http://www.lacapital.com.ar/rss/salud.xml' )
|
||||||
|
,(u'Tecnologia' , u'http://www.lacapital.com.ar/rss/tecnologia.xml' )
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
soup = self.index_to_soup('http://www.lacapital.com.ar/impresa/tapa.html')
|
||||||
|
for image in soup.findAll('img',alt=True):
|
||||||
|
if image['alt'].startswith('Tapa de papel'):
|
||||||
|
return image['src']
|
||||||
|
return None
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
@ -13,6 +13,8 @@ class LamebookRecipe(BasicNewsRecipe):
|
|||||||
language = 'en'
|
language = 'en'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
publication_type = 'blog'
|
publication_type = 'blog'
|
||||||
|
reverse_article_order = True
|
||||||
|
encoding = 'utf-8'
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='div', attrs={'class':'entry'})
|
dict(name='div', attrs={'class':'entry'})
|
||||||
|
34
recipes/le_gorafi.recipe
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Malah <malah at neuf dot fr>'
|
||||||
|
'''
|
||||||
|
Le GORAFI.fr
|
||||||
|
'''
|
||||||
|
|
||||||
|
__author__ = '2013, Malah <malah at neuf dot fr>'
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class legorafi(BasicNewsRecipe):
|
||||||
|
title = u'Le GORAFI.fr'
|
||||||
|
__author__ = 'Malah'
|
||||||
|
description = u'Depuis 1826, toute l\'information de sources contradictoires'
|
||||||
|
oldest_article = 7
|
||||||
|
language = 'fr'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'class':'entry-content'}),
|
||||||
|
dict(name='h3', attrs={'id':'comments-title'}),
|
||||||
|
]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'id':'soshake-sharebox'}),
|
||||||
|
dict(name='div', attrs={'class':'social-ring'}),
|
||||||
|
dict(name='div', attrs={'class':'entry-utility'}),
|
||||||
|
dict(name='div', attrs={'id':'respond'}),
|
||||||
|
]
|
||||||
|
masthead_url = u'http://web.gweno.free.fr/img/logositeter.png'
|
||||||
|
couverture_url = u'http://www.legorafi.fr/wp-content/uploads/2013/02/iconegorafi.png'
|
||||||
|
feeds = [
|
||||||
|
(u'Articles', u'http://www.legorafi.fr/feed/'),
|
||||||
|
]
|
111
recipes/le_monde_diplomatique_fr.recipe
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
# vim:fileencoding=utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013'
|
||||||
|
'''
|
||||||
|
monde-diplomatique.fr
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.web.feeds import feeds_from_index
|
||||||
|
|
||||||
|
class LeMondeDiplomatiqueSiteWeb(BasicNewsRecipe):
|
||||||
|
title = u'Le Monde diplomatique.fr'
|
||||||
|
__author__ = 'Gaëtan Lehmann'
|
||||||
|
description = "Le Monde diplomatique est un mensuel français d’information et d’opinion à la ligne éditoriale nettement engagée en faveur d'une gauche de rupture avec le capitalisme. Il aborde de nombreux sujets — géopolitique, relations internationales, économie, questions sociales, écologie, culture, médias, …" # noqa
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
publisher = 'monde-diplomatique.fr'
|
||||||
|
category = 'news, France, world'
|
||||||
|
language = 'fr'
|
||||||
|
masthead_url = 'http://www.monde-diplomatique.fr/squelettes/images/logotyfa.png'
|
||||||
|
timefmt = ' [%d %b %Y]'
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
feeds = [(u'Blogs', u'http://blog.mondediplo.net/spip.php?page=backend'), (u'Archives', u'http://www.monde-diplomatique.fr/rss/')]
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
|
(re.compile(r'<title>(.*) - Les blogs du Diplo</title>'), lambda m: '<title>' + m.group(1) + '</title>'),
|
||||||
|
(re.compile(r'<h2>(.*) - Les blogs du Diplo</h2>'), lambda m: '<h2>' + m.group(1) + '</h2>'),
|
||||||
|
(re.compile(r'<title>(.*) \(Le Monde diplomatique\)</title>'), lambda m: '<title>' + m.group(1) + '</title>'),
|
||||||
|
(re.compile(r'<h2>(.*) \(Le Monde diplomatique\)</h2>'), lambda m: '<h2>' + m.group(1) + '</h2>'),
|
||||||
|
(re.compile(r'<h3>Grand format</h3>'), lambda m: '')]
|
||||||
|
|
||||||
|
remove_tags = [dict(name='div', attrs={'class':'voiraussi liste'}),
|
||||||
|
dict(name='ul', attrs={'class':'hermetique carto hombre_demi_inverse'}),
|
||||||
|
dict(name='a', attrs={'class':'tousles'}),
|
||||||
|
dict(name='h3', attrs={'class':'cat'}),
|
||||||
|
dict(name='div', attrs={'class':'logodiplo'}),
|
||||||
|
dict(name='img', attrs={'class':'spip_logos'}),
|
||||||
|
dict(name='p', attrs={'id':'hierarchie'}),
|
||||||
|
dict(name='div', attrs={'class':'espace'})]
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comments' : description
|
||||||
|
,'tags' : category
|
||||||
|
,'language' : language
|
||||||
|
,'publisher' : publisher
|
||||||
|
,'linearize_tables': True
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_empty_feeds = True
|
||||||
|
|
||||||
|
filterDuplicates = True
|
||||||
|
|
||||||
|
# don't use parse_index - we need it to send an exception so we can mix
|
||||||
|
# feed and parse_index results in parse_feeds
|
||||||
|
def parse_index_valise(self):
|
||||||
|
articles = []
|
||||||
|
soup = self.index_to_soup('http://www.monde-diplomatique.fr/carnet/')
|
||||||
|
cnt = soup.find('ul',attrs={'class':'hermetique liste'})
|
||||||
|
for item in cnt.findAll('li'):
|
||||||
|
description = ''
|
||||||
|
feed_link = item.find('a')
|
||||||
|
desc = item.find('div',attrs={'class':'intro'})
|
||||||
|
date = item.find('div',attrs={'class':'dates_auteurs'})
|
||||||
|
if desc:
|
||||||
|
description = desc.string
|
||||||
|
if feed_link and feed_link.has_key('href'):
|
||||||
|
url = 'http://www.monde-diplomatique.fr' + feed_link['href']
|
||||||
|
title = self.tag_to_string(feed_link)
|
||||||
|
articles.append({
|
||||||
|
'title' :title
|
||||||
|
,'date' :date.string.strip()
|
||||||
|
,'url' :url
|
||||||
|
,'description':description
|
||||||
|
})
|
||||||
|
return [("La valise diplomatique", articles)]
|
||||||
|
|
||||||
|
def parse_index_cartes(self):
|
||||||
|
articles = []
|
||||||
|
soup = self.index_to_soup('http://www.monde-diplomatique.fr/cartes/')
|
||||||
|
cnt = soup.find('div',attrs={'class':'decale hermetique'})
|
||||||
|
for item in cnt.findAll('div',attrs={'class':re.compile('grid_3 filet hombre_demi')}):
|
||||||
|
feed_link = item.find('a',attrs={'class':'couve'})
|
||||||
|
h3 = item.find('h3')
|
||||||
|
authorAndDate = item.find('div',attrs={'class':'dates_auteurs'})
|
||||||
|
author, date = authorAndDate.string.strip().split(', ')
|
||||||
|
if feed_link and feed_link.has_key('href'):
|
||||||
|
url = 'http://www.monde-diplomatique.fr' + feed_link['href']
|
||||||
|
title = self.tag_to_string(h3)
|
||||||
|
articles.append({
|
||||||
|
'title' :title
|
||||||
|
,'date' :date
|
||||||
|
,'url' :url
|
||||||
|
,'description': author
|
||||||
|
})
|
||||||
|
return [("Cartes", articles)]
|
||||||
|
|
||||||
|
def parse_feeds(self):
|
||||||
|
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||||
|
valise = feeds_from_index(self.parse_index_valise(), oldest_article=self.oldest_article,
|
||||||
|
max_articles_per_feed=self.max_articles_per_feed,
|
||||||
|
log=self.log)
|
||||||
|
cartes = feeds_from_index(self.parse_index_cartes(), oldest_article=self.oldest_article,
|
||||||
|
max_articles_per_feed=self.max_articles_per_feed,
|
||||||
|
log=self.log)
|
||||||
|
feeds = valise + feeds + cartes
|
||||||
|
return feeds
|
@ -2,7 +2,7 @@
|
|||||||
__author__ = 'Sylvain Durand <sylvain.durand@ponts.org>'
|
__author__ = 'Sylvain Durand <sylvain.durand@ponts.org>'
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
import time
|
import time, re
|
||||||
|
|
||||||
from calibre import strftime
|
from calibre import strftime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
@ -13,7 +13,7 @@ class LeMonde(BasicNewsRecipe):
|
|||||||
|
|
||||||
title = u'Le Monde: Édition abonnés'
|
title = u'Le Monde: Édition abonnés'
|
||||||
__author__ = 'Sylvain Durand'
|
__author__ = 'Sylvain Durand'
|
||||||
description = u'Disponible du lundi au samedi à partir de 14 heures environ, avec tous ses cahiers.'
|
description = u'La version papier du quotidien Le Monde, disponible du lundi au samedi à partir de 14 heures environ, avec tous ses cahiers.'
|
||||||
language = 'fr'
|
language = 'fr'
|
||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
|
|
||||||
@ -65,26 +65,38 @@ class LeMonde(BasicNewsRecipe):
|
|||||||
url = time.strftime(self.journal_url,self.date)
|
url = time.strftime(self.journal_url,self.date)
|
||||||
soup = self.index_to_soup(url).sommaire
|
soup = self.index_to_soup(url).sommaire
|
||||||
sections = []
|
sections = []
|
||||||
for sec in soup.findAll("section"):
|
try:
|
||||||
articles = []
|
for sec in soup.findAll("section"):
|
||||||
if sec['cahier'] != "Le Monde":
|
articles = []
|
||||||
for col in sec.findAll("fnts"):
|
if sec['cahier'] != "Le Monde":
|
||||||
col.extract()
|
for col in sec.findAll("fnts"):
|
||||||
if sec['cahier']=="Le Monde Magazine":
|
col.extract()
|
||||||
continue
|
if sec['cahier']=="Le Monde Magazine":
|
||||||
for art in sec.findAll("art"):
|
continue
|
||||||
if art.txt.string and art.ttr.string:
|
for art in sec.findAll("art"):
|
||||||
if art.find(['url']):
|
if art.txt.string and art.ttr.string:
|
||||||
art.insert(6,'<div id="photo"><img src="'+art.find(['url']).string+'" /></div>')
|
if art.find(['url']):
|
||||||
if art.find(['lgd']) and art.find(['lgd']).string:
|
art.insert(6,'<div id="photo"><img src="'+art.find(['url']).string+'" /></div>')
|
||||||
art.insert(7,'<div id="lgd">'+art.find(['lgd']).string+'</div>')
|
if art.find(['lgd']) and art.find(['lgd']).string:
|
||||||
article = "<html><head></head><body>"+unicode(art)+"</body></html>"
|
art.insert(7,'<div id="lgd">'+art.find(['lgd']).string+'</div>')
|
||||||
article = article.replace('<![CDATA[','').replace(']]>','').replace(' oC ','°C ')
|
|
||||||
article = article.replace('srttr>','h3>').replace('ssttr>','h2>').replace('ttr>','h1>')
|
def guillemets(match):
|
||||||
f = PersistentTemporaryFile()
|
if match.group(1) == u"=":
|
||||||
f.write(article)
|
return match.group(0)
|
||||||
articles.append({'title':art.ttr.string,'url':"file:///"+f.name})
|
return u'%s« %s »' % (match.group(1), match.group(2))
|
||||||
sections.append((sec['nom'], articles))
|
|
||||||
|
article = "<html><head></head><body>"+unicode(art)+"</body></html>"
|
||||||
|
article = article.replace('<![CDATA[','').replace(']]>','').replace(' oC ','°C ')
|
||||||
|
article = article.replace('srttr>','h3>').replace('ssttr>','h2>').replace('ttr>','h1>')
|
||||||
|
article = article.replace("'" , u'\u2019')
|
||||||
|
article = re.sub('(.|^)"([^"]+)"', guillemets, article)
|
||||||
|
|
||||||
|
f = PersistentTemporaryFile()
|
||||||
|
f.write(article)
|
||||||
|
articles.append({'title':art.ttr.string,'url':"file:///"+f.name})
|
||||||
|
sections.append((sec['nom'], articles))
|
||||||
|
except AttributeError:
|
||||||
|
self.log("Vos identifiants sont incorrects, ou votre abonnement LeMonde.fr ne vous permet pas de télécharger le journal.")
|
||||||
return sections
|
return sections
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
@ -92,3 +104,4 @@ class LeMonde(BasicNewsRecipe):
|
|||||||
lgd.contents[-1].extract()
|
lgd.contents[-1].extract()
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
49
recipes/le_nouvel_observateur.recipe
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Malah <malah at neuf dot fr>'
|
||||||
|
'''
|
||||||
|
Le Nouvel Observateur
|
||||||
|
'''
|
||||||
|
|
||||||
|
__author__ = '2013, Malah <malah at neuf dot fr>'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class LeNouvelObs(BasicNewsRecipe):
|
||||||
|
title = u'Le Nouvel Observateur'
|
||||||
|
__author__ = 'Malah'
|
||||||
|
description = u'Actualités en temps réel, Info à la Une'
|
||||||
|
oldest_article = 1
|
||||||
|
language = 'fr'
|
||||||
|
max_articles_per_feed = 25
|
||||||
|
use_embedded_content = False
|
||||||
|
ignore_duplicate_articles = ('title', 'url')
|
||||||
|
remove_empty_feeds = True
|
||||||
|
no_stylesheets = True
|
||||||
|
masthead_url = u'https://upload.wikimedia.org/wikipedia/fr/f/f9/Le_Nouvel_observateur.png'
|
||||||
|
feeds = [
|
||||||
|
(u'Politique', u'http://tempsreel.nouvelobs.com/politique/rss.xml'),
|
||||||
|
(u'Société', u'http://tempsreel.nouvelobs.com/societe/rss.xml'),
|
||||||
|
(u'Monde', u'http://tempsreel.nouvelobs.com/monde/rss.xml'),
|
||||||
|
(u'Economie', u'http://tempsreel.nouvelobs.com/economie/rss.xml'),
|
||||||
|
(u'Culture', u'http://tempsreel.nouvelobs.com/culture/rss.xml'),
|
||||||
|
(u'High Tech', u'http://obsession.nouvelobs.com/high-tech/rss.xml'),
|
||||||
|
(u'Education', u'http://tempsreel.nouvelobs.com/education/rss.xml'),
|
||||||
|
(u'Services', u'http://tempsreel.nouvelobs.com/services/rss.xml'),
|
||||||
|
(u'Sport', u'http://tempsreel.nouvelobs.com/sport/rss.xml'),
|
||||||
|
(u'CinéObs', u'http://cinema.nouvelobs.com/articles.rss'),
|
||||||
|
(u'TéléObs', u'http://teleobs.nouvelobs.com/rss.xml'),
|
||||||
|
(u'Autres Actualités',u'http://tempsreel.nouvelobs.com/rss.xml'),
|
||||||
|
]
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='h1', attrs={'id':'obs-article-title'}),
|
||||||
|
dict(name='div', attrs={'class':'obs-date'}),
|
||||||
|
dict(name='div', attrs={'class':'art-auteur'}),
|
||||||
|
dict(name='h2', attrs={'class':'obs-article-intro'}),
|
||||||
|
dict(name='div', attrs={'id':'obs-article-keywords'}),
|
||||||
|
dict(name='div', attrs={'id':'obs-article-mainpic'}),
|
||||||
|
dict(name='div', attrs={'itemprop':'articleBody'}),
|
||||||
|
dict(name='img', attrs={'id':'ObsImg'}),
|
||||||
|
dict(name='p', attrs={'class':'date-media'}),
|
||||||
|
dict(name='p', attrs={'id':'ObsDesc'}),
|
||||||
|
]
|
@ -21,42 +21,10 @@ class Liberation(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 15
|
max_articles_per_feed = 15
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
filterDuplicates = True
|
needs_subscription = 'optional'
|
||||||
|
|
||||||
extra_css = '''
|
keep_only_tags = [dict(name='article')]
|
||||||
h1, h2, h3 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
|
remove_tags = [dict(attrs={'class':['tool-bar']})]
|
||||||
p.subtitle {font-size:xx-small; font-family:Arial,Helvetica,sans-serif;}
|
|
||||||
h4, h5, h2.rubrique, {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
|
|
||||||
.ref, .date, .author, .legende {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
|
|
||||||
.mna-body, entry-body {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
|
|
||||||
'''
|
|
||||||
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(name='div', attrs={'class':'article'})
|
|
||||||
,dict(name='div', attrs={'class':'text-article m-bot-s1'})
|
|
||||||
,dict(name='div', attrs={'class':'entry'})
|
|
||||||
,dict(name='div', attrs={'class':'col_contenu'})
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags_after = [
|
|
||||||
dict(name='div',attrs={'class':['object-content text text-item', 'object-content', 'entry-content', 'col01', 'bloc_article_01']})
|
|
||||||
,dict(name='p',attrs={'class':['chapo']})
|
|
||||||
,dict(id='_twitter_facebook')
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='iframe')
|
|
||||||
,dict(name='a', attrs={'class':'lnk-comments'})
|
|
||||||
,dict(name='div', attrs={'class':'toolbox'})
|
|
||||||
,dict(name='ul', attrs={'class':'share-box'})
|
|
||||||
,dict(name='ul', attrs={'class':'tool-box'})
|
|
||||||
,dict(name='ul', attrs={'class':'rub'})
|
|
||||||
,dict(name='p',attrs={'class':['chapo']})
|
|
||||||
,dict(name='p',attrs={'class':['tag']})
|
|
||||||
,dict(name='div',attrs={'class':['blokLies']})
|
|
||||||
,dict(name='div',attrs={'class':['alire']})
|
|
||||||
,dict(id='_twitter_facebook')
|
|
||||||
]
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'La une', u'http://rss.liberation.fr/rss/9/')
|
(u'La une', u'http://rss.liberation.fr/rss/9/')
|
||||||
@ -69,6 +37,16 @@ class Liberation(BasicNewsRecipe):
|
|||||||
,(u'Sports', u'http://www.liberation.fr/rss/12/')
|
,(u'Sports', u'http://www.liberation.fr/rss/12/')
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
br.open('http://token.liberation.fr/accounts/login/')
|
||||||
|
br.select_form(nr=0)
|
||||||
|
br['email'] = self.username
|
||||||
|
br['password'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
||||||
|
|
||||||
def get_masthead_url(self):
|
def get_masthead_url(self):
|
||||||
masthead = 'http://s0.libe.com/libe/img/common/logo-liberation-150.png'
|
masthead = 'http://s0.libe.com/libe/img/common/logo-liberation-150.png'
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
@ -78,3 +56,15 @@ class Liberation(BasicNewsRecipe):
|
|||||||
self.log("\nCover unavailable")
|
self.log("\nCover unavailable")
|
||||||
masthead = None
|
masthead = None
|
||||||
return masthead
|
return masthead
|
||||||
|
|
||||||
|
def get_article_url(self, article):
|
||||||
|
url = BasicNewsRecipe.get_article_url(self, article)
|
||||||
|
url = url.split('/')[-2]
|
||||||
|
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
|
||||||
|
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http://', '0S':
|
||||||
|
'www.', '0I': '_'}
|
||||||
|
for k, v in encoding.iteritems():
|
||||||
|
url = url.replace(k, v)
|
||||||
|
return url.partition('?')[0]
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,103 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2012, Rémi Vanicat <vanicat at debian.org>'
|
|
||||||
'''
|
|
||||||
liberation.fr
|
|
||||||
'''
|
|
||||||
# The cleanning is from the Liberation recipe, by Darko Miletic
|
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
|
|
||||||
class Liberation(BasicNewsRecipe):
|
|
||||||
|
|
||||||
title = u'Libération: Édition abonnés'
|
|
||||||
__author__ = 'Rémi Vanicat'
|
|
||||||
description = u'Actualités'
|
|
||||||
category = 'Actualités, France, Monde'
|
|
||||||
language = 'fr'
|
|
||||||
needs_subscription = True
|
|
||||||
|
|
||||||
use_embedded_content = False
|
|
||||||
no_stylesheets = True
|
|
||||||
remove_empty_feeds = True
|
|
||||||
|
|
||||||
extra_css = '''
|
|
||||||
h1, h2, h3 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
|
|
||||||
p.subtitle {font-size:xx-small; font-family:Arial,Helvetica,sans-serif;}
|
|
||||||
h4, h5, h2.rubrique, {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
|
|
||||||
.ref, .date, .author, .legende {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
|
|
||||||
.mna-body, entry-body {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
|
|
||||||
'''
|
|
||||||
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(name='div', attrs={'class':'article'})
|
|
||||||
,dict(name='div', attrs={'class':'text-article m-bot-s1'})
|
|
||||||
,dict(name='div', attrs={'class':'entry'})
|
|
||||||
,dict(name='div', attrs={'class':'col_contenu'})
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags_after = [
|
|
||||||
dict(name='div',attrs={'class':['object-content text text-item', 'object-content', 'entry-content', 'col01', 'bloc_article_01']})
|
|
||||||
,dict(name='p',attrs={'class':['chapo']})
|
|
||||||
,dict(id='_twitter_facebook')
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags = [
|
|
||||||
dict(name='iframe')
|
|
||||||
,dict(name='a', attrs={'class':'lnk-comments'})
|
|
||||||
,dict(name='div', attrs={'class':'toolbox'})
|
|
||||||
,dict(name='ul', attrs={'class':'share-box'})
|
|
||||||
,dict(name='ul', attrs={'class':'tool-box'})
|
|
||||||
,dict(name='ul', attrs={'class':'rub'})
|
|
||||||
,dict(name='p',attrs={'class':['chapo']})
|
|
||||||
,dict(name='p',attrs={'class':['tag']})
|
|
||||||
,dict(name='div',attrs={'class':['blokLies']})
|
|
||||||
,dict(name='div',attrs={'class':['alire']})
|
|
||||||
,dict(id='_twitter_facebook')
|
|
||||||
]
|
|
||||||
|
|
||||||
index = 'http://www.liberation.fr/abonnes/'
|
|
||||||
|
|
||||||
def get_browser(self):
|
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
|
||||||
if self.username is not None and self.password is not None:
|
|
||||||
br.open('http://www.liberation.fr/jogger/login/')
|
|
||||||
br.select_form(nr=0)
|
|
||||||
br['email'] = self.username
|
|
||||||
br['password'] = self.password
|
|
||||||
br.submit()
|
|
||||||
return br
|
|
||||||
|
|
||||||
def parse_index(self):
|
|
||||||
soup=self.index_to_soup(self.index)
|
|
||||||
|
|
||||||
content = soup.find('div', { 'class':'block-content' })
|
|
||||||
|
|
||||||
articles = []
|
|
||||||
cat_articles = []
|
|
||||||
|
|
||||||
for tag in content.findAll(recursive=False):
|
|
||||||
if(tag['class']=='headrest headrest-basic-rounded'):
|
|
||||||
cat_articles = []
|
|
||||||
articles.append((tag.find('h5').contents[0],cat_articles))
|
|
||||||
else:
|
|
||||||
title = tag.find('h3').contents[0]
|
|
||||||
url = tag.find('a')['href']
|
|
||||||
print(url)
|
|
||||||
descripion = tag.find('p',{ 'class':'subtitle' }).contents[0]
|
|
||||||
article = {
|
|
||||||
'title': title,
|
|
||||||
'url': url,
|
|
||||||
'descripion': descripion,
|
|
||||||
'content': ''
|
|
||||||
}
|
|
||||||
cat_articles.append(article)
|
|
||||||
return articles
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Local Variables:
|
|
||||||
# mode: python
|
|
||||||
# End:
|
|
@ -1,23 +1,30 @@
|
|||||||
# vim:fileencoding=UTF-8
|
# vim:fileencoding=UTF-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
|
|
||||||
class AListApart (BasicNewsRecipe):
|
class AListApart (BasicNewsRecipe):
|
||||||
__author__ = 'Marc Busqué <marc@lamarciana.com>'
|
__author__ = 'Marc Busqué <marc@lamarciana.com>'
|
||||||
__url__ = 'http://www.lamarciana.com'
|
__url__ = 'http://www.lamarciana.com'
|
||||||
__version__ = '2.0'
|
__version__ = '2.0.1'
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2012, Marc Busqué <marc@lamarciana.com>'
|
__copyright__ = '2012, Marc Busqué <marc@lamarciana.com>'
|
||||||
title = u'A List Apart'
|
title = u'A List Apart'
|
||||||
description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices. This recipe retrieve articles and columns.'
|
description = u'A List Apart Magazine (ISSN: 1534-0295) explores the design, development, and meaning of web content, with a special focus on web standards and best practices. This recipe retrieve articles and columns.'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
tags = 'web development, software'
|
tags = 'web development, software'
|
||||||
oldest_article = 120
|
oldest_article = 120
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
cover_url = u'http://alistapart.com/pix/alalogo.gif'
|
cover_url = u'http://alistapart.com/pix/alalogo.gif'
|
||||||
extra_css = u'img {max-width: 100%; display: block; margin: auto;}'
|
extra_css = u'img {max-width: 100%; display: block; margin: auto;}'
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'A List Apart', u'http://feeds.feedburner.com/alistapart/abridged'),
|
(u'A List Apart', u'http://feeds.feedburner.com/alistapart/abridged'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def image_url_processor(self, baseurl, url):
|
||||||
|
if re.findall('alistapart\.com', url):
|
||||||
|
return 'http:'+url
|
||||||
|
else:
|
||||||
|
return url
|
||||||
|
@ -1,16 +0,0 @@
|
|||||||
from calibre.web.feeds.news import CalibrePeriodical
|
|
||||||
|
|
||||||
class LivingDigital(CalibrePeriodical):
|
|
||||||
|
|
||||||
title = 'Living Digital'
|
|
||||||
calibre_periodicals_slug = 'living-digital'
|
|
||||||
|
|
||||||
description = '''
|
|
||||||
Catch the latest buzz in the digital world with Living Digital. Enjoy
|
|
||||||
reviews, news, features and recommendations on a wide range of consumer
|
|
||||||
technology products - from smartphones to flat panel TVs, netbooks to
|
|
||||||
cameras, and many more consumer lifestyle gadgets. To subscribe, visit
|
|
||||||
<a href="http://news.calibre-ebook.com/periodical/living-digital">calibre
|
|
||||||
Periodicals</a>.
|
|
||||||
'''
|
|
||||||
language = 'en_IN'
|
|
14
recipes/ludwig_mises.recipe
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from calibre.web.feeds.news import AutomaticNewsRecipe
|
||||||
|
|
||||||
|
class BasicUserRecipe1373130372(AutomaticNewsRecipe):
|
||||||
|
title = u'Ludwig von Mises Institute'
|
||||||
|
__author__ = 'anywho'
|
||||||
|
language = 'en'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
auto_cleanup = True
|
||||||
|
|
||||||
|
feeds = [(u'Daily Articles (Full text version)',
|
||||||
|
u'http://feed.mises.org/MisesFullTextArticles'),
|
||||||
|
(u'Mises Blog Posts',
|
||||||
|
u'http://mises.org/blog/index.rdf')]
|
@ -1,3 +1,6 @@
|
|||||||
|
# -*- mode:python -*-
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
|
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
|
||||||
'''
|
'''
|
||||||
@ -6,57 +9,164 @@ Mediapart
|
|||||||
|
|
||||||
__author__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
|
__author__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
|
||||||
|
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
import re
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.web.feeds import feeds_from_index
|
||||||
|
from datetime import date,timedelta
|
||||||
|
|
||||||
class Mediapart(BasicNewsRecipe):
|
class Mediapart(BasicNewsRecipe):
|
||||||
title = 'Mediapart'
|
title = 'Mediapart'
|
||||||
__author__ = 'Mathieu Godlewski, Louis Gesbert'
|
__author__ = 'Mathieu Godlewski, Louis Gesbert'
|
||||||
description = 'Global news in french from news site Mediapart'
|
description = 'Global news in french from news site Mediapart'
|
||||||
oldest_article = 7
|
publication_type = 'newspaper'
|
||||||
language = 'fr'
|
language = 'fr'
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
max_articles_per_feed = 50
|
oldest_article = 2
|
||||||
|
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg'
|
cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png'
|
||||||
|
|
||||||
|
# --
|
||||||
|
|
||||||
|
oldest_article_date = date.today() - timedelta(days=oldest_article)
|
||||||
|
|
||||||
|
# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has
|
||||||
|
# the 10 last elements :/)
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('Les articles', 'http://www.mediapart.fr/articles/feed'),
|
('La Une', 'http://www.mediapart.fr/articles/feed'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def parse_feeds(self):
|
||||||
|
feeds = super(Mediapart, self).parse_feeds()
|
||||||
|
feeds += feeds_from_index(self.my_parse_index(feeds))
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def my_parse_index(self, la_une):
|
||||||
|
articles = []
|
||||||
|
|
||||||
|
breves = []
|
||||||
|
liens = []
|
||||||
|
confidentiels = []
|
||||||
|
|
||||||
|
soup = self.index_to_soup('https://www.mediapart.fr/journal/fil-dactualites')
|
||||||
|
page = soup.find('div', {'id':'pageFirstContent'})
|
||||||
|
fils = page.find('div', {'class':re.compile(r'\bcontent-journal\b')})
|
||||||
|
|
||||||
|
for article in fils.findAll('div'):
|
||||||
|
try:
|
||||||
|
title = article.find('h2',recursive=False)
|
||||||
|
if title is None or title['class'] == 'title-specific':
|
||||||
|
continue
|
||||||
|
|
||||||
|
# print "found fil ",title
|
||||||
|
article_type = article.find('a', {'href': re.compile(r'.*\/type-darticles\/.*')}).renderContents()
|
||||||
|
# print "kind: ",article_type
|
||||||
|
|
||||||
|
for s in title('span'):
|
||||||
|
s.replaceWith(s.renderContents() + "\n")
|
||||||
|
url = title.find('a', href=True)['href']
|
||||||
|
|
||||||
|
article_date = self.parse_french_date(article.find("span", "article-date").renderContents())
|
||||||
|
|
||||||
|
if article_date < self.oldest_article_date:
|
||||||
|
# print "too old"
|
||||||
|
continue
|
||||||
|
|
||||||
|
authors = article.findAll('a',{'class':re.compile(r'\bjournalist\b')})
|
||||||
|
authors = [self.tag_to_string(a) for a in authors]
|
||||||
|
|
||||||
|
description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p')
|
||||||
|
|
||||||
|
# print "fil ",title," by ",authors," : ",description
|
||||||
|
|
||||||
|
summary = {
|
||||||
|
'title': self.tag_to_string(title).strip(),
|
||||||
|
'author': ', '.join(authors),
|
||||||
|
'url': url,
|
||||||
|
'date': u'' + article_date.strftime("%A %d %b %Y"),
|
||||||
|
'description': '\n'.join([self.tag_to_string(d) for d in description]),
|
||||||
|
}
|
||||||
|
{
|
||||||
|
"Brève": breves,
|
||||||
|
"Lien": liens,
|
||||||
|
"Confidentiel": confidentiels,
|
||||||
|
}.get(article_type).append(summary)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# print 'La Une: ', len(la_une), ' articles'
|
||||||
|
# for a in la_une: print a["title"]
|
||||||
|
# print 'Brèves: ', len(breves), ' articles'
|
||||||
|
# print 'Revue web: ', len(liens), ' articles'
|
||||||
|
# print 'Confidentiel: ', len(confidentiels), ' articles'
|
||||||
|
|
||||||
|
articles += [('Brèves', breves)] if breves else []
|
||||||
|
articles += [('Revue du Web', liens)] if liens else []
|
||||||
|
articles += [('Confidentiel', confidentiels)] if confidentiels else []
|
||||||
|
return articles
|
||||||
|
|
||||||
# -- print-version
|
# -- print-version
|
||||||
|
|
||||||
conversion_options = { 'smarten_punctuation' : True }
|
conversion_options = {'smarten_punctuation' : True}
|
||||||
|
|
||||||
remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}) ]
|
remove_tags = [dict(name='div', attrs={'class':'print-source_url'})]
|
||||||
|
|
||||||
|
# non-locale specific date parse (strptime("%d %b %Y",s) would work with french locale)
|
||||||
|
def parse_french_date(self, date_str):
|
||||||
|
date_arr = date_str.lower().split()
|
||||||
|
return date(day=int(date_arr[0]),
|
||||||
|
year=int(date_arr[2]),
|
||||||
|
month=
|
||||||
|
[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
|
||||||
|
'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1]))
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
raw = self.browser.open(url).read()
|
raw = self.browser.open(url).read()
|
||||||
soup = BeautifulSoup(raw.decode('utf8', 'replace'))
|
soup = BeautifulSoup(raw.decode('utf8', 'replace'))
|
||||||
link = soup.find('a', {'title':'Imprimer'})
|
|
||||||
if link is None:
|
# Filter old articles
|
||||||
|
article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date')))
|
||||||
|
|
||||||
|
if article_date < self.oldest_article_date:
|
||||||
return None
|
return None
|
||||||
return link['href']
|
|
||||||
|
tools = soup.find('div', {'class':'menu-tools'})
|
||||||
|
link = tools.find('a', {'href': re.compile(r'\/print\/.*')})
|
||||||
|
if link is None:
|
||||||
|
print 'Error: print link not found'
|
||||||
|
return None
|
||||||
|
return 'https://mediapart.fr/' + link['href']
|
||||||
|
|
||||||
# -- Handle login
|
# -- Handle login
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
if self.username is not None and self.password is not None:
|
if self.username is not None and self.password is not None:
|
||||||
br.open('http://www.mediapart.fr/')
|
br.open('https://www.mediapart.fr/user')
|
||||||
br.select_form(nr=0)
|
br.select_form(nr=1)
|
||||||
br['name'] = self.username
|
br['name'] = self.username
|
||||||
br['pass'] = self.password
|
br['pass'] = self.password
|
||||||
br.submit()
|
br.submit()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
# This is a workaround articles with scribd content that include
|
||||||
for title in soup.findAll('p', {'class':'titre_page'}):
|
# <body></body> tags _within_ the body
|
||||||
title.name = 'h3'
|
preprocess_regexps = [
|
||||||
for legend in soup.findAll('span', {'class':'legend'}):
|
(re.compile(r'(<body.*?>)(.*)</body>', re.IGNORECASE|re.DOTALL),
|
||||||
legend.insert(0, Tag(soup, 'br', []))
|
lambda match:
|
||||||
legend.name = 'small'
|
match.group(1)
|
||||||
return soup
|
+ re.sub(re.compile(r'</?body>', re.IGNORECASE|re.DOTALL),'',
|
||||||
|
match.group(2))
|
||||||
|
+ '</body>')
|
||||||
|
]
|
||||||
|
|
||||||
|
# def preprocess_html(self, soup):
|
||||||
|
# for title in soup.findAll('p', {'class':'titre_page'}):
|
||||||
|
# title.name = 'h3'
|
||||||
|
# for legend in soup.findAll('span', {'class':'legend'}):
|
||||||
|
# legend.insert(0, Tag(soup, 'br', []))
|
||||||
|
# legend.name = 'em'
|
||||||
|
# return soup
|
||||||
|
@ -7,71 +7,75 @@ import time
|
|||||||
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||||
title = u'Metro UK'
|
title = u'Metro UK'
|
||||||
description = 'News from The Metro, UK'
|
description = 'News from The Metro, UK'
|
||||||
#timefmt = ''
|
|
||||||
__author__ = 'Dave Asbury'
|
|
||||||
#last update 4/4/13
|
|
||||||
#cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
|
|
||||||
|
|
||||||
cover_url = 'https://twimg0-a.akamaihd.net/profile_images/1638332595/METRO_LETTERS-01.jpg'
|
cover_url = 'https://twimg0-a.akamaihd.net/profile_images/1638332595/METRO_LETTERS-01.jpg'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
max_articles_per_feed = 12
|
max_articles_per_feed = 12
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
encoding = 'UTF-8'
|
#encoding = 'UTF-8'
|
||||||
|
|
||||||
language = 'en_GB'
|
language = 'en_GB'
|
||||||
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
|
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
|
||||||
compress_news_images = True
|
compress_news_images = True
|
||||||
|
compress_news_images_max_size = 30
|
||||||
|
remove_attributes = ['style', 'font']
|
||||||
|
preprocess_regexps = [
|
||||||
|
|
||||||
|
(re.compile(r'\| Metro News', re.IGNORECASE | re.DOTALL), lambda match: ''),
|
||||||
|
]
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
articles = {}
|
articles = {}
|
||||||
key = None
|
key = None
|
||||||
ans = []
|
ans = []
|
||||||
feeds = [ ('UK', 'http://metro.co.uk/news/uk/'),
|
feeds = [('UK', 'http://metro.co.uk/news/uk/'),
|
||||||
('World', 'http://metro.co.uk/news/world/'),
|
('World', 'http://metro.co.uk/news/world/'),
|
||||||
('Weird', 'http://metro.co.uk/news/weird/'),
|
('Weird', 'http://metro.co.uk/news/weird/'),
|
||||||
('Money', 'http://metro.co.uk/news/money/'),
|
('Money', 'http://metro.co.uk/news/money/'),
|
||||||
('Sport', 'http://metro.co.uk/sport/'),
|
('Sport', 'http://metro.co.uk/sport/'),
|
||||||
('Guilty Pleasures', 'http://metro.co.uk/guilty-pleasures/')
|
('Guilty Pleasures', 'http://metro.co.uk/guilty-pleasures/')
|
||||||
]
|
]
|
||||||
for key, feed in feeds:
|
for key, feed in feeds:
|
||||||
soup = self.index_to_soup(feed)
|
soup = self.index_to_soup(feed)
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
ans.append(key)
|
ans.append(key)
|
||||||
|
|
||||||
today = datetime.date.today()
|
today = datetime.date.today()
|
||||||
today = time.mktime(today.timetuple())-60*60*24
|
today = time.mktime(today.timetuple())-60*60*24
|
||||||
|
|
||||||
for a in soup.findAll('a'):
|
for a in soup.findAll('a'):
|
||||||
for name, value in a.attrs:
|
for name, value in a.attrs:
|
||||||
if name == "class" and value=="post":
|
if name == "class" and value=="post":
|
||||||
url = a['href']
|
url = a['href']
|
||||||
title = a['title']
|
title = a['title']
|
||||||
print title
|
print title
|
||||||
description = ''
|
description = ''
|
||||||
m = re.search('^.*uk/([^/]*)/([^/]*)/([^/]*)/', url)
|
m = re.search('^.*uk/([^/]*)/([^/]*)/([^/]*)/', url)
|
||||||
skip = 1
|
skip = 1
|
||||||
if len(m.groups()) == 3:
|
if len(m.groups()) == 3:
|
||||||
g = m.groups()
|
g = m.groups()
|
||||||
dt = datetime.datetime.strptime(''+g[0]+'-'+g[1]+'-'+g[2], '%Y-%m-%d')
|
dt = datetime.datetime.strptime(''+g[0]+'-'+g[1]+'-'+g[2], '%Y-%m-%d')
|
||||||
pubdate = time.strftime('%a, %d %b', dt.timetuple())
|
pubdate = time.strftime('%a, %d %b', dt.timetuple())
|
||||||
|
|
||||||
dt = time.mktime(dt.timetuple())
|
dt = time.mktime(dt.timetuple())
|
||||||
if dt >= today:
|
if dt >= today:
|
||||||
print pubdate
|
print pubdate
|
||||||
skip = 0
|
skip = 0
|
||||||
else:
|
else:
|
||||||
pubdate = strftime('%a, %d %b')
|
pubdate = strftime('%a, %d %b')
|
||||||
|
|
||||||
summary = a.find(True, attrs={'class':'excerpt'})
|
summary = a.find(True, attrs={'class':'excerpt'})
|
||||||
if summary:
|
if summary:
|
||||||
description = self.tag_to_string(summary, use_alt=False)
|
description = self.tag_to_string(summary, use_alt=False)
|
||||||
|
|
||||||
if skip == 0:
|
if skip == 0:
|
||||||
articles[key].append(
|
articles[key].append(
|
||||||
dict(title=title, url=url, date=pubdate,
|
dict(title=title, url=url, date=pubdate,
|
||||||
description=description,
|
description=description,
|
||||||
content=''))
|
content=''))
|
||||||
#ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
|
#ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
ans = [(key, articles[key]) for key in ans if key in articles]
|
||||||
return ans
|
return ans
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2010-2011, Eddie Lau'
|
__copyright__ = '2010-2013, Eddie Lau'
|
||||||
|
|
||||||
# Region - Hong Kong, Vancouver, Toronto
|
# Region - Hong Kong, Vancouver, Toronto
|
||||||
__Region__ = 'Hong Kong'
|
__Region__ = 'Hong Kong'
|
||||||
@ -32,6 +32,7 @@ __Date__ = ''
|
|||||||
|
|
||||||
'''
|
'''
|
||||||
Change Log:
|
Change Log:
|
||||||
|
2013/09/28: allow thumbnails even with hi-res images
|
||||||
2012/04/24: improved parsing of news.mingpao.com content
|
2012/04/24: improved parsing of news.mingpao.com content
|
||||||
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
|
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
|
||||||
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
|
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
|
||||||
@ -846,8 +847,7 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
return soup
|
return soup
|
||||||
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
def populate_article_metadata(self, article, soup, first):
|
||||||
# thumbnails shouldn't be available if using hi-res images
|
if __IncludeThumbnails__ and first and hasattr(self, 'add_toc_thumbnail'):
|
||||||
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
|
|
||||||
img = soup.find('img')
|
img = soup.find('img')
|
||||||
if img is not None:
|
if img is not None:
|
||||||
self.add_toc_thumbnail(article, img['src'])
|
self.add_toc_thumbnail(article, img['src'])
|
||||||
@ -1071,3 +1071,4 @@ class MPRecipe(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,18 +1,15 @@
|
|||||||
#!/usr/bin/env python
|
__copyright__ = '2009-2013, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
|
||||||
'''
|
'''
|
||||||
elargentino.com
|
sur.infonews.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
import datetime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
|
||||||
|
|
||||||
class MiradasAlSur(BasicNewsRecipe):
|
class MiradasAlSur(BasicNewsRecipe):
|
||||||
title = 'Miradas al Sur'
|
title = 'Miradas al Sur'
|
||||||
__author__ = 'Darko Miletic'
|
__author__ = 'Darko Miletic'
|
||||||
description = 'Revista Argentina'
|
description = 'Semanario Argentino'
|
||||||
publisher = 'ElArgentino.com'
|
publisher = 'ElArgentino.com'
|
||||||
category = 'news, politics, Argentina'
|
category = 'news, politics, Argentina'
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
@ -20,53 +17,51 @@ class MiradasAlSur(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'es_AR'
|
language = 'es_AR'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
masthead_url = 'http://sur.infonews.com/sites/default/files/www_miradas_al_sur_com_logo.gif'
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Arial,Helvetica,sans-serif}
|
||||||
|
h1{font-family: Georgia,Times,serif}
|
||||||
|
.field-field-story-author{color: gray; font-size: small}
|
||||||
|
"""
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
, 'series' : title
|
||||||
|
}
|
||||||
|
|
||||||
lang = 'es-AR'
|
keep_only_tags = [dict(name='div', attrs={'id':['content-header', 'content-area']})]
|
||||||
direction = 'ltr'
|
remove_tags = [
|
||||||
INDEX = 'http://www.elargentino.com/medios/123/Miradas-al-Sur.html'
|
dict(name=['link','meta','iframe','embed','object']),
|
||||||
extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} '
|
dict(name='form', attrs={'class':'fivestar-widget'}),
|
||||||
|
dict(attrs={'class':lambda x: x and 'terms-inline' in x.split()})
|
||||||
|
]
|
||||||
|
|
||||||
html2lrf_options = [
|
feeds = [
|
||||||
'--comment' , description
|
(u'Politica' , u'http://sur.infonews.com/taxonomy/term/1/0/feed'),
|
||||||
, '--category' , category
|
(u'Internacional' , u'http://sur.infonews.com/taxonomy/term/2/0/feed'),
|
||||||
, '--publisher', publisher
|
(u'Informe Especial' , u'http://sur.infonews.com/taxonomy/term/14/0/feed'),
|
||||||
]
|
(u'Delitos y pesquisas', u'http://sur.infonews.com/taxonomy/term/6/0/feed'),
|
||||||
|
(u'Lesa Humanidad' , u'http://sur.infonews.com/taxonomy/term/7/0/feed'),
|
||||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
|
(u'Cultura' , u'http://sur.infonews.com/taxonomy/term/8/0/feed'),
|
||||||
|
(u'Deportes' , u'http://sur.infonews.com/taxonomy/term/9/0/feed'),
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})]
|
(u'Contratapa' , u'http://sur.infonews.com/taxonomy/term/10/0/feed'),
|
||||||
|
]
|
||||||
remove_tags = [dict(name='link')]
|
|
||||||
|
|
||||||
feeds = [(u'Articulos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=123&Content-Type=text/xml&ChannelDesc=Miradas%20al%20Sur')]
|
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
main, sep, article_part = url.partition('/nota-')
|
|
||||||
article_id, rsep, rrest = article_part.partition('-')
|
|
||||||
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
for item in soup.findAll(style=True):
|
|
||||||
del item['style']
|
|
||||||
soup.html['lang'] = self.lang
|
|
||||||
soup.html['dir' ] = self.direction
|
|
||||||
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
|
|
||||||
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
|
|
||||||
soup.head.insert(0,mlang)
|
|
||||||
soup.head.insert(1,mcharset)
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
|
# determine the series number, unfortunately not gonna happen now
|
||||||
|
#self.conversion_options.update({'series_index':seriesnr})
|
||||||
cover_url = None
|
cover_url = None
|
||||||
soup = self.index_to_soup(self.INDEX)
|
cdate = datetime.date.today()
|
||||||
cover_item = soup.find('div',attrs={'class':'colder'})
|
todayweekday = cdate.isoweekday()
|
||||||
|
if (todayweekday != 7):
|
||||||
|
cdate -= datetime.timedelta(days=todayweekday)
|
||||||
|
cover_page_url = cdate.strftime('http://sur.infonews.com/ediciones/%Y-%m-%d/tapa')
|
||||||
|
soup = self.index_to_soup(cover_page_url)
|
||||||
|
cover_item = soup.find('img', attrs={'class':lambda x: x and 'imagecache-tapa_edicion_full' in x.split()})
|
||||||
if cover_item:
|
if cover_item:
|
||||||
clean_url = self.image_url_processor(None,cover_item.div.img['src'])
|
cover_url = cover_item['src']
|
||||||
cover_url = 'http://www.elargentino.com' + clean_url + '&height=600'
|
|
||||||
return cover_url
|
return cover_url
|
||||||
|
|
||||||
def image_url_processor(self, baseurl, url):
|
|
||||||
base, sep, rest = url.rpartition('?Id=')
|
|
||||||
img, sep2, rrest = rest.partition('&')
|
|
||||||
return base + sep + img
|
|
||||||
|
@ -1,46 +1,49 @@
|
|||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
class NatGeoMag(BasicNewsRecipe):
|
class NGM(BasicNewsRecipe):
|
||||||
title = 'National Geographic Mag'
|
|
||||||
__author__ = 'Terminal Veracity'
|
|
||||||
description = 'The National Geographic Magazine'
|
|
||||||
publisher = 'National Geographic'
|
|
||||||
oldest_article = 31
|
|
||||||
max_articles_per_feed = 50
|
|
||||||
category = 'geography, magazine'
|
|
||||||
language = 'en'
|
|
||||||
publication_type = 'magazine'
|
|
||||||
cover_url = 'http://www.yourlogoresources.com/wp-content/uploads/2011/09/national-geographic-logo.jpg'
|
|
||||||
use_embedded_content = False
|
|
||||||
no_stylesheets = True
|
|
||||||
remove_javascript = True
|
|
||||||
recursions = 1
|
|
||||||
remove_empty_feeds = True
|
|
||||||
feeds = [('National Geographic Magazine', 'http://feeds.nationalgeographic.com/ng/NGM/NGM_Magazine')]
|
|
||||||
remove_tags = [dict(name='div', attrs={'class':['nextpage_continue', 'subscribe']})]
|
|
||||||
keep_only_tags = [dict(attrs={'class':'main_3narrow'})]
|
|
||||||
extra_css = """
|
|
||||||
h1 {font-size: large; font-weight: bold; margin: .5em 0; }
|
|
||||||
h2 {font-size: large; font-weight: bold; margin: .5em 0; }
|
|
||||||
h3 {font-size: medium; font-weight: bold; margin: 0 0; }
|
|
||||||
.article_credits_author {font-size: small; font-style: italic; }
|
|
||||||
.article_credits_photographer {font-size: small; font-style: italic; display: inline }
|
|
||||||
"""
|
|
||||||
|
|
||||||
def parse_feeds(self):
|
title = 'National Geographic Magazine'
|
||||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
__author__ = 'Krittika Goyal'
|
||||||
for feed in feeds:
|
description = 'National Geographic Magazine'
|
||||||
for article in feed.articles[:]:
|
timefmt = ' [%d %b, %Y]'
|
||||||
if 'Flashback' in article.title:
|
|
||||||
feed.articles.remove(article)
|
no_stylesheets = True
|
||||||
elif 'Desktop Wallpaper' in article.title:
|
auto_cleanup = True
|
||||||
feed.articles.remove(article)
|
auto_cleanup_keep = '//div[@class="featurepic"]'
|
||||||
elif 'Visions of Earth' in article.title:
|
|
||||||
feed.articles.remove(article)
|
def nejm_get_index(self):
|
||||||
elif 'Your Shot' in article.title:
|
return self.index_to_soup('http://ngm.nationalgeographic.com/2013/10/table-of-contents')
|
||||||
feed.articles.remove(article)
|
|
||||||
elif 'MyShot' in article.title:
|
# To parse artice toc
|
||||||
feed.articles.remove(article)
|
def parse_index(self):
|
||||||
elif 'Field Test' in article.title:
|
soup = self.nejm_get_index()
|
||||||
feed.articles.remove(article)
|
tocfull = soup.find('div', attrs={'class':'coltoc'})
|
||||||
return feeds
|
|
||||||
|
toc = tocfull.find('div', attrs={'class':'more_section'})
|
||||||
|
|
||||||
|
articles = []
|
||||||
|
feeds = []
|
||||||
|
section_title = 'Features'
|
||||||
|
for x in toc.findAll(True):
|
||||||
|
if x.name == 'a':
|
||||||
|
# Article found
|
||||||
|
title = self.tag_to_string(x)
|
||||||
|
url = x.get('href', False)
|
||||||
|
if not url or not title:
|
||||||
|
continue
|
||||||
|
url = 'http://ngm.nationalgeographic.com' + url
|
||||||
|
self.log('\t\tFound article:', title)
|
||||||
|
self.log('\t\t\t', url)
|
||||||
|
articles.append({'title': title, 'url':url,
|
||||||
|
'description':'', 'date':''})
|
||||||
|
feeds.append((section_title, articles))
|
||||||
|
|
||||||
|
art1 = tocfull.findAll('a')[1]
|
||||||
|
art1_title = self.tag_to_string(art1.find('div', attrs={'class': 'toched'}))
|
||||||
|
art1_url = art1.get('href', False)
|
||||||
|
art1_url = 'http://ngm.nationalgeographic.com' + art1_url
|
||||||
|
art1feed = {'title': art1_title, 'url':art1_url,
|
||||||
|
'description':'', 'date':''}
|
||||||
|
feeds.append(('Cover Story', [art1feed]))
|
||||||
|
|
||||||
|
return feeds
|
||||||
|
@ -1,49 +1,108 @@
|
|||||||
# vim:fileencoding=utf-8
|
# vim:fileencoding=utf-8
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class AdvancedUserRecipe1344926684(BasicNewsRecipe):
|
class AdvancedUserRecipe1380105782(BasicNewsRecipe):
|
||||||
title = u'Neue Osnabrücker Zeitung'
|
title = u'Neue Osnabrücker Zeitung'
|
||||||
__author__ = 'Krittika Goyal'
|
__author__ = 'vo_he'
|
||||||
oldest_article = 7
|
description = 'Online auch ohne IPhone'
|
||||||
max_articles_per_feed = 100
|
encoding = 'utf-8'
|
||||||
# auto_cleanup = True
|
language = 'de'
|
||||||
no_stylesheets = True
|
|
||||||
use_embedded_content = False
|
|
||||||
language = 'de'
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
cover_url = 'http://www.noz.de/bundles/nozplatform/images/logos/osnabruecker-zeitung.png'
|
||||||
|
|
||||||
|
remove_tags_before =dict(id='feedContent')
|
||||||
|
remove_tags_before =dict(id='headline')
|
||||||
|
|
||||||
|
remove_tags_after =dict(id='article-authorbox')
|
||||||
|
remove_tags_after =dict(id='footer-start')
|
||||||
|
remove_tags_after =dict(name='div', attrs={'class':'morelinks'})
|
||||||
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(name='div', attrs={'class':'article'}),
|
|
||||||
dict(name='span', attrs={'id':'articletext'})
|
|
||||||
]
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='div', attrs={'id':'retresco-title'}),
|
dict(name='div', attrs={'id':'ui-datepicker-div'}),
|
||||||
dict(name='div', attrs={'class':'retresco-item s1 relative'}),
|
dict(name='div', attrs={'class':'nav-second'}),
|
||||||
dict(name='a', attrs={'class':'medium2 largeSpaceTop icon'}),
|
dict(name='div', attrs={'class':'nav-first'}),
|
||||||
dict(name='div', attrs={'class':'articleFunctions inlineTeaserRight'}),
|
dict(name='div', attrs={'class':'icon-print'}),
|
||||||
dict(name='div', attrs={'class':'imageContainer '}),
|
dict(name='div', attrs={'class':'social-button'}),
|
||||||
dict(name='div', attrs={'class':'imageContainer centerContainer'}),
|
dict(name='div', attrs={'class':'social-media-bar'}),
|
||||||
dict(name='div', attrs={'class':'grid singleCol articleTeaser'}),
|
dict(name='div', attrs={'class':'pull-right'}),
|
||||||
dict(name='h3', attrs={'class':'teaserRow'}),
|
dict(name='div', attrs={'class':'btn btn-primary flat-button'}),
|
||||||
dict(name='div', attrs={'class':'related-comments'}),
|
dict(name='div', attrs={'class':'carousel-wrapper'}),
|
||||||
dict(name='a', attrs={'class':' icon'}),
|
dict(name='a', attrs={'class':'right-content merchandising hidden-tablet'}),
|
||||||
dict(name='a', attrs={'class':'right small'}),
|
dict(name='div', attrs={'class':'border-circle pull-left'}),
|
||||||
dict(name='span', attrs={'class':'small block spaceBottom rectangleAd'}),
|
dict(name='div', attrs={'class':'row show-grid general-infoimageContainer '}),
|
||||||
|
dict(name='div', attrs={'class':'location-list'}),
|
||||||
|
dict(name='div', attrs={'class':'block'}),
|
||||||
dict(name='div', attrs={'class':'furtherGalleries largeSpaceTop'})
|
dict(name='div', attrs={'class':'furtherGalleries largeSpaceTop'})
|
||||||
]
|
]
|
||||||
|
|
||||||
feeds = [(u'Lokales', u'http://www.noz.de/rss/Lokales'),
|
feeds = [(u'Melle Mitte', u'http://www.noz.de/rss/ressort/Melle%20Mitte'),
|
||||||
(u'Vermischtes', u'http://www.noz.de/rss/Vermischtes'),
|
(u'Melle Nord', u'http://www.noz.de/rss/ressort/Melle%20Nord'),
|
||||||
(u'Politik', u'http://www.noz.de/rss/Politik'),
|
(u'Melle Sued', u'http://www.noz.de/rss/ressort/Melle%20S%C3%BCd'),
|
||||||
(u'Wirtschaft', u'http://www.noz.de/rss/Wirtschaft'),
|
(u'Nordrhein Westfalen', u'http://www.noz.de/rss/ressort/Nordrhein-Westfalen'),
|
||||||
(u'Kultur', u'http://www.noz.de/rss/Kultur'),
|
(u'Niedersachsen', u'http://www.noz.de/rss/ressort/Niedersachsen'),
|
||||||
(u'Medien', u'http://www.noz.de/rss/Medien'),
|
(u'Vermischtes', u'http://www.noz.de/rss/ressort/Vermischtes'),
|
||||||
(u'Wissenschaft', u'http://www.noz.de/rss/wissenschaft'),
|
(u'GutzuWissen', u'http://www.noz.de/rss/ressort/Gut%20zu%20Wissen'),
|
||||||
(u'Sport', u'http://www.noz.de/rss/Sport'),
|
(u'Sport', u'http://www.noz.de/rss/ressort/Sport'),
|
||||||
(u'Computer', u'http://www.noz.de/rss/Computer'),
|
(u'Kultur', u'http://www.noz.de/rss/ressort/Kultur'),
|
||||||
(u'Musik', u'http://www.noz.de/rss/Musik'),
|
(u'Medien', u'http://www.noz.de/rss/ressort/Medien'),
|
||||||
(u'Szene', u'http://www.noz.de/rss/Szene'),
|
(u'Belm', u'http://www.noz.de/rss/ressort/Belm'),
|
||||||
(u'Niedersachsen', u'http://www.noz.de/rss/Niedersachsen'),
|
(u'Bissendorf', u' [url]http://www.noz.de/rss/ressort/Bissendorf[/url]'),
|
||||||
(u'Kino', u'http://www.noz.de/rss/Kino')]
|
(u'Osnabrueck', u'http://www.noz.de/rss/ressort/Osnabr%C3%BCck'),
|
||||||
|
(u'Bad Essen', u'http://www.noz.de/rss/ressort/Bad%20Essen'),
|
||||||
|
(u'Politik', u'http://www.noz.de/rss/ressort/Politik'),
|
||||||
|
(u'Wirtschaft', u'http://www.noz.de/rss/ressort/Wirtschaft'),
|
||||||
|
#(u'Fussball', u'http:/www.noz.de/rss/ressort/Fußball'),
|
||||||
|
#(u'VfL Osnabrueck', u'http://www.noz.de/rss/ressort/VfL%20Osnabr%C3%BCck'),
|
||||||
|
#(u'SF Lotte', u'http://www.noz.de/rss/ressort/SF%20Lotte'),
|
||||||
|
#(u'SV Meppen', u'http://www.noz.de/rss/ressort/SV%20Meppen'),
|
||||||
|
#(u'Artland Dragons', u'http://www.noz.de/rss/ressort/Artland%20Dragons'),
|
||||||
|
#(u'Panthers', u'http://www.noz.de/rss/ressort/Panthers'),
|
||||||
|
(u'OS-Sport', u'http://www.noz.de/rss/ressort/OS-Sport'),
|
||||||
|
#(u'Emsland Sport', u'http://www.noz.de/rss/ressort/EL-Sport'),
|
||||||
|
#(u'Lingen', u'http://www.noz.de/rss/ressort/Lingen'),
|
||||||
|
#(u'Lohne', u'http://www.noz.de/rss/ressort/Lohne'),
|
||||||
|
#(u'Emsbueren', u'http://www.noz.de/rss/ressort/Emsb%C3%BCren'),
|
||||||
|
#(u'Salzbergen', u'http://www.noz.de/rss/ressort/Salzbergen'),
|
||||||
|
#(u'Spelle', u'http://www.noz.de/rss/ressort/Spelle'),
|
||||||
|
#(u'Freren', u'http://www.noz.de/rss/ressort/Freren'),
|
||||||
|
#(u'Lengerich', u'http://www.noz.de/rss/ressort/Lengerich'),
|
||||||
|
#(u'Bad Iburg', u'http://www.noz.de/rss/ressort/Bad%20Iburg'),
|
||||||
|
#(u'Bad Laer', u'http://www.noz.de/rss/ressort/Bad%20Laer'),
|
||||||
|
#(u'Bad Rothenfelde', u'http://www.noz.de/rss/ressort/Bad%20Rothenfelde'),
|
||||||
|
#(u'GMHütte', u'http://www.noz.de/rss/ressort/Georgsmarienh%C3%BCtte'),
|
||||||
|
#(u'Glandorf', u'http://www.noz.de/rss/ressort/Glandorf'),
|
||||||
|
#(u'Hagen', u'http://www.noz.de/rss/ressort/Hagen'),
|
||||||
|
#(u'Hasbergen', u'http://www.noz.de/rss/ressort/Hasbergen'),
|
||||||
|
#(u'Hilter', u'http://www.noz.de/rss/ressort/Hilter'),
|
||||||
|
#(u'Lotte', u'http://www.noz.de/rss/ressort/Lotte'),
|
||||||
|
#(u'Wallenhorst', u'http://www.noz.de/rss/ressort/Wallenhorst'),
|
||||||
|
#(u'Westerkappeln', u'http://www.noz.de/rss/ressort/Westerkappeln'),
|
||||||
|
#(u'Artland', u'http://www.noz.de/rss/ressort/Artland'),
|
||||||
|
#(u'Bersenbrück', u'http://www.noz.de/rss/ressort/Bersenbr%C3%BCck'),
|
||||||
|
#(u'Fürstenau', u'http://www.noz.de/rss/ressort/F%C3%BCrstenau'),
|
||||||
|
#(u'Neuenkirchen', u'http://www.noz.de/rss/ressort/Neuenkirchen'),
|
||||||
|
#(u'Lokalsport', u'http://www.noz.de/rss/ressort/Lokalsport%20Nordkreis'),
|
||||||
|
#(u'Bramsche', u'http://www.noz.de/rss/ressort/Bramsche'),
|
||||||
|
#(u'Bramsche Ortsteile', u'http://www.noz.de/rss/ressort/Bramscher%20Ortsteile'),
|
||||||
|
#(u'Neuenkirchen Vörden', u'http://www.noz.de/rss/ressort/Neuenkirchen-V%C3%B6rden'),
|
||||||
|
#(u'Papenburg', u'http://www.noz.de/rss/ressort/Papenburg'),
|
||||||
|
#(u'Dörpen', u'http://www.noz.de/rss/ressort/D%C3%B6rpen'),
|
||||||
|
#(u'Rhede', u'http://www.noz.de/rss/ressort/Rhede'),
|
||||||
|
#(u'Lathen', u'http://www.noz.de/rss/ressort/Lathen'),
|
||||||
|
#(u'Sögel', u'http://www.noz.de/rss/ressort/S%C3%B6gel'),
|
||||||
|
#(u'Nordhümmling', u'http://www.noz.de/rss/ressort/Nordh%C3%BCmmling'),
|
||||||
|
#(u'Werlte', u'http://www.noz.de/rss/ressort/Werlte'),
|
||||||
|
#(u'Westoverledingen', u'http://www.noz.de/rss/ressort/Westoverledingen'),
|
||||||
|
#(u'Geeste', u'http://www.noz.de/rss/ressort/Geeste'),
|
||||||
|
#(u'Haren', u'http://www.noz.de/rss/ressort/Haren'),
|
||||||
|
#(u'Haselünne', u'http://www.noz.de/rss/ressort/Hasel%C3%BCnne'),
|
||||||
|
#(u'Herzlake', u'http://www.noz.de/rss/ressort/Herzlake'),
|
||||||
|
#(u'Meppen', u'http://www.noz.de/rss/ressort/Meppen'),
|
||||||
|
#(u'Twist', u'http://www.noz.de/rss/ressort/Twist'),
|
||||||
|
#(u'Bohmte', u'http://www.noz.de/rss/ressort/Bohmte'),
|
||||||
|
#(u'Ostercappeln', u'http://www.noz.de/rss/ressort/Ostercappeln')
|
||||||
|
]
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||||
@ -11,6 +10,9 @@ import re
|
|||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
def find_header(tag):
|
||||||
|
return tag.name == 'header' and tag.parent['class'] == 'article'
|
||||||
|
|
||||||
class NewYorkReviewOfBooks(BasicNewsRecipe):
|
class NewYorkReviewOfBooks(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'New York Review of Books'
|
title = u'New York Review of Books'
|
||||||
@ -23,65 +25,70 @@ class NewYorkReviewOfBooks(BasicNewsRecipe):
|
|||||||
no_javascript = True
|
no_javascript = True
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
|
|
||||||
keep_only_tags = [dict(id=['article-body','page-title'])]
|
keep_only_tags = [
|
||||||
remove_tags = [dict(attrs={'class':['article-tools', 'article-links',
|
dict(name='section', attrs={'class':'article_body'}),
|
||||||
'center advertisement']})]
|
dict(name=find_header),
|
||||||
|
dict(name='div', attrs={'class':'for-subscribers-only'}),
|
||||||
|
]
|
||||||
|
|
||||||
preprocess_regexps = [(re.compile(r'<head>.*?</head>', re.DOTALL), lambda
|
preprocess_regexps = [(re.compile(r'<head>.*?</head>', re.DOTALL), lambda
|
||||||
m:'<head></head>')]
|
m:'<head></head>')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url+'?pagination=false'
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
br.open('http://www.nybooks.com/account/signin/')
|
br.open('http://www.nybooks.com/account/signin/')
|
||||||
br.select_form(nr = 1)
|
br.select_form(nr=2)
|
||||||
br['username'] = self.username
|
br['username'] = self.username
|
||||||
br['password'] = self.password
|
br['password'] = self.password
|
||||||
br.submit()
|
br.submit()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def print_version(self, url):
|
def preprocess_html(self, soup):
|
||||||
return url+'?pagination=false'
|
header = soup.find('header')
|
||||||
|
body = soup.find('body')
|
||||||
|
body.insert(0, header)
|
||||||
|
header.find('div', attrs={'class':'details'}).extract()
|
||||||
|
for i in soup.findAll('input'):
|
||||||
|
i.extract()
|
||||||
|
return soup
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup('http://www.nybooks.com/current-issue')
|
soup = self.index_to_soup('http://www.nybooks.com/current-issue')
|
||||||
|
|
||||||
# Find cover
|
# Find cover
|
||||||
sidebar = soup.find(id='sidebar')
|
sidebar = soup.find('div', attrs={'class':'issue_cover'})
|
||||||
if sidebar is not None:
|
if sidebar is not None:
|
||||||
a = sidebar.find('a', href=lambda x: x and 'view-photo' in x)
|
img = sidebar.find('img', src=True)
|
||||||
if a is not None:
|
self.cover_url = 'http://www.nybooks.com' + img['src']
|
||||||
psoup = self.index_to_soup('http://www.nybooks.com'+a['href'])
|
self.log('Found cover at:', self.cover_url)
|
||||||
cover = psoup.find('img', src=True)
|
|
||||||
self.cover_url = cover['src']
|
|
||||||
self.log('Found cover at:', self.cover_url)
|
|
||||||
|
|
||||||
# Find date
|
# Find date
|
||||||
div = soup.find(id='page-title')
|
div = soup.find('time', pubdate='pubdate')
|
||||||
if div is not None:
|
if div is not None:
|
||||||
h5 = div.find('h5')
|
text = self.tag_to_string(div)
|
||||||
if h5 is not None:
|
date = text.partition(u'\u2022')[0].strip()
|
||||||
text = self.tag_to_string(h5)
|
self.timefmt = u' [%s]'%date
|
||||||
date = text.partition(u'\u2022')[0].strip()
|
self.log('Issue date:', date)
|
||||||
self.timefmt = u' [%s]'%date
|
|
||||||
self.log('Issue date:', date)
|
|
||||||
|
|
||||||
# Find TOC
|
# Find TOC
|
||||||
tocs = soup.findAll('ul', attrs={'class':'issue-article-list'})
|
toc = soup.find('div', attrs={'class':'current_issue'}).find('div', attrs={'class':'articles_list'})
|
||||||
articles = []
|
articles = []
|
||||||
for toc in tocs:
|
for div in toc.findAll('div', attrs={'class':'row'}):
|
||||||
for li in toc.findAll('li'):
|
h2 = div.find('h2')
|
||||||
h3 = li.find('h3')
|
title = self.tag_to_string(h2).strip()
|
||||||
title = self.tag_to_string(h3)
|
author = self.tag_to_string(div.find('div', attrs={'class':'author'})).strip()
|
||||||
author = self.tag_to_string(li.find('h4'))
|
title = title + u' (%s)'%author
|
||||||
title = title + u' (%s)'%author
|
url = 'http://www.nybooks.com' + h2.find('a', href=True)['href']
|
||||||
url = 'http://www.nybooks.com'+h3.find('a', href=True)['href']
|
desc = ''
|
||||||
desc = ''
|
for p in div.findAll('p', attrs={'class':lambda x: x and 'quiet' in x}):
|
||||||
for p in li.findAll('p'):
|
desc += self.tag_to_string(p)
|
||||||
desc += self.tag_to_string(p)
|
self.log('Found article:', title)
|
||||||
self.log('Found article:', title)
|
self.log('\t', url)
|
||||||
self.log('\t', url)
|
self.log('\t', desc)
|
||||||
self.log('\t', desc)
|
articles.append({'title':title, 'url':url, 'date':'',
|
||||||
articles.append({'title':title, 'url':url, 'date':'',
|
|
||||||
'description':desc})
|
'description':desc})
|
||||||
|
|
||||||
return [('Current Issue', articles)]
|
return [('Current Issue', articles)]
|
||||||
|
@ -10,6 +10,9 @@ import re
|
|||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
def find_header(tag):
|
||||||
|
return tag.name == 'header' and tag.parent['class'] == 'article'
|
||||||
|
|
||||||
class NewYorkReviewOfBooks(BasicNewsRecipe):
|
class NewYorkReviewOfBooks(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'New York Review of Books (no subscription)'
|
title = u'New York Review of Books (no subscription)'
|
||||||
@ -21,9 +24,11 @@ class NewYorkReviewOfBooks(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
no_javascript = True
|
no_javascript = True
|
||||||
|
|
||||||
keep_only_tags = [dict(id=['article-body', 'page-title'])]
|
keep_only_tags = [
|
||||||
remove_tags = [dict(attrs={'class':['article-tools', 'article-links',
|
dict(name='section', attrs={'class':'article_body'}),
|
||||||
'center advertisement']})]
|
dict(name=find_header),
|
||||||
|
dict(name='div', attrs={'class':'for-subscribers-only'}),
|
||||||
|
]
|
||||||
|
|
||||||
preprocess_regexps = [(re.compile(r'<head>.*?</head>', re.DOTALL), lambda
|
preprocess_regexps = [(re.compile(r'<head>.*?</head>', re.DOTALL), lambda
|
||||||
m:'<head></head>')]
|
m:'<head></head>')]
|
||||||
@ -31,40 +36,44 @@ class NewYorkReviewOfBooks(BasicNewsRecipe):
|
|||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url+'?pagination=false'
|
return url+'?pagination=false'
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
header = soup.find('header')
|
||||||
|
body = soup.find('body')
|
||||||
|
body.insert(0, header)
|
||||||
|
header.find('div', attrs={'class':'details'}).extract()
|
||||||
|
for i in soup.findAll('input'):
|
||||||
|
i.extract()
|
||||||
|
return soup
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup('http://www.nybooks.com/current-issue')
|
soup = self.index_to_soup('http://www.nybooks.com/current-issue')
|
||||||
|
|
||||||
# Find cover
|
# Find cover
|
||||||
sidebar = soup.find(id='sidebar')
|
sidebar = soup.find('div', attrs={'class':'issue_cover'})
|
||||||
if sidebar is not None:
|
if sidebar is not None:
|
||||||
a = sidebar.find('a', href=lambda x: x and 'view-photo' in x)
|
img = sidebar.find('img', src=True)
|
||||||
if a is not None:
|
self.cover_url = 'http://www.nybooks.com' + img['src']
|
||||||
psoup = self.index_to_soup('http://www.nybooks.com'+a['href'])
|
self.log('Found cover at:', self.cover_url)
|
||||||
cover = psoup.find('img', src=True)
|
|
||||||
self.cover_url = cover['src']
|
|
||||||
self.log('Found cover at:', self.cover_url)
|
|
||||||
|
|
||||||
# Find date
|
# Find date
|
||||||
div = soup.find(id='page-title')
|
div = soup.find('time', pubdate='pubdate')
|
||||||
if div is not None:
|
if div is not None:
|
||||||
h5 = div.find('h5')
|
text = self.tag_to_string(div)
|
||||||
if h5 is not None:
|
date = text.partition(u'\u2022')[0].strip()
|
||||||
text = self.tag_to_string(h5)
|
self.timefmt = u' [%s]'%date
|
||||||
date = text.partition(u'\u2022')[0].strip()
|
self.log('Issue date:', date)
|
||||||
self.timefmt = u' [%s]'%date
|
|
||||||
self.log('Issue date:', date)
|
|
||||||
|
|
||||||
# Find TOC
|
# Find TOC
|
||||||
toc = soup.find('ul', attrs={'class':'issue-article-list'})
|
toc = soup.find('div', attrs={'class':'current_issue'}).find('div', attrs={'class':'articles_list'})
|
||||||
articles = []
|
articles = []
|
||||||
for li in toc.findAll('li'):
|
for div in toc.findAll('div', attrs={'class':'row'}):
|
||||||
h3 = li.find('h3')
|
h2 = div.find('h2')
|
||||||
title = self.tag_to_string(h3)
|
title = self.tag_to_string(h2).strip()
|
||||||
author = self.tag_to_string(li.find('h4'))
|
author = self.tag_to_string(div.find('div', attrs={'class':'author'})).strip()
|
||||||
title = title + u' (%s)'%author
|
title = title + u' (%s)'%author
|
||||||
url = 'http://www.nybooks.com'+h3.find('a', href=True)['href']
|
url = 'http://www.nybooks.com' + h2.find('a', href=True)['href']
|
||||||
desc = ''
|
desc = ''
|
||||||
for p in li.findAll('p'):
|
for p in div.findAll('p', attrs={'class':lambda x: x and 'quiet' in x}):
|
||||||
desc += self.tag_to_string(p)
|
desc += self.tag_to_string(p)
|
||||||
self.log('Found article:', title)
|
self.log('Found article:', title)
|
||||||
self.log('\t', url)
|
self.log('\t', url)
|
||||||
|
53
recipes/news24.recipe
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1375900744(BasicNewsRecipe):
|
||||||
|
title = u'News24'
|
||||||
|
description = "News24."
|
||||||
|
__author__ = 'Nicki de Wet'
|
||||||
|
publisher = 'Media24'
|
||||||
|
category = 'news, politics, South Africa'
|
||||||
|
oldest_article = 3
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf8'
|
||||||
|
use_embedded_content = False
|
||||||
|
auto_cleanup = False
|
||||||
|
language = 'en_ZA'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
publication_type = 'newsportal'
|
||||||
|
masthead_url = 'http://www.24.com/images/widgethead_news.png'
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Arial,Helvetica,sans-serif }
|
||||||
|
img{display: block}
|
||||||
|
"""
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['object','embed','iframe','table','meta','link']),
|
||||||
|
dict(attrs={
|
||||||
|
'class':['TwitterfacebookLink','superSportArticleBlock',
|
||||||
|
'videoHighlights', 'facebookComments','share',
|
||||||
|
'item_block','kalahari_product left', 'block red',
|
||||||
|
'credit']}),
|
||||||
|
dict(attrs={'id':['comments_wrap', 'article_toolbox_bot',
|
||||||
|
'inside_news','sponsored-links', 'lnkGalleries',
|
||||||
|
'relatedlinks_box', 'lnkUserGalleries',
|
||||||
|
'lnkNewsGalleries', 'relatedlinks',
|
||||||
|
'divRelatedLinks']})]
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(attrs={'class':['left col633', 'article col626',
|
||||||
|
'columnWrapperLeft', 'articlecolumn',
|
||||||
|
'article_img', 'picture_caption', 'DiveTable']})]
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Top Stories', u'http://feeds.news24.com/articles/news24/TopStories/rss'),
|
||||||
|
(u'South Africa', u'http://feeds.news24.com/articles/news24/SouthAfrica/rss'),
|
||||||
|
(u'World', u'http://feeds.news24.com/articles/news24/World/rss'),
|
||||||
|
(u'Sport', u'http://feeds.24.com/articles/sport/featured/topstories/rss')]
|
@ -2,173 +2,263 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2010, matek09, matek09@gmail.com; 2012, admroz, a.rozewicki@gmail.com'
|
__copyright__ = '2010, matek09, matek09@gmail.com; 2012-2013, admroz, a.rozewicki@gmail.com'
|
||||||
|
|
||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
from string import capwords
|
from string import capwords
|
||||||
import datetime
|
import datetime
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
class Newsweek(BasicNewsRecipe):
|
class Newsweek(BasicNewsRecipe):
|
||||||
|
|
||||||
# how many issues to go back, 0 means get the most current one
|
# how many issues to go back, 0 means get the most current one
|
||||||
BACK_ISSUES = 2
|
BACK_ISSUES = 1
|
||||||
|
|
||||||
EDITION = '0'
|
EDITION = '0'
|
||||||
DATE = None
|
DATE = None
|
||||||
YEAR = datetime.datetime.now().year
|
YEAR = datetime.datetime.now().year
|
||||||
|
|
||||||
title = u'Newsweek Polska'
|
title = u'Newsweek Polska'
|
||||||
__author__ = 'matek09, admroz'
|
__author__ = 'matek09, admroz'
|
||||||
description = 'Weekly magazine'
|
description = 'Weekly magazine'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'pl'
|
language = 'pl'
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
|
|
||||||
temp_files = []
|
temp_files = []
|
||||||
articles_are_obfuscated = True
|
articles_are_obfuscated = True
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Parses each article
|
# Parses article contents from one page
|
||||||
#
|
#
|
||||||
def get_obfuscated_article(self, url):
|
def get_article_divs(self, css, main_section):
|
||||||
br = self.get_browser()
|
strs = []
|
||||||
br.open(url)
|
|
||||||
source = br.response().read()
|
|
||||||
page = self.index_to_soup(source)
|
|
||||||
|
|
||||||
main_section = page.find(id='mainSection')
|
# get all divs with given css class
|
||||||
|
article_divs = main_section.findAll('div', attrs={'class' : css})
|
||||||
|
for article_div in article_divs:
|
||||||
|
|
||||||
title = main_section.find('h1')
|
# remove sections like 'read more...' etc.
|
||||||
info = main_section.find('ul', attrs={'class' : 'articleInfo'})
|
for p in article_div.findAll('p'):
|
||||||
authors = info.find('li').find('h4')
|
|
||||||
article = main_section.find('div', attrs={'id' : 'article'})
|
|
||||||
|
|
||||||
# remove related articles box
|
if p.find('span', attrs={'style' : 'color: #800000; font-size: medium;'}):
|
||||||
related = article.find('div', attrs={'class' : 'relatedBox'})
|
p.extract()
|
||||||
if related is not None:
|
continue
|
||||||
related.extract()
|
|
||||||
|
|
||||||
# remove div with social networking links and links to
|
if p.find('span', attrs={'style' : 'font-size: medium; color: #800000;'}):
|
||||||
# other articles in web version
|
p.extract()
|
||||||
for div in article.findAll('div'):
|
continue
|
||||||
if div.find('span', attrs={'class' : 'google-plus'}):
|
|
||||||
div.extract()
|
|
||||||
|
|
||||||
for p in div.findAll('p'):
|
if p.find('span', attrs={'style' : 'font-size: medium;'}):
|
||||||
if p.find('span', attrs={'style' : 'color: rgb(255, 0, 0);'}):
|
p.extract()
|
||||||
p.extract()
|
continue
|
||||||
continue
|
|
||||||
for a in p.findAll('a'):
|
if p.find('span', attrs={'style' : 'color: #800000;'}):
|
||||||
if a.find('span', attrs={'style' : 'font-size: larger;'}):
|
p.extract()
|
||||||
a.extract()
|
continue
|
||||||
|
|
||||||
|
obj = p.find('object')
|
||||||
|
if obj:
|
||||||
|
obj.extract()
|
||||||
|
continue
|
||||||
|
|
||||||
|
strong = p.find('strong')
|
||||||
|
if strong:
|
||||||
|
newest = re.compile("Tekst pochodzi z najnowszego numeru Tygodnika Newsweek")
|
||||||
|
if newest.search(str(strong)):
|
||||||
|
strong.extract()
|
||||||
|
continue
|
||||||
|
|
||||||
|
itunes = p.find('a')
|
||||||
|
if itunes:
|
||||||
|
reurl = re.compile("itunes.apple.com")
|
||||||
|
if reurl.search(str(itunes['href'])):
|
||||||
|
p.extract()
|
||||||
|
continue
|
||||||
|
|
||||||
|
imagedesc = p.find('div', attrs={'class' : 'image-desc'})
|
||||||
|
if imagedesc:
|
||||||
|
redesc = re.compile("Okładka numeru")
|
||||||
|
if (redesc.search(str(imagedesc))):
|
||||||
|
p.extract()
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
html = unicode(title) + unicode(authors) + unicode(article)
|
|
||||||
next = main_section.find('li', attrs={'class' : 'next'})
|
|
||||||
|
|
||||||
while next:
|
# get actual contents
|
||||||
url = next.find('a')['href']
|
for content in article_div.contents:
|
||||||
br.open(url)
|
strs.append("".join(str(content)))
|
||||||
source = br.response().read()
|
|
||||||
page = self.index_to_soup(source)
|
# return contents as a string
|
||||||
main_section = page.find(id='mainSection')
|
return unicode("".join(strs))
|
||||||
article = main_section.find('div', attrs={'id' : 'article'})
|
|
||||||
aside = article.find(id='articleAside')
|
|
||||||
if aside is not None:
|
|
||||||
aside.extract()
|
|
||||||
html = html + unicode(article)
|
|
||||||
next = main_section.find('li', attrs={'class' : 'next'})
|
|
||||||
|
|
||||||
|
|
||||||
self.temp_files.append(PersistentTemporaryFile('_temparse.html'))
|
#
|
||||||
self.temp_files[-1].write(html)
|
# Articles can be divided into several pages, this method parses them recursevely
|
||||||
self.temp_files[-1].close()
|
#
|
||||||
return self.temp_files[-1].name
|
def get_article_page(self, br, url, page):
|
||||||
|
br.open(url)
|
||||||
|
source = br.response().read()
|
||||||
|
|
||||||
|
html = ''
|
||||||
|
|
||||||
|
matches = re.search(r'<article>(.*)</article>', source, re.DOTALL)
|
||||||
|
if matches is None:
|
||||||
|
print "no article tag found, returning..."
|
||||||
|
return
|
||||||
|
|
||||||
|
main_section = BeautifulSoup(matches.group(0))
|
||||||
|
|
||||||
|
if page == 0:
|
||||||
|
title = main_section.find('h1')
|
||||||
|
html = html + unicode(title)
|
||||||
|
|
||||||
|
authors = ''
|
||||||
|
authorBox = main_section.find('div', attrs={'class' : 'AuthorBox'})
|
||||||
|
if authorBox is not None:
|
||||||
|
authorH4 = authorBox.find('h4')
|
||||||
|
if authorH4 is not None:
|
||||||
|
authors = self.tag_to_string(authorH4)
|
||||||
|
html = html + unicode(authors)
|
||||||
|
|
||||||
|
info = main_section.find('p', attrs={'class' : 'lead'})
|
||||||
|
html = html + unicode(info)
|
||||||
|
|
||||||
|
html = html + self.get_article_divs('3917dc34e07c9c7180df2ea9ef103361845c8af42b71f51b960059226090a1ac articleStart', main_section)
|
||||||
|
html = html + self.get_article_divs('3917dc34e07c9c7180df2ea9ef103361845c8af42b71f51b960059226090a1ac', main_section)
|
||||||
|
|
||||||
|
nextPage = main_section.find('a', attrs={'class' : 'next'})
|
||||||
|
if nextPage:
|
||||||
|
html = html + self.get_article_page(br, nextPage['href'], page+1)
|
||||||
|
|
||||||
|
return html
|
||||||
|
|
||||||
|
#
|
||||||
|
# Parses each article
|
||||||
|
#
|
||||||
|
def get_obfuscated_article(self, url):
|
||||||
|
br = self.get_browser()
|
||||||
|
html = self.get_article_page(br, url, 0)
|
||||||
|
self.temp_files.append(PersistentTemporaryFile('_temparse.html'))
|
||||||
|
self.temp_files[-1].write(html)
|
||||||
|
self.temp_files[-1].close()
|
||||||
|
return self.temp_files[-1].name
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Goes back given number of issues. It also knows how to go back
|
# Goes back given number of issues. It also knows how to go back
|
||||||
# to the previous year if there are not enough issues in the current one
|
# to the previous year if there are not enough issues in the current one
|
||||||
#
|
#
|
||||||
def find_last_issue(self, archive_url):
|
def find_last_issue(self, archive_url):
|
||||||
archive_soup = self.index_to_soup(archive_url)
|
archive_soup = self.index_to_soup(archive_url, True)
|
||||||
select = archive_soup.find('select', attrs={'id' : 'paper_issue_select'})
|
|
||||||
options = select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value'))
|
|
||||||
|
|
||||||
# check if need to go back to previous year
|
# workaround because html is so messed up that find() method on soup returns None
|
||||||
if len(options) > self.BACK_ISSUES:
|
# and therefore we need to extract subhtml that we need
|
||||||
option = options[self.BACK_ISSUES];
|
matches = re.search(r'<ul class="rightIssueList">(.*?)</ul>', archive_soup, re.DOTALL)
|
||||||
self.EDITION = option['value'].replace('http://www.newsweek.pl/wydania/','')
|
if matches is None:
|
||||||
self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
|
return
|
||||||
else:
|
|
||||||
self.BACK_ISSUES = self.BACK_ISSUES - len(options)
|
subSoup = BeautifulSoup(matches.group(0))
|
||||||
self.YEAR = self.YEAR - 1
|
issueLinks = subSoup.findAll('a')
|
||||||
self.find_last_issue(archive_url + ',' + str(self.YEAR))
|
|
||||||
|
# check if need to go back to previous year
|
||||||
|
if len(issueLinks) > self.BACK_ISSUES:
|
||||||
|
link = issueLinks[self.BACK_ISSUES];
|
||||||
|
self.EDITION = link['href'].replace('http://www.newsweek.pl/wydania/','')
|
||||||
|
self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
|
||||||
|
else:
|
||||||
|
self.BACK_ISSUES = self.BACK_ISSUES - len(issueLinks)
|
||||||
|
self.YEAR = self.YEAR - 1
|
||||||
|
self.find_last_issue(archive_url + '/' + str(self.YEAR))
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Looks for the last issue which we want to download. Then goes on each
|
# Looks for the last issue which we want to download. Then goes on each
|
||||||
# section and article and stores them (assigning to sections)
|
# section and article and stores them (assigning to sections)
|
||||||
#
|
#
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
archive_url = 'http://www.newsweek.pl/wydania/archiwum'
|
archive_url = 'http://www.newsweek.pl/wydania/archiwum'
|
||||||
self.find_last_issue(archive_url)
|
self.find_last_issue(archive_url)
|
||||||
soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
|
soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
|
||||||
self.DATE = self.tag_to_string(soup.find('span', attrs={'class' : 'data'}))
|
|
||||||
main_section = soup.find(id='mainSection')
|
|
||||||
img = main_section.find(lambda tag: tag.name == 'img' and tag.has_key('alt') and tag.has_key('title'))
|
|
||||||
self.cover_url = img['src']
|
|
||||||
feeds = []
|
|
||||||
articles = {}
|
|
||||||
sections = []
|
|
||||||
|
|
||||||
news_list = main_section.find('ul', attrs={'class' : 'newsList'})
|
matches = re.search(r'<div class="Issue-Entry">(.*)ARTICLE_BOTTOM', soup.prettify(), re.DOTALL)
|
||||||
section = 'Inne'
|
if matches is None:
|
||||||
|
return
|
||||||
|
|
||||||
for li in news_list.findAll('li'):
|
main_section = BeautifulSoup(matches.group(0))
|
||||||
h3 = li.find('h3')
|
|
||||||
if h3 is not None:
|
|
||||||
section = capwords(self.tag_to_string(h3))
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
h2 = li.find('h2')
|
|
||||||
if h2 is not None:
|
|
||||||
article = self.create_article(h2)
|
|
||||||
if article is None :
|
|
||||||
continue
|
|
||||||
|
|
||||||
if articles.has_key(section):
|
# date
|
||||||
articles[section].append(article)
|
matches = re.search(r'(\d{2}-\d{2}-\d{4})', self.tag_to_string(main_section.find('h2')))
|
||||||
else:
|
if matches:
|
||||||
articles[section] = [article]
|
self.DATE = matches.group(0)
|
||||||
sections.append(section)
|
|
||||||
|
# cover
|
||||||
|
img = main_section.find(lambda tag: tag.name == 'img' and tag.has_key('alt') and tag.has_key('title'))
|
||||||
|
self.cover_url = img['src']
|
||||||
|
feeds = []
|
||||||
|
articles = {}
|
||||||
|
sections = []
|
||||||
|
|
||||||
|
# sections
|
||||||
|
for sectionUl in main_section.findAll('ul', attrs={'class' : 'whatsin'}):
|
||||||
|
|
||||||
|
# section header
|
||||||
|
header = sectionUl.find('li', attrs={'class' : 'header'})
|
||||||
|
if header is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
section = capwords(self.tag_to_string(header))
|
||||||
|
|
||||||
|
# articles in section
|
||||||
|
articleUl = sectionUl.find('ul')
|
||||||
|
if articleUl is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for articleLi in articleUl.findAll('li'):
|
||||||
|
# check if article is closed which should be skipped
|
||||||
|
closed = articleLi.find('span', attrs={'class' : 'closeart'})
|
||||||
|
if closed is not None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
article = self.create_article(articleLi)
|
||||||
|
if article is None :
|
||||||
|
continue
|
||||||
|
|
||||||
|
if articles.has_key(section):
|
||||||
|
articles[section].append(article)
|
||||||
|
else:
|
||||||
|
articles[section] = [article]
|
||||||
|
sections.append(section)
|
||||||
|
|
||||||
|
for section in sections:
|
||||||
|
# print("%s -> %d" % (section, len(articles[section])))
|
||||||
|
#
|
||||||
|
# for article in articles[section]:
|
||||||
|
# print(" - %s" % article)
|
||||||
|
|
||||||
|
feeds.append((section, articles[section]))
|
||||||
|
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
|
||||||
for section in sections:
|
#
|
||||||
feeds.append((section, articles[section]))
|
# Creates each article metadata (skips locked ones). The content will
|
||||||
return feeds
|
# be extracted later by other method (get_obfuscated_article).
|
||||||
|
#
|
||||||
|
def create_article(self, articleLi):
|
||||||
|
article = {}
|
||||||
|
|
||||||
|
a = articleLi.find('a')
|
||||||
|
if a is None:
|
||||||
|
return None
|
||||||
|
|
||||||
#
|
article['title'] = self.tag_to_string(a)
|
||||||
# Creates each article metadata (skips locked ones). The content will
|
article['url'] = a['href']
|
||||||
# be extracted later by other method (get_obfuscated_article).
|
article['date'] = self.DATE
|
||||||
#
|
article['description'] = ''
|
||||||
def create_article(self, h2):
|
|
||||||
article = {}
|
|
||||||
a = h2.find('a')
|
|
||||||
if a is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
article['title'] = self.tag_to_string(a)
|
return article
|
||||||
article['url'] = a['href']
|
|
||||||
article['date'] = self.DATE
|
|
||||||
desc = h2.findNext('p')
|
|
||||||
|
|
||||||
if desc is not None:
|
|
||||||
article['description'] = self.tag_to_string(desc)
|
|
||||||
else:
|
|
||||||
article['description'] = ''
|
|
||||||
return article
|
|
||||||
|
57
recipes/no_names_no_jackets.recipe
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
#
|
||||||
|
# Written: July 2013
|
||||||
|
# Last Edited: 2013-07-25
|
||||||
|
# Version: 1.0
|
||||||
|
# Last update: 2013-07-25
|
||||||
|
#
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Armin Geller'
|
||||||
|
|
||||||
|
'''
|
||||||
|
Fetch blindenbuch.de
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = u'No Names, No Jackets'
|
||||||
|
__author__ = 'Armin Geller' # AGe 2013-07-25
|
||||||
|
description = u'One chapter. Just the writing. Discover something new.'
|
||||||
|
publisher = 'nonamesnojackets.com/'
|
||||||
|
publication_type = 'ebook news'
|
||||||
|
tags = 'Books, Literature, E-Books, US'
|
||||||
|
timefmt = ' [%a, %d %b %Y]'
|
||||||
|
publication_type = 'Feed'
|
||||||
|
language = 'en'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
|
||||||
|
oldest_article = 14
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
conversion_options = {'title' : title,
|
||||||
|
'comments' : description,
|
||||||
|
'tags' : tags,
|
||||||
|
'language' : language,
|
||||||
|
'publisher' : publisher,
|
||||||
|
'authors' : publisher,
|
||||||
|
}
|
||||||
|
|
||||||
|
# cover_url = ''
|
||||||
|
# masthead_url = ''
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1,h2 {font-weight:bold;font-size:large;}
|
||||||
|
.entry-meta {font-size: 1em;text-align: left; font-style: italic}
|
||||||
|
'''
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='article')
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [(u'No Names, No Jackets', u'http://www.nonamesnojackets.com/feed/')]
|
||||||
|
|
57
recipes/nuus24.recipe
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
import re
|
||||||
|
from calibre import strftime
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Nuus24(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = 'Nuus24'
|
||||||
|
__author__ = 'Nicki de Wet'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
description = 'Daaglikse Afrikaanse Nuus via Nuus24'
|
||||||
|
language = 'af'
|
||||||
|
publisher = 'Media24'
|
||||||
|
timefmt = ' [%a, %d %b, %Y]'
|
||||||
|
masthead_url = 'http://afrikaans.news24.com/images/nuus.jpg'
|
||||||
|
max_articles_per_feed = 25
|
||||||
|
remove_tags_before = dict(id='TheFeed')
|
||||||
|
remove_tags_after = dict(id='TheFeed')
|
||||||
|
remove_tags = [dict(
|
||||||
|
attrs={
|
||||||
|
'class':[
|
||||||
|
'personal-bar row-fluid', 'navbar main-menu-fixed',
|
||||||
|
'breaking-news-wrapper', 'row-fluid comments-bg',
|
||||||
|
'unstyled actions', 'modal-body', 'modal-header', 'desktop']}),
|
||||||
|
dict(id=['weather-forecast', 'topics', 'side-widgets', 'footer-container', 'sb-container', 'myModal']),
|
||||||
|
dict(name=['script', 'noscript', 'style'])]
|
||||||
|
|
||||||
|
keep_only_tags = [dict(attrs={'class':['span8 border-right']}),
|
||||||
|
dict(name=['article', 'section']),
|
||||||
|
dict(id=['img-wrapper'])]
|
||||||
|
extra_css = """ div.carousel-inner{ overflow:hidden;display: block;height:300px;} img{display: block} """
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup('http://afrikaans.news24.com/Index.aspx')
|
||||||
|
|
||||||
|
def feed_title(div):
|
||||||
|
return ''.join(div.findAll(text=True, recursive=False)).strip()
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = None
|
||||||
|
key = 'Nuus in Afrikaans'
|
||||||
|
articles[key] = []
|
||||||
|
ans= []
|
||||||
|
|
||||||
|
for anchor in soup.findAll(True,
|
||||||
|
attrs={'id':['lnkLink']}):
|
||||||
|
url = re.sub(r'\?.*', '', anchor['href'])
|
||||||
|
title = self.tag_to_string(anchor, use_alt=True).strip()
|
||||||
|
print title
|
||||||
|
description = ''
|
||||||
|
pubdate = strftime('%a, %d %b')
|
||||||
|
articles[key].append(
|
||||||
|
dict(title=title, url=url, date=pubdate,
|
||||||
|
description=description,
|
||||||
|
content=''))
|
||||||
|
ans = [(key, articles[key])]
|
||||||
|
return ans
|
50
recipes/padreydecano.recipe
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = '2013, Carlos Alves <carlosalves90@gmail.com>'
|
||||||
|
'''
|
||||||
|
padreydecano.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class General(BasicNewsRecipe):
|
||||||
|
title = 'Padre y Decano'
|
||||||
|
__author__ = 'Carlos Alves'
|
||||||
|
description = 'El sitio del pueblo'
|
||||||
|
tags = 'soccer, futbol, Peñarol'
|
||||||
|
language = 'es_UY'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
use_embedded_content = False
|
||||||
|
recursion = 5
|
||||||
|
encoding = None
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='h1', attrs={'class':'entry-title'}),
|
||||||
|
dict(name='div', attrs={'class':'entry-content clearfix'})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['br', 'hr', 'titlebar', 'navigation']}),
|
||||||
|
dict(name='dl', attrs={'class':'gallery-item'}),
|
||||||
|
dict(name=['object','link'])
|
||||||
|
]
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||||
|
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||||
|
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||||
|
p {font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
'''
|
||||||
|
feeds = [
|
||||||
|
(u'Padre y Decano | Club Atlético Peñarol', u'http://www.padreydecano.com/cms/feed/')
|
||||||
|
]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
@ -1,16 +0,0 @@
|
|||||||
from calibre.web.feeds.news import CalibrePeriodical
|
|
||||||
|
|
||||||
class PCQ(CalibrePeriodical):
|
|
||||||
|
|
||||||
title = 'PCQuest'
|
|
||||||
calibre_periodicals_slug = 'pc-quest-india'
|
|
||||||
|
|
||||||
description = '''
|
|
||||||
Buying a tech product? Seeking a tech solution? Consult PCQuest, India's
|
|
||||||
market-leading selection and implementation guide for the latest
|
|
||||||
technologies: servers, business apps, security, open source, gadgets and
|
|
||||||
more. To subscribe visit, <a
|
|
||||||
href="http://news.calibre-ebook.com/periodical/pc-quest-india">calibre
|
|
||||||
Periodicals</a>.
|
|
||||||
'''
|
|
||||||
language = 'en_IN'
|
|