Merge
533
Changelog.yaml
@ -19,6 +19,539 @@
|
||||
# new recipes:
|
||||
# - title:
|
||||
|
||||
- version: 0.8.45
|
||||
date: 2012-03-30
|
||||
|
||||
new features:
|
||||
- title: "E-book viewer: Allow the up and down keys to scroll past section boundaries"
|
||||
|
||||
- title: "calibredb: Allow specification of basic metadata on the command line when adding books."
|
||||
tickets: [951063]
|
||||
|
||||
- title: "Driver for Samsung Galaxy Plus GT-I9001"
|
||||
|
||||
- title: "KF8 Input: Support KF8 format Amazon book samples."
|
||||
tickets: [963418]
|
||||
|
||||
- title: "When a new plugin is added to calibre for the first time, have its icon (if any) show up even when a device is connected (this can be changed by the user at the time of plugin installation)"
|
||||
|
||||
- title: "Add keyboard shortcuts for Bold, Italic and Underline to the comments editor in the edit metadata dialog"
|
||||
tickets: [963559]
|
||||
|
||||
bug fixes:
|
||||
- title: "E-book viewer: Fix last read position (and bookmarks in general) being inaccurate for some books."
|
||||
description: "The technique for marking locations in books used by the viewer has changed. The new technique should be much more accurate than the last one, especially when the font size at which the book is being viewed is changed. Note that this change means that bookmarks created with this release of calibre will not be read by previous calibre versions. On a technical note, the viewer now uses the CFI specification from the EPUB 3 standard for bookmarks."
|
||||
type: major
|
||||
|
||||
- title: "Workarounds for a few regressions in the user interface in 0.8.44 caused by the update to Qt 4.8.0"
|
||||
|
||||
- title: "Books list: Preserve the horizontal scroll position when sorting by a column"
|
||||
|
||||
- title: "Fix saving to disk and then adding the book back not restoring tags-like custom columns"
|
||||
|
||||
- title: "Linux installer: Fix completion for ebook-convert not working."
|
||||
tickets: [967834]
|
||||
|
||||
- title: "MOBI Output: Recognize type=text in addition to type=start guide elements"
|
||||
|
||||
- title: "Get Books: Updates to Nexto, Ebookpoint and Woblink stores"
|
||||
|
||||
- title: "Fix unable to clear username/password in Fetch news dialog"
|
||||
|
||||
- title: "PDF Output: Fix margin specifications not being applied"
|
||||
|
||||
- title: "Linux installer: Manually preserve the defaults.list mimetype association file to workaround buggy xdg-desktop-menu implementations in some distros."
|
||||
tickets: [926559]
|
||||
|
||||
- title: "E-book viewer: Fix regression that caused the ebook viewer to stop functioning if it is launched from the main calibre program and then the main calibre program is closed."
|
||||
tickets: [963960]
|
||||
|
||||
|
||||
improved recipes:
|
||||
- Our Daily Bread
|
||||
|
||||
new recipes:
|
||||
- title: NRC Handelsblad (free)
|
||||
author: veezh
|
||||
|
||||
- version: 0.8.44
|
||||
date: 2012-03-23
|
||||
|
||||
new features:
|
||||
- title: "E-book viewer: A whole new full screen mode."
|
||||
description: "The new mode has no toolbars to distract from the text and the ability to set the width of the column of text via Preferences in the ebook viewer. Click the Fullscreen button on the toolbar in the viewer to enter fullscreen mode (or press the F11 or Ctrl+Shit+F keys)"
|
||||
type: major
|
||||
tickets: [959830]
|
||||
|
||||
- title: "Copy to Library: If books were auto merged by the copy to library process, popup a message telling the user about it, as otherwise some people forget they have turned on auto merge and accuse calibre of losing their books."
|
||||
|
||||
- title: "Unix driver for Ectaco JetBook color"
|
||||
tickets: [958442]
|
||||
|
||||
- title: "Add a link to the 'Adding Books Preferences' in the drop down menu of the Add Books button for easier access and more prominence"
|
||||
tickets: [958145]
|
||||
|
||||
- title: "Smarten punctuation: Add a few more cases for detecting opening and closing quotes"
|
||||
|
||||
bug fixes:
|
||||
- title: "Get Books: Updates to various store plugins to deal with website changes: Amazon Europe, Waterstones, Foyles, B&N, Kobo, Woblink and Empik"
|
||||
|
||||
- title: "Catalog generation: Do not error out when generating csv/xml catalogs if the catalog title contains filename invalid characters."
|
||||
tickets: [960154]
|
||||
|
||||
- title: "RTF Output: Ignore corrupted images in the input document, instead of erroring out."
|
||||
tickets: [959600]
|
||||
|
||||
- title: "E-book viewer: Try to preserve page position when the window is resized"
|
||||
|
||||
- title: "Fix bug that caused wrong series to be shown when clicking on the first letter of a series group in the Tag Browser"
|
||||
|
||||
- title: "Fix calibre not supporting different http and https proxies."
|
||||
tickets: [960173]
|
||||
|
||||
- title: "MOBI Input: Fix regression caused by KF8 support that broke reading of ancient non-Amazon PRC files"
|
||||
|
||||
- title: "Fix EPUB to EPUB conversion of an EPUB with obfuscated fonts resulting in the fonts not being readable in Adobe Digital Editions"
|
||||
tickets: [957527]
|
||||
|
||||
- title: "RTF Output: Fix bug that broke conversion to RTF when the input document contains <img> tags with no src attribute."
|
||||
|
||||
- title: "Fix regression in 0.8.43 that broke use of general mode templates that ended in a semi-colon."
|
||||
tickets: [957295]
|
||||
|
||||
improved recipes:
|
||||
- b92
|
||||
- Various Polish news sources
|
||||
- Le Monde
|
||||
- FHM UK
|
||||
|
||||
new recipes:
|
||||
- title: Ivana Milakovic and Klub knjige
|
||||
author: Darko Miletic
|
||||
|
||||
|
||||
- version: 0.8.43
|
||||
date: 2012-03-16
|
||||
|
||||
new features:
|
||||
- title: "Template language: Speedup evaluation of general program mode templates by pre-compiling them to python. If you experience errors with this optimization, you can turn it off via Preferences->Tweaks. Also other miscellaneous optimizations in evaluating templates with composite columns."
|
||||
|
||||
- title: "MOBI Output: Add an option to not convert all images to JPEG when creating MOBI files. For maximum compatibility of the produced MOBI files, do not use this option."
|
||||
tickets: [954025]
|
||||
|
||||
- title: "Add iPad3 Output Profile"
|
||||
|
||||
bug fixes:
|
||||
- title: "KF8 Input: Add support for KF8 files with obfuscated embedded fonts"
|
||||
tickets: [953260]
|
||||
|
||||
- title: "Make the stars in the book list a little larger on windows >= vista"
|
||||
|
||||
- title: "Revised periodical Section layout, for touchscreen devices resolving iBooks problem with tables spanning multiple pages"
|
||||
|
||||
- title: "Read dc:contributor metadata from MOBI files"
|
||||
|
||||
- title: "MOBI Output: Fix a regression that caused the generated thumbnail embedded in calibre produced MOBI files to be a large, low quality image instead of a small, high quality image. You would have been affected by this bug only if you directly used the output from calibre, without exporting it via send to device or save to disk."
|
||||
tickets: [954254]
|
||||
|
||||
- title: "KF8 Input: Recognize OpenType embedded fonts as well."
|
||||
tickets: [954728]
|
||||
|
||||
- title: "Fix regression in 0.8.41 that caused file:/// URLs to stop working in the news download system on windows."
|
||||
tickets: [955581]
|
||||
|
||||
- title: "When setting metadata in MOBI files fix cover not being updated if the mobi file has its first image record as the cover"
|
||||
|
||||
- title: "Fix column coloring rules based on the size column not working"
|
||||
tickets: [953737]
|
||||
|
||||
improved recipes:
|
||||
- Microwaves and RF
|
||||
- idg.se
|
||||
|
||||
new recipes:
|
||||
- title: SatMagazine
|
||||
author: kiavash
|
||||
|
||||
- version: 0.8.42
|
||||
date: 2012-03-12
|
||||
|
||||
new features:
|
||||
- title: "Support for reading Amazon's new KF8 format"
|
||||
type: major
|
||||
description: "calibre can now both view and convert MOBI files that contain Amazon's new KF8 (Kindle Fire) format"
|
||||
|
||||
- title: "Add a tweak to Preferences->Tweaks to control the font size used in the book details panel"
|
||||
tickets: [948357]
|
||||
|
||||
- title: "Allow specifying a list of file types to exclude when automatically adding files from a folder"
|
||||
tickets: [943025]
|
||||
|
||||
- title: "Show ratings in the book details panel as stars. Also allow the user to change the alignment of the ratings column in the main books list. No longer display the stars in blue, instead their color can be customized via the column coloring rules, like any other column"
|
||||
|
||||
- title: "When setting metadata in EPUB ensure that the <meta name=cover> tag has its name attribute first. Needed for the Nook."
|
||||
|
||||
- title: "Drivers for Novo 7, LG G2x and Zenithink T-280"
|
||||
tickets: [941671, 940625, 940527]
|
||||
|
||||
- title: "Update linux binaries to Qt 4.8.0"
|
||||
|
||||
bug fixes:
|
||||
- title: "Fix some rar files causing crashes on OS X (updated libunrar.dylib in the OS X build)"
|
||||
tickets: [951185]
|
||||
|
||||
- title: "MOBI Output: Ignore the Table of Contents pointed to by the guide, if it contains no links"
|
||||
|
||||
- title: "ODT Input: Ignore margin declaration in ODT styles if more specific margin-* declarations are present"
|
||||
tickets: [941134]
|
||||
|
||||
- title: "Conversion pipeline: Fix @import rules in CSS stylesheets that have comments on their first few lines being ignored."
|
||||
|
||||
- title: "EPUB Input: When extracting the contents of epub files on windows, do not error out if one or more of the components in the epub file have filepaths containing characters that are invalid for the windows filesystem, instead, just replace those characters, since those entries are likely to be errors in the zip container anyway."
|
||||
tickets: [950081]
|
||||
|
||||
- title: "Textile output: Fix issue with blockquotes and sentences getting removed."
|
||||
|
||||
- title: "MOBI Output: When using the prefer author sort conversion option, handle multiple authors better."
|
||||
tickets: [947146]
|
||||
|
||||
- title: "Fix regression in 0.8.41 that broke direct connection to iDevices in windows"
|
||||
tickets: [944534]
|
||||
|
||||
- title: "Fix the download bulk metadata completed popup causing a crash if the Esc key is pressed."
|
||||
tickets: [943056]
|
||||
|
||||
- title: "Fix rating values doubled in CSV/XML catalogs"
|
||||
tickets: [942790]
|
||||
|
||||
- title: "EPUB Input: Remove non markup documents from the spine automatically, instead of erroring out"
|
||||
|
||||
- title: "When formatting ratings in templates, etc., do not have an unnecessary .0"
|
||||
|
||||
- title: "Calibre portable: Do not allow calibre portable to run if it is placed in a location whose path is too long. Also hide the library location setup in the welcome wizard when running the portable build."
|
||||
|
||||
- title: "Fix regression in 0.8.41 that broke calibre if the TMP or TEMP environment variable is set to the root of a drive."
|
||||
tickets: [952284]
|
||||
|
||||
- title: "Fix display of ratings type custom fields in the content server"
|
||||
tickets: [940600]
|
||||
|
||||
|
||||
improved recipes:
|
||||
- La Jornada
|
||||
- Chicago Tribune
|
||||
- Mediapart
|
||||
- rue89
|
||||
|
||||
new recipes:
|
||||
- title: Racjonalista
|
||||
author: Racjonlista
|
||||
|
||||
- title: JAPAA
|
||||
author: adoucette
|
||||
|
||||
|
||||
- version: 0.8.41
|
||||
date: 2012-02-24
|
||||
|
||||
new features:
|
||||
- title: "Driver for Sony Experia Play 4G"
|
||||
tickets: [938831]
|
||||
|
||||
- title: "News download system: Allow use of __future__ in recipes, and do not change line numbers of code in the recipe when compiling it"
|
||||
|
||||
- title: "Use the My Documents folder as the default location for the Calibre Library folder on first start in windows"
|
||||
tickets: [934840]
|
||||
|
||||
- title: "Add a tweak to Preferences->Tweaks to control the order in which categories appear in the Tag Browser"
|
||||
|
||||
- title: "Tag Browser: Add an entry to the right click menu to quickly delete tags"
|
||||
tickets: [934509]
|
||||
|
||||
- title: "Amazon metadata download: Try to scrape series information from the amazon details page. Note that currently very few books have series info available. Often the page for hardcover will have series, but the Kindle edition will not. In such cases calibre may or may not find the series, depending on which page it ends up using."
|
||||
|
||||
- title: "Content server: Add favicon to OPDS feeds."
|
||||
tickets: [934731]
|
||||
|
||||
bug fixes:
|
||||
- title: "RTF Input: Fix some WMF images embedded in RTF files being distorted on conversion."
|
||||
tickets: [934167]
|
||||
|
||||
- title: "Fix long standing bug preventing calibre from working on east asian windows installs when the user name in windows has non-ascii characters"
|
||||
tickets: [937389]
|
||||
|
||||
- title: "Get Books: Fix Baen Webscription and O'Reilly stores. Fix price detection for Google Books"
|
||||
|
||||
- title: "MOBI Output: When the same anchor is present more than once in the input document, use the first occurrence rather than the last one."
|
||||
tickets: [934031]
|
||||
|
||||
- title: "Use the 'default cover font' tweak when generating default masthead images as well"
|
||||
tickets: [939256]
|
||||
|
||||
- title: "Fix content server does not correctly display custom field of type 'rating'"
|
||||
tickets: [938303]
|
||||
|
||||
- title: "Fix welcome wizard does not save send-from email info unless send-to field is filled"
|
||||
tickets: [937087]
|
||||
|
||||
- title: "When reading metadata from odt files, use initial-creator in preference to creator for setting the author field"
|
||||
tickets: [934564]
|
||||
|
||||
- title: "Fix conversion erroring out when the input document has very long and thin images"
|
||||
tickets: [935234]
|
||||
|
||||
improved recipes:
|
||||
- The Sun
|
||||
- Various Polish news sources
|
||||
- Mediapart
|
||||
|
||||
new recipes:
|
||||
- title: La pausa caffe
|
||||
author: faber1971
|
||||
|
||||
- title: Various Polish news sources
|
||||
author: fenuks
|
||||
|
||||
|
||||
- version: 0.8.40
|
||||
date: 2012-02-17
|
||||
|
||||
new features:
|
||||
- title: "Amazon metadata download: Support the new 'Book Description' section that Amazon publishes for some books. Also workaround the amazon US servers occasionally returning broken markup leading to calibre not finding any matches for books on Amazon."
|
||||
|
||||
- title: "Kindle driver: Add an option to allow using page counts stored in a custom column. Go to Preferences->Plugins and customize the Kindle driver, to tell it to use a custom column to get page count data. See http://www.mobileread.com/forums/showpost.php?p=1963075&postcount=215 for details."
|
||||
|
||||
- title: "Template language: Add a current_library_name() function that can be used to return the name of the currently opened library in calibre"
|
||||
|
||||
- title: "Driver for Xperia Neo and PocketBook A10"
|
||||
tickets: [930788]
|
||||
|
||||
bug fixes:
|
||||
- title: "Fix regression in 0.8.36 that caused the calibredb command to not properly refresh format information in standalone calibre-server processes"
|
||||
|
||||
- title: "Fix regression in 0.8.39 that broke getting covers from some epub files on OS X."
|
||||
tickets: [932507]
|
||||
|
||||
- title: "Reading metadata from HTML files: Do not take a very long time for very large HTML files. Also fix reading metadata from meta tags with multiple spaces before the content attribute."
|
||||
tickets: [932262]
|
||||
|
||||
- title: "EPUB Output: Fix splitting breaking internal links in the epub, if the links pointed to files with URL unsafe characters in their file names."
|
||||
tickets: [929966]
|
||||
|
||||
- title: "Fix auto adding not leaving languages field blank when book has no defined laguage"
|
||||
tickets: [930648]
|
||||
|
||||
improved recipes:
|
||||
- Samanyolu Haber
|
||||
- Kurier
|
||||
- Le devoir
|
||||
- Daily Mirror
|
||||
- Common Dreams
|
||||
- Pescanik
|
||||
|
||||
new recipes:
|
||||
- title: Asian Review of Books
|
||||
author: Darko Miletic
|
||||
|
||||
- title: Albert Mohler, Desiring God, Living Stones and Resurgence
|
||||
author: Peter Grungi
|
||||
|
||||
- title: Novinite BG
|
||||
author: M3 Web
|
||||
|
||||
- title: Catholic Daily Readings
|
||||
author: adoucette
|
||||
|
||||
- title: Consortium News and Microwave and RF magazine
|
||||
author: kiavash
|
||||
|
||||
- version: 0.8.39
|
||||
date: 2012-02-10
|
||||
|
||||
new features:
|
||||
- title: "Auto-adding: Add an option to check for duplicates when auto adding."
|
||||
tickets: [926962]
|
||||
|
||||
- title: "Content server: Export a second record via mDNS that points to the full OPDS feed in addition to the one pointing to the Stanza feed. The new record is of type _calibre._tcp."
|
||||
tickets: [929304]
|
||||
|
||||
- title: "Allow specifying a set of categories that are not partitioned even if they contain a large number of items in the Tag Browser. Preference is available under Look & Feel->Tag Browser"
|
||||
|
||||
- title: "Allow setting a URL prefix for the content server that run embedded in the calibre GUI as well."
|
||||
tickets: [928905]
|
||||
|
||||
- title: "Allow output of identifiers data in CSV/XML/BiBTeX catalogs"
|
||||
tickets: [927737]
|
||||
|
||||
- title: "Driver for Motorola Droid XT910, Nokia E71 and HTC EVO 3D."
|
||||
tickets: [928202, 927818, 929400]
|
||||
|
||||
- title: "Cut down the time taken to launch worker processes by 40%"
|
||||
|
||||
- title: "You can now configure the calibre settings for the currently connected device by right clicking on the device icon in the toolbar, instead of having to go through Preferences->Plugins"
|
||||
|
||||
bug fixes:
|
||||
- title: "Auto-adding: Do not add incomplete files when files are downloaded directly into the auto add folder."
|
||||
tickets: [926578]
|
||||
|
||||
- title: "When running multiple delete from device jobs, fix the device view sometimes marking the wrong books as being deleted, after the first delete job completes."
|
||||
tickets: [927972]
|
||||
|
||||
- title: "MOBI Input: Handle files that have spurious closing </body> and/or </html> tags in their markup."
|
||||
tickets: [925833]
|
||||
|
||||
- title: "RTF Input: Strip out false color specifications, as they cause artifacts when converted to MOBI"
|
||||
|
||||
improved recipes:
|
||||
- Updated Postmedia publications
|
||||
- Foreign Affairs
|
||||
- Read It Later
|
||||
- Microwave Journal
|
||||
- taggeschau.de
|
||||
|
||||
new recipes:
|
||||
- title: Vancouver Province and Windsor Star
|
||||
author: Nick Redding
|
||||
|
||||
- title: Onda Rock
|
||||
author: faber1971
|
||||
|
||||
- title: Il Manifesto
|
||||
author: Giacomo Lacava
|
||||
|
||||
- version: 0.8.38
|
||||
date: 2012-02-03
|
||||
|
||||
new features:
|
||||
- title: "Implement the ability to automatically add books to calibre from a specified folder."
|
||||
type: major
|
||||
description: "calibre can now watch a folder on your computer and instantly add any files you put there to the calibre library as new books. You can tell calibre which folder to watch via Preferences->Adding Books->Automatic Adding."
|
||||
tickets: [920249]
|
||||
|
||||
- title: "Conversion: When automatically inserting page breaks, do not put a page break before a <h1> or <h2> tag if it is immediately preceded by another <h1> or <h2> tag."
|
||||
|
||||
- title: "Driver for EZReader T730 and Point-of-View PlayTab Pro"
|
||||
tickets: [923283, 922969]
|
||||
|
||||
bug fixes:
|
||||
- title: "Fix device entry not visible in menubar even when it has been added via Preferences->Toolbars."
|
||||
tickets: [923175]
|
||||
|
||||
- title: "Fix metadata plugboards not applied when auto sending news by email"
|
||||
|
||||
- title: "Fix regression in 0.8.34 that broke recipes that used skip_ad_pages() but not get_browser(). "
|
||||
tickets: [923724]
|
||||
|
||||
- title: "Restore device support on FreeBSD, by using HAL"
|
||||
tickets: [924503]
|
||||
|
||||
- title: "Get books: Show no more than 10 results from the Gandalf store"
|
||||
|
||||
- title: "Content server: Fix metadata not being updated when sending for some MOBI files."
|
||||
tickets: [923130]
|
||||
|
||||
- title: "Heuristic processing: Fix the italicize common patterns algorithm breaking on some HTML markup."
|
||||
tickets: [922317]
|
||||
|
||||
- title: "When trying to find an ebook inside a zip file, do not fail if the zip file itself contains other zip files."
|
||||
tickets: [925670]
|
||||
|
||||
- title: "EPUB Input: Handle EPUBs with duplicate entries in the manifest."
|
||||
tickets: [925831]
|
||||
|
||||
- title: "MOBI Input: Handle files that have extra </html> tags sprinkled through out their markup."
|
||||
tickets: [925833]
|
||||
|
||||
improved recipes:
|
||||
- Metro Nieuws NL
|
||||
- FHM UK
|
||||
|
||||
new recipes:
|
||||
- title: Strange Horizons
|
||||
author: Jim DeVona
|
||||
|
||||
- title: Telegraph India and Live Mint
|
||||
author: Krittika Goyal
|
||||
|
||||
- title: High Country News
|
||||
author: Armin Geller
|
||||
|
||||
- title: Countryfile
|
||||
author: Dave Asbury
|
||||
|
||||
- title: Liberation (subscription version)
|
||||
author: Remi Vanicat
|
||||
|
||||
- title: Various Italian news sources
|
||||
author: faber1971
|
||||
|
||||
|
||||
- version: 0.8.37
|
||||
date: 2012-01-27
|
||||
|
||||
new features:
|
||||
- title: "Allow calibre to be run simultaneously in two different user accounts on windows."
|
||||
tickets: [919856]
|
||||
|
||||
- title: "Driver for Motorola Photon and Point of View PlayTab"
|
||||
tickets: [920582, 919080]
|
||||
|
||||
- title: "Add a checkbox to preferences->plugins to show only user installed plugins"
|
||||
|
||||
- title: "Add a restart calibre button to the warning dialog that pops up after changing some preference that requires a restart"
|
||||
|
||||
bug fixes:
|
||||
- title: "Fix regression in 0.8.36 that caused the remove format from book function to only delete the entry from the database and not delete the actual file from the disk"
|
||||
tickets: [921721]
|
||||
|
||||
- title: "Fix regression in 0.8.36 that caused the calibredb command to not properly refresh the format information in the GUI"
|
||||
tickets: [919494]
|
||||
|
||||
- title: "E-book viewer: Preserve the current position more accurately when changing font size/other preferences."
|
||||
tickets: [912406]
|
||||
|
||||
- title: "Conversion pipeline: Fix items in the <guide> that refer to files with URL unsafe filenames being ignored."
|
||||
tickets: [920804]
|
||||
|
||||
- title: "Fix calibre not running on linux systems that set LANG to an empty string"
|
||||
|
||||
- title: "On first run of calibre, ensure the columns are sized appropriately"
|
||||
|
||||
- title: "MOBI Output: Do not collapse whitespace when setting the comments metadata in newly created MOBI files"
|
||||
|
||||
- title: "HTML Input: Fix handling of files with ä characters in their filenames."
|
||||
tickets: [919931]
|
||||
|
||||
- title: "Fix the sort on startup tweak ignoring more than three levels"
|
||||
tickets: [919584]
|
||||
|
||||
- title: "Edit metadata dialog: Fix a bug that broke adding of a file to the book that calibre did not previously know about in the books directory while simultaneously changing the author or title of the book."
|
||||
tickets: [922003]
|
||||
|
||||
improved recipes:
|
||||
- People's Daily
|
||||
- Plus Info
|
||||
- grantland.com
|
||||
- Eret es irodalom
|
||||
- Sueddeutsche.de
|
||||
|
||||
new recipes:
|
||||
- title: Mumbai Mirror
|
||||
author: Krittika Goyal
|
||||
|
||||
- title: Real Clear
|
||||
author: TMcN
|
||||
|
||||
- title: Gazeta Wyborcza
|
||||
author: ravcio
|
||||
|
||||
- title: The Daily News Egypt and al masry al youm
|
||||
author: Omm Mishmishah
|
||||
|
||||
- title: Klip.me
|
||||
author: Ken Sun
|
||||
|
||||
|
||||
- version: 0.8.36
|
||||
date: 2012-01-20
|
||||
|
||||
|
152
imgsrc/calibreSymbols.spd
Normal file
@ -0,0 +1,152 @@
|
||||
SplineFontDB: 3.0
|
||||
FontName: calibreSymbols
|
||||
FullName: calibre Symbols
|
||||
FamilyName: calibre Symbols
|
||||
Weight: Medium
|
||||
Copyright: Created by Kovid Goyal with FontForge 2.0 (http://fontforge.sf.net)
|
||||
UComments: "2012-2-27: Created."
|
||||
Version: 001.000
|
||||
ItalicAngle: 0
|
||||
UnderlinePosition: -100
|
||||
UnderlineWidth: 50
|
||||
Ascent: 800
|
||||
Descent: 200
|
||||
LayerCount: 2
|
||||
Layer: 0 0 "Back" 1
|
||||
Layer: 1 0 "Fore" 0
|
||||
NeedsXUIDChange: 1
|
||||
XUID: [1021 913 325894820 11538708]
|
||||
FSType: 0
|
||||
OS2Version: 0
|
||||
OS2_WeightWidthSlopeOnly: 0
|
||||
OS2_UseTypoMetrics: 1
|
||||
CreationTime: 1330331997
|
||||
ModificationTime: 1330487767
|
||||
OS2TypoAscent: 0
|
||||
OS2TypoAOffset: 1
|
||||
OS2TypoDescent: 0
|
||||
OS2TypoDOffset: 1
|
||||
OS2TypoLinegap: 90
|
||||
OS2WinAscent: 0
|
||||
OS2WinAOffset: 1
|
||||
OS2WinDescent: 0
|
||||
OS2WinDOffset: 1
|
||||
HheadAscent: 0
|
||||
HheadAOffset: 1
|
||||
HheadDescent: 0
|
||||
HheadDOffset: 1
|
||||
MarkAttachClasses: 1
|
||||
DEI: 91125
|
||||
Encoding: UnicodeFull
|
||||
UnicodeInterp: none
|
||||
NameList: Adobe Glyph List
|
||||
DisplaySize: -24
|
||||
AntiAlias: 1
|
||||
FitToEm: 1
|
||||
WidthSeparation: 150
|
||||
WinInfo: 9600 75 22
|
||||
BeginPrivate: 0
|
||||
EndPrivate
|
||||
BeginChars: 1114112 3
|
||||
|
||||
StartChar: uni2605
|
||||
Encoding: 9733 9733 0
|
||||
Width: 979
|
||||
VWidth: -26
|
||||
Flags: W
|
||||
LayerCount: 2
|
||||
Fore
|
||||
SplineSet
|
||||
551.923 352.862 m 1
|
||||
749.497 369.592 l 2
|
||||
804.954 374.123 833.379 376.389 834.765 376.389 c 0
|
||||
852.095 376.389 860.761 368.896 860.761 353.907 c 0
|
||||
860.761 347.981 859.028 343.363 855.562 340.052 c 0
|
||||
852.095 336.74 825.578 319.225 776.012 287.506 c 2
|
||||
609.635 180.323 l 1
|
||||
716.22 -88.417 l 2
|
||||
717.606 -91.2051 718.301 -95.3877 718.301 -100.965 c 0
|
||||
718.301 -106.193 716.394 -110.725 712.58 -114.558 c 0
|
||||
708.769 -118.393 704.608 -120.31 700.104 -120.31 c 0
|
||||
695.943 -120.31 691.61 -118.828 687.103 -115.866 c 0
|
||||
682.598 -112.902 658.162 -92.251 613.795 -53.9082 c 2
|
||||
466.134 74.71 l 1
|
||||
320.554 -51.8184 l 2
|
||||
274.802 -91.5547 249.758 -112.902 245.426 -115.866 c 0
|
||||
241.092 -118.828 236.846 -120.31 232.688 -120.31 c 0
|
||||
227.835 -120.31 223.415 -118.306 219.429 -114.297 c 0
|
||||
215.442 -110.289 213.449 -105.844 213.449 -100.965 c 0
|
||||
213.449 -97.8281 223.329 -71.3379 243.087 -21.4932 c 2
|
||||
322.115 180.323 l 1
|
||||
152.618 289.598 l 2
|
||||
104.783 320.271 79.2217 337.176 75.9297 340.313 c 0
|
||||
72.6357 343.45 70.9893 347.981 70.9893 353.907 c 0
|
||||
70.9893 369.243 79.8291 376.912 97.5059 376.912 c 0
|
||||
98.8926 376.912 123.155 374.82 170.296 370.638 c 2
|
||||
379.825 352.862 l 1
|
||||
427.14 555.201 l 2
|
||||
439.271 607.834 446.811 636.764 449.757 641.992 c 0
|
||||
452.702 647.221 458.162 649.834 466.134 649.834 c 0
|
||||
474.454 649.834 480 646.96 482.772 641.208 c 0
|
||||
485.545 635.457 493.518 604.173 506.689 547.357 c 2
|
||||
551.923 352.862 l 1
|
||||
EndSplineSet
|
||||
Validated: 524289
|
||||
EndChar
|
||||
|
||||
StartChar: zero
|
||||
Encoding: 48 48 1
|
||||
Width: 1303
|
||||
VWidth: 2048
|
||||
Flags: W
|
||||
HStem: -43.3789 76.7998<582.097 721.09> 623.341 76.7998<582.097 721.091>
|
||||
VStem: 403.82 97.4395<148.044 508.66> 802.221 96.959<148.044 508.659>
|
||||
LayerCount: 2
|
||||
Fore
|
||||
SplineSet
|
||||
651.5 623.341 m 0
|
||||
601.58 623.341 564.061 598.78 538.939 549.66 c 0
|
||||
513.82 500.541 501.26 426.7 501.26 328.141 c 0
|
||||
501.26 229.9 513.82 156.221 538.939 107.101 c 0
|
||||
564.061 57.9805 601.58 33.4209 651.5 33.4209 c 0
|
||||
701.74 33.4209 739.42 57.9805 764.54 107.101 c 0
|
||||
789.66 156.221 802.221 229.9 802.221 328.141 c 0
|
||||
802.221 426.7 789.66 500.541 764.54 549.66 c 0
|
||||
739.42 598.78 701.74 623.341 651.5 623.341 c 0
|
||||
651.5 700.141 m 0
|
||||
731.82 700.141 793.18 668.38 835.58 604.859 c 0
|
||||
877.979 541.341 899.18 449.101 899.18 328.141 c 0
|
||||
899.18 207.5 877.979 115.421 835.58 51.9004 c 0
|
||||
793.18 -11.6201 731.819 -43.3789 651.5 -43.3789 c 0
|
||||
571.18 -43.3789 509.82 -11.6201 467.42 51.9004 c 0
|
||||
425.021 115.421 403.82 207.5 403.82 328.141 c 0
|
||||
403.82 449.101 425.021 541.341 467.42 604.859 c 0
|
||||
509.82 668.38 571.18 700.141 651.5 700.141 c 0
|
||||
EndSplineSet
|
||||
Validated: 1
|
||||
EndChar
|
||||
|
||||
StartChar: period
|
||||
Encoding: 46 46 2
|
||||
Width: 516
|
||||
VWidth: 2048
|
||||
Flags: W
|
||||
HStem: 53.4004 166.199<203.263 309.297>
|
||||
VStem: 174.6 163.801<82.9501 190.955>
|
||||
LayerCount: 2
|
||||
Fore
|
||||
SplineSet
|
||||
338.4 142.8 m 0
|
||||
338.4 119.2 330.5 98.4004 314.7 80.4004 c 0
|
||||
298.9 62.4004 277 53.4004 249 53.4004 c 0
|
||||
225.4 53.4004 207.1 61.2002 194.1 76.7998 c 0
|
||||
181.1 92.4004 174.6 111 174.6 132.6 c 0
|
||||
174.6 155.8 182.6 176.1 198.6 193.5 c 0
|
||||
214.6 210.9 236.8 219.6 265.2 219.6 c 0
|
||||
288.8 219.6 306.9 212.2 319.5 197.4 c 0
|
||||
332.1 182.6 338.4 164.4 338.4 142.8 c 0
|
||||
EndSplineSet
|
||||
Validated: 1
|
||||
EndChar
|
||||
EndChars
|
||||
EndSplineFont
|
18
recipes/albert_mohler.recipe
Normal file
@ -0,0 +1,18 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AlbertMohlersBlog(BasicNewsRecipe):
|
||||
title = u'Albert Mohler\'s Blog'
|
||||
__author__ = 'Peter Grungi'
|
||||
language = 'en'
|
||||
oldest_article = 90
|
||||
max_articles_per_feed = 10
|
||||
auto_cleanup = True
|
||||
cover_url = 'http://www.albertmohler.com/wp-content/themes/albert-mohler-v5/img/logo-am-lg.gif'
|
||||
publisher = 'Albert Mohler'
|
||||
language = 'en'
|
||||
author = 'Albert Mohler'
|
||||
|
||||
feeds = [(u'Albert Mohler\'s Blog', u'http://feeds.feedburner.com/AlbertMohlersBlog?format=xml')]
|
@ -6,6 +6,7 @@ class Android_com_pl(BasicNewsRecipe):
|
||||
description = 'Android.com.pl - biggest polish Android site'
|
||||
category = 'Android, mobile'
|
||||
language = 'pl'
|
||||
use_embedded_content=True
|
||||
cover_url =u'http://upload.wikimedia.org/wikipedia/commons/thumb/d/d7/Android_robot.svg/220px-Android_robot.svg.png'
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
|
@ -7,6 +7,7 @@ class Archeowiesci(BasicNewsRecipe):
|
||||
language = 'pl'
|
||||
cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg'
|
||||
oldest_article = 7
|
||||
needs_subscription='optional'
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
remove_tags=[dict(name='span', attrs={'class':['post-ratings', 'post-ratings-loading']})]
|
||||
@ -16,6 +17,16 @@ class Archeowiesci(BasicNewsRecipe):
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
for feed in feeds:
|
||||
for article in feed.articles[:]:
|
||||
if 'subskrypcja' in article.title:
|
||||
if self.username is None and 'subskrypcja' in article.title:
|
||||
feed.articles.remove(article)
|
||||
return feeds
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://archeowiesci.pl/wp-login.php')
|
||||
br.select_form(name='loginform')
|
||||
br['log'] = self.username
|
||||
br['pwd'] = self.password
|
||||
br.submit()
|
||||
return br
|
51
recipes/asianreviewofbooks.recipe
Normal file
@ -0,0 +1,51 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.asianreviewofbooks.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AsianReviewOfBooks(BasicNewsRecipe):
|
||||
title = 'The Asian Review of Books'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'In addition to reviewing books about or of relevance to Asia, the Asian Review of Books also features long-format essays by leading Asian writers and thinkers, to providing an unparalleled forum for discussion of key contemporary issues by Asians for Asia and a vehicle of intellectual depth and breadth where leading thinkers can write on the books, arts and ideas of the day. Widely quoted and referenced, with an archive of more than one thousand book reviews, it is the only web resource dedicated to Asian books. And now, with the addition of the new premium content, the Asian Review of Books, is a must-read publication.'
|
||||
publisher = 'The Asian Review of Books'
|
||||
category = 'literature, books, reviews, Asia'
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'cp1252'
|
||||
language = 'en_HK'
|
||||
publication_type = 'magazine'
|
||||
masthead_url = 'http://www.asianreviewofbooks.com/new/images/mob_arb.png'
|
||||
extra_css = """
|
||||
body{font-family: serif}
|
||||
.big {font-size: xx-large}
|
||||
.bold {font-weight: bold}
|
||||
.italic {font-style: italic}
|
||||
.small {font-size: small}
|
||||
img {display: block}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
|
||||
remove_tags = [dict(name=['object','script','iframe','embed'])]
|
||||
remove_attributes = ['style', 'onclick']
|
||||
feeds = [(u'Articles' , u'http://www.asianreviewofbooks.com/new/rss.php')]
|
||||
|
||||
def print_version(self, url):
|
||||
root, sep, artid = url.rpartition('?ID=')
|
||||
return root + 'getarticle.php?articleID=' + artid + '&stats=web'
|
||||
|
||||
def preprocess_raw_html(self, raw, url):
|
||||
return '<html><head><title>title</title></head><body>' + raw + '</body></html>'
|
||||
|
@ -1,15 +1,18 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
import re
|
||||
class Astronomia_pl(BasicNewsRecipe):
|
||||
title = u'Astronomia.pl'
|
||||
__author__ = 'fenuks'
|
||||
description = 'Astronomia - polish astronomy site'
|
||||
masthead_url = 'http://www.astronomia.pl/grafika/logo.gif'
|
||||
cover_url = 'http://www.astronomia.pl/grafika/logo.gif'
|
||||
category = 'astronomy, science'
|
||||
language = 'pl'
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
#no_stylesheets=True
|
||||
extra_css='#h2 {font-size: 18px;}'
|
||||
no_stylesheets=True
|
||||
preprocess_regexps = [(re.compile(ur'<b>Przeczytaj także:.*?</BODY>', re.DOTALL), lambda match: '</BODY>') ]
|
||||
remove_tags_before=dict(name='div', attrs={'id':'a1'})
|
||||
keep_only_tags=[dict(name='div', attrs={'id':['a1', 'h2']})]
|
||||
feeds = [(u'Wiadomości z astronomii i astronautyki', u'http://www.astronomia.pl/rss/')]
|
||||
|
@ -1,6 +1,6 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
b92.net
|
||||
'''
|
||||
@ -20,13 +20,13 @@ class B92(BasicNewsRecipe):
|
||||
encoding = 'cp1250'
|
||||
language = 'sr'
|
||||
publication_type = 'newsportal'
|
||||
masthead_url = 'http://www.b92.net/images/fp/logo.gif'
|
||||
masthead_url = 'http://b92s.net/v4/img/new-logo.png'
|
||||
extra_css = """
|
||||
@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
|
||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||
body{font-family: Arial,Helvetica,sans1,sans-serif}
|
||||
.articledescription{font-family: serif1, serif}
|
||||
.article-info2,.article-info1{text-transform: uppercase; font-size: small}
|
||||
img{display: block}
|
||||
.sms{font-weight: bold}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
@ -37,11 +37,17 @@ class B92(BasicNewsRecipe):
|
||||
, 'linearize_tables' : True
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
preprocess_regexps = [
|
||||
(re.compile(u'\u0110'), lambda match: u'\u00D0'),
|
||||
(re.compile(r'<html.*?<body>', re.DOTALL|re.IGNORECASE), lambda match: '<html><head><title>something</title></head><body>')
|
||||
]
|
||||
|
||||
keep_only_tags = [dict(attrs={'class':['article-info1','article-text']})]
|
||||
remove_attributes = ['width','height','align','hspace','vspace','border']
|
||||
remove_tags = [dict(name=['embed','link','base','meta'])]
|
||||
remove_attributes = ['width','height','align','hspace','vspace','border','lang','xmlns:fb']
|
||||
remove_tags = [
|
||||
dict(name=['embed','link','base','meta','iframe'])
|
||||
,dict(attrs={'id':'social'})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Vesti' , u'http://www.b92.net/info/rss/vesti.xml' )
|
||||
|
@ -4,16 +4,17 @@ class Benchmark_pl(BasicNewsRecipe):
|
||||
title = u'Benchmark.pl'
|
||||
__author__ = 'fenuks'
|
||||
description = u'benchmark.pl -IT site'
|
||||
masthead_url = 'http://www.benchmark.pl/i/logo-footer.png'
|
||||
cover_url = 'http://www.ieaddons.pl/benchmark/logo_benchmark_new.gif'
|
||||
category = 'IT'
|
||||
language = 'pl'
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets=True
|
||||
preprocess_regexps = [(re.compile(ur'\bWięcej o .*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
|
||||
preprocess_regexps = [(re.compile(ur'<h3><span style="font-size: small;"> Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'Więcej o .*?</ul>', re.DOTALL|re.IGNORECASE), lambda match: '')]
|
||||
keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']})]
|
||||
remove_tags_after=dict(name='div', attrs={'class':'body'})
|
||||
remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']})]
|
||||
remove_tags=[dict(name='div', attrs={'class':['kategoria', 'socialize', 'thumb', 'panelOcenaObserwowane', 'categoryNextToSocializeGallery']}), dict(name='table', attrs={'background':'http://www.benchmark.pl/uploads/backend_img/a/fotki_newsy/opinie_dnia/bg.png'}), dict(name='table', attrs={'width':'210', 'cellspacing':'1', 'cellpadding':'4', 'border':'0', 'align':'right'})]
|
||||
INDEX= 'http://www.benchmark.pl'
|
||||
feeds = [(u'Aktualności', u'http://www.benchmark.pl/rss/aktualnosci-pliki.xml'),
|
||||
(u'Testy i recenzje', u'http://www.benchmark.pl/rss/testy-recenzje-minirecenzje.xml')]
|
||||
|
16
recipes/beppe_grillo.recipe
Normal file
@ -0,0 +1,16 @@
|
||||
__license__ = 'GPL v3'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1327747616(BasicNewsRecipe):
|
||||
title = u'Beppe Grillo'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Beppe Grillo', u'http://feeds.feedburner.com/beppegrillo/atom')]
|
||||
description = 'Blog of the famous comedian and politician Beppe Grillo - v1.00 (28, January 2012)'
|
||||
__author__ = 'faber1971'
|
||||
|
||||
language = 'it'
|
||||
|
@ -10,10 +10,11 @@ class Biolog_pl(BasicNewsRecipe):
|
||||
description = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.'
|
||||
category = 'biology'
|
||||
language = 'pl'
|
||||
masthead_url= 'http://www.biolog.pl/naukowy,portal,biolog.png'
|
||||
cover_url='http://www.biolog.pl/naukowy,portal,biolog.png'
|
||||
no_stylesheets = True
|
||||
#keeps_only_tags=[dict(id='main')]
|
||||
remove_tags_before=dict(id='main')
|
||||
remove_tags_after=dict(name='a', attrs={'name':'komentarze'})
|
||||
remove_tags=[dict(name='img', attrs={'alt':'Komentarze'})]
|
||||
remove_tags=[dict(name='img', attrs={'alt':'Komentarze'}), dict(name='span', attrs={'class':'menu_odsylacze'})]
|
||||
feeds = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')]
|
||||
|
@ -1,95 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
|
||||
'''
|
||||
borba.rs
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Borba(BasicNewsRecipe):
|
||||
title = 'Borba Online'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Dnevne novine Borba Online'
|
||||
publisher = 'IP Novine Borba'
|
||||
category = 'news, politics, Serbia'
|
||||
language = 'sr'
|
||||
|
||||
lang = _('sr-Latn-RS')
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
use_embedded_content = False
|
||||
cover_url = 'http://www.borba.rs/images/stories/novine/naslovna_v.jpg'
|
||||
INDEX = u'http://www.borba.rs/'
|
||||
extra_css = ' @font-face {font-family: "serif1"; src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} .contentheading{font-size: x-large; font-weight: bold} .createdate{font-size: small; font-weight: bold} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : lang
|
||||
, 'pretty_print' : True
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'main'})]
|
||||
|
||||
remove_tags_after = dict(name='div',attrs={'id':'written_comments_title'})
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['object','link','iframe','base','img'])
|
||||
,dict(name='div',attrs={'id':'written_comments_title'})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Najnovije vesti', u'http://www.borba.rs/content/blogsection/28/105/')
|
||||
,(u'Prvi plan' , u'http://www.borba.rs/content/blogsection/4/92/' )
|
||||
,(u'Dogadjaji' , u'http://www.borba.rs/content/blogsection/21/83/' )
|
||||
,(u'Ekonomija' , u'http://www.borba.rs/content/blogsection/5/35/' )
|
||||
,(u'Komentari' , u'http://www.borba.rs/content/blogsection/23/94/' )
|
||||
,(u'Svet' , u'http://www.borba.rs/content/blogsection/7/36/' )
|
||||
,(u'Sport' , u'http://www.borba.rs/content/blogsection/6/37/' )
|
||||
,(u'Fama' , u'http://www.borba.rs/content/blogsection/25/89/' )
|
||||
,(u'B2 Dodatak' , u'http://www.borba.rs/content/blogsection/30/116/')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
attribs = [ 'style','font','valign'
|
||||
,'colspan','width','height'
|
||||
,'rowspan','summary','align'
|
||||
,'cellspacing','cellpadding'
|
||||
,'frames','rules','border'
|
||||
]
|
||||
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
|
||||
item.name = 'div'
|
||||
for attrib in attribs:
|
||||
if item.has_key(attrib):
|
||||
del item[attrib]
|
||||
return soup
|
||||
|
||||
def parse_index(self):
|
||||
totalfeeds = []
|
||||
lfeeds = self.get_feeds()
|
||||
for feedobj in lfeeds:
|
||||
feedtitle, feedurl = feedobj
|
||||
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
|
||||
articles = []
|
||||
soup = self.index_to_soup(feedurl)
|
||||
for item in soup.findAll('a', attrs={'class':'contentpagetitle'}):
|
||||
url = item['href']
|
||||
title = self.tag_to_string(item)
|
||||
articles.append({
|
||||
'title' :title
|
||||
,'date' :''
|
||||
,'url' :url
|
||||
,'description':''
|
||||
})
|
||||
totalfeeds.append((feedtitle, articles))
|
||||
return totalfeeds
|
||||
|
@ -1,4 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
|
||||
@ -6,45 +7,76 @@ __license__ = 'GPL v3'
|
||||
www.canada.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
|
||||
|
||||
class CanWestPaper(BasicNewsRecipe):
|
||||
|
||||
# un-comment the following three lines for the Calgary Herald
|
||||
# un-comment the following four lines for the Victoria Times Colonist
|
||||
## title = u'Victoria Times Colonist'
|
||||
## url_prefix = 'http://www.timescolonist.com'
|
||||
## description = u'News from Victoria, BC'
|
||||
## fp_tag = 'CAN_TC'
|
||||
|
||||
# un-comment the following four lines for the Vancouver Province
|
||||
## title = u'Vancouver Province'
|
||||
## url_prefix = 'http://www.theprovince.com'
|
||||
## description = u'News from Vancouver, BC'
|
||||
## fp_tag = 'CAN_VP'
|
||||
|
||||
# un-comment the following four lines for the Vancouver Sun
|
||||
## title = u'Vancouver Sun'
|
||||
## url_prefix = 'http://www.vancouversun.com'
|
||||
## description = u'News from Vancouver, BC'
|
||||
## fp_tag = 'CAN_VS'
|
||||
|
||||
# un-comment the following four lines for the Edmonton Journal
|
||||
## title = u'Edmonton Journal'
|
||||
## url_prefix = 'http://www.edmontonjournal.com'
|
||||
## description = u'News from Edmonton, AB'
|
||||
## fp_tag = 'CAN_EJ'
|
||||
|
||||
# un-comment the following four lines for the Calgary Herald
|
||||
title = u'Calgary Herald'
|
||||
url_prefix = 'http://www.calgaryherald.com'
|
||||
description = u'News from Calgary, AB'
|
||||
fp_tag = 'CAN_CH'
|
||||
|
||||
# un-comment the following three lines for the Regina Leader-Post
|
||||
#title = u'Regina Leader-Post'
|
||||
#url_prefix = 'http://www.leaderpost.com'
|
||||
#description = u'News from Regina, SK'
|
||||
# un-comment the following four lines for the Regina Leader-Post
|
||||
## title = u'Regina Leader-Post'
|
||||
## url_prefix = 'http://www.leaderpost.com'
|
||||
## description = u'News from Regina, SK'
|
||||
## fp_tag = ''
|
||||
|
||||
# un-comment the following three lines for the Saskatoon Star-Phoenix
|
||||
#title = u'Saskatoon Star-Phoenix'
|
||||
#url_prefix = 'http://www.thestarphoenix.com'
|
||||
#description = u'News from Saskatoon, SK'
|
||||
# un-comment the following four lines for the Saskatoon Star-Phoenix
|
||||
## title = u'Saskatoon Star-Phoenix'
|
||||
## url_prefix = 'http://www.thestarphoenix.com'
|
||||
## description = u'News from Saskatoon, SK'
|
||||
## fp_tag = ''
|
||||
|
||||
# un-comment the following three lines for the Windsor Star
|
||||
#title = u'Windsor Star'
|
||||
#url_prefix = 'http://www.windsorstar.com'
|
||||
#description = u'News from Windsor, ON'
|
||||
# un-comment the following four lines for the Windsor Star
|
||||
## title = u'Windsor Star'
|
||||
## url_prefix = 'http://www.windsorstar.com'
|
||||
## description = u'News from Windsor, ON'
|
||||
## fp_tag = 'CAN_'
|
||||
|
||||
# un-comment the following three lines for the Ottawa Citizen
|
||||
#title = u'Ottawa Citizen'
|
||||
#url_prefix = 'http://www.ottawacitizen.com'
|
||||
#description = u'News from Ottawa, ON'
|
||||
# un-comment the following four lines for the Ottawa Citizen
|
||||
## title = u'Ottawa Citizen'
|
||||
## url_prefix = 'http://www.ottawacitizen.com'
|
||||
## description = u'News from Ottawa, ON'
|
||||
## fp_tag = 'CAN_OC'
|
||||
|
||||
# un-comment the following three lines for the Montreal Gazette
|
||||
#title = u'Montreal Gazette'
|
||||
#url_prefix = 'http://www.montrealgazette.com'
|
||||
#description = u'News from Montreal, QC'
|
||||
# un-comment the following four lines for the Montreal Gazette
|
||||
## title = u'Montreal Gazette'
|
||||
## url_prefix = 'http://www.montrealgazette.com'
|
||||
## description = u'News from Montreal, QC'
|
||||
## fp_tag = 'CAN_MG'
|
||||
|
||||
|
||||
language = 'en_CA'
|
||||
__author__ = 'Nick Redding'
|
||||
encoding = 'latin1'
|
||||
no_stylesheets = True
|
||||
timefmt = ' [%b %d]'
|
||||
extra_css = '''
|
||||
@ -64,14 +96,80 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||
|
||||
def preprocess_html(self,soup):
|
||||
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
||||
divtags = soup.findAll('div',attrs={'id':''})
|
||||
if divtags:
|
||||
for div in divtags:
|
||||
del(div['id'])
|
||||
def get_cover_url(self):
|
||||
from datetime import timedelta, date
|
||||
if self.fp_tag=='':
|
||||
return None
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
daysback=1
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
while daysback<7:
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
daysback = daysback+1
|
||||
continue
|
||||
break
|
||||
if daysback==7:
|
||||
self.log("\nCover unavailable")
|
||||
cover = None
|
||||
return cover
|
||||
|
||||
def fixChars(self,string):
|
||||
# Replace lsquo (\x91)
|
||||
fixed = re.sub("\x91","‘",string)
|
||||
# Replace rsquo (\x92)
|
||||
fixed = re.sub("\x92","’",fixed)
|
||||
# Replace ldquo (\x93)
|
||||
fixed = re.sub("\x93","“",fixed)
|
||||
# Replace rdquo (\x94)
|
||||
fixed = re.sub("\x94","”",fixed)
|
||||
# Replace ndash (\x96)
|
||||
fixed = re.sub("\x96","–",fixed)
|
||||
# Replace mdash (\x97)
|
||||
fixed = re.sub("\x97","—",fixed)
|
||||
fixed = re.sub("’","’",fixed)
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&","&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first:
|
||||
picdiv = soup.find('body').find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
|
||||
xtitle = article.text_summary.strip()
|
||||
if len(xtitle) == 0:
|
||||
desc = soup.find('meta',attrs={'property':'og:description'})
|
||||
if desc is not None:
|
||||
article.summary = article.text_summary = desc['content']
|
||||
|
||||
def strip_anchors(self,soup):
|
||||
paras = soup.findAll(True)
|
||||
for para in paras:
|
||||
aTags = para.findAll('a')
|
||||
for a in aTags:
|
||||
if a.img is None:
|
||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||
return soup
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||
@ -98,9 +196,7 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
atag = h1tag.find('a',href=True)
|
||||
if not atag:
|
||||
continue
|
||||
url = atag['href']
|
||||
if not url.startswith('http:'):
|
||||
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
||||
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
||||
#self.log("Section %s" % key)
|
||||
#self.log("url %s" % url)
|
||||
title = self.tag_to_string(atag,False)
|
||||
|
11
recipes/catholic_daily_readings.recipe
Normal file
@ -0,0 +1,11 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BasicUserRecipe1328971305(BasicNewsRecipe):
|
||||
title = u'Catholic Daily Readings'
|
||||
language = 'en'
|
||||
__author__ = 'adoucette'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Daily Readings - USCCB', u'http://www.usccb.org/bible/readings/rss/'), (u'Daily Reflection - One Bread One Body', u'http://www.presentationministries.com/general/rss.asp'), (u'Mass Readings - Universalis', u'http://www.universalis.com/atommass3.xml'), (u'Saint Of The Day - CNA', u'http://feeds.feedburner.com/catholicnewsagency/saintoftheday')]
|
@ -1,16 +1,20 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
class CD_Action(BasicNewsRecipe):
|
||||
title = u'CD-Action'
|
||||
__author__ = 'fenuks'
|
||||
description = 'cdaction.pl - polish magazine about games site'
|
||||
description = 'cdaction.pl - polish games magazine site'
|
||||
category = 'games'
|
||||
language = 'pl'
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets= True
|
||||
cover_url =u'http://s.cdaction.pl/obrazki/logo-CD-Action_172k9.JPG'
|
||||
keep_only_tags= dict(id='news_content')
|
||||
remove_tags_after= dict(name='div', attrs={'class':'tresc'})
|
||||
feeds = [(u'Newsy', u'http://www.cdaction.pl/rss_newsy.xml')]
|
||||
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.cdaction.pl/magazyn/')
|
||||
self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
@ -1,10 +1,12 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class CGM(BasicNewsRecipe):
|
||||
title = u'CGM'
|
||||
oldest_article = 7
|
||||
__author__ = 'fenuks'
|
||||
description = u'Codzienna Gazeta Muzyczna'
|
||||
masthead_url='http://www.cgm.pl/img/header/logo.gif'
|
||||
cover_url = 'http://www.krafcy.com/foto/tinymce/Image/cgm%281%29.jpg'
|
||||
category = 'music'
|
||||
language = 'pl'
|
||||
@ -16,28 +18,28 @@ class CGM(BasicNewsRecipe):
|
||||
remove_tags_before=dict(id='mainContent')
|
||||
remove_tags_after=dict(name='div', attrs={'class':'fbContainer'})
|
||||
remove_tags=[dict(name='div', attrs={'class':'fbContainer'}),
|
||||
dict(name='p', attrs={'class':['tagCloud', 'galleryAuthor']}),
|
||||
dict(id=['movieShare', 'container'])]
|
||||
dict(name='p', attrs={'class':['tagCloud', 'galleryAuthor']}),
|
||||
dict(id=['movieShare', 'container'])]
|
||||
feeds = [(u'Informacje', u'http://www.cgm.pl/rss.xml'), (u'Polecamy', u'http://www.cgm.pl/rss,4,news.xml'),
|
||||
(u'Recenzje', u'http://www.cgm.pl/rss,1,news.xml')]
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
gallery=soup.find('div', attrs={'class':'galleryFlash'})
|
||||
if gallery:
|
||||
img=gallery.div
|
||||
gallery.img.extract()
|
||||
if img:
|
||||
img=img['style']
|
||||
img='http://www.cgm.pl'+img[img.find('url(')+4:img.find(')')]
|
||||
gallery.contents[1].name='img'
|
||||
gallery.contents[1]['src']=img
|
||||
pos = len(gallery.contents)
|
||||
gallery.insert(pos, BeautifulSoup('<br />'))
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
ad=soup.findAll('a')
|
||||
for r in ad:
|
||||
if 'http://www.hustla.pl' in r['href'] or 'http://www.ebilet.pl' in r['href']:
|
||||
if 'www.hustla.pl' in r['href'] or 'www.ebilet.pl' in r['href']:
|
||||
r.extract()
|
||||
gallery=soup.find('div', attrs={'class':'galleryFlash'})
|
||||
if gallery:
|
||||
img=gallery.find('embed')
|
||||
if img:
|
||||
img=img['src'][35:]
|
||||
img='http://www.cgm.pl/_vault/_gallery/_photo/'+img
|
||||
param=gallery.findAll(name='param')
|
||||
for i in param:
|
||||
i.extract()
|
||||
gallery.contents[1].name='img'
|
||||
gallery.contents[1]['src']=img
|
||||
return soup
|
@ -3,6 +3,7 @@ __license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import urllib, re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class ChicagoTribune(BasicNewsRecipe):
|
||||
@ -77,8 +78,25 @@ class ChicagoTribune(BasicNewsRecipe):
|
||||
|
||||
|
||||
def get_article_url(self, article):
|
||||
print article.get('feedburner_origlink', article.get('guid', article.get('link')))
|
||||
return article.get('feedburner_origlink', article.get('guid', article.get('link')))
|
||||
ans = None
|
||||
try:
|
||||
s = article.summary
|
||||
ans = urllib.unquote(
|
||||
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
|
||||
except:
|
||||
pass
|
||||
if ans is None:
|
||||
ans = article.get('feedburner_origlink', article.get('guid', article.get('link')))
|
||||
if ans is not None:
|
||||
return ans.replace('?track=rss', '')
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
text = soup.find(text='click here to continue to article')
|
||||
if text:
|
||||
a = text.parent
|
||||
url = a.get('href')
|
||||
if url:
|
||||
return self.index_to_soup(url, raw=True)
|
||||
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
# Remove the navigation bar. It was kept until now to be able to follow
|
||||
|
@ -33,6 +33,32 @@ class ChristianScienceMonitor(BasicNewsRecipe):
|
||||
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
requires_version = (0, 8, 39)
|
||||
|
||||
def preprocess_raw_html(self, raw, url):
|
||||
try:
|
||||
from html5lib import parse
|
||||
root = parse(raw, namespaceHTMLElements=False,
|
||||
treebuilder='lxml').getroot()
|
||||
from lxml import etree
|
||||
for tag in root.xpath(
|
||||
'//script|//style|//noscript|//meta|//link|//object'):
|
||||
tag.getparent().remove(tag)
|
||||
for elem in list(root.iterdescendants(tag=etree.Comment)):
|
||||
elem.getparent().remove(elem)
|
||||
ans = etree.tostring(root, encoding=unicode)
|
||||
ans = re.sub('.*<html', '<html', ans, flags=re.DOTALL)
|
||||
return ans
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
def index_to_soup(self, url):
|
||||
raw = BasicNewsRecipe.index_to_soup(self, url,
|
||||
raw=True).decode('utf-8')
|
||||
raw = self.preprocess_raw_html(raw, url)
|
||||
return BasicNewsRecipe.index_to_soup(self, raw)
|
||||
|
||||
def append_page(self, soup, appendtag, position):
|
||||
nav = soup.find('div',attrs={'class':'navigation'})
|
||||
@ -78,14 +104,6 @@ class ChristianScienceMonitor(BasicNewsRecipe):
|
||||
print_soup = soup
|
||||
return print_soup
|
||||
|
||||
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
(r'<!--.*?-->', lambda match : ''),
|
||||
(r'<body.*?<div id="story"', lambda match : '<body><div id="story"'),
|
||||
(r'<div class="pubdate">.*?</div>', lambda m: ''),
|
||||
(r'Full HTML version of this story which may include photos, graphics, and related links.*</body>',
|
||||
lambda match : '</body>'),
|
||||
]]
|
||||
extra_css = '''
|
||||
h1{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: large}
|
||||
.sub{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: small;}
|
||||
|
48
recipes/ciekawostki_historyczne.recipe
Normal file
@ -0,0 +1,48 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
class Ciekawostki_Historyczne(BasicNewsRecipe):
|
||||
title = u'Ciekawostki Historyczne'
|
||||
oldest_article = 7
|
||||
__author__ = 'fenuks'
|
||||
description = u'Serwis popularnonaukowy - odkrycia, kontrowersje, historia, ciekawostki, badania, ciekawostki z przeszłości.'
|
||||
category = 'history'
|
||||
language = 'pl'
|
||||
masthead_url= 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg'
|
||||
cover_url='http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg'
|
||||
max_articles_per_feed = 100
|
||||
preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?</fb:like>', re.DOTALL), lambda match: ''), (re.compile(ur'<h2>Zobacz też:</h2>.*?</ol>', re.DOTALL), lambda match: '')]
|
||||
no_stylesheets=True
|
||||
remove_empty_feeds=True
|
||||
keep_only_tags=[dict(name='div', attrs={'class':'post'})]
|
||||
remove_tags=[dict(id='singlepostinfo')]
|
||||
feeds = [(u'Staro\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/starozytnosc/feed/'), (u'\u015aredniowiecze', u'http://ciekawostkihistoryczne.pl/tag/sredniowiecze/feed/'), (u'Nowo\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/nowozytnosc/feed/'), (u'XIX wiek', u'http://ciekawostkihistoryczne.pl/tag/xix-wiek/feed/'), (u'1914-1939', u'http://ciekawostkihistoryczne.pl/tag/1914-1939/feed/'), (u'1939-1945', u'http://ciekawostkihistoryczne.pl/tag/1939-1945/feed/'), (u'Powojnie (od 1945)', u'http://ciekawostkihistoryczne.pl/tag/powojnie/feed/'), (u'Recenzje', u'http://ciekawostkihistoryczne.pl/category/recenzje/feed/')]
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag=soup.find(name='h7')
|
||||
if tag:
|
||||
if tag.br:
|
||||
pass
|
||||
elif tag.nextSibling.name=='p':
|
||||
tag=tag.nextSibling
|
||||
nexturl = tag.findAll('a')
|
||||
for nextpage in nexturl:
|
||||
tag.extract()
|
||||
nextpage= nextpage['href']
|
||||
soup2 = self.index_to_soup(nextpage)
|
||||
pagetext = soup2.find(name='div', attrs={'class':'post'})
|
||||
for r in pagetext.findAll('div', attrs={'id':'singlepostinfo'}):
|
||||
r.extract()
|
||||
for r in pagetext.findAll('div', attrs={'class':'wp-caption alignright'}):
|
||||
r.extract()
|
||||
for r in pagetext.findAll('h1'):
|
||||
r.extract()
|
||||
pagetext.find('h6').nextSibling.extract()
|
||||
pagetext.find('h7').nextSibling.extract()
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
||||
|
||||
|
@ -1,38 +1,89 @@
|
||||
#!/usr/bin/env python
|
||||
##
|
||||
## Title: Common Dreams
|
||||
##
|
||||
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||
|
||||
# Feb 2012: Cleaned up the output to have only the main article
|
||||
|
||||
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||
'''
|
||||
commondreams.org
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class CommonDreams(BasicNewsRecipe):
|
||||
# Identify the recipe
|
||||
|
||||
title = u'Common Dreams'
|
||||
description = u'Progressive news and views'
|
||||
description = u'Breaking News & Views for the Progressive Community.'
|
||||
cover_url = 'https://s3.amazonaws.com/s3.commondreams.org/images/common-dreams.png'
|
||||
__author__ = u'XanthanGum'
|
||||
language = 'en'
|
||||
|
||||
# Format the text
|
||||
|
||||
extra_css = '''
|
||||
body{font-family:verdana,arial,helvetica,geneva,sans-serif ;}
|
||||
h1{font-size: xx-large;}
|
||||
h2{font-size: large;}
|
||||
'''
|
||||
|
||||
# Pick no article older than seven days and limit the number of articles per feed to 100
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
|
||||
# Remove everything before the article
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
|
||||
remove_tags_before = dict(name = 'div', attrs = {'id':'node-header'})
|
||||
# Flattens all the tables to make it compatible with Nook
|
||||
conversion_options = {'linearize_tables' : True}
|
||||
|
||||
# Remove everything after the article
|
||||
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
||||
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
|
||||
|
||||
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
||||
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
||||
.introduction, .first { font-weight: bold; } \
|
||||
.cross-head { font-weight: bold; font-size: 125%; } \
|
||||
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
|
||||
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
|
||||
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
|
||||
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
|
||||
font-size: 80%; font-style: italic; margin: 1px auto; } \
|
||||
.story-date, .published { font-size: 80%; } \
|
||||
table { width: 100%; } \
|
||||
td img { display: block; margin: 5px auto; } \
|
||||
ul { padding-top: 10px; } \
|
||||
ol { padding-top: 10px; } \
|
||||
li { padding-top: 5px; padding-bottom: 5px; } \
|
||||
h1 { font-size: 175%; font-weight: bold; } \
|
||||
h2 { font-size: 150%; font-weight: bold; } \
|
||||
h3 { font-size: 125%; font-weight: bold; } \
|
||||
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
|
||||
|
||||
# Remove the line breaks and float left/right and picture width/height.
|
||||
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
|
||||
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
|
||||
(re.compile(r'float:.*?'), lambda m: ''),
|
||||
(re.compile(r'width:.*?px'), lambda m: ''),
|
||||
(re.compile(r'height:.*?px'), lambda m: ''),
|
||||
(re.compile(r'<a.*?>'), lambda m: ''),
|
||||
(re.compile(r'</a>'), lambda m: ''),
|
||||
]
|
||||
|
||||
|
||||
# Main article is inside this tag
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':lambda x: x and 'node-' in x}),
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':'node-links clear-block'}), # remove Share options
|
||||
]
|
||||
|
||||
remove_tags_after = dict(name = 'div', attrs = {'class':'copyright-info'})
|
||||
|
||||
# Identify the news feeds
|
||||
|
||||
feeds = [(u'Headlines', u'http://www.commondreams.org/feed/headlines_rss'),
|
||||
(u'Further News Articles', u'http://www.commondreams.org/feed/further_rss'),
|
||||
(u'Views', u'http://www.commondreams.org/feed/views_rss'),
|
||||
(u'Progressive Newswire', u'http://www.commondreams.org/feed/newswire_rss')]
|
||||
feeds = [(u'Headlines', u'https://www.commondreams.org/feed/headlines_rss'),
|
||||
(u'Further News Articles', u'https://www.commondreams.org/feed/further_rss'),
|
||||
(u'Views', u'https://www.commondreams.org/feed/views_rss'),
|
||||
(u'Progressive Newswire', u'https://www.commondreams.org/feed/newswire_rss')]
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
url = url + '?print'
|
||||
return url
|
@ -7,10 +7,11 @@ class Computerworld_pl(BasicNewsRecipe):
|
||||
description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne'
|
||||
category = 'IT'
|
||||
language = 'pl'
|
||||
masthead_url= 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif'
|
||||
no_stylesheets=True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags=[dict(name='div', attrs={'id':'s'})]
|
||||
keep_only_tags=[dict(attrs={'class':['tyt_news', 'prawo', 'autor', 'tresc']})]
|
||||
remove_tags_after=dict(name='div', attrs={'class':'rMobi'})
|
||||
remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})]
|
||||
feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')]
|
||||
|
71
recipes/consortium_news.recipe
Normal file
@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env python
|
||||
##
|
||||
## Title: Consortium News
|
||||
##
|
||||
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||
|
||||
# Feb 2012: Initial release
|
||||
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||
'''
|
||||
consortiumnews.com
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class ConsortiumNews(BasicNewsRecipe):
|
||||
|
||||
title = u'Consortium News'
|
||||
publisher = 'Copyright © 2012 Consortiumnews. All Rights Reserved.'
|
||||
language = 'en'
|
||||
__author__ = 'kiavash'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
|
||||
conversion_options = {'linearize_tables' : True} # Flattens all the tables to make it compatible with Nook
|
||||
|
||||
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
||||
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
|
||||
|
||||
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
||||
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
||||
.introduction, .first { font-weight: bold; } \
|
||||
.cross-head { font-weight: bold; font-size: 125%; } \
|
||||
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
|
||||
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
|
||||
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
|
||||
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
|
||||
font-size: 80%; font-style: italic; margin: 1px auto; } \
|
||||
.story-date, .published { font-size: 80%; } \
|
||||
table { width: 100%; } \
|
||||
td img { display: block; margin: 5px auto; } \
|
||||
ul { padding-top: 10px; } \
|
||||
ol { padding-top: 10px; } \
|
||||
li { padding-top: 5px; padding-bottom: 5px; } \
|
||||
h1 { font-size: 175%; font-weight: bold; } \
|
||||
h2 { font-size: 150%; font-weight: bold; } \
|
||||
h3 { font-size: 125%; font-weight: bold; } \
|
||||
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
|
||||
|
||||
# Remove the line breaks and float left/right and picture width/height.
|
||||
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
|
||||
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
|
||||
(re.compile(r'float:.*?'), lambda m: ''),
|
||||
(re.compile(r'width:.*?px'), lambda m: ''),
|
||||
(re.compile(r'height:.*?px'), lambda m: ''),
|
||||
(re.compile(r'<a.*?>'), lambda h1: ''),
|
||||
(re.compile(r'</a>'), lambda h2: ''),
|
||||
]
|
||||
|
||||
# Main article is inside this tag
|
||||
keep_only_tags = [dict(name='div', attrs={'id':lambda x: x and 'post-' in x})]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':'sociable'}), # remove 'Share this Article'
|
||||
dict(name='p', attrs={'class':'tags'}), # remove 'Tags: ... '
|
||||
]
|
||||
|
||||
feeds = [(u'Consortium News', u'http://feeds.feedburner.com/Consortiumnewscom')]
|
25
recipes/countryfile.recipe
Normal file
@ -0,0 +1,25 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
title = u'Countryfile.com'
|
||||
cover_url = 'http://www.buysubscriptions.com/static_content/the-immediate/en/images/covers/CFIL_maxi.jpg'
|
||||
__author__ = 'Dave Asbury'
|
||||
description = 'The official website of Countryfile Magazine'
|
||||
# last updated 29/1/12
|
||||
language = 'en_GB'
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 25
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
#articles_are_obfuscated = True
|
||||
|
||||
remove_tags = [
|
||||
# dict(attrs={'class' : ['player']}),
|
||||
|
||||
]
|
||||
feeds = [
|
||||
(u'Homepage', u'http://www.countryfile.com/rss/home'),
|
||||
(u'Country News', u'http://www.countryfile.com/rss/news'),
|
||||
(u'Countryside', u'http://www.countryfile.com/rss/countryside'),
|
||||
]
|
@ -5,7 +5,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
description = 'News as provide by The Daily Mirror -UK'
|
||||
|
||||
__author__ = 'Dave Asbury'
|
||||
# last updated 26/12/11
|
||||
# last updated 11/2/12
|
||||
language = 'en_GB'
|
||||
|
||||
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
|
||||
@ -14,35 +14,58 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
|
||||
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 20
|
||||
max_articles_per_feed = 5
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
#conversion_options = { 'linearize_tables' : True }
|
||||
|
||||
|
||||
#keep_only_tags = [
|
||||
# dict(name='h1'),
|
||||
# dict(name='div',attrs={'id' : 'body-content'}),
|
||||
#dict(name='div',atts={'class' : 'article-body'}),
|
||||
#dict(attrs={'class' : ['article-attr','byline append-1','published']}),
|
||||
#dict(name='p'),
|
||||
# ]
|
||||
|
||||
#remove_tags_after = [dict (name='div',attrs={'class' : 'related'})]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='title'),
|
||||
dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),
|
||||
# dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}),
|
||||
#dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}),
|
||||
#dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}),
|
||||
]
|
||||
|
||||
# preprocess_regexps = [
|
||||
#(re.compile(r'<dl class="q-search">.*?</dl>', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'Advertisement >>', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
|
||||
#preprocess_regexps = [
|
||||
#(re.compile(r'Sponsored Links', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
|
||||
feeds = [
|
||||
|
||||
(u'News', u'http://www.mirror.co.uk/news/rss.xml')
|
||||
,(u'Tech News', u'http://www.mirror.co.uk/news/technology/rss.xml')
|
||||
,(u'Weird World','http://www.mirror.co.uk/news/weird-world/rss.xml')
|
||||
,(u'Film Gossip','http://www.mirror.co.uk/celebs/film/rss.xml')
|
||||
,(u'Music News','http://www.mirror.co.uk/celebs/music/rss.xml')
|
||||
,(u'Celebs and Tv Gossip','http://www.mirror.co.uk/celebs/tv/rss.xml')
|
||||
,(u'Sport','http://www.mirror.co.uk/sport/rss.xml')
|
||||
,(u'Life Style','http://www.mirror.co.uk/life-style/rss.xml')
|
||||
,(u'Advice','http://www.mirror.co.uk/advice/rss.xml')
|
||||
,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
|
||||
(u'UK News', u'http://feed43.com/0287771688643868.xml')
|
||||
,(u'Tech News', u'http://feed43.com/2455520588350501.xml')
|
||||
,(u'Weird World','http://feed43.com/0863800333634654.xml')
|
||||
,(u'Sport','http://feed43.com/7713243036546130.xml')
|
||||
,(u'Sport : Boxing ','http://feed43.com/0414732220804255.xml')
|
||||
,(u'Sport : Rugby Union','http://feed43.com/4710138762362383.xml')
|
||||
,(u'Sport : Other','http://feed43.com/4501416886323415.xml')
|
||||
,(u'TV and Film','http://feed43.com/5238302853765104.xml')
|
||||
,(u'Celebs','http://feed43.com/8770061048844683.xml')
|
||||
,(u'Life Style : Family','http://feed43.com/4356170742410338.xml')
|
||||
,(u'Travel','http://feed43.com/1436576006476607.xml')
|
||||
|
||||
|
||||
|
||||
# example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
|
||||
]
|
||||
|
21
recipes/desiring_god.recipe
Normal file
@ -0,0 +1,21 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
class DesiringGodEnglish(BasicNewsRecipe):
|
||||
title = u'Desiring God'
|
||||
__author__ = 'Peter Grungi'
|
||||
language = 'en'
|
||||
|
||||
cover_url = 'http://cdn0.desiringgod.org/images/layout/breadcrumbs_dg_mark.png'
|
||||
masthead_url = 'http://cdn0.desiringgod.org/images/layout/breadcrumbs_dg_mark.png'
|
||||
language = 'en'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 50
|
||||
auto_cleanup = True
|
||||
publisher = 'Desiring God Ministries'
|
||||
author = 'Desiring God Ministries'
|
||||
|
||||
feeds = [(u'Desiring God Blog', u'http://feeds.feedburner.com/DGBlog?format=xml')]
|
@ -7,6 +7,7 @@ class Dobreprogramy_pl(BasicNewsRecipe):
|
||||
__licence__ ='GPL v3'
|
||||
category = 'IT'
|
||||
language = 'pl'
|
||||
masthead_url='http://static.dpcdn.pl/css/Black/Images/header_logo_napis_fullVersion.png'
|
||||
cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png'
|
||||
description = u'Aktualności i blogi z dobreprogramy.pl'
|
||||
encoding = 'utf-8'
|
||||
@ -16,7 +17,8 @@ class Dobreprogramy_pl(BasicNewsRecipe):
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
preprocess_regexps = [(re.compile(ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '') ]
|
||||
remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['mainBar', 'newsContent', 'postTitle title', 'postInfo', 'contentText', 'content']})]
|
||||
keep_only_tags=[dict(attrs={'class':['news', 'entry single']})]
|
||||
remove_tags = [dict(name='div', attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']})]
|
||||
#remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
|
||||
feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'),
|
||||
('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')]
|
||||
|
@ -8,15 +8,17 @@ class Dziennik_pl(BasicNewsRecipe):
|
||||
description = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
cover_url='http://6.s.dziennik.pl/images/og_dziennik.jpg'
|
||||
masthead_url= 'http://5.s.dziennik.pl/images/logos.png'
|
||||
cover_url= 'http://5.s.dziennik.pl/images/logos.png'
|
||||
no_stylesheets = True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript=True
|
||||
remove_empty_feeds=True
|
||||
preprocess_regexps = [(re.compile("Komentarze:"), lambda m: '')]
|
||||
extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}'
|
||||
preprocess_regexps = [(re.compile("Komentarze:"), lambda m: ''), (re.compile('<p><strong><a href=".*?">>>> CZYTAJ TAKŻE: ".*?"</a></strong></p>'), lambda m: '')]
|
||||
keep_only_tags=[dict(id='article')]
|
||||
remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget']}), dict(name='a', attrs={'class':'komentarz'})]
|
||||
remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget', 'belka-spol', 'belka-spol belka-spol-bottom', 'art_data_tags', 'cl_right', 'boxRounded gal_inside']}), dict(name='a', attrs={'class':['komentarz', 'article_icon_addcommnent']})]
|
||||
feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
|
||||
(u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
|
||||
(u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
|
||||
@ -30,6 +32,12 @@ class Dziennik_pl(BasicNewsRecipe):
|
||||
(u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'),
|
||||
(u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
tag=soup.find(name='a', attrs={'title':'CZYTAJ DALEJ'})
|
||||
if tag:
|
||||
new_soup=self.index_to_soup(tag['href'], raw=True)
|
||||
return new_soup
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag=soup.find('a', attrs={'class':'page_next'})
|
||||
if tag:
|
||||
@ -56,3 +64,4 @@ class Dziennik_pl(BasicNewsRecipe):
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
|
||||
@ -6,45 +7,72 @@ __license__ = 'GPL v3'
|
||||
www.canada.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
|
||||
|
||||
class CanWestPaper(BasicNewsRecipe):
|
||||
|
||||
# un-comment the following three lines for the Edmonton Journal
|
||||
# un-comment the following four lines for the Victoria Times Colonist
|
||||
## title = u'Victoria Times Colonist'
|
||||
## url_prefix = 'http://www.timescolonist.com'
|
||||
## description = u'News from Victoria, BC'
|
||||
## fp_tag = 'CAN_TC'
|
||||
|
||||
# un-comment the following four lines for the Vancouver Province
|
||||
## title = u'Vancouver Province'
|
||||
## url_prefix = 'http://www.theprovince.com'
|
||||
## description = u'News from Vancouver, BC'
|
||||
## fp_tag = 'CAN_VP'
|
||||
|
||||
# un-comment the following four lines for the Vancouver Sun
|
||||
## title = u'Vancouver Sun'
|
||||
## url_prefix = 'http://www.vancouversun.com'
|
||||
## description = u'News from Vancouver, BC'
|
||||
## fp_tag = 'CAN_VS'
|
||||
|
||||
# un-comment the following four lines for the Edmonton Journal
|
||||
title = u'Edmonton Journal'
|
||||
url_prefix = 'http://www.edmontonjournal.com'
|
||||
description = u'News from Edmonton, AB'
|
||||
fp_tag = 'CAN_EJ'
|
||||
|
||||
# un-comment the following three lines for the Calgary Herald
|
||||
#title = u'Calgary Herald'
|
||||
#url_prefix = 'http://www.calgaryherald.com'
|
||||
#description = u'News from Calgary, AB'
|
||||
# un-comment the following four lines for the Calgary Herald
|
||||
## title = u'Calgary Herald'
|
||||
## url_prefix = 'http://www.calgaryherald.com'
|
||||
## description = u'News from Calgary, AB'
|
||||
## fp_tag = 'CAN_CH'
|
||||
|
||||
# un-comment the following three lines for the Regina Leader-Post
|
||||
#title = u'Regina Leader-Post'
|
||||
#url_prefix = 'http://www.leaderpost.com'
|
||||
#description = u'News from Regina, SK'
|
||||
# un-comment the following four lines for the Regina Leader-Post
|
||||
## title = u'Regina Leader-Post'
|
||||
## url_prefix = 'http://www.leaderpost.com'
|
||||
## description = u'News from Regina, SK'
|
||||
## fp_tag = ''
|
||||
|
||||
# un-comment the following three lines for the Saskatoon Star-Phoenix
|
||||
#title = u'Saskatoon Star-Phoenix'
|
||||
#url_prefix = 'http://www.thestarphoenix.com'
|
||||
#description = u'News from Saskatoon, SK'
|
||||
# un-comment the following four lines for the Saskatoon Star-Phoenix
|
||||
## title = u'Saskatoon Star-Phoenix'
|
||||
## url_prefix = 'http://www.thestarphoenix.com'
|
||||
## description = u'News from Saskatoon, SK'
|
||||
## fp_tag = ''
|
||||
|
||||
# un-comment the following three lines for the Windsor Star
|
||||
#title = u'Windsor Star'
|
||||
#url_prefix = 'http://www.windsorstar.com'
|
||||
#description = u'News from Windsor, ON'
|
||||
# un-comment the following four lines for the Windsor Star
|
||||
## title = u'Windsor Star'
|
||||
## url_prefix = 'http://www.windsorstar.com'
|
||||
## description = u'News from Windsor, ON'
|
||||
## fp_tag = 'CAN_'
|
||||
|
||||
# un-comment the following three lines for the Ottawa Citizen
|
||||
#title = u'Ottawa Citizen'
|
||||
#url_prefix = 'http://www.ottawacitizen.com'
|
||||
#description = u'News from Ottawa, ON'
|
||||
# un-comment the following four lines for the Ottawa Citizen
|
||||
## title = u'Ottawa Citizen'
|
||||
## url_prefix = 'http://www.ottawacitizen.com'
|
||||
## description = u'News from Ottawa, ON'
|
||||
## fp_tag = 'CAN_OC'
|
||||
|
||||
# un-comment the following three lines for the Montreal Gazette
|
||||
#title = u'Montreal Gazette'
|
||||
#url_prefix = 'http://www.montrealgazette.com'
|
||||
#description = u'News from Montreal, QC'
|
||||
# un-comment the following four lines for the Montreal Gazette
|
||||
## title = u'Montreal Gazette'
|
||||
## url_prefix = 'http://www.montrealgazette.com'
|
||||
## description = u'News from Montreal, QC'
|
||||
## fp_tag = 'CAN_MG'
|
||||
|
||||
|
||||
language = 'en_CA'
|
||||
@ -68,14 +96,80 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||
|
||||
def preprocess_html(self,soup):
|
||||
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
||||
divtags = soup.findAll('div',attrs={'id':''})
|
||||
if divtags:
|
||||
for div in divtags:
|
||||
del(div['id'])
|
||||
def get_cover_url(self):
|
||||
from datetime import timedelta, date
|
||||
if self.fp_tag=='':
|
||||
return None
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
daysback=1
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
while daysback<7:
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
daysback = daysback+1
|
||||
continue
|
||||
break
|
||||
if daysback==7:
|
||||
self.log("\nCover unavailable")
|
||||
cover = None
|
||||
return cover
|
||||
|
||||
def fixChars(self,string):
|
||||
# Replace lsquo (\x91)
|
||||
fixed = re.sub("\x91","‘",string)
|
||||
# Replace rsquo (\x92)
|
||||
fixed = re.sub("\x92","’",fixed)
|
||||
# Replace ldquo (\x93)
|
||||
fixed = re.sub("\x93","“",fixed)
|
||||
# Replace rdquo (\x94)
|
||||
fixed = re.sub("\x94","”",fixed)
|
||||
# Replace ndash (\x96)
|
||||
fixed = re.sub("\x96","–",fixed)
|
||||
# Replace mdash (\x97)
|
||||
fixed = re.sub("\x97","—",fixed)
|
||||
fixed = re.sub("’","’",fixed)
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&","&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first:
|
||||
picdiv = soup.find('body').find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
|
||||
xtitle = article.text_summary.strip()
|
||||
if len(xtitle) == 0:
|
||||
desc = soup.find('meta',attrs={'property':'og:description'})
|
||||
if desc is not None:
|
||||
article.summary = article.text_summary = desc['content']
|
||||
|
||||
def strip_anchors(self,soup):
|
||||
paras = soup.findAll(True)
|
||||
for para in paras:
|
||||
aTags = para.findAll('a')
|
||||
for a in aTags:
|
||||
if a.img is None:
|
||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||
return soup
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||
|
@ -1,4 +1,5 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class Elektroda(BasicNewsRecipe):
|
||||
title = u'Elektroda'
|
||||
@ -13,3 +14,18 @@ class Elektroda(BasicNewsRecipe):
|
||||
remove_tags_after=dict(name='td', attrs={'class':'spaceRow'})
|
||||
remove_tags=[dict(name='a', attrs={'href':'#top'})]
|
||||
feeds = [(u'Elektroda', u'http://www.elektroda.pl/rtvforum/rss.php')]
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
tag=soup.find('span', attrs={'class':'postbody'})
|
||||
if tag:
|
||||
pos = len(tag.contents)
|
||||
tag.insert(pos, BeautifulSoup('<br />'))
|
||||
return soup
|
||||
|
||||
def parse_feeds (self):
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
for feed in feeds:
|
||||
for article in feed.articles[:]:
|
||||
article.title=article.title[article.title.find("::")+3:]
|
||||
return feeds
|
||||
|
@ -3,10 +3,11 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
title = u'FHM UK'
|
||||
description = 'Good News for Men'
|
||||
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
|
||||
cover_url = 'http://www.greatmagazines.co.uk/covers/large/w197/current/fhm.jpg'
|
||||
# cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
|
||||
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
|
||||
__author__ = 'Dave Asbury'
|
||||
# last updated 27/12/11
|
||||
# last updated 17/3/12
|
||||
language = 'en_GB'
|
||||
oldest_article = 28
|
||||
max_articles_per_feed = 12
|
||||
@ -22,9 +23,15 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
|
||||
]
|
||||
|
||||
#remove_tags = [
|
||||
#dict(attrs={'class' : ['player']}),
|
||||
|
||||
#]
|
||||
feeds = [
|
||||
(u'From the Homepage',u'http://feed43.com/8053226782885416.xml'),
|
||||
(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),
|
||||
(u'Gaming',u'http://feed43.com/0755006465351035.xml'),
|
||||
]
|
||||
(u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'),
|
||||
(u'Upgrade',u'http://feed43.com/0877305847443234.xml'),
|
||||
#(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),
|
||||
#(u'Gaming',u'http://feed43.com/0755006465351035.xml'),
|
||||
(u'Gaming',u'http://feed43.com/6537162612465672.xml'),
|
||||
]
|
||||
|
@ -10,9 +10,10 @@ class Filmweb_pl(BasicNewsRecipe):
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets= True
|
||||
extra_css = '.hdrBig {font-size:22px;}'
|
||||
remove_empty_feeds=True
|
||||
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
|
||||
remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'})]
|
||||
keep_only_tags= [dict(name='h1', attrs={'class':'hdrBig'}), dict(name='div', attrs={'class':['newsInfo', 'reviewContent fontSizeCont description']})]
|
||||
keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})]
|
||||
feeds = [(u'Wszystkie newsy', u'http://www.filmweb.pl/feed/news/latest'),
|
||||
(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
|
||||
(u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'),
|
||||
|
@ -3,10 +3,17 @@ import re
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
class ForeignAffairsRecipe(BasicNewsRecipe):
|
||||
''' there are three modifications:
|
||||
1) fetch issue cover
|
||||
2) toggle ignore premium articles
|
||||
3) extract proper section names, ie. "Comments", "Essay"
|
||||
|
||||
by Chen Wei weichen302@gmx.com, 2012-02-05'''
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'kwetal'
|
||||
language = 'en'
|
||||
version = 1
|
||||
version = 1.01
|
||||
|
||||
title = u'Foreign Affairs (Subcription or (free) Registration)'
|
||||
publisher = u'Council on Foreign Relations'
|
||||
@ -17,6 +24,9 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
||||
remove_javascript = True
|
||||
|
||||
INDEX = 'http://www.foreignaffairs.com'
|
||||
FRONTPAGE = 'http://www.foreignaffairs.com/magazine'
|
||||
INCLUDE_PREMIUM = False
|
||||
|
||||
|
||||
remove_tags = []
|
||||
remove_tags.append(dict(name = 'base'))
|
||||
@ -37,6 +47,12 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
||||
temp_files = []
|
||||
articles_are_obfuscated = True
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(self.FRONTPAGE)
|
||||
div = soup.find('div', attrs={'class':'inthemag-issuebuy-cover'})
|
||||
img_url = div.find('img')['src']
|
||||
return self.INDEX + img_url
|
||||
|
||||
def get_obfuscated_article(self, url):
|
||||
br = self.get_browser()
|
||||
br.open(url)
|
||||
@ -50,57 +66,46 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
||||
|
||||
return self.temp_files[-1].name
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup('http://www.foreignaffairs.com/magazine')
|
||||
articles = []
|
||||
answer = []
|
||||
content = soup.find('div', attrs = {'class': 'center-wrapper'})
|
||||
if content:
|
||||
for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
|
||||
tag = div.find('div', attrs = {'class': 'views-field-title'})
|
||||
if tag:
|
||||
a = tag.find('a')
|
||||
if a:
|
||||
title = self.tag_to_string(a)
|
||||
url = self.INDEX + a['href']
|
||||
|
||||
author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
|
||||
tag = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
|
||||
# If they ever fix their markup, this will break :-(
|
||||
summary = self.tag_to_string(tag.findNextSibling('p'))
|
||||
description = author + '<br/>' + summary
|
||||
|
||||
articles.append({'title': title, 'date': None, 'url': url, 'description': description})
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
|
||||
answer.append(('Magazine', articles))
|
||||
|
||||
ul = content.find('ul')
|
||||
if ul:
|
||||
soup = self.index_to_soup(self.FRONTPAGE)
|
||||
sec_start = soup.findAll('div', attrs={'class':'panel-separator'})
|
||||
for sec in sec_start:
|
||||
content = sec.nextSibling
|
||||
if content:
|
||||
section = self.tag_to_string(content.find('h2'))
|
||||
articles = []
|
||||
for li in ul.findAll('li'):
|
||||
tag = li.find('div', attrs = {'class': 'views-field-title'})
|
||||
if tag:
|
||||
a = tag.find('a')
|
||||
if a:
|
||||
title = self.tag_to_string(a)
|
||||
url = self.INDEX + a['href']
|
||||
description = ''
|
||||
tag = li.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
|
||||
if tag:
|
||||
description = self.tag_to_string(tag)
|
||||
|
||||
articles.append({'title': title, 'date': None, 'url': url, 'description': description})
|
||||
else:
|
||||
continue
|
||||
tags = []
|
||||
for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
|
||||
tags.append(div)
|
||||
for li in content.findAll('li'):
|
||||
tags.append(li)
|
||||
|
||||
for div in tags:
|
||||
title = url = description = author = None
|
||||
|
||||
if self.INCLUDE_PREMIUM:
|
||||
found_premium = False
|
||||
else:
|
||||
continue
|
||||
|
||||
answer.append(('Letters to the Editor', articles))
|
||||
found_premium = div.findAll('span', attrs={'class':
|
||||
'premium-icon'})
|
||||
if not found_premium:
|
||||
tag = div.find('div', attrs={'class': 'views-field-title'})
|
||||
|
||||
if tag:
|
||||
a = tag.find('a')
|
||||
if a:
|
||||
title = self.tag_to_string(a)
|
||||
url = self.INDEX + a['href']
|
||||
author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
|
||||
tag_summary = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
|
||||
description = self.tag_to_string(tag_summary)
|
||||
articles.append({'title':title, 'date':None, 'url':url,
|
||||
'description':description, 'author':author})
|
||||
if articles:
|
||||
answer.append((section, articles))
|
||||
return answer
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
21
recipes/gameplay_pl.recipe
Normal file
@ -0,0 +1,21 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class Gameplay_pl(BasicNewsRecipe):
|
||||
title = u'Gameplay.pl'
|
||||
oldest_article = 7
|
||||
__author__ = 'fenuks'
|
||||
description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.'
|
||||
category = 'games, movies, books, music'
|
||||
language = 'pl'
|
||||
masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png'
|
||||
cover_url= 'http://gameplay.pl/img/gpy_top_logo.png'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets= True
|
||||
keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})]
|
||||
remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})]
|
||||
feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')]
|
||||
|
||||
def image_url_processor(self, baseurl, url):
|
||||
if 'http' not in url:
|
||||
return 'http://gameplay.pl'+ url[2:]
|
||||
else:
|
||||
return url
|
@ -4,10 +4,11 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class Gazeta_Wyborcza(BasicNewsRecipe):
|
||||
title = u'Gazeta Wyborcza'
|
||||
__author__ = 'fenuks'
|
||||
cover_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
|
||||
language = 'pl'
|
||||
description ='news from gazeta.pl'
|
||||
category='newspaper'
|
||||
publication_type = 'newspaper'
|
||||
masthead_url='http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'
|
||||
INDEX='http://wyborcza.pl'
|
||||
remove_empty_feeds= True
|
||||
oldest_article = 3
|
||||
@ -81,3 +82,10 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
|
||||
return url
|
||||
else:
|
||||
return url.replace('http://wyborcza.biz/biznes/1', 'http://wyborcza.biz/biznes/2029020')
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://wyborcza.pl/0,76762,3751429.html')
|
||||
cover=soup.find(id='GWmini2')
|
||||
soup = self.index_to_soup('http://wyborcza.pl/'+ cover.contents[3].a['href'])
|
||||
self.cover_url='http://wyborcza.pl' + soup.img['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
@ -9,12 +9,12 @@ class Gram_pl(BasicNewsRecipe):
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets= True
|
||||
extra_css = 'h2 {font-style: italic; font-size:20px;}'
|
||||
extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
|
||||
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
|
||||
remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])]
|
||||
keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']})]
|
||||
feeds = [(u'gram.pl - informacje', u'http://www.gram.pl/feed_news.asp'),
|
||||
(u'gram.pl - publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')]
|
||||
feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
|
||||
(u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')]
|
||||
|
||||
def parse_feeds (self):
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
@ -23,3 +23,33 @@ class Gram_pl(BasicNewsRecipe):
|
||||
if 'REKLAMA SKLEP' in article.title.upper() or u'ARTYKUŁ:' in article.title.upper():
|
||||
feed.articles.remove(article)
|
||||
return feeds
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
nexturl = appendtag.find('a', attrs={'class':'cpn'})
|
||||
while nexturl:
|
||||
soup2 = self.index_to_soup('http://www.gram.pl'+ nexturl['href'])
|
||||
r=appendtag.find(id='pgbox')
|
||||
if r:
|
||||
r.extract()
|
||||
pagetext = soup2.find(attrs={'class':'main'})
|
||||
r=pagetext.find('h1')
|
||||
if r:
|
||||
r.extract()
|
||||
r=pagetext.find('h2')
|
||||
if r:
|
||||
r.extract()
|
||||
for r in pagetext.findAll('script'):
|
||||
r.extract()
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
nexturl = appendtag.find('a', attrs={'class':'cpn'})
|
||||
r=appendtag.find(id='pgbox')
|
||||
if r:
|
||||
r.extract()
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
tag=soup.findAll(name='div', attrs={'class':'picbox'})
|
||||
for t in tag:
|
||||
t['style']='float: left;'
|
||||
return soup
|
@ -8,29 +8,31 @@ class Gry_online_pl(BasicNewsRecipe):
|
||||
language = 'pl'
|
||||
oldest_article = 13
|
||||
INDEX= 'http://www.gry-online.pl/'
|
||||
cover_url='http://www.gry-online.pl/img/1st_10/1st-gol-logo.png'
|
||||
masthead_url='http://www.gry-online.pl/im/gry-online-logo.png'
|
||||
cover_url='http://www.gry-online.pl/im/gry-online-logo.png'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets= True
|
||||
extra_css = 'p.wn1{font-size:22px;}'
|
||||
remove_tags_after= [dict(name='div', attrs={'class':['tresc-newsa']})]
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['txthead']}), dict(name='p', attrs={'class':['wtx1', 'wn1', 'wob']}), dict(name='a', attrs={'class':['num_str_nex']})]
|
||||
#remove_tags= [dict(name='div', attrs={'class':['news_plat']})]
|
||||
keep_only_tags=[dict(name='div', attrs={'class':'gc660'})]
|
||||
remove_tags=[dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})]
|
||||
feeds = [(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')]
|
||||
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
nexturl = soup.find('a', attrs={'class':'num_str_nex'})
|
||||
if appendtag.find('a', attrs={'class':'num_str_nex'}) is not None:
|
||||
appendtag.find('a', attrs={'class':'num_str_nex'}).replaceWith('\n')
|
||||
if nexturl is not None:
|
||||
if 'strona' in nexturl.div.string:
|
||||
nexturl= self.INDEX + nexturl['href']
|
||||
soup2 = self.index_to_soup(nexturl)
|
||||
pagetext = soup2.findAll(name='p', attrs={'class':['wtx1', 'wn1', 'wob']})
|
||||
for tag in pagetext:
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, tag)
|
||||
self.append_page(soup2, appendtag)
|
||||
tag = appendtag.find('div', attrs={'class':'n5p'})
|
||||
if tag:
|
||||
nexturls=tag.findAll('a')
|
||||
for nexturl in nexturls[1:]:
|
||||
try:
|
||||
soup2 = self.index_to_soup('http://www.gry-online.pl/S020.asp'+ nexturl['href'])
|
||||
except:
|
||||
soup2 = self.index_to_soup('http://www.gry-online.pl/S022.asp'+ nexturl['href'])
|
||||
pagetext = soup2.find(attrs={'class':'gc660'})
|
||||
for r in pagetext.findAll(name='header'):
|
||||
r.extract()
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button']}):
|
||||
r.extract()
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
43
recipes/high_country_news.recipe
Normal file
@ -0,0 +1,43 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
|
||||
|
||||
'''
|
||||
Fetch High Country News
|
||||
'''
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class HighCountryNews(BasicNewsRecipe):
|
||||
|
||||
title = u'High Country News'
|
||||
description = u'News from the American West'
|
||||
__author__ = 'Armin Geller' # 2012-01-31
|
||||
publisher = 'High Country News'
|
||||
timefmt = ' [%a, %d %b %Y]'
|
||||
language = 'en'
|
||||
encoding = 'UTF-8'
|
||||
publication_type = 'newspaper'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
masthead_url = 'http://www.hcn.org/logo.jpg' # 2012-01-31 AGe add
|
||||
cover_source = 'http://www.hcn.org' # 2012-01-31 AGe add
|
||||
|
||||
def get_cover_url(self): # 2012-01-31 AGe add
|
||||
cover_source_soup = self.index_to_soup(self.cover_source)
|
||||
preview_image_div = cover_source_soup.find(attrs={'class':' portaltype-Plone Site content--hcn template-homepage_view'})
|
||||
return preview_image_div.div.img['src']
|
||||
|
||||
feeds = [
|
||||
(u'Most recent', u'http://feeds.feedburner.com/hcn/most-recent'),
|
||||
(u'Current Issue', u'http://feeds.feedburner.com/hcn/current-issue'),
|
||||
|
||||
(u'Writers on the Range', u'http://feeds.feedburner.com/hcn/wotr'),
|
||||
(u'High Country Views', u'http://feeds.feedburner.com/hcn/HighCountryViews'),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '/print_view'
|
||||
|
BIN
recipes/icons/asianreviewofbooks.png
Normal file
After Width: | Height: | Size: 906 B |
Before Width: | Height: | Size: 413 B After Width: | Height: | Size: 1.5 KiB |
BIN
recipes/icons/ciekawostki_historyczne.png
Normal file
After Width: | Height: | Size: 994 B |
BIN
recipes/icons/gameplay_pl.png
Normal file
After Width: | Height: | Size: 991 B |
BIN
recipes/icons/in4_pl.png
Normal file
After Width: | Height: | Size: 357 B |
BIN
recipes/icons/informacje_usa.png
Normal file
After Width: | Height: | Size: 808 B |
BIN
recipes/icons/kresy_pl.png
Normal file
After Width: | Height: | Size: 4.0 KiB |
BIN
recipes/icons/mediapart.png
Normal file
After Width: | Height: | Size: 382 B |
Before Width: | Height: | Size: 712 B After Width: | Height: | Size: 712 B |
BIN
recipes/icons/oclab_pl.png
Normal file
After Width: | Height: | Size: 881 B |
BIN
recipes/icons/overclock_pl.png
Normal file
After Width: | Height: | Size: 817 B |
BIN
recipes/icons/palmtop_pl.png
Normal file
After Width: | Height: | Size: 366 B |
BIN
recipes/icons/pc_arena.png
Normal file
After Width: | Height: | Size: 1.1 KiB |
BIN
recipes/icons/pc_centre_pl.png
Normal file
After Width: | Height: | Size: 2.8 KiB |
BIN
recipes/icons/pc_foster.png
Normal file
After Width: | Height: | Size: 694 B |
Before Width: | Height: | Size: 1.1 KiB After Width: | Height: | Size: 289 B |
BIN
recipes/icons/polska_times.png
Normal file
After Width: | Height: | Size: 322 B |
BIN
recipes/icons/pure_pc.png
Normal file
After Width: | Height: | Size: 386 B |
BIN
recipes/icons/racjonalista_pl.png
Normal file
After Width: | Height: | Size: 850 B |
BIN
recipes/icons/rue89.png
Normal file
After Width: | Height: | Size: 1.2 KiB |
BIN
recipes/icons/samanyolu_haber.png
Normal file
After Width: | Height: | Size: 968 B |
BIN
recipes/icons/tanuki.png
Normal file
After Width: | Height: | Size: 1017 B |
BIN
recipes/icons/tvn24.png
Normal file
After Width: | Height: | Size: 5.1 KiB |
BIN
recipes/icons/webhosting_pl.png
Normal file
After Width: | Height: | Size: 1.4 KiB |
@ -4,7 +4,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class IDGse(BasicNewsRecipe):
|
||||
title = 'IDG'
|
||||
__author__ = 'zapt0'
|
||||
__author__ = 'Stanislav Khromov'
|
||||
language = 'sv'
|
||||
description = 'IDG.se'
|
||||
oldest_article = 1
|
||||
@ -15,6 +15,9 @@ class IDGse(BasicNewsRecipe):
|
||||
|
||||
feeds = [(u'Dagens IDG-nyheter',u'http://feeds.idg.se/idg/ETkj?format=xml')]
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('guid', None)
|
||||
|
||||
def print_version(self,url):
|
||||
return url + '?articleRenderMode=print&m=print'
|
||||
|
||||
|
110
recipes/ilmanifesto.recipe
Normal file
@ -0,0 +1,110 @@
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
MANIFESTO_BASEURL = 'http://www.ilmanifesto.it/'
|
||||
|
||||
class IlManifesto(BasicNewsRecipe):
|
||||
title = 'Il Manifesto'
|
||||
__author__ = 'Giacomo Lacava'
|
||||
description = 'quotidiano comunista - ultima edizione html disponibile'
|
||||
publication_type = 'newspaper'
|
||||
publisher = 'il manifesto coop. editrice a r.l.'
|
||||
language = 'it'
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
delay = 1
|
||||
no_stylesheets = True
|
||||
simultaneous_downloads = 5
|
||||
timeout = 30
|
||||
auto_cleanup = True
|
||||
remove_tags = [dict(name='div', attrs={'class':'column_1 float_left'})]
|
||||
remove_tags_before = dict(name='div',attrs={'class':'column_2 float_right'})
|
||||
remove_tags_after = dict(id='myPrintArea')
|
||||
|
||||
manifesto_index = None
|
||||
manifesto_datestr = None
|
||||
|
||||
def _set_manifesto_index(self):
|
||||
if self.manifesto_index == None:
|
||||
startUrl = MANIFESTO_BASEURL + 'area-abbonati/in-edicola/'
|
||||
startSoup = self.index_to_soup(startUrl)
|
||||
lastEdition = startSoup.findAll('div',id='accordion_inedicola')[1].find('a')['href']
|
||||
del(startSoup)
|
||||
self.manifesto_index = MANIFESTO_BASEURL + lastEdition
|
||||
urlsplit = lastEdition.split('/')
|
||||
self.manifesto_datestr = urlsplit[-1]
|
||||
if urlsplit[-1] == '':
|
||||
self.manifesto_datestr = urlsplit[-2]
|
||||
|
||||
|
||||
|
||||
def get_cover_url(self):
|
||||
self._set_manifesto_index()
|
||||
url = MANIFESTO_BASEURL + 'fileadmin/archivi/in_edicola/%sprimapagina.gif' % self.manifesto_datestr
|
||||
return url
|
||||
|
||||
def parse_index(self):
|
||||
self._set_manifesto_index()
|
||||
soup = self.index_to_soup(self.manifesto_index)
|
||||
feedLinks = soup.find('div',id='accordion_inedicola').findAll('a')
|
||||
result = []
|
||||
for feed in feedLinks:
|
||||
articles = []
|
||||
feedName = feed.find('h2').string
|
||||
feedUrl = MANIFESTO_BASEURL + feed['href']
|
||||
feedSoup = self.index_to_soup(feedUrl)
|
||||
indexRoot = feedSoup.find('div',attrs={'class':'column1'})
|
||||
for div in indexRoot.findAll('div',attrs={'class':'strumenti1_inedicola'}):
|
||||
artLink = div.find('a')
|
||||
if artLink is None: continue # empty div
|
||||
title = artLink.string
|
||||
url = MANIFESTO_BASEURL + artLink['href']
|
||||
|
||||
description = ''
|
||||
descNode = div.find('div',attrs={'class':'text_12'})
|
||||
if descNode is not None:
|
||||
description = descNode.string
|
||||
|
||||
author = ''
|
||||
authNode = div.find('div',attrs={'class':'firma'})
|
||||
if authNode is not None:
|
||||
author = authNode.string
|
||||
|
||||
articleText = ''
|
||||
article = {
|
||||
'title':title,
|
||||
'url':url,
|
||||
'date': strftime('%d %B %Y'),
|
||||
'description': description,
|
||||
'content': articleText,
|
||||
'author': author
|
||||
}
|
||||
articles.append(article)
|
||||
result.append((feedName,articles))
|
||||
return result
|
||||
|
||||
|
||||
def extract_readable_article(self, html, url):
|
||||
|
||||
bs = BeautifulSoup(html)
|
||||
col1 = bs.find('div',attrs={'class':'column1'})
|
||||
|
||||
content = col1.find('div',attrs={'class':'bodytext'})
|
||||
title = bs.find(id='titolo_articolo').string
|
||||
author = col1.find('span',attrs={'class':'firma'})
|
||||
subtitle = ''
|
||||
subNode = col1.findPrevious('div',attrs={'class':'occhiello_rosso'})
|
||||
if subNode is not None:
|
||||
subtitle = subNode
|
||||
summary = ''
|
||||
sommNode = bs.find('div',attrs={'class':'sommario'})
|
||||
if sommNode is not None:
|
||||
summary = sommNode
|
||||
|
||||
template = "<html><head><title>%(title)s</title></head><body><h1>%(title)s</h1><h2>%(subtitle)s</h2><h3>%(author)s</h3><div style='font-size: x-large;'>%(summary)s</div><div>%(content)s</div></body></html>"
|
||||
del(bs)
|
||||
return template % dict(title=title,subtitle=subtitle,author=author,summary=summary,content=content)
|
||||
|
||||
|
44
recipes/in4_pl.recipe
Normal file
@ -0,0 +1,44 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
class in4(BasicNewsRecipe):
|
||||
title = u'IN4.pl'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'fenuks'
|
||||
description = u'Serwis Informacyjny - Aktualnosci, recenzje'
|
||||
category = 'IT'
|
||||
language = 'pl'
|
||||
#cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg'
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
preprocess_regexps = [(re.compile(ur'<a title="translate into.*?</a>', re.DOTALL), lambda match: '') ]
|
||||
keep_only_tags=[dict(name='div', attrs={'class':'left_alone'})]
|
||||
remove_tags_after=dict(name='img', attrs={'title':'komentarze'})
|
||||
remove_tags=[dict(name='img', attrs={'title':'komentarze'})]
|
||||
feeds = [(u'Wiadomo\u015bci', u'http://www.in4.pl/rss.php'), (u'Recenzje', u'http://www.in4.pl/rss_recenzje.php'), (u'Mini recenzje', u'http://www.in4.pl/rss_mini.php')]
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
a=soup.findAll('a')
|
||||
nexturl=None
|
||||
for i in a:
|
||||
if i.string and 'następna str' in i.string:
|
||||
nexturl='http://www.in4.pl/' + i['href']
|
||||
i.extract()
|
||||
while nexturl:
|
||||
soup2 = self.index_to_soup(nexturl)
|
||||
pagetext = soup2.find(id='news')
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
nexturl=None
|
||||
tag=soup2.findAll('a')
|
||||
for z in tag:
|
||||
if z.string and u'następna str' in z.string:
|
||||
nexturl='http://www.in4.pl/' + z['href']
|
||||
break
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
||||
|
||||
|
18
recipes/informacje_usa.recipe
Normal file
@ -0,0 +1,18 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
class Informacje_USA(BasicNewsRecipe):
|
||||
title = u'Informacje USA'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'fenuks'
|
||||
description = u'portal wiadomości amerykańskich'
|
||||
category = 'news'
|
||||
language = 'pl'
|
||||
masthead_url= 'http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg'
|
||||
cover_url='http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg'
|
||||
no_stylesheets = True
|
||||
preprocess_regexps = [(re.compile(ur'<p>Zobacz:.*?</p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><a href=".*?Zobacz także:.*?</a></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><p>Zobacz też:.*?</a></p>', re.DOTALL), lambda match: '')]
|
||||
keep_only_tags=[dict(name='div', attrs={'class':'box box-single'})]
|
||||
remove_tags_after= dict(attrs={'class':'tags'})
|
||||
remove_tags= [dict(attrs={'class':['postmetadata', 'tags', 'banner']}), dict(name='a', attrs={'title':['Drukuj', u'Wyślij']})]
|
||||
feeds = [(u'Informacje', u'http://www.informacjeusa.com/feed/')]
|
@ -1,8 +1,9 @@
|
||||
#v2 2011-07-25
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1299694372(BasicNewsRecipe):
|
||||
title = u'Instapaper'
|
||||
__author__ = 'Darko Miletic'
|
||||
__author__ = 'Darko Miletic, Stanislav Khromov'
|
||||
publisher = 'Instapaper.com'
|
||||
category = 'info, custom, Instapaper'
|
||||
oldest_article = 365
|
||||
@ -15,6 +16,8 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe):
|
||||
,dict(name='div', attrs={'id':'text_controls'})
|
||||
,dict(name='div', attrs={'id':'editing_controls'})
|
||||
,dict(name='div', attrs={'class':'bar bottom'})
|
||||
,dict(name='div', attrs={'id':'controlbar_container'})
|
||||
,dict(name='div', attrs={'id':'footer'})
|
||||
]
|
||||
use_embedded_content = False
|
||||
needs_subscription = True
|
||||
|
43
recipes/ivanamilakovic.recipe
Normal file
@ -0,0 +1,43 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
ivanamilakovic.blogspot.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class IvanaMilakovic(BasicNewsRecipe):
|
||||
title = u'Ivana Milaković'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = u'Hronika mačijeg škrabala - priče, inspiracija, knjige, pisanje, prevodi...'
|
||||
oldest_article = 80
|
||||
max_articles_per_feed = 100
|
||||
language = 'sr'
|
||||
encoding = 'utf-8'
|
||||
no_stylesheets = True
|
||||
use_embedded_content = True
|
||||
publication_type = 'blog'
|
||||
extra_css = """
|
||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||
body{font-family: Arial,Tahoma,Helvetica,FreeSans,sans1,sans-serif}
|
||||
img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px }
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : 'knjige, blog, srbija, sf'
|
||||
, 'publisher': 'Ivana Milakovic'
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
feeds = [(u'Posts', u'http://ivanamilakovic.blogspot.com/feeds/posts/default')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return self.adeify_images(soup)
|
99
recipes/japaa.recipe
Normal file
@ -0,0 +1,99 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1330393641(BasicNewsRecipe):
|
||||
title = u'JAAPA'
|
||||
__author__ = 'adoucette'
|
||||
language = 'en'
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = None
|
||||
soup = self.index_to_soup('http://www.jaapa.com')
|
||||
cover_item = soup.find('img', src=re.compile(r'\w*?cover\w{1,22}\.jpg'))
|
||||
if cover_item:
|
||||
cover_url = cover_item['src']
|
||||
return cover_url
|
||||
|
||||
feeds = [
|
||||
(u'CME Articles',
|
||||
u'http://feeds.feedburner.com/jaapacmearticles'),
|
||||
(u'A Day in the Life',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=490'),
|
||||
(u'Ask A Librarian',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=847'),
|
||||
(u'Case of the Month',
|
||||
u'http://feeds.feedburner.com/jaapacaseofthemonth'),
|
||||
(u'Clinical Watch',
|
||||
u'http://feeds.feedburner.com/jaapaclinicalwatch'),
|
||||
(u'Commentary',
|
||||
u'http://feeds.feedburner.com/jaapacommentary'),
|
||||
(u'Critically Appraised Topic',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=699'),
|
||||
(u'Dermatology Digest',
|
||||
u'http://feeds.feedburner.com/jaapadermatologydigest'),
|
||||
(u'Diagnostic Imaging Review',
|
||||
u'http://feeds.feedburner.com/jaapadiagnosticimagingreview'),
|
||||
(u'Editorial',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=759'),
|
||||
(u'From the Academy',
|
||||
u'http://feeds.feedburner.com/jaapafromtheacademy'),
|
||||
(u'Genomics in PA Practice',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=760'),
|
||||
(u'Humane Medicine',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=758'),
|
||||
(u'Inside the AAPA Policy Manual',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=1546'),
|
||||
(u'Interpreting ECGs',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=1624'),
|
||||
(u'Letters',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=808'),
|
||||
(u'PA Quandaries',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=496'),
|
||||
(u'Pharmacology Consult',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=1614'),
|
||||
(u'POEMs', u'http://feeds.feedburner.com/jaapapoems'),
|
||||
(u'Quick Recertification',
|
||||
u'http://feeds.feedburner.com/jaapaquickrecertificationseries'),
|
||||
(u'Sounding Board',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=698'),
|
||||
(u'The Surgical Patient',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=499'),
|
||||
(u'Topics in Infectious Diseases',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=2495'),
|
||||
(u"What's New", u'http://feeds.feedburner.com/jaapawhatsnew'),
|
||||
(u'When the Patient Asks',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=501'),
|
||||
(u"Women's Health",
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=2176'),
|
||||
(u'AAPA Special Article',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=1453'),
|
||||
(u'Case Reports',
|
||||
u'http://feeds.feedburner.com/jaapacasereports'),
|
||||
(u'Review Articles',
|
||||
u'http://feeds.feedburner.com/jaapareviewarticles'),
|
||||
(u'Surgical Reviews',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=505'),
|
||||
(u'Brief Report',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=2353'),
|
||||
(u'Research Corner',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=498'),
|
||||
(u'Research Reports',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=1024'),
|
||||
(u'The Art of Medicine',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=1289'),
|
||||
(u'Clinical Practice Guidelines',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=2102'),
|
||||
(u'Complementary and Alternative Medicine',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=2123'),
|
||||
(u'Drug Information',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=2089'),
|
||||
(u'Evidence-Based Medicine',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=1288'),
|
||||
(u'Patient Information',
|
||||
u'http://www.jaapa.com/pages/rss.aspx?sectionid=2122')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('/article/', '/printarticle/')
|
42
recipes/klubknjige.recipe
Normal file
@ -0,0 +1,42 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
klub-knjige.blogspot.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class KlubKnjige(BasicNewsRecipe):
|
||||
title = 'Klub knjige'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'literarni blog'
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 100
|
||||
language = 'sr'
|
||||
encoding = 'utf-8'
|
||||
no_stylesheets = True
|
||||
use_embedded_content = True
|
||||
publication_type = 'blog'
|
||||
extra_css = """
|
||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||
body{font-family: Arial,Tahoma,Helvetica,FreeSans,sans1,sans-serif}
|
||||
img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px }
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : 'knjige, blog, srbija, sf'
|
||||
, 'publisher': 'Klub Knjige'
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
feeds = [(u'Posts', u'http://klub-knjige.blogspot.com/feeds/posts/default')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return self.adeify_images(soup)
|
14
recipes/kresy_pl.recipe
Normal file
@ -0,0 +1,14 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class Kresy(BasicNewsRecipe):
|
||||
title = u'Kresy'
|
||||
__author__ = 'fenuks'
|
||||
description = u'portal społeczności kresowej'
|
||||
language = 'pl'
|
||||
masthead_url= 'http://www.kresy.pl/public/img/logo.png'
|
||||
cover_url= 'http://www.kresy.pl/public/img/logo.png'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
keep_only_tags= [dict(id='artykul')]
|
||||
remove_tags= [dict(attrs={'class':['twitter-share-button', 'likefbborder', 'tagi']})]
|
||||
feeds = [(u'Wszystkie', u'http://www.kresy.pl/rss')]
|
@ -13,9 +13,10 @@ class Kurier(BasicNewsRecipe):
|
||||
publisher = 'KURIER'
|
||||
category = 'news, politics, Austria'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
max_articles_per_feed = 100
|
||||
timeout = 30
|
||||
encoding = None
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
language = 'de_AT'
|
||||
remove_empty_feeds = True
|
||||
@ -29,9 +30,11 @@ class Kurier(BasicNewsRecipe):
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [dict(attrs={'class':['functionsleiste','functions','social_positionierung','contenttabs','drucken','versenden','leserbrief','kommentieren','addthis_button']})]
|
||||
remove_tags = [ dict(attrs={'id':['artikel_expand_symbol2','imgzoom_close2']}),
|
||||
dict(attrs={'class':['linkextern','functionsleiste','functions','social_positionierung','contenttabs','drucken','versenden','leserbrief','kommentieren','addthis_button']})
|
||||
]
|
||||
keep_only_tags = [dict(attrs={'id':'content'})]
|
||||
remove_tags_after = dict(attrs={'id':'author'})
|
||||
remove_tags_after = [dict(attrs={'id':'author'})]
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [
|
||||
@ -41,7 +44,7 @@ class Kurier(BasicNewsRecipe):
|
||||
,(u'Kultur' , u'http://kurier.at/rss/kultur_kultur_rss.xml' )
|
||||
,(u'Freizeit' , u'http://kurier.at/rss/freizeit_freizeit_rss.xml' )
|
||||
,(u'Wetter' , u'http://kurier.at/rss/oewetter_rss.xml' )
|
||||
,(u'Verkehr' , u'http://kurier.at/rss/verkehr_rss.xml' )
|
||||
,(u'Sport' , u'http://kurier.at/newsfeed/detail/sport_rss.xml' )
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
@ -1,5 +1,5 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>, Rogelio Domínguez <rogelio.dominguez@gmail.com>'
|
||||
__copyright__ = '2010-2012, Darko Miletic <darko.miletic at gmail.com>, Rogelio Domínguez <rogelio.dominguez@gmail.com>'
|
||||
'''
|
||||
www.jornada.unam.mx
|
||||
'''
|
||||
@ -86,6 +86,6 @@ class LaJornada_mx(BasicNewsRecipe):
|
||||
return soup
|
||||
|
||||
def get_article_url(self, article):
|
||||
rurl = article.get('link', None)
|
||||
rurl = article.get('guid', None)
|
||||
return rurl.rpartition('&partner=')[0]
|
||||
|
||||
|
17
recipes/la_pausa_caffe.recipe
Normal file
@ -0,0 +1,17 @@
|
||||
__version__ = 'v1.0'
|
||||
__date__ = '13, February 2011'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1329125921(BasicNewsRecipe):
|
||||
title = u'La pausa caff\xe8'
|
||||
__author__ = 'faber1971'
|
||||
description = 'An Italian satirical blog'
|
||||
language = 'it'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
no_stylesheets = True
|
||||
feeds = [(u'La pausa caff\xe8', u'http://feeds.feedburner.com/LapausaCaffe')]
|
||||
|
15
recipes/la_voce.recipe
Normal file
@ -0,0 +1,15 @@
|
||||
__license__ = 'GPL v3'
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1324114228(BasicNewsRecipe):
|
||||
title = u'La Voce'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
masthead_url = 'http://www.lavoce.info/binary/la_voce/testata/lavoce.1184661635.gif'
|
||||
feeds = [(u'La Voce', u'http://www.lavoce.info/feed_rss.php?id_feed=1')]
|
||||
__author__ = 'faber1971'
|
||||
description = 'Italian website on Economy - v1.01 (17, December 2011)'
|
||||
language = 'it'
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011'
|
||||
__copyright__ = '2012'
|
||||
'''
|
||||
lemonde.fr
|
||||
'''
|
||||
@ -25,7 +25,7 @@ class LeMonde(BasicNewsRecipe):
|
||||
.ariane{font-size:xx-small;}
|
||||
.source{font-size:xx-small;}
|
||||
#.href{font-size:xx-small;}
|
||||
.LM_caption{color:#666666; font-size:x-small;}
|
||||
#.figcaption style{color:#666666; font-size:x-small;}
|
||||
#.main-article-info{font-family:Arial,Helvetica,sans-serif;}
|
||||
#full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
|
||||
#match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
|
||||
@ -48,7 +48,7 @@ class LeMonde(BasicNewsRecipe):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return self.adeify_images(soup)
|
||||
return soup
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'([0-9])%'), lambda m: m.group(1) + ' %'),
|
||||
@ -61,6 +61,11 @@ class LeMonde(BasicNewsRecipe):
|
||||
(re.compile(r'”'), lambda match: ' »)'),
|
||||
(re.compile(r'>\''), lambda match: '>‘'),
|
||||
(re.compile(r' \''), lambda match: ' ‘'),
|
||||
(re.compile(r' "'), lambda match: ' « '),
|
||||
(re.compile(r'>"'), lambda match: '>« '),
|
||||
(re.compile(r'"<'), lambda match: ' »<'),
|
||||
(re.compile(r'" '), lambda match: ' » '),
|
||||
(re.compile(r'",'), lambda match: ' »,'),
|
||||
(re.compile(r'\''), lambda match: '’'),
|
||||
(re.compile(r'"<em>'), lambda match: '<em>« '),
|
||||
(re.compile(r'"<em>"</em><em>'), lambda match: '<em>« '),
|
||||
@ -86,9 +91,10 @@ class LeMonde(BasicNewsRecipe):
|
||||
(re.compile(r'\s»'), lambda match: ' »'),
|
||||
(re.compile(r'«\s'), lambda match: '« '),
|
||||
(re.compile(r' %'), lambda match: ' %'),
|
||||
(re.compile(r'\.jpg » border='), lambda match: '.jpg'),
|
||||
(re.compile(r'\.png » border='), lambda match: '.png'),
|
||||
(re.compile(r'\.jpg » width='), lambda match: '.jpg'),
|
||||
(re.compile(r'\.png » width='), lambda match: '.png'),
|
||||
(re.compile(r' – '), lambda match: ' – '),
|
||||
(re.compile(r'figcaption style="display:none"'), lambda match: 'figcaption'),
|
||||
(re.compile(r' – '), lambda match: ' – '),
|
||||
(re.compile(r' - '), lambda match: ' – '),
|
||||
(re.compile(r' -,'), lambda match: ' –,'),
|
||||
@ -97,10 +103,15 @@ class LeMonde(BasicNewsRecipe):
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['contenu']})
|
||||
dict(name='div', attrs={'class':['global']})
|
||||
]
|
||||
remove_tags = [dict(name='div', attrs={'class':['LM_atome']})]
|
||||
remove_tags_after = [dict(id='appel_temoignage')]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['bloc_base meme_sujet']}),
|
||||
dict(name='p', attrs={'class':['lire']})
|
||||
]
|
||||
|
||||
remove_tags_after = [dict(id='fb-like')]
|
||||
|
||||
def get_article_url(self, article):
|
||||
url = article.get('guid', None)
|
||||
@ -136,4 +147,3 @@ class LeMonde(BasicNewsRecipe):
|
||||
cover_url = link_item.img['src']
|
||||
|
||||
return cover_url
|
||||
|
||||
|
@ -1,8 +1,8 @@
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Lorenzo Vigentini'
|
||||
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
|
||||
__author__ = 'Lorenzo Vigentini and Olivier Daigle'
|
||||
__copyright__ = '2012, Lorenzo Vigentini <l.vigentini at gmail.com>, Olivier Daigle <odaigle _at nuvucameras __dot__ com>'
|
||||
__version__ = 'v1.01'
|
||||
__date__ = '14, January 2010'
|
||||
__date__ = '12, February 2012'
|
||||
__description__ = 'Canadian Paper '
|
||||
|
||||
'''
|
||||
@ -18,7 +18,7 @@ class ledevoir(BasicNewsRecipe):
|
||||
description = 'Canadian Paper. A subscription is optional, with it you get more content'
|
||||
|
||||
cover_url = 'http://www.ledevoir.com/images/ul/graphiques/logo_devoir.gif'
|
||||
title = u'Le Devoir'
|
||||
title = u'Le Devoir '
|
||||
publisher = 'leDevoir.com'
|
||||
category = 'News, finance, economy, politics'
|
||||
|
||||
@ -26,11 +26,15 @@ class ledevoir(BasicNewsRecipe):
|
||||
encoding = 'utf-8'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
|
||||
max_articles_per_feed = 50
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 200
|
||||
use_embedded_content = False
|
||||
recursion = 10
|
||||
needs_subscription = 'optional'
|
||||
|
||||
filterDuplicates = False
|
||||
url_list = []
|
||||
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
@ -38,7 +42,7 @@ class ledevoir(BasicNewsRecipe):
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':'article'}),
|
||||
dict(name='ul', attrs={'id':'ariane'})
|
||||
dict(name='div', attrs={'id':'colonne_principale'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
@ -51,7 +55,7 @@ class ledevoir(BasicNewsRecipe):
|
||||
|
||||
feeds = [
|
||||
(u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'),
|
||||
(u'Edition complete', 'http://feeds2.feedburner.com/fluxdudevoir'),
|
||||
(u'Édition complete', 'http://feeds2.feedburner.com/fluxdudevoir'),
|
||||
(u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'),
|
||||
(u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'),
|
||||
(u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'),
|
||||
@ -61,7 +65,7 @@ class ledevoir(BasicNewsRecipe):
|
||||
(u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'),
|
||||
(u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'),
|
||||
(u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'),
|
||||
(u'Loisirs', 'http://www.ledevoir.com/rss/section/loisirs.xml?id=50')
|
||||
(u'Art de vivre', 'http://www.ledevoir.com/rss/section/art-de-vivre.xml?id=50')
|
||||
]
|
||||
|
||||
extra_css = '''
|
||||
@ -85,8 +89,16 @@ class ledevoir(BasicNewsRecipe):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://www.ledevoir.com')
|
||||
br.select_form(nr=1)
|
||||
br['login[courriel]'] = self.username
|
||||
br['login[password]'] = self.password
|
||||
br.select_form(nr=0)
|
||||
br['login_popup[courriel]'] = self.username
|
||||
br['login_popup[password]'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def print_version(self, url):
|
||||
if self.filterDuplicates:
|
||||
if url in self.url_list:
|
||||
return
|
||||
self.url_list.append(url)
|
||||
return url
|
||||
|
||||
|
103
recipes/liberation_sub.recipe
Normal file
@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Rémi Vanicat <vanicat at debian.org>'
|
||||
'''
|
||||
liberation.fr
|
||||
'''
|
||||
# The cleanning is from the Liberation recipe, by Darko Miletic
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Liberation(BasicNewsRecipe):
|
||||
|
||||
title = u'Libération: Édition abonnés'
|
||||
__author__ = 'Rémi Vanicat'
|
||||
description = u'Actualités'
|
||||
category = 'Actualités, France, Monde'
|
||||
language = 'fr'
|
||||
needs_subscription = True
|
||||
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
|
||||
extra_css = '''
|
||||
h1, h2, h3 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
|
||||
p.subtitle {font-size:xx-small; font-family:Arial,Helvetica,sans-serif;}
|
||||
h4, h5, h2.rubrique, {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
|
||||
.ref, .date, .author, .legende {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
|
||||
.mna-body, entry-body {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
|
||||
'''
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'article'})
|
||||
,dict(name='div', attrs={'class':'text-article m-bot-s1'})
|
||||
,dict(name='div', attrs={'class':'entry'})
|
||||
,dict(name='div', attrs={'class':'col_contenu'})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div',attrs={'class':['object-content text text-item', 'object-content', 'entry-content', 'col01', 'bloc_article_01']})
|
||||
,dict(name='p',attrs={'class':['chapo']})
|
||||
,dict(id='_twitter_facebook')
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='iframe')
|
||||
,dict(name='a', attrs={'class':'lnk-comments'})
|
||||
,dict(name='div', attrs={'class':'toolbox'})
|
||||
,dict(name='ul', attrs={'class':'share-box'})
|
||||
,dict(name='ul', attrs={'class':'tool-box'})
|
||||
,dict(name='ul', attrs={'class':'rub'})
|
||||
,dict(name='p',attrs={'class':['chapo']})
|
||||
,dict(name='p',attrs={'class':['tag']})
|
||||
,dict(name='div',attrs={'class':['blokLies']})
|
||||
,dict(name='div',attrs={'class':['alire']})
|
||||
,dict(id='_twitter_facebook')
|
||||
]
|
||||
|
||||
index = 'http://www.liberation.fr/abonnes/'
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://www.liberation.fr/jogger/login/')
|
||||
br.select_form(nr=0)
|
||||
br['email'] = self.username
|
||||
br['password'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def parse_index(self):
|
||||
soup=self.index_to_soup(self.index)
|
||||
|
||||
content = soup.find('div', { 'class':'block-content' })
|
||||
|
||||
articles = []
|
||||
cat_articles = []
|
||||
|
||||
for tag in content.findAll(recursive=False):
|
||||
if(tag['class']=='headrest headrest-basic-rounded'):
|
||||
cat_articles = []
|
||||
articles.append((tag.find('h5').contents[0],cat_articles))
|
||||
else:
|
||||
title = tag.find('h3').contents[0]
|
||||
url = tag.find('a')['href']
|
||||
print(url)
|
||||
descripion = tag.find('p',{ 'class':'subtitle' }).contents[0]
|
||||
article = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'descripion': descripion,
|
||||
'content': ''
|
||||
}
|
||||
cat_articles.append(article)
|
||||
return articles
|
||||
|
||||
|
||||
|
||||
# Local Variables:
|
||||
# mode: python
|
||||
# End:
|
@ -1,41 +1,26 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.livemint.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class LiveMint(BasicNewsRecipe):
|
||||
title = u'Livemint'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'The Wall Street Journal'
|
||||
publisher = 'The Wall Street Journal'
|
||||
category = 'news, games, adventure, technology'
|
||||
language = 'en'
|
||||
title = u'Live Mint'
|
||||
language = 'en_IN'
|
||||
__author__ = 'Krittika Goyal'
|
||||
#encoding = 'cp1252'
|
||||
oldest_article = 1 #days
|
||||
max_articles_per_feed = 25
|
||||
use_embedded_content = True
|
||||
|
||||
oldest_article = 15
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
use_embedded_content = False
|
||||
extra_css = ' #dvArtheadline{font-size: x-large} #dvArtAbstract{font-size: large} '
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'innercontent'})]
|
||||
|
||||
remove_tags = [dict(name=['object','link','embed','form','iframe'])]
|
||||
feeds = [
|
||||
('Latest News',
|
||||
'http://www.livemint.com/StoryRss.aspx?LN=Latestnews'),
|
||||
('Gallery',
|
||||
'http://www.livemint.com/GalleryRssfeed.aspx'),
|
||||
('Top Stories',
|
||||
'http://www.livemint.com/StoryRss.aspx?ts=Topstories'),
|
||||
('Banking',
|
||||
'http://www.livemint.com/StoryRss.aspx?Id=104'),
|
||||
]
|
||||
|
||||
feeds = [(u'Articles', u'http://www.livemint.com/SectionRssfeed.aspx?Mid=1')]
|
||||
|
||||
def print_version(self, url):
|
||||
link = url
|
||||
msoup = self.index_to_soup(link)
|
||||
mlink = msoup.find(attrs={'id':'ctl00_bodyplaceholdercontent_cntlArtTool_printUrl'})
|
||||
if mlink:
|
||||
link = 'http://www.livemint.com/Articles/' + mlink['href'].rpartition('/Articles/')[2]
|
||||
return link
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
||||
|
25
recipes/living_stones.recipe
Normal file
@ -0,0 +1,25 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class LivingStonesPastorsBlog(BasicNewsRecipe):
|
||||
title = u'Living Stones Pastors Blog'
|
||||
__author__ = 'Peter Grungi'
|
||||
language = 'en'
|
||||
|
||||
oldest_article = 90
|
||||
max_articles_per_feed = 10
|
||||
auto_cleanup = True
|
||||
cover_url = 'http://blogs.livingstonesreno.com/wp-content/uploads/2011/08/blogBGRD_norepeat.jpg'
|
||||
masthead_url = 'http://www.livingstonesreno.com/podcast/LSpodcastnew.jpg'
|
||||
publisher = 'Living Stones Church of Reno, NV'
|
||||
language = 'en'
|
||||
author = 'Living Stones Church of Reno, NV'
|
||||
|
||||
feeds = [(u'LS Blog', u'http://blogs.livingstonesreno.com/feed?utm_source=calibre&utm_medium=rss')]
|
||||
|
||||
def full_version(self, url):
|
||||
import re
|
||||
newurl = re.sub(r'\?.*','',url)
|
||||
return newurl
|
19
recipes/marketing_magazine.recipe
Normal file
@ -0,0 +1,19 @@
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'faber1971'
|
||||
description = 'Collection of Italian marketing websites - v1.04 (17, March 2012)'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1327062445(BasicNewsRecipe):
|
||||
title = u'Marketing Magazine'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
conversion_options = {'linearize_tables': True}
|
||||
remove_tags = [
|
||||
dict(name='ul', attrs={'id':'ads0'})
|
||||
]
|
||||
masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg'
|
||||
feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'MarketingArena', u'http://feeds.feedburner.com/marketingarena'), (u'minimarketing', u'http://feeds.feedburner.com/minimarketingit'), (u'Marketing Journal', u'http://feeds.feedburner.com/marketingjournal/jPwA'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]
|
@ -1,69 +1,45 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010, Louis Gesbert <meta at antislash dot info>'
|
||||
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
|
||||
'''
|
||||
Mediapart
|
||||
'''
|
||||
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
__author__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
|
||||
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Mediapart(BasicNewsRecipe):
|
||||
title = 'Mediapart'
|
||||
__author__ = 'Mathieu Godlewski'
|
||||
description = 'Global news in french from online newspapers'
|
||||
__author__ = 'Mathieu Godlewski, Louis Gesbert'
|
||||
description = 'Global news in french from news site Mediapart'
|
||||
oldest_article = 7
|
||||
language = 'fr'
|
||||
needs_subscription = True
|
||||
|
||||
max_articles_per_feed = 50
|
||||
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
|
||||
cover_url = 'http://www.mediapart.fr/sites/all/themes/mediapart/mediapart/images/annonce.jpg'
|
||||
cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg'
|
||||
|
||||
feeds = [
|
||||
('Les articles', 'http://www.mediapart.fr/articles/feed'),
|
||||
]
|
||||
|
||||
# -- print-version has poor quality on this website, better do the conversion ourselves
|
||||
#
|
||||
# preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in
|
||||
# [
|
||||
# (r'<div class="print-title">([^>]+)</div>', lambda match : '<h2>'+match.group(1)+'</h2>'),
|
||||
# (r'<span class=\'auteur_staff\'>[^>]+<a title=\'[^\']*\'[^>]*>([^<]*)</a>[^<]*</span>',
|
||||
# lambda match : '<i>'+match.group(1)+'</i>'),
|
||||
# (r'\'', lambda match: '’'),
|
||||
# ]
|
||||
# ]
|
||||
#
|
||||
# remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}),
|
||||
# dict(name='div', attrs={'class':'print-links'}),
|
||||
# dict(name='img', attrs={'src':'entete_article.png'}),
|
||||
# dict(name='br') ]
|
||||
#
|
||||
# def print_version(self, url):
|
||||
# raw = self.browser.open(url).read()
|
||||
# soup = BeautifulSoup(raw.decode('utf8', 'replace'))
|
||||
# div = soup.find('div', {'id':re.compile('node-\d+')})
|
||||
# if div is None:
|
||||
# return None
|
||||
# article_id = string.replace(div['id'], 'node-', '')
|
||||
# if article_id is None:
|
||||
# return None
|
||||
# return 'http://www.mediapart.fr/print/'+article_id
|
||||
# -- print-version
|
||||
|
||||
# -- Non-print version [dict(name='div', attrs={'class':'advert'})]
|
||||
conversion_options = { 'smarten_punctuation' : True }
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='h1', attrs={'class':'title'}),
|
||||
dict(name='div', attrs={'class':'page_papier_detail'}),
|
||||
]
|
||||
remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}) ]
|
||||
|
||||
def preprocess_html(self,soup):
|
||||
for title in soup.findAll('div', {'class':'titre'}):
|
||||
tag = Tag(soup, 'h3')
|
||||
title.replaceWith(tag)
|
||||
tag.insert(0,title)
|
||||
return soup
|
||||
def print_version(self, url):
|
||||
raw = self.browser.open(url).read()
|
||||
soup = BeautifulSoup(raw.decode('utf8', 'replace'))
|
||||
link = soup.find('a', {'title':'Imprimer'})
|
||||
if link is None:
|
||||
return None
|
||||
return link['href']
|
||||
|
||||
# -- Handle login
|
||||
|
||||
@ -77,3 +53,10 @@ class Mediapart(BasicNewsRecipe):
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for title in soup.findAll('p', {'class':'titre_page'}):
|
||||
title.name = 'h3'
|
||||
for legend in soup.findAll('span', {'class':'legend'}):
|
||||
legend.insert(0, Tag(soup, 'br', []))
|
||||
legend.name = 'small'
|
||||
return soup
|
||||
|
@ -38,18 +38,23 @@ except:
|
||||
removed keep_only tags
|
||||
Version 1.8 26-11-2022
|
||||
added remove tag: article-slideshow
|
||||
Version 1.9 31-1-2012
|
||||
removed some left debug settings
|
||||
extended timeout from 2 to 10
|
||||
changed oldest article from 10 to 1.2
|
||||
changed max articles from 15 to 25
|
||||
'''
|
||||
|
||||
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
title = u'Metro Nieuws NL'
|
||||
oldest_article = 10
|
||||
max_articles_per_feed = 15
|
||||
oldest_article = 1.2
|
||||
max_articles_per_feed = 25
|
||||
__author__ = u'DrMerry'
|
||||
description = u'Metro Nederland'
|
||||
language = u'nl'
|
||||
simultaneous_downloads = 5
|
||||
simultaneous_downloads = 3
|
||||
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
|
||||
timeout = 2
|
||||
timeout = 10
|
||||
center_navbar = True
|
||||
timefmt = ' [%A, %d %b %Y]'
|
||||
no_stylesheets = True
|
||||
|
224
recipes/microwave_and_rf.recipe
Normal file
@ -0,0 +1,224 @@
|
||||
#!/usr/bin/env python
|
||||
##
|
||||
## Title: Microwave and RF
|
||||
##
|
||||
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||
|
||||
# Feb 2012: Initial release
|
||||
|
||||
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||
'''
|
||||
mwrf.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.utils.magick import Image
|
||||
|
||||
class Microwaves_and_RF(BasicNewsRecipe):
|
||||
|
||||
Convert_Grayscale = False # Convert images to gray scale or not
|
||||
|
||||
# Add sections that want to be excluded from the magazine
|
||||
exclude_sections = []
|
||||
|
||||
# Add sections that want to be included from the magazine
|
||||
include_sections = []
|
||||
|
||||
title = u'Microwaves and RF'
|
||||
__author__ = u'kiavash'
|
||||
description = u'Microwaves and RF Montly Magazine'
|
||||
publisher = 'Penton Media, Inc.'
|
||||
publication_type = 'magazine'
|
||||
site = 'http://mwrf.com'
|
||||
|
||||
language = 'en'
|
||||
asciiize = True
|
||||
timeout = 120
|
||||
simultaneous_downloads = 1 # very peaky site!
|
||||
|
||||
# Main article is inside this tag
|
||||
keep_only_tags = [dict(name='table', attrs={'id':'prtContent'})]
|
||||
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
|
||||
# Flattens all the tables to make it compatible with Nook
|
||||
conversion_options = {'linearize_tables' : True}
|
||||
|
||||
remove_tags = [
|
||||
dict(name='span', attrs={'class':'body12'}),
|
||||
]
|
||||
|
||||
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
||||
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
|
||||
|
||||
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
||||
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
||||
.introduction, .first { font-weight: bold; } \
|
||||
.cross-head { font-weight: bold; font-size: 125%; } \
|
||||
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
|
||||
.cap, .caption, .caption img, .caption span { display: block; margin: 5px auto; } \
|
||||
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
|
||||
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
|
||||
font-size: 80%; font-style: italic; margin: 1px auto; } \
|
||||
.story-date, .published { font-size: 80%; } \
|
||||
table { width: 100%; } \
|
||||
td img { display: block; margin: 5px auto; } \
|
||||
ul { padding-top: 10px; } \
|
||||
ol { padding-top: 10px; } \
|
||||
li { padding-top: 5px; padding-bottom: 5px; } \
|
||||
h1 { font-size: 175%; font-weight: bold; } \
|
||||
h2 { font-size: 150%; font-weight: bold; } \
|
||||
h3 { font-size: 125%; font-weight: bold; } \
|
||||
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
|
||||
|
||||
# Remove the line breaks and float left/right and picture width/height.
|
||||
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
|
||||
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
|
||||
(re.compile(r'float:.*?'), lambda m: ''),
|
||||
(re.compile(r'width:.*?px'), lambda m: ''),
|
||||
(re.compile(r'height:.*?px'), lambda m: '')
|
||||
]
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
url = re.sub(r'.html', '', url)
|
||||
url = re.sub('/ArticleID/.*?/', '/Print.cfm?ArticleID=', url)
|
||||
return url
|
||||
|
||||
# Need to change the user agent to avoid potential download errors
|
||||
def get_browser(self, *args, **kwargs):
|
||||
from calibre import browser
|
||||
kwargs['user_agent'] = 'Mozilla/5.0 (Windows NT 5.1; rv:10.0) Gecko/20100101 Firefox/10.0'
|
||||
return browser(*args, **kwargs)
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
|
||||
# Fetches the main page of Microwaves and RF
|
||||
soup = self.index_to_soup(self.site)
|
||||
|
||||
# First page has the ad, Let's find the redirect address.
|
||||
url = soup.find('span', attrs={'class':'commonCopy'}).find('a').get('href')
|
||||
if url.startswith('/'):
|
||||
url = self.site + url
|
||||
|
||||
soup = self.index_to_soup(url)
|
||||
|
||||
# Searches the site for Issue ID link then returns the href address
|
||||
# pointing to the latest issue
|
||||
latest_issue = soup.find('a', attrs={'href':lambda x: x and 'IssueID' in x}).get('href')
|
||||
|
||||
# Fetches the index page for of the latest issue
|
||||
soup = self.index_to_soup(latest_issue)
|
||||
|
||||
# Finds the main section of the page containing cover, issue date and
|
||||
# TOC
|
||||
ts = soup.find('div', attrs={'id':'columnContainer'})
|
||||
|
||||
# Finds the issue date
|
||||
ds = ' '.join(self.tag_to_string(ts.find('span', attrs={'class':'CurrentIssueSectionHead'})).strip().split()[-2:]).capitalize()
|
||||
self.log('Found Current Issue:', ds)
|
||||
self.timefmt = ' [%s]'%ds
|
||||
|
||||
# Finds the cover image
|
||||
cover = ts.find('img', src = lambda x: x and 'Cover' in x)
|
||||
if cover is not None:
|
||||
self.cover_url = self.site + cover['src']
|
||||
self.log('Found Cover image:', self.cover_url)
|
||||
|
||||
feeds = []
|
||||
article_info = []
|
||||
|
||||
# Finds all the articles (tiles and links)
|
||||
articles = ts.findAll('a', attrs={'class':'commonArticleTitle'})
|
||||
|
||||
# Finds all the descriptions
|
||||
descriptions = ts.findAll('span', attrs={'class':'commonCopy'})
|
||||
|
||||
# Find all the sections
|
||||
sections = ts.findAll('span', attrs={'class':'kicker'})
|
||||
|
||||
title_number = 0
|
||||
|
||||
# Goes thru all the articles one by one and sort them out
|
||||
for section in sections:
|
||||
title_number = title_number + 1
|
||||
|
||||
# Removes the unwanted sections
|
||||
if self.tag_to_string(section) in self.exclude_sections:
|
||||
continue
|
||||
|
||||
# Only includes the wanted sections
|
||||
if self.include_sections:
|
||||
if self.tag_to_string(section) not in self.include_sections:
|
||||
continue
|
||||
|
||||
|
||||
title = self.tag_to_string(articles[title_number])
|
||||
url = articles[title_number].get('href')
|
||||
if url.startswith('/'):
|
||||
url = self.site + url
|
||||
|
||||
self.log('\tFound article:', title, 'at', url)
|
||||
desc = self.tag_to_string(descriptions[title_number])
|
||||
self.log('\t\t', desc)
|
||||
|
||||
article_info.append({'title':title, 'url':url, 'description':desc,
|
||||
'date':self.timefmt})
|
||||
|
||||
if article_info:
|
||||
feeds.append((self.title, article_info))
|
||||
|
||||
#self.log(feeds)
|
||||
return feeds
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
if self.Convert_Grayscale:
|
||||
#process all the images
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||
iurl = tag['src']
|
||||
img = Image()
|
||||
img.open(iurl)
|
||||
if img < 0:
|
||||
raise RuntimeError('Out of memory')
|
||||
img.type = "GrayscaleType"
|
||||
img.save(iurl)
|
||||
return soup
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
# Includes all the figures inside the final ebook
|
||||
# Finds all the jpg links
|
||||
for figure in soup.findAll('a', attrs = {'href' : lambda x: x and 'jpg' in x}):
|
||||
|
||||
# makes sure that the link points to the absolute web address
|
||||
if figure['href'].startswith('/'):
|
||||
figure['href'] = self.site + figure['href']
|
||||
|
||||
figure.name = 'img' # converts the links to img
|
||||
figure['src'] = figure['href'] # with the same address as href
|
||||
figure['style'] = 'display:block' # adds /n before and after the image
|
||||
del figure['href']
|
||||
del figure['target']
|
||||
|
||||
# Makes the title standing out
|
||||
for title in soup.findAll('a', attrs = {'class': 'commonSectionTitle'}):
|
||||
title.name = 'h1'
|
||||
del title['href']
|
||||
del title['target']
|
||||
|
||||
# Makes the section name more visible
|
||||
for section_name in soup.findAll('a', attrs = {'class': 'kicker2'}):
|
||||
section_name.name = 'h5'
|
||||
del section_name['href']
|
||||
del section_name['target']
|
||||
|
||||
# Removes all unrelated links
|
||||
for link in soup.findAll('a', attrs = {'href': True}):
|
||||
link.name = 'font'
|
||||
del link['href']
|
||||
del link['target']
|
||||
|
||||
return soup
|
@ -1,4 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
|
||||
@ -6,15 +7,72 @@ __license__ = 'GPL v3'
|
||||
www.canada.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
|
||||
|
||||
class CanWestPaper(BasicNewsRecipe):
|
||||
|
||||
# un-comment the following three lines for the Montreal Gazette
|
||||
# un-comment the following four lines for the Victoria Times Colonist
|
||||
## title = u'Victoria Times Colonist'
|
||||
## url_prefix = 'http://www.timescolonist.com'
|
||||
## description = u'News from Victoria, BC'
|
||||
## fp_tag = 'CAN_TC'
|
||||
|
||||
# un-comment the following four lines for the Vancouver Province
|
||||
## title = u'Vancouver Province'
|
||||
## url_prefix = 'http://www.theprovince.com'
|
||||
## description = u'News from Vancouver, BC'
|
||||
## fp_tag = 'CAN_VP'
|
||||
|
||||
# un-comment the following four lines for the Vancouver Sun
|
||||
## title = u'Vancouver Sun'
|
||||
## url_prefix = 'http://www.vancouversun.com'
|
||||
## description = u'News from Vancouver, BC'
|
||||
## fp_tag = 'CAN_VS'
|
||||
|
||||
# un-comment the following four lines for the Edmonton Journal
|
||||
## title = u'Edmonton Journal'
|
||||
## url_prefix = 'http://www.edmontonjournal.com'
|
||||
## description = u'News from Edmonton, AB'
|
||||
## fp_tag = 'CAN_EJ'
|
||||
|
||||
# un-comment the following four lines for the Calgary Herald
|
||||
## title = u'Calgary Herald'
|
||||
## url_prefix = 'http://www.calgaryherald.com'
|
||||
## description = u'News from Calgary, AB'
|
||||
## fp_tag = 'CAN_CH'
|
||||
|
||||
# un-comment the following four lines for the Regina Leader-Post
|
||||
## title = u'Regina Leader-Post'
|
||||
## url_prefix = 'http://www.leaderpost.com'
|
||||
## description = u'News from Regina, SK'
|
||||
## fp_tag = ''
|
||||
|
||||
# un-comment the following four lines for the Saskatoon Star-Phoenix
|
||||
## title = u'Saskatoon Star-Phoenix'
|
||||
## url_prefix = 'http://www.thestarphoenix.com'
|
||||
## description = u'News from Saskatoon, SK'
|
||||
## fp_tag = ''
|
||||
|
||||
# un-comment the following four lines for the Windsor Star
|
||||
## title = u'Windsor Star'
|
||||
## url_prefix = 'http://www.windsorstar.com'
|
||||
## description = u'News from Windsor, ON'
|
||||
## fp_tag = 'CAN_'
|
||||
|
||||
# un-comment the following four lines for the Ottawa Citizen
|
||||
## title = u'Ottawa Citizen'
|
||||
## url_prefix = 'http://www.ottawacitizen.com'
|
||||
## description = u'News from Ottawa, ON'
|
||||
## fp_tag = 'CAN_OC'
|
||||
|
||||
# un-comment the following four lines for the Montreal Gazette
|
||||
title = u'Montreal Gazette'
|
||||
url_prefix = 'http://www.montrealgazette.com'
|
||||
description = u'News from Montreal, QC'
|
||||
fp_tag = 'CAN_MG'
|
||||
|
||||
|
||||
language = 'en_CA'
|
||||
@ -38,14 +96,81 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||
|
||||
def preprocess_html(self,soup):
|
||||
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
||||
divtags = soup.findAll('div',attrs={'id':''})
|
||||
if divtags:
|
||||
for div in divtags:
|
||||
del(div['id'])
|
||||
|
||||
def get_cover_url(self):
|
||||
from datetime import timedelta, date
|
||||
if self.fp_tag=='':
|
||||
return None
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
daysback=1
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
while daysback<7:
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
daysback = daysback+1
|
||||
continue
|
||||
break
|
||||
if daysback==7:
|
||||
self.log("\nCover unavailable")
|
||||
cover = None
|
||||
return cover
|
||||
|
||||
def fixChars(self,string):
|
||||
# Replace lsquo (\x91)
|
||||
fixed = re.sub("\x91","‘",string)
|
||||
# Replace rsquo (\x92)
|
||||
fixed = re.sub("\x92","’",fixed)
|
||||
# Replace ldquo (\x93)
|
||||
fixed = re.sub("\x93","“",fixed)
|
||||
# Replace rdquo (\x94)
|
||||
fixed = re.sub("\x94","”",fixed)
|
||||
# Replace ndash (\x96)
|
||||
fixed = re.sub("\x96","–",fixed)
|
||||
# Replace mdash (\x97)
|
||||
fixed = re.sub("\x97","—",fixed)
|
||||
fixed = re.sub("’","’",fixed)
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&","&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first:
|
||||
picdiv = soup.find('body').find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
|
||||
xtitle = article.text_summary.strip()
|
||||
if len(xtitle) == 0:
|
||||
desc = soup.find('meta',attrs={'property':'og:description'})
|
||||
if desc is not None:
|
||||
article.summary = article.text_summary = desc['content']
|
||||
|
||||
def strip_anchors(self,soup):
|
||||
paras = soup.findAll(True)
|
||||
for para in paras:
|
||||
aTags = para.findAll('a')
|
||||
for a in aTags:
|
||||
if a.img is None:
|
||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||
return soup
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||
|
@ -1,58 +1,53 @@
|
||||
#!/usr/bin/env python
|
||||
##
|
||||
## Title: Microwave Journal RSS recipe
|
||||
## Title: Microwave Journal
|
||||
## Contact: Kiavash (use Mobile Read)
|
||||
##
|
||||
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||
## Copyright: Kiavash
|
||||
##
|
||||
## Written: Jan 2012
|
||||
## Last Edited: Jan 2012
|
||||
## Last Edited: Feb 2012
|
||||
##
|
||||
|
||||
# Feb 2012: New Recipe compatible with the MWJournal 2.0 website
|
||||
|
||||
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||
__copyright__ = 'Kiavash'
|
||||
__author__ = 'Kaivash'
|
||||
|
||||
'''
|
||||
Microwave Journal Monthly Magazine
|
||||
You need to sign up (free) and get username/password.
|
||||
microwavejournal.com
|
||||
'''
|
||||
|
||||
import re # Import the regular expressions module.
|
||||
from calibre.ptempfile import TemporaryFile # we need this for saving to a temp file
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.utils.magick import Image
|
||||
|
||||
class MWJournal(BasicNewsRecipe):
|
||||
# Title to use for the ebook.
|
||||
title = u'Microwave Journal'
|
||||
__author__ = 'Kiavash'
|
||||
language = 'en'
|
||||
|
||||
#A brief description for the ebook.
|
||||
description = u'Microwave Journal web site ebook created using rss feeds.'
|
||||
|
||||
# Set publisher and publication type.
|
||||
publisher = 'Horizon House'
|
||||
title = u'Microwave Journal'
|
||||
description = u'Microwave Journal Monthly Magazine'
|
||||
publisher = 'Horizon House'
|
||||
publication_type = 'magazine'
|
||||
INDEX = 'http://www.microwavejournal.com/publications/'
|
||||
|
||||
oldest_article = 31 # monthly published magazine. Some months are 31 days!
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds = True
|
||||
auto_cleanup = True
|
||||
|
||||
# Disable stylesheets and javascript from site.
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
|
||||
asciiize = True # Converts all none ascii characters to their ascii equivalents
|
||||
|
||||
needs_subscription = True # oh yeah... we need to login btw.
|
||||
|
||||
# Timeout for fetching files from the server in seconds. The default of 120 seconds, seems somewhat excessive.
|
||||
language = 'en'
|
||||
timeout = 30
|
||||
|
||||
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
||||
Convert_Grayscale = False # Convert images to gray scale or not
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'record'})]
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
remove_tags = [
|
||||
dict(name='font', attrs={'class':'footer'}), # remove fonts
|
||||
]
|
||||
|
||||
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
||||
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
|
||||
|
||||
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
||||
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
||||
.introduction, .first { font-weight: bold; } \
|
||||
.cross-head { font-weight: bold; font-size: 125%; } \
|
||||
@ -72,72 +67,75 @@ class MWJournal(BasicNewsRecipe):
|
||||
h3 { font-size: 125%; font-weight: bold; } \
|
||||
h4, h5, h6 { font-size: 100%; font-weight: bold; }'
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':'boxadzonearea350'}), # Removes banner ads
|
||||
dict(name='font', attrs={'class':'footer'}), # remove fonts if you do like your fonts more! Comment out to use website's fonts
|
||||
dict(name='div', attrs={'class':'newsarticlead'})
|
||||
]
|
||||
|
||||
# Remove various tag attributes to improve the look of the ebook pages.
|
||||
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
||||
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
|
||||
|
||||
# Remove the line breaks as well as href links. Books don't have links generally speaking
|
||||
# Remove the line breaks, href links and float left/right and picture width/height.
|
||||
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
|
||||
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: ''),
|
||||
(re.compile(r'<a.*?>'), lambda h1: ''),
|
||||
(re.compile(r'</a>'), lambda h2: '')
|
||||
(re.compile(r'</a>'), lambda h2: ''),
|
||||
(re.compile(r'float:.*?'), lambda h3: ''),
|
||||
(re.compile(r'width:.*?px'), lambda h4: ''),
|
||||
(re.compile(r'height:.*?px'), lambda h5: '')
|
||||
]
|
||||
|
||||
# Select the feeds that you are interested.
|
||||
feeds = [
|
||||
(u'Current Issue', u'http://www.mwjournal.com/rss/Rss.asp?type=99'),
|
||||
(u'Industry News', u'http://www.mwjournal.com/rss/Rss.asp?type=1'),
|
||||
(u'Resources', u'http://www.mwjournal.com/rss/Rss.asp?type=3'),
|
||||
(u'Buyer\'s Guide', u'http://www.mwjournal.com/rss/Rss.asp?type=5'),
|
||||
(u'Events', u'http://www.mwjournal.com/rss/Rss.asp?type=2'),
|
||||
(u'All Updates', u'http://www.mwjournal.com/rss/Rss.asp?type=0'),
|
||||
]
|
||||
|
||||
# No magazine is complete without cover. Let's get it then!
|
||||
# The function is adapted from the Economist recipe
|
||||
def get_cover_url(self):
|
||||
cover_url = None
|
||||
cover_page_location = 'http://www.mwjournal.com/Journal/' # Cover image is located on this page
|
||||
soup = self.index_to_soup(cover_page_location)
|
||||
cover_item = soup.find('img',attrs={'src':lambda x: x and '/IssueImg/3_MWJ_CurrIss_CoverImg' in x}) # There are three files named cover, we want the highest resolution which is the 3rd image. So we look for the pattern. Remember that the name of the cover image changes every month so we cannot search for the complete name. Instead we are searching for the pattern
|
||||
if cover_item:
|
||||
cover_url = 'http://www.mwjournal.com' + cover_item['src'].strip() # yeah! we found it. Let's fetch the image file and pass it as cover to calibre
|
||||
return cover_url
|
||||
|
||||
def print_version(self, url):
|
||||
if url.find('/Journal/article.asp?HH_ID=') >= 0:
|
||||
return self.browser.open_novisit(url).geturl().replace('/Journal/article.asp?HH_ID=', '/Journal/Print.asp?Id=')
|
||||
elif url.find('/News/article.asp?HH_ID=') >= 0:
|
||||
return self.browser.open_novisit(url).geturl().replace('/News/article.asp?HH_ID=', '/Journal/Print.asp?Id=')
|
||||
elif url.find('/Resources/TechLib.asp?HH_ID=') >= 0:
|
||||
return self.browser.open_novisit(url).geturl().replace('/Resources/TechLib.asp?HH_ID=', '/Resources/PrintRessource.asp?Id=')
|
||||
return url.replace('/articles/', '/articles/print/')
|
||||
|
||||
def get_browser(self):
|
||||
'''
|
||||
Microwave Journal website, directs the login page to omeda.com once login info is submitted, omeda.com redirects to mwjournal.com with again the browser logs in into that site (hidden from the user). To overcome this obsticle, first login page is fetch and its output is stored to an HTML file. Then the HTML file is opened again and second login form is submitted (Many thanks to Barty which helped with second page login).
|
||||
'''
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
url = ('http://www.omeda.com/cgi-win/mwjreg.cgi?m=login') # main login page.
|
||||
br.open(url) # fetch the 1st login page
|
||||
br.select_form('login') # finds the login form
|
||||
br['EMAIL_ADDRESS'] = self.username # fills the username
|
||||
br['PASSWORD'] = self.password # fills the password
|
||||
raw = br.submit().read() # submit the form and read the 2nd login form
|
||||
# save it to an htm temp file (from ESPN recipe written by Kovid Goyal kovid@kovidgoyal.net
|
||||
with TemporaryFile(suffix='.htm') as fname:
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(raw)
|
||||
br.open_local_file(fname)
|
||||
br.select_form(nr=0) # finds submit on the 2nd form
|
||||
didwelogin = br.submit().read() # submit it and read the return html
|
||||
if 'Welcome ' not in didwelogin: # did it login successfully? Is Username/password correct?
|
||||
raise Exception('Failed to login, are you sure your username and password are correct?')
|
||||
#login is done
|
||||
return br
|
||||
def parse_index(self):
|
||||
articles = []
|
||||
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
ts = soup.find('div', attrs={'class':'box1 article publications-show'})
|
||||
ds = self.tag_to_string(ts.find('h2'))
|
||||
self.log('Found Current Issue:', ds)
|
||||
self.timefmt = ' [%s]'%ds
|
||||
|
||||
cover = ts.find('img', src=True)
|
||||
if cover is not None:
|
||||
self.cover_url = 'http://www.microwavejournal.com' + cover['src']
|
||||
self.log('Found Cover image:', self.cover_url)
|
||||
|
||||
feeds = []
|
||||
seen_titles = set([]) # This is used to remove duplicant articles
|
||||
sections = soup.find('div', attrs={'class':'box2 publication'})
|
||||
for section in sections.findAll('div', attrs={'class':'records'}):
|
||||
section_title = self.tag_to_string(section.find('h3'))
|
||||
self.log('Found section:', section_title)
|
||||
articles = []
|
||||
for post in section.findAll('div', attrs={'class':'record'}):
|
||||
h = post.find('h2')
|
||||
title = self.tag_to_string(h)
|
||||
if title.find('The MWJ Puzzler') >=0: #Let's get rid of the useless Puzzler!
|
||||
continue
|
||||
if title in seen_titles:
|
||||
continue
|
||||
seen_titles.add(title)
|
||||
a = post.find('a', href=True)
|
||||
url = a['href']
|
||||
if url.startswith('/'):
|
||||
url = 'http://www.microwavejournal.com'+url
|
||||
abstract = post.find('div', attrs={'class':'abstract'})
|
||||
p = abstract.find('p')
|
||||
desc = None
|
||||
self.log('\tFound article:', title, 'at', url)
|
||||
if p is not None:
|
||||
desc = self.tag_to_string(p)
|
||||
self.log('\t\t', desc)
|
||||
articles.append({'title':title, 'url':url, 'description':desc,
|
||||
'date':self.timefmt})
|
||||
if articles:
|
||||
feeds.append((section_title, articles))
|
||||
return feeds
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
if self.Convert_Grayscale:
|
||||
#process all the images
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||
iurl = tag['src']
|
||||
img = Image()
|
||||
img.open(iurl)
|
||||
if img < 0:
|
||||
raise RuntimeError('Out of memory')
|
||||
img.type = "GrayscaleType"
|
||||
img.save(iurl)
|
||||
return soup
|
||||
|
@ -1,16 +1,18 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
import re
|
||||
class naczytniki(BasicNewsRecipe):
|
||||
title = u'naczytniki.pl'
|
||||
__author__ = 'fenuks'
|
||||
masthead_url= 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png'
|
||||
cover_url = 'http://naczytniki.pl/wp-content/uploads/2010/08/logo_nc28.png'
|
||||
language = 'pl'
|
||||
description ='everything about e-readers'
|
||||
category='readers'
|
||||
category='e-readers'
|
||||
no_stylesheets=True
|
||||
use_embedded_content=False
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_tags_after= dict(name='div', attrs={'class':'sociable'})
|
||||
preprocess_regexps = [(re.compile(ur'<p><br><b>Zobacz także:</b></p>.*?</body>', re.DOTALL), lambda match: '</body>') ]
|
||||
keep_only_tags=[dict(name='div', attrs={'class':'post'})]
|
||||
remove_tags=[dict(name='span', attrs={'class':'comments'}), dict(name='div', attrs={'class':'sociable'})]
|
||||
feeds = [(u'Wpisy', u'http://naczytniki.pl/?feed=rss2')]
|
26
recipes/novinite_bg.recipe
Normal file
@ -0,0 +1,26 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1329123365(BasicNewsRecipe):
|
||||
title = u'Novinite.bg'
|
||||
__author__ = 'M3 Web'
|
||||
description = 'Real time provider of the latest news from Bulgaria and the world'
|
||||
category = 'Business, Politics, Society, Sports, Crime, Lifestyle, World, Health'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 6
|
||||
language = 'bg'
|
||||
encoding = 'windows-1251'
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
|
||||
remove_tags = [dict(name='div', attrs={'id':'text_options'})]
|
||||
remove_tags = [dict(name='div', attrs={'id':'social_shares_top'})]
|
||||
remove_tags_after = dict(id='textsize')
|
||||
feeds = [(u'Business', u'http://novinite.bg/rss.php?category_id=1'),
|
||||
(u'Politics', u'http://novinite.bg/rss.php?category_id=2'),
|
||||
(u'Society', u'http://novinite.bg/rss.php?category_id=3'),
|
||||
(u'Sport', u'http://novinite.bg/rss.php?category_id=4'),
|
||||
(u'Crime', u'http://novinite.bg/rss.php?category_id=5'),
|
||||
(u'Lifestyle', u'http://novinite.bg/rss.php?category_id=6'),
|
||||
(u'Health', u'http://novinite.bg/rss.php?category_id=7'),
|
||||
(u'Other', u'http://novinite.bg/rss.php?category_id=10'),
|
||||
(u'World', u'http://novinite.bg/rss.php?category_id=9')]
|
@ -1,21 +1,33 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
class Nowa_Fantastyka(BasicNewsRecipe):
|
||||
title = u'Nowa Fantastyka'
|
||||
oldest_article = 7
|
||||
__author__ = 'fenuks'
|
||||
__modified_by__ = 'zaslav'
|
||||
language = 'pl'
|
||||
encoding='latin2'
|
||||
description ='site for fantasy readers'
|
||||
category='fantasy'
|
||||
masthead_url='http://farm5.static.flickr.com/4133/4956658792_7ba7fbf562.jpg'
|
||||
#extra_css='.tytul {font-size: 20px;}' #not working
|
||||
max_articles_per_feed = 100
|
||||
INDEX='http://www.fantastyka.pl/'
|
||||
no_stylesheets=True
|
||||
needs_subscription = 'optional'
|
||||
remove_tags_before=dict(attrs={'class':'belka1-tlo-md'})
|
||||
remove_tags_before=dict(attrs={'class':'naglowek2'})
|
||||
#remove_tags_after=dict(name='span', attrs={'class':'naglowek-oceny'})
|
||||
remove_tags_after=dict(name='td', attrs={'class':'belka1-bot'})
|
||||
remove_tags=[dict(attrs={'class':'avatar2'}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'})]
|
||||
remove_tags_after=dict(name='form', attrs={'name':'form1'})
|
||||
remove_tags=[dict(attrs={'class':['avatar2', 'belka-margin', 'naglowek2']}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'}), dict(name='form')]
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'\<table .*?\>'), lambda match: ''),
|
||||
(re.compile(r'\<td.*?\>'), lambda match: ''),
|
||||
(re.compile(r'\<center\>'), lambda match: '')]
|
||||
|
||||
|
||||
|
||||
|
||||
def find_articles(self, url):
|
||||
articles = []
|
||||
@ -41,10 +53,10 @@ class Nowa_Fantastyka(BasicNewsRecipe):
|
||||
|
||||
return feeds
|
||||
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.fantastyka.pl/1.html')
|
||||
cover=soup.find(name='img', attrs={'class':'okladka'})
|
||||
self.cover_url=self.INDEX+ cover['src']
|
||||
soup = self.index_to_soup('http://www.e-kiosk.pl/nowa_fantastyka')
|
||||
self.cover_url='http://www.e-kiosk.pl' + soup.find(name='a', attrs={'class':'img'})['href']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def get_browser(self):
|
||||
@ -56,3 +68,18 @@ class Nowa_Fantastyka(BasicNewsRecipe):
|
||||
br['pass'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll(font=True):
|
||||
del item['font']
|
||||
for item in soup.findAll(align=True):
|
||||
del item['align']
|
||||
for item in soup.findAll(name='tr'):
|
||||
item.name='div'
|
||||
title=soup.find(attrs={'class':'tytul'})
|
||||
if title:
|
||||
title['style']='font-size: 20px; font-weight: bold;'
|
||||
self.log.warn(soup)
|
||||
return soup
|
||||
|
76
recipes/nrc_handelsblad.recipe
Normal file
@ -0,0 +1,76 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012'
|
||||
'''
|
||||
nrc.nl
|
||||
'''
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class NRC(BasicNewsRecipe):
|
||||
title = 'NRC Handelsblad'
|
||||
__author__ = 'veezh'
|
||||
description = 'Nieuws (no subscription needed)'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
#delay = 1
|
||||
use_embedded_content = False
|
||||
encoding = 'utf-8'
|
||||
publisher = 'nrc.nl'
|
||||
category = 'news, Netherlands, world'
|
||||
language = 'nl'
|
||||
timefmt = ''
|
||||
#publication_type = 'newsportal'
|
||||
extra_css = '''
|
||||
h1{font-size:130%;}
|
||||
#h2{font-size:100%;font-weight:normal;}
|
||||
#.href{font-size:xx-small;}
|
||||
.bijschrift{color:#666666; font-size:x-small;}
|
||||
#.main-article-info{font-family:Arial,Helvetica,sans-serif;}
|
||||
#full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
|
||||
#match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
|
||||
'''
|
||||
#preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'linearize_tables': True
|
||||
}
|
||||
|
||||
remove_empty_feeds = True
|
||||
|
||||
filterDuplicates = True
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'article'})]
|
||||
remove_tags_after = [dict(id='broodtekst')]
|
||||
|
||||
# keep_only_tags = [
|
||||
# dict(name='div', attrs={'class':['label']})
|
||||
# ]
|
||||
|
||||
# remove_tags_after = [dict(name='dl', attrs={'class':['tags']})]
|
||||
|
||||
# def get_article_url(self, article):
|
||||
# link = article.get('link')
|
||||
# if 'blog' not in link and ('chat' not in link):
|
||||
# return link
|
||||
|
||||
feeds = [
|
||||
# ('Nieuws', 'http://www.nrc.nl/rss.php'),
|
||||
('Binnenland', 'http://www.nrc.nl/nieuws/categorie/binnenland/rss.php'),
|
||||
('Buitenland', 'http://www.nrc.nl/nieuws/categorie/buitenland/rss.php'),
|
||||
('Economie', 'http://www.nrc.nl/nieuws/categorie/economie/rss.php'),
|
||||
('Wetenschap', 'http://www.nrc.nl/nieuws/categorie/wetenschap/rss.php'),
|
||||
('Cultuur', 'http://www.nrc.nl/nieuws/categorie/cultuur/rss.php'),
|
||||
('Boeken', 'http://www.nrc.nl/boeken/rss.php'),
|
||||
('Tech', 'http://www.nrc.nl/tech/rss.php/'),
|
||||
('Klimaat', 'http://www.nrc.nl/klimaat/rss.php/'),
|
||||
]
|
31
recipes/oclab_pl.recipe
Normal file
@ -0,0 +1,31 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class OCLab(BasicNewsRecipe):
|
||||
title = u'OCLab.pl'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'fenuks'
|
||||
description = u'Portal OCLab.pl jest miejscem przyjaznym pasjonatom sprzętu komputerowego, w szczególności overclockerom, które będzie służyć im za aktualną bazę wiedzy o podkręcaniu komputera, źródło aktualnych informacji z rynku oraz opinii na temat sprzętu komputerowego.'
|
||||
category = 'IT'
|
||||
language = 'pl'
|
||||
cover_url= 'http://www.idealforum.ru/attachment.php?attachmentid=7963&d=1316008118'
|
||||
no_stylesheets = True
|
||||
keep_only_tags=[dict(id='main')]
|
||||
remove_tags_after= dict(attrs={'class':'single-postmetadata'})
|
||||
remove_tags=[dict(attrs={'class':['single-postmetadata', 'pagebar']})]
|
||||
feeds = [(u'Wpisy', u'http://oclab.pl/feed/')]
|
||||
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag=soup.find(attrs={'class':'contentjumpddl'})
|
||||
if tag:
|
||||
nexturl=tag.findAll('option')
|
||||
for nextpage in nexturl[1:-1]:
|
||||
soup2 = self.index_to_soup(nextpage['value'])
|
||||
pagetext = soup2.find(attrs={'class':'single-entry'})
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
for r in appendtag.findAll(attrs={'class':'post-nav-bottom-list'}):
|
||||
r.extract()
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
21
recipes/onda_rock.recipe
Normal file
@ -0,0 +1,21 @@
|
||||
__license__ = 'GPL v3'
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1328535130(BasicNewsRecipe):
|
||||
title = u'Onda Rock'
|
||||
__author__ = 'faber1971'
|
||||
description = 'Italian rock webzine'
|
||||
language = 'it'
|
||||
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = False
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':['boxHeader','boxlinks_med','footer','boxinterviste','box_special_med','boxdiscografia_head','path']}),
|
||||
dict(name='div', attrs={'align':'left'}),
|
||||
dict(name='div', attrs={'style':'text-align: center'}),
|
||||
]
|
||||
no_stylesheets = True
|
||||
feeds = [(u'Onda Rock', u'http://www.ondarock.it/feed.php')]
|
||||
masthead_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/71135_45820579767_4993043_n.jpg'
|
@ -1,4 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
|
||||
@ -6,20 +7,72 @@ __license__ = 'GPL v3'
|
||||
www.canada.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
|
||||
|
||||
class CanWestPaper(BasicNewsRecipe):
|
||||
|
||||
# un-comment the following three lines for the Ottawa Citizen
|
||||
# un-comment the following four lines for the Victoria Times Colonist
|
||||
## title = u'Victoria Times Colonist'
|
||||
## url_prefix = 'http://www.timescolonist.com'
|
||||
## description = u'News from Victoria, BC'
|
||||
## fp_tag = 'CAN_TC'
|
||||
|
||||
# un-comment the following four lines for the Vancouver Province
|
||||
## title = u'Vancouver Province'
|
||||
## url_prefix = 'http://www.theprovince.com'
|
||||
## description = u'News from Vancouver, BC'
|
||||
## fp_tag = 'CAN_VP'
|
||||
|
||||
# un-comment the following four lines for the Vancouver Sun
|
||||
## title = u'Vancouver Sun'
|
||||
## url_prefix = 'http://www.vancouversun.com'
|
||||
## description = u'News from Vancouver, BC'
|
||||
## fp_tag = 'CAN_VS'
|
||||
|
||||
# un-comment the following four lines for the Edmonton Journal
|
||||
## title = u'Edmonton Journal'
|
||||
## url_prefix = 'http://www.edmontonjournal.com'
|
||||
## description = u'News from Edmonton, AB'
|
||||
## fp_tag = 'CAN_EJ'
|
||||
|
||||
# un-comment the following four lines for the Calgary Herald
|
||||
## title = u'Calgary Herald'
|
||||
## url_prefix = 'http://www.calgaryherald.com'
|
||||
## description = u'News from Calgary, AB'
|
||||
## fp_tag = 'CAN_CH'
|
||||
|
||||
# un-comment the following four lines for the Regina Leader-Post
|
||||
## title = u'Regina Leader-Post'
|
||||
## url_prefix = 'http://www.leaderpost.com'
|
||||
## description = u'News from Regina, SK'
|
||||
## fp_tag = ''
|
||||
|
||||
# un-comment the following four lines for the Saskatoon Star-Phoenix
|
||||
## title = u'Saskatoon Star-Phoenix'
|
||||
## url_prefix = 'http://www.thestarphoenix.com'
|
||||
## description = u'News from Saskatoon, SK'
|
||||
## fp_tag = ''
|
||||
|
||||
# un-comment the following four lines for the Windsor Star
|
||||
## title = u'Windsor Star'
|
||||
## url_prefix = 'http://www.windsorstar.com'
|
||||
## description = u'News from Windsor, ON'
|
||||
## fp_tag = 'CAN_'
|
||||
|
||||
# un-comment the following four lines for the Ottawa Citizen
|
||||
title = u'Ottawa Citizen'
|
||||
url_prefix = 'http://www.ottawacitizen.com'
|
||||
description = u'News from Ottawa, ON'
|
||||
fp_tag = 'CAN_OC'
|
||||
|
||||
# un-comment the following three lines for the Montreal Gazette
|
||||
#title = u'Montreal Gazette'
|
||||
#url_prefix = 'http://www.montrealgazette.com'
|
||||
#description = u'News from Montreal, QC'
|
||||
# un-comment the following four lines for the Montreal Gazette
|
||||
## title = u'Montreal Gazette'
|
||||
## url_prefix = 'http://www.montrealgazette.com'
|
||||
## description = u'News from Montreal, QC'
|
||||
## fp_tag = 'CAN_MG'
|
||||
|
||||
|
||||
language = 'en_CA'
|
||||
@ -43,14 +96,80 @@ class CanWestPaper(BasicNewsRecipe):
|
||||
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||
|
||||
def preprocess_html(self,soup):
|
||||
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
||||
divtags = soup.findAll('div',attrs={'id':''})
|
||||
if divtags:
|
||||
for div in divtags:
|
||||
del(div['id'])
|
||||
def get_cover_url(self):
|
||||
from datetime import timedelta, date
|
||||
if self.fp_tag=='':
|
||||
return None
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
daysback=1
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
while daysback<7:
|
||||
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
daysback = daysback+1
|
||||
continue
|
||||
break
|
||||
if daysback==7:
|
||||
self.log("\nCover unavailable")
|
||||
cover = None
|
||||
return cover
|
||||
|
||||
def fixChars(self,string):
|
||||
# Replace lsquo (\x91)
|
||||
fixed = re.sub("\x91","‘",string)
|
||||
# Replace rsquo (\x92)
|
||||
fixed = re.sub("\x92","’",fixed)
|
||||
# Replace ldquo (\x93)
|
||||
fixed = re.sub("\x93","“",fixed)
|
||||
# Replace rdquo (\x94)
|
||||
fixed = re.sub("\x94","”",fixed)
|
||||
# Replace ndash (\x96)
|
||||
fixed = re.sub("\x96","–",fixed)
|
||||
# Replace mdash (\x97)
|
||||
fixed = re.sub("\x97","—",fixed)
|
||||
fixed = re.sub("’","’",fixed)
|
||||
return fixed
|
||||
|
||||
def massageNCXText(self, description):
|
||||
# Kindle TOC descriptions won't render certain characters
|
||||
if description:
|
||||
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
|
||||
# Replace '&' with '&'
|
||||
massaged = re.sub("&","&", massaged)
|
||||
return self.fixChars(massaged)
|
||||
else:
|
||||
return description
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first:
|
||||
picdiv = soup.find('body').find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
|
||||
xtitle = article.text_summary.strip()
|
||||
if len(xtitle) == 0:
|
||||
desc = soup.find('meta',attrs={'property':'og:description'})
|
||||
if desc is not None:
|
||||
article.summary = article.text_summary = desc['content']
|
||||
|
||||
def strip_anchors(self,soup):
|
||||
paras = soup.findAll(True)
|
||||
for para in paras:
|
||||
aTags = para.findAll('a')
|
||||
for a in aTags:
|
||||
if a.img is None:
|
||||
a.replaceWith(a.renderContents().decode('cp1252','replace'))
|
||||
return soup
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.strip_anchors(soup)
|
||||
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||
|
@ -14,6 +14,7 @@ class OurDailyBread(BasicNewsRecipe):
|
||||
language = 'en'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
use_embedded_content = False
|
||||
category = 'ODB, Daily Devotional, Bible, Christian Devotional, Devotional, RBC Ministries, Our Daily Bread, Devotionals, Daily Devotionals, Christian Devotionals, Faith, Bible Study, Bible Studies, Scripture, RBC, religion'
|
||||
encoding = 'utf-8'
|
||||
@ -25,12 +26,12 @@ class OurDailyBread(BasicNewsRecipe):
|
||||
,'linearize_tables' : True
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(attrs={'class':'module-content'})]
|
||||
remove_tags = [
|
||||
dict(attrs={'id':'article-zoom'})
|
||||
,dict(attrs={'class':'listen-now-box'})
|
||||
]
|
||||
remove_tags_after = dict(attrs={'class':'readable-area'})
|
||||
#keep_only_tags = [dict(attrs={'class':'module-content'})]
|
||||
#remove_tags = [
|
||||
#dict(attrs={'id':'article-zoom'})
|
||||
#,dict(attrs={'class':'listen-now-box'})
|
||||
#]
|
||||
#remove_tags_after = dict(attrs={'class':'readable-area'})
|
||||
|
||||
extra_css = '''
|
||||
.text{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
|
||||
|
24
recipes/overclock_pl.recipe
Normal file
@ -0,0 +1,24 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class Overclock_pl(BasicNewsRecipe):
|
||||
title = u'Overclock.pl'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'fenuks'
|
||||
description = u'Vortal poświęcony tematyce hardware, kładący największy nacisk na podkręcanie / overclocking (włącznie z extreme) i chłodzenie / cooling (air cooling, water cooling, freon cooling, dry ice, liquid nitrogen).'
|
||||
category = 'IT'
|
||||
language = 'pl'
|
||||
masthead_url='http://www.overclock.pl/gfx/logo_m.png'
|
||||
cover_url='http://www.overclock.pl/gfx/logo_m.png'
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
preprocess_regexps = [(re.compile(ur'<b>Komentarze do aktualności:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'<h3>Nawigacja</h3>', re.DOTALL), lambda match: '') ]
|
||||
keep_only_tags=[dict(name='div', attrs={'class':'news'}), dict(id='articleContent')]
|
||||
remove_tags=[dict(name='span', attrs={'class':'info'}), dict(attrs={'class':'shareit'})]
|
||||
feeds = [(u'Aktualno\u015bci', u'http://www.overclock.pl/rss.news.xml'), (u'Testy i recenzje', u'http://www.overclock.pl/rss.articles.xml')]
|
||||
|
||||
def print_version(self, url):
|
||||
if 'articles/show' in url:
|
||||
return url.replace('show', 'showall')
|
||||
else:
|
||||
return url
|
16
recipes/palmtop_pl.recipe
Normal file
@ -0,0 +1,16 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class palmtop_pl(BasicNewsRecipe):
|
||||
title = u'Palmtop.pl'
|
||||
__author__ = 'fenuks'
|
||||
description = 'wortal technologii mobilnych'
|
||||
category = 'mobile'
|
||||
language = 'pl'
|
||||
cover_url='http://cdn.g-point.biz/wp-content/themes/palmtop-new/images/header_palmtop_logo.png'
|
||||
masthead_url='http://cdn.g-point.biz/wp-content/themes/palmtop-new/images/header_palmtop_logo.png'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content=True
|
||||
#remove_tags_before=dict(name='h2')
|
||||
#remove_tags_after=dict(attrs={'class':'entry clearfix'})
|
||||
feeds = [(u'Newsy', u'http://palmtop.pl/feed/atom/')]
|
25
recipes/pc_arena.recipe
Normal file
@ -0,0 +1,25 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class PC_Arena(BasicNewsRecipe):
|
||||
title = u'PCArena'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'fenuks'
|
||||
description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.'
|
||||
category = 'IT'
|
||||
language = 'pl'
|
||||
masthead_url='http://pcarena.pl/pcarena/img/logo.png'
|
||||
cover_url= 'http://pcarena.pl/pcarena/img/logo.png'
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds=True
|
||||
#keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})]
|
||||
#remove_tags=[dict(attrs={'class':'pages'})]
|
||||
feeds = [(u'Aktualności', u'http://pcarena.pl/aktualnosci/feeds.rss'), (u'Testy', u'http://pcarena.pl/testy/feeds.rss'), (u'Software', u'http://pcarena.pl/oprogramowanie/feeds.rss'), (u'Poradniki', u'http://pcarena.pl/poradniki/feeds.rss'), (u'Mobile', u'http://pcarena.pl/mobile/feeds.rss')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('show', 'print')
|
||||
|
||||
def image_url_processor(self, baseurl, url):
|
||||
if 'http' not in url:
|
||||
return 'http://pcarena.pl' + url
|
||||
else:
|
||||
return url
|
20
recipes/pc_centre_pl.recipe
Normal file
@ -0,0 +1,20 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class PC_Centre(BasicNewsRecipe):
|
||||
title = u'PC Centre'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'fenuks'
|
||||
description = u'Portal komputerowy, a w nim: testy sprzętu komputerowego, recenzje gier i oprogramowania. a także opisy produktów związanych z komputerami.'
|
||||
category = 'IT'
|
||||
language = 'pl'
|
||||
masthead_url= 'http://pccentre.pl/views/images/logo.gif'
|
||||
cover_url= 'http://pccentre.pl/views/images/logo.gif'
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
#keep_only_tags= [dict(id='content')]
|
||||
#remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')]
|
||||
remove_tags=[dict(attrs={'class':'logo_print'})]
|
||||
feeds = [(u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n§ion=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n§ion=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n§ion=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n§ion=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n§ion=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n§ion=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n§ion=9')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('show', 'print')
|
35
recipes/pc_foster.recipe
Normal file
@ -0,0 +1,35 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class PC_Foster(BasicNewsRecipe):
|
||||
title = u'PC Foster'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'fenuks'
|
||||
description = u'Vortal technologiczny: testy, recenzje sprzętu komputerowego i telefonów, nowinki hardware, programy i gry dla Windows. Podkręcanie, modding i Overclocking.'
|
||||
category = 'IT'
|
||||
language = 'pl'
|
||||
masthead_url='http://pcfoster.pl/public/images/logo.png'
|
||||
cover_url= 'http://pcfoster.pl/public/images/logo.png'
|
||||
no_stylesheets= True
|
||||
remove_empty_feeds= True
|
||||
keep_only_tags= [dict(id=['news_details', 'review_details']), dict(attrs={'class':'pager more_top'})]
|
||||
remove_tags=[dict(name='p', attrs={'class':'right'})]
|
||||
feeds = [(u'G\u0142\xf3wny', u'http://pcfoster.pl/public/rss/main.xml')]
|
||||
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
nexturl= appendtag.find(attrs={'alt':u'Następna strona'})
|
||||
if nexturl:
|
||||
appendtag.find(attrs={'class':'pager more_top'}).extract()
|
||||
while nexturl:
|
||||
nexturl='http://pcfoster.pl' + nexturl.parent['href']
|
||||
soup2 = self.index_to_soup(nexturl)
|
||||
nexturl=soup2.find(attrs={'alt':u'Następna strona'})
|
||||
pagetext = soup2.find(attrs={'class':'content'})
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
for r in appendtag.findAll(attrs={'class':'review_content double'}):
|
||||
r.extract()
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
@ -1,18 +1,18 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
pescanik.net
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
|
||||
class Pescanik(BasicNewsRecipe):
|
||||
title = 'Pescanik'
|
||||
title = 'Peščanik'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Pescanik'
|
||||
publisher = 'Pescanik'
|
||||
description = 'Peščanik je udruženje građana osnovano 2006. godine. Glavni proizvod Peščanika je radio emisija koja je emitovana na Radiju B92 od 02.02.2000. do 16.06.2011, a od septembra 2011. se emituje na osam radio stanica u Srbiji, Crnoj Gori i BiH'
|
||||
publisher = 'Peščanik'
|
||||
category = 'news, politics, Serbia'
|
||||
oldest_article = 10
|
||||
max_articles_per_feed = 100
|
||||
@ -21,7 +21,12 @@ class Pescanik(BasicNewsRecipe):
|
||||
encoding = 'utf-8'
|
||||
language = 'sr'
|
||||
publication_type = 'newsportal'
|
||||
extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .article_description,body{font-family: Arial,"Lucida Grande",Tahoma,Verdana,sans1,sans-serif} .contentheading{font-size: x-large; font-weight: bold} .small{font-size: small} .createdate{font-size: x-small; font-weight: bold} '
|
||||
masthead_url = 'http://pescanik.net/wp-content/uploads/2011/10/logo1.png'
|
||||
extra_css = """
|
||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||
body{font-family: Verdana,Arial,Tahoma,sans1,sans-serif}
|
||||
#BlogTitle{font-size: xx-large; font-weight: bold}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
@ -32,29 +37,12 @@ class Pescanik(BasicNewsRecipe):
|
||||
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
remove_attributes = ['valign','colspan','width','height','align','alt']
|
||||
|
||||
remove_tags = [dict(name=['object','link','meta','script'])]
|
||||
|
||||
keep_only_tags = [
|
||||
dict(attrs={'class':['contentheading','small','createdate']})
|
||||
,dict(name='td', attrs={'valign':'top','colspan':'2'})
|
||||
]
|
||||
|
||||
feeds = [(u'Pescanik Online', u'http://www.pescanik.net/index.php?option=com_rd_rss&id=12')]
|
||||
remove_tags = [dict(name=['object','link','meta','script','iframe','embed'])]
|
||||
keep_only_tags = [dict(attrs={'id':['BlogTitle','BlogDate','BlogContent']})]
|
||||
feeds = [
|
||||
(u'Autori' , u'http://pescanik.net/category/autori/feed/'),
|
||||
(u'Prevodi', u'http://pescanik.net/category/prevodi/feed/')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
nurl = url.replace('/index.php','/index2.php')
|
||||
return nurl + '&pop=1&page=0'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
st = soup.findAll('td')
|
||||
for it in st:
|
||||
it.name='p'
|
||||
for pt in soup.findAll('img'):
|
||||
brtag = Tag(soup,'br')
|
||||
brtag2 = Tag(soup,'br')
|
||||
pt.append(brtag)
|
||||
pt.append(brtag2)
|
||||
return soup
|
||||
return url + 'print/'
|
81
recipes/polska_times.recipe
Normal file
@ -0,0 +1,81 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
class Polska_times(BasicNewsRecipe):
|
||||
title = u'Polska Times'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Internetowe wydanie dziennika ogólnopolskiego Polska The Times. Najświeższe informacje: wydarzenia w kraju i na świecie, reportaże, poradniki, opinie.'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/polska.gif?17'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_emty_feeds= True
|
||||
no_stylesheets = True
|
||||
preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
|
||||
keep_only_tags= [dict(id=['tytul-artykulu', 'kontent'])]
|
||||
remove_tags_after= dict(id='material-tagi')
|
||||
remove_tags=[dict(attrs={'id':'reklama_srodtekst_0'}), dict(attrs={'id':'material-tagi'}), dict(name='div', attrs={'class':'zakladki'}), dict(attrs={'title':u'CZYTAJ TAKŻE'}), dict(attrs={'id':'podobne'}), dict(name='a', attrs={'href':'http://www.dzienniklodzki.pl/newsletter'})]
|
||||
feeds = [(u'Fakty', u'http://polskatimes.feedsportal.com/c/32980/f/533648/index.rss'), (u'Opinie', u'http://www.polskatimes.pl/rss/opinie.xml'), (u'Sport', u'http://polskatimes.feedsportal.com/c/32980/f/533649/index.rss'), (u'Pieni\u0105dze', u'http://polskatimes.feedsportal.com/c/32980/f/533657/index.rss'), (u'Twoje finanse', u'http://www.polskatimes.pl/rss/twojefinanse.xml'), (u'Kultura', u'http://polskatimes.feedsportal.com/c/32980/f/533650/index.rss'), (u'Dodatki', u'http://www.polskatimes.pl/rss/dodatki.xml')]
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
if 'Advertisement' in soup.title:
|
||||
nexturl=soup.find('a')['href']
|
||||
return self.index_to_soup(nexturl, raw=True)
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
nexturl=soup.find(id='nastepna_strona')
|
||||
while nexturl:
|
||||
soup2= self.index_to_soup(nexturl['href'])
|
||||
nexturl=soup2.find(id='nastepna_strona')
|
||||
pagetext = soup2.find(id='tresc')
|
||||
for dictionary in self.remove_tags:
|
||||
v=pagetext.findAll(attrs=dictionary['attrs'])
|
||||
for delete in v:
|
||||
delete.extract()
|
||||
for b in pagetext.findAll(name='b'):
|
||||
if b.string:
|
||||
if u'CZYTAJ TEŻ' in b.string or u'Czytaj także' in b.string or u'Czytaj też' in b.string or u'Zobacz także' in b.string:
|
||||
b.extract()
|
||||
for center in pagetext.findAll(name='center'):
|
||||
if center.h4:
|
||||
if center.h4.a:
|
||||
center.extract()
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}):
|
||||
paginator.extract()
|
||||
|
||||
def image_article(self, soup, appendtag):
|
||||
nexturl=soup.find('a', attrs={'class':'nastepna'})
|
||||
urls=[]
|
||||
while nexturl:
|
||||
if nexturl not in urls:
|
||||
urls.append(nexturl)
|
||||
else:
|
||||
break
|
||||
soup2= self.index_to_soup('http://www.polskatimes.pl/artykul/' + nexturl['href'])
|
||||
nexturl=soup2.find('a', attrs={'class':'nastepna'})
|
||||
if nexturl in urls:
|
||||
break;
|
||||
pagetext = soup2.find(id='galeria-material')
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, '<br />')
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
for rem in appendtag.findAll(attrs={'class':['galeriaNawigator', 'miniaturyPojemnik']}):
|
||||
rem.extract()
|
||||
for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}):
|
||||
paginator.extract()
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
if soup.find('a', attrs={'class':'nastepna'}):
|
||||
self.image_article(soup, soup.body)
|
||||
elif soup.find(id='nastepna_strona'):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
||||
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/metropolia-warszawska/')
|
||||
self.cover_url=soup.find(id='pojemnik').img['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|