Merge from trunk
171
Changelog.yaml
@ -4,6 +4,176 @@
|
|||||||
# for important features/bug fixes.
|
# for important features/bug fixes.
|
||||||
# Also, each release can have new and improved recipes.
|
# Also, each release can have new and improved recipes.
|
||||||
|
|
||||||
|
- version: 0.7.29
|
||||||
|
date: 2010-11-19
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "OSX binary build is now based on Qt 4.7. Also, the build is now Intel only and requires at least OS X 10.5.2. If you are on a PowerPC machine or an older OS X version, do not upgrade"
|
||||||
|
|
||||||
|
- title: "Content server: Allow direct navigation to a set of books in the book list."
|
||||||
|
tickets: [7453]
|
||||||
|
|
||||||
|
- title: "OS X: When deleting books, put the files into the recycle bin instead of deleting them permanently"
|
||||||
|
|
||||||
|
- title: "Add button to easy configure Hotmail as email relay. Also improve usability of easy config buttons"
|
||||||
|
|
||||||
|
- title: "Kobo driver: Support Currently_Reading category"
|
||||||
|
|
||||||
|
- title: "Catalog generation: Thumbnail caching, wishlist, improved description layout."
|
||||||
|
tickets: [7376]
|
||||||
|
|
||||||
|
- title: "Support for the Cybook Orizon"
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Fix restore to defaults in preferences incorrectly setting PDF unwrap factor to 0.0"
|
||||||
|
|
||||||
|
- title: "PDF Input: Fix unwrapping of accented characters"
|
||||||
|
|
||||||
|
- title: "Do not display dialogs asking for confirmation or showing conversion errors when calibre is minimized to system tray"
|
||||||
|
tickets: [7549]
|
||||||
|
|
||||||
|
- title: "calibre server: Fix regression that broke digest authentication when the calibre interface language was set to non English"
|
||||||
|
|
||||||
|
- title: "EPUB Output: Do not raise an error for invalid embedded fonts in the input document."
|
||||||
|
tickets: [7567]
|
||||||
|
|
||||||
|
- title: "RTF Input: Improved conversion of tables, with support for border styles on table cells"
|
||||||
|
|
||||||
|
- title: "E-book viewer: Fix regression that broke hyphenation. Also add more language patterns for hyphenation"
|
||||||
|
|
||||||
|
- title: "SONY driver: Fix cover thumbnails being uploaded to wrong directory on windows"
|
||||||
|
|
||||||
|
- title: "Fix UnicodeDecodeError when displaying a failed metadata fetch message"
|
||||||
|
tickets: [7560]
|
||||||
|
|
||||||
|
- title: "Bulk metadata edit: Speed up remove all tags operation"
|
||||||
|
|
||||||
|
- title: "MOBI Output: Specify image sizes in pixels instead of em to accomodate Amazon's @#$%#@! MOBI renderer"
|
||||||
|
|
||||||
|
- title: "Fix bug preventing customizing of builtin recipes if they are not ascii encoded"
|
||||||
|
|
||||||
|
- title: "SONY XML cache: Handle case where XML db contains reference to a file that does not exist gracefully"
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Al Jazeera
|
||||||
|
- The Moscow Times
|
||||||
|
- GLobe and Mail
|
||||||
|
- Washington Post
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: "Hannoversche Allgemeine Zeitung"
|
||||||
|
author: "Artemis"
|
||||||
|
|
||||||
|
- title: "globes.co.il"
|
||||||
|
author: "marbs"
|
||||||
|
|
||||||
|
- title: "THN and RDS"
|
||||||
|
author: "Nexus"
|
||||||
|
|
||||||
|
- title: "pclab.pl"
|
||||||
|
author: "ravcio"
|
||||||
|
|
||||||
|
- title: "Now Toronto"
|
||||||
|
author: "Starson17"
|
||||||
|
|
||||||
|
- title: "Press releases of the German government and EU Commission"
|
||||||
|
author: "malfi"
|
||||||
|
|
||||||
|
|
||||||
|
- version: 0.7.28
|
||||||
|
date: 2010-11-12
|
||||||
|
|
||||||
|
new features:
|
||||||
|
- title: "Update the version of the grahical toolkit (Qt 4.7.1) used in the calibre binary builds on windows and linux. This should result in a significant speed up for the calibre ebook viewer"
|
||||||
|
|
||||||
|
- title: "Driver for Nook Color, Eken M001"
|
||||||
|
|
||||||
|
- title: "Add a tweak to turn off double clicking to open viewer"
|
||||||
|
|
||||||
|
- title: "Catalog generation: Add indication when a book has no formats"
|
||||||
|
tickets: [7376]
|
||||||
|
|
||||||
|
- title: "Advanced search dialog: Add a tab to allow searching particular metadata fields easily"
|
||||||
|
|
||||||
|
- title: "Conversion pipeline: When using the Level x Table of Contents expressions, if a tag is empty but has a non-empty title attribute, use that instead of ignoring the tag"
|
||||||
|
|
||||||
|
bug fixes:
|
||||||
|
- title: "Comic metadata reader: Sort filenames aplhabetically when choosing an image for the cover"
|
||||||
|
tickets: [7488]
|
||||||
|
|
||||||
|
- title: "Bulk convert dialog: Hide useless restore defaults button."
|
||||||
|
tickets: [7471]
|
||||||
|
|
||||||
|
- title: "Conversion pipeline: Handle input documents that encode null bytes as HTML entities correctly"
|
||||||
|
tickets: [7355]
|
||||||
|
|
||||||
|
- title: "Fix some SONY readers not being detected on windows"
|
||||||
|
tickets: [7413]
|
||||||
|
|
||||||
|
- title: "MOBI Input: Fix images missing when converting MOBI news downloads created with Mobipocket reader"
|
||||||
|
tickets: [7455]
|
||||||
|
|
||||||
|
- title: "ODT Input: Handle hyperlinks to headings that have truncated destination specifiers correctly"
|
||||||
|
tickets: [7506]
|
||||||
|
|
||||||
|
- title: "Sony driver: Ignore invalid strings when updating XML database"
|
||||||
|
|
||||||
|
- title: "Content Server: Add day to displayed date in /mobile book listing"
|
||||||
|
|
||||||
|
- title: "MOBI Input: Do not generate filenames with only extensions if the MOBI file has no internal name"
|
||||||
|
tickets: [7481]
|
||||||
|
|
||||||
|
- title: "MOBI Input: Handle files that has the record sizes set incorrectly to a long integer"
|
||||||
|
tickets: [7472]
|
||||||
|
|
||||||
|
- title: "Fix not enough vertical space for text in the preferences dialog category listing"
|
||||||
|
|
||||||
|
- title: "Remove 'sort' from Search and replace destination fields and add it to source fields. S&R is no longer marked experimental"
|
||||||
|
|
||||||
|
- title: "Edit metadata dialog: Save dialog geometry on reject as well as on accept"
|
||||||
|
|
||||||
|
- title: "E-book viewer: Fix clicking entries in TOC that point to the currently loaded flow not scrolling view to the top of the document"
|
||||||
|
|
||||||
|
- title: "Fix bug in regex used to extract charset from <meta> tags"
|
||||||
|
|
||||||
|
- title: "MOBI Output: Add support for the <q> tag"
|
||||||
|
|
||||||
|
improved recipes:
|
||||||
|
- Zeit Online
|
||||||
|
- Gamespot Review
|
||||||
|
- Ploitika
|
||||||
|
- Pagina12
|
||||||
|
- Irish Times
|
||||||
|
- elektrolese
|
||||||
|
|
||||||
|
new recipes:
|
||||||
|
- title: "Handelsblatt and European Voice"
|
||||||
|
author: "malfi"
|
||||||
|
|
||||||
|
- title: "Polityka and Newsweek"
|
||||||
|
author: "Mateusz Kielar"
|
||||||
|
|
||||||
|
- title: "MarcTV"
|
||||||
|
author: "Marc Toensings"
|
||||||
|
|
||||||
|
- title: "Rolling Stone"
|
||||||
|
author: "Darko Miletic"
|
||||||
|
|
||||||
|
- title: "Vedomosti"
|
||||||
|
author: "Nikolai Kotchetkov"
|
||||||
|
|
||||||
|
- title: "Hola.com"
|
||||||
|
author: "bmsleight"
|
||||||
|
|
||||||
|
- title: "Dnevnik, Siol.net, MMC-RTV and Avto-magazon"
|
||||||
|
author: "BlonG"
|
||||||
|
|
||||||
|
- title: "SC Print Magazine"
|
||||||
|
author: "Tony Maro"
|
||||||
|
|
||||||
|
- title: "Diario Sport"
|
||||||
|
author: "Jefferson Frantz"
|
||||||
|
|
||||||
- version: 0.7.27
|
- version: 0.7.27
|
||||||
date: 2010-11-05
|
date: 2010-11-05
|
||||||
|
|
||||||
@ -44,6 +214,7 @@
|
|||||||
tickets: [7356]
|
tickets: [7356]
|
||||||
|
|
||||||
- title: "News download: Workaround lack of thread safety in python mechanize, causing corrupted network packets (degrading network performance) on Ubuntu Maverick 64bit kernels"
|
- title: "News download: Workaround lack of thread safety in python mechanize, causing corrupted network packets (degrading network performance) on Ubuntu Maverick 64bit kernels"
|
||||||
|
tickets: [7321]
|
||||||
|
|
||||||
- title: "Convert comments to HTML for book details panel in separate thread to make scrolling through the book list faster when large comments are present"
|
- title: "Convert comments to HTML for book details panel in separate thread to make scrolling through the book list faster when large comments are present"
|
||||||
|
|
||||||
|
@ -12,8 +12,8 @@ p.title {
|
|||||||
p.author {
|
p.author {
|
||||||
margin-top:0em;
|
margin-top:0em;
|
||||||
margin-bottom:0em;
|
margin-bottom:0em;
|
||||||
text-align: left;
|
text-align: center;
|
||||||
text-indent: 1em;
|
text-indent: 0em;
|
||||||
font-size:large;
|
font-size:large;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -27,17 +27,28 @@ p.author_index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
p.tags {
|
p.tags {
|
||||||
margin-top:0em;
|
margin-top:0.5em;
|
||||||
margin-bottom:0em;
|
margin-bottom:0em;
|
||||||
text-align: left;
|
text-align: left;
|
||||||
text-indent: 1em;
|
text-indent: 0.0in;
|
||||||
font-size:small;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
p.description {
|
p.formats {
|
||||||
text-align:left;
|
font-size:90%;
|
||||||
font-style:normal;
|
margin-top:0em;
|
||||||
margin-top: 0em;
|
margin-bottom:0.5em;
|
||||||
|
text-align: left;
|
||||||
|
text-indent: 0.0in;
|
||||||
|
}
|
||||||
|
|
||||||
|
div.description > p:first-child {
|
||||||
|
margin: 0 0 0 0;
|
||||||
|
text-indent: 0em;
|
||||||
|
}
|
||||||
|
|
||||||
|
div.description {
|
||||||
|
margin: 0 0 0 0;
|
||||||
|
text-indent: 1em;
|
||||||
}
|
}
|
||||||
|
|
||||||
p.date_index {
|
p.date_index {
|
||||||
@ -81,6 +92,14 @@ p.unread_book {
|
|||||||
text-indent:-2em;
|
text-indent:-2em;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
p.wishlist_item {
|
||||||
|
text-align:left;
|
||||||
|
margin-top:0px;
|
||||||
|
margin-bottom:0px;
|
||||||
|
margin-left:2em;
|
||||||
|
text-indent:-2em;
|
||||||
|
}
|
||||||
|
|
||||||
p.date_read {
|
p.date_read {
|
||||||
text-align:left;
|
text-align:left;
|
||||||
margin-top:0px;
|
margin-top:0px;
|
||||||
@ -104,3 +123,14 @@ hr.annotations_divider {
|
|||||||
margin-top:0em;
|
margin-top:0em;
|
||||||
margin-bottom:0em;
|
margin-bottom:0em;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
td.publisher, td.date {
|
||||||
|
font-weight:bold;
|
||||||
|
text-align:center;
|
||||||
|
}
|
||||||
|
td.rating {
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
td.thumbnail img {
|
||||||
|
-webkit-box-shadow: 4px 4px 12px #999;
|
||||||
|
}
|
@ -355,6 +355,25 @@ h2.library_name {
|
|||||||
color: red;
|
color: red;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#booklist > #pagelist { display: none; }
|
||||||
|
|
||||||
|
#goto_page_dialog ul {
|
||||||
|
list-style-type: none;
|
||||||
|
font-size: medium;
|
||||||
|
}
|
||||||
|
|
||||||
|
#goto_page_dialog li {
|
||||||
|
margin-bottom: 1.5ex;
|
||||||
|
}
|
||||||
|
|
||||||
|
#goto_page_dialog a {
|
||||||
|
text-decoration: none;
|
||||||
|
color: blue;
|
||||||
|
}
|
||||||
|
|
||||||
|
#goto_page_dialog a:hover {
|
||||||
|
color: red;
|
||||||
|
}
|
||||||
|
|
||||||
#booklist .left .ui-button-text {
|
#booklist .left .ui-button-text {
|
||||||
font-size: medium;
|
font-size: medium;
|
||||||
|
@ -96,5 +96,6 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div id="book_details_dialog"></div>
|
<div id="book_details_dialog"></div>
|
||||||
|
<div id="goto_page_dialog"></div>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
@ -202,6 +202,23 @@ function previous_page() {
|
|||||||
else last_page();
|
else last_page();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function gp_internal(id) {
|
||||||
|
var gp = $('#goto_page_dialog');
|
||||||
|
gp.dialog('close');
|
||||||
|
var elem = $("#booklist #" + id);
|
||||||
|
load_page(elem);
|
||||||
|
}
|
||||||
|
|
||||||
|
function goto_page() {
|
||||||
|
var gp = $('#goto_page_dialog');
|
||||||
|
var pl = $('#booklist > #pagelist');
|
||||||
|
gp.html(pl.html());
|
||||||
|
gp.dialog('option', 'title', pl.attr('title'));
|
||||||
|
gp.dialog('option', 'height', $(window).height() - 100);
|
||||||
|
gp.dialog('open');
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
function load_page(elem) {
|
function load_page(elem) {
|
||||||
if (elem.is(":visible")) return;
|
if (elem.is(":visible")) return;
|
||||||
var ld = elem.find('.load_data');
|
var ld = elem.find('.load_data');
|
||||||
@ -251,6 +268,12 @@ function booklist(hide_sort) {
|
|||||||
modal: true,
|
modal: true,
|
||||||
show: 'slide'
|
show: 'slide'
|
||||||
});
|
});
|
||||||
|
$("#goto_page_dialog").dialog({
|
||||||
|
autoOpen: false,
|
||||||
|
modal: true,
|
||||||
|
show: 'slide'
|
||||||
|
});
|
||||||
|
|
||||||
first_page();
|
first_page();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -38,6 +38,7 @@ Monocle.Browser.on = {
|
|||||||
iPad: navigator.userAgent.indexOf("iPad") != -1,
|
iPad: navigator.userAgent.indexOf("iPad") != -1,
|
||||||
BlackBerry: navigator.userAgent.indexOf("BlackBerry") != -1,
|
BlackBerry: navigator.userAgent.indexOf("BlackBerry") != -1,
|
||||||
Android: navigator.userAgent.indexOf('Android') != -1,
|
Android: navigator.userAgent.indexOf('Android') != -1,
|
||||||
|
MacOSX: navigator.userAgent.indexOf('Mac OS X') != -1,
|
||||||
Kindle3: navigator.userAgent.match(/Kindle\/3/)
|
Kindle3: navigator.userAgent.match(/Kindle\/3/)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -162,12 +163,23 @@ Monocle.Browser.has.transform3d = Monocle.Browser.CSSProps.isSupported([
|
|||||||
'OPerspective',
|
'OPerspective',
|
||||||
'msPerspective'
|
'msPerspective'
|
||||||
]) && Monocle.Browser.CSSProps.supportsMediaQueryProperty('transform-3d');
|
]) && Monocle.Browser.CSSProps.supportsMediaQueryProperty('transform-3d');
|
||||||
|
Monocle.Browser.has.embedded = (top != self);
|
||||||
|
|
||||||
Monocle.Browser.has.iframeTouchBug = Monocle.Browser.iOSVersionBelow("4.2");
|
Monocle.Browser.has.iframeTouchBug = Monocle.Browser.iOSVersionBelow("4.2");
|
||||||
|
|
||||||
Monocle.Browser.has.selectThruBug = Monocle.Browser.iOSVersionBelow("4.2");
|
Monocle.Browser.has.selectThruBug = Monocle.Browser.iOSVersionBelow("4.2");
|
||||||
|
|
||||||
Monocle.Browser.has.mustScrollSheaf = Monocle.Browser.is.MobileSafari;
|
Monocle.Browser.has.mustScrollSheaf = Monocle.Browser.is.MobileSafari;
|
||||||
Monocle.Browser.has.iframeDoubleWidthBug = Monocle.Browser.has.mustScrollSheaf;
|
Monocle.Browser.has.iframeDoubleWidthBug = Monocle.Browser.has.mustScrollSheaf;
|
||||||
|
|
||||||
Monocle.Browser.has.floatColumnBug = Monocle.Browser.is.WebKit;
|
Monocle.Browser.has.floatColumnBug = Monocle.Browser.is.WebKit;
|
||||||
|
|
||||||
|
Monocle.Browser.has.relativeIframeWidthBug = Monocle.Browser.on.Android;
|
||||||
|
|
||||||
|
|
||||||
|
Monocle.Browser.has.jumpFlickerBug =
|
||||||
|
Monocle.Browser.on.MacOSX && Monocle.Browser.is.WebKit;
|
||||||
|
|
||||||
|
|
||||||
if (typeof window.console == "undefined") {
|
if (typeof window.console == "undefined") {
|
||||||
window.console = {
|
window.console = {
|
||||||
@ -1091,11 +1103,29 @@ Monocle.Reader = function (node, bookData, options, onLoadCallback) {
|
|||||||
cmpt.dom.setStyles(Monocle.Styles.component);
|
cmpt.dom.setStyles(Monocle.Styles.component);
|
||||||
Monocle.Styles.applyRules(cmpt.contentDocument.body, Monocle.Styles.body);
|
Monocle.Styles.applyRules(cmpt.contentDocument.body, Monocle.Styles.body);
|
||||||
}
|
}
|
||||||
|
lockFrameWidths();
|
||||||
dom.find('overlay').dom.setStyles(Monocle.Styles.overlay);
|
dom.find('overlay').dom.setStyles(Monocle.Styles.overlay);
|
||||||
dispatchEvent('monocle:styles');
|
dispatchEvent('monocle:styles');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function lockingFrameWidths() {
|
||||||
|
if (!Monocle.Browser.has.relativeIframeWidthBug) { return; }
|
||||||
|
for (var i = 0, cmpt; cmpt = dom.find('component', i); ++i) {
|
||||||
|
cmpt.style.display = "none";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function lockFrameWidths() {
|
||||||
|
if (!Monocle.Browser.has.relativeIframeWidthBug) { return; }
|
||||||
|
for (var i = 0, cmpt; cmpt = dom.find('component', i); ++i) {
|
||||||
|
cmpt.style.width = cmpt.parentNode.offsetWidth+"px";
|
||||||
|
cmpt.style.display = "block";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
function setBook(bk, place, callback) {
|
function setBook(bk, place, callback) {
|
||||||
p.book = bk;
|
p.book = bk;
|
||||||
var pageCount = 0;
|
var pageCount = 0;
|
||||||
@ -1121,12 +1151,14 @@ Monocle.Reader = function (node, bookData, options, onLoadCallback) {
|
|||||||
if (!p.initialized) {
|
if (!p.initialized) {
|
||||||
console.warn('Attempt to resize book before initialization.');
|
console.warn('Attempt to resize book before initialization.');
|
||||||
}
|
}
|
||||||
|
lockingFrameWidths();
|
||||||
if (!dispatchEvent("monocle:resizing", {}, true)) {
|
if (!dispatchEvent("monocle:resizing", {}, true)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
clearTimeout(p.resizeTimer);
|
clearTimeout(p.resizeTimer);
|
||||||
p.resizeTimer = setTimeout(
|
p.resizeTimer = setTimeout(
|
||||||
function () {
|
function () {
|
||||||
|
lockFrameWidths();
|
||||||
p.flipper.moveTo({ page: pageNumber() });
|
p.flipper.moveTo({ page: pageNumber() });
|
||||||
dispatchEvent("monocle:resize");
|
dispatchEvent("monocle:resize");
|
||||||
},
|
},
|
||||||
@ -1765,12 +1797,7 @@ Monocle.Book = function (dataSource) {
|
|||||||
|
|
||||||
|
|
||||||
function componentIdMatching(str) {
|
function componentIdMatching(str) {
|
||||||
for (var i = 0; i < p.componentIds.length; ++i) {
|
return p.componentIds.indexOf(str) >= 0 ? str : null;
|
||||||
if (str.indexOf(p.componentIds[i]) > -1) {
|
|
||||||
return p.componentIds[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -2018,6 +2045,12 @@ Monocle.Component = function (book, id, index, chapters, source) {
|
|||||||
|
|
||||||
|
|
||||||
function loadFrameFromURL(url, frame, callback) {
|
function loadFrameFromURL(url, frame, callback) {
|
||||||
|
if (!url.match(/^\//)) {
|
||||||
|
var link = document.createElement('a');
|
||||||
|
link.setAttribute('href', url);
|
||||||
|
url = link.href;
|
||||||
|
delete(link);
|
||||||
|
}
|
||||||
frame.onload = function () {
|
frame.onload = function () {
|
||||||
frame.onload = null;
|
frame.onload = null;
|
||||||
Monocle.defer(callback);
|
Monocle.defer(callback);
|
||||||
@ -2460,7 +2493,7 @@ Monocle.Flippers.Legacy = function (reader) {
|
|||||||
function moveTo(locus, callback) {
|
function moveTo(locus, callback) {
|
||||||
var fn = frameToLocus;
|
var fn = frameToLocus;
|
||||||
if (typeof callback == "function") {
|
if (typeof callback == "function") {
|
||||||
fn = function () { frameToLocus(); callback(); }
|
fn = function (locus) { frameToLocus(locus); callback(locus); }
|
||||||
}
|
}
|
||||||
p.reader.getBook().setOrLoadPageAt(page(), locus, fn);
|
p.reader.getBook().setOrLoadPageAt(page(), locus, fn);
|
||||||
}
|
}
|
||||||
@ -2794,7 +2827,9 @@ Monocle.Dimensions.Columns = function (pageDiv) {
|
|||||||
function scrollerWidth() {
|
function scrollerWidth() {
|
||||||
var bdy = p.page.m.activeFrame.contentDocument.body;
|
var bdy = p.page.m.activeFrame.contentDocument.body;
|
||||||
if (Monocle.Browser.has.iframeDoubleWidthBug) {
|
if (Monocle.Browser.has.iframeDoubleWidthBug) {
|
||||||
if (Monocle.Browser.iOSVersion < "4.1") {
|
if (Monocle.Browser.on.Android) {
|
||||||
|
return bdy.scrollWidth * 1.5; // I actually have no idea why 1.5.
|
||||||
|
} else if (Monocle.Browser.iOSVersion < "4.1") {
|
||||||
var hbw = bdy.scrollWidth / 2;
|
var hbw = bdy.scrollWidth / 2;
|
||||||
var sew = scrollerElement().scrollWidth;
|
var sew = scrollerElement().scrollWidth;
|
||||||
return Math.max(sew, hbw);
|
return Math.max(sew, hbw);
|
||||||
@ -2969,6 +3004,7 @@ Monocle.Flippers.Slider = function (reader) {
|
|||||||
|
|
||||||
|
|
||||||
function setPage(pageDiv, locus, callback) {
|
function setPage(pageDiv, locus, callback) {
|
||||||
|
ensureWaitControl();
|
||||||
p.reader.getBook().setOrLoadPageAt(
|
p.reader.getBook().setOrLoadPageAt(
|
||||||
pageDiv,
|
pageDiv,
|
||||||
locus,
|
locus,
|
||||||
@ -3048,6 +3084,7 @@ Monocle.Flippers.Slider = function (reader) {
|
|||||||
checkPoint(boxPointX);
|
checkPoint(boxPointX);
|
||||||
|
|
||||||
p.turnData.releasing = true;
|
p.turnData.releasing = true;
|
||||||
|
showWaitControl(lowerPage());
|
||||||
|
|
||||||
if (dir == k.FORWARDS) {
|
if (dir == k.FORWARDS) {
|
||||||
if (
|
if (
|
||||||
@ -3088,14 +3125,18 @@ Monocle.Flippers.Slider = function (reader) {
|
|||||||
|
|
||||||
|
|
||||||
function onGoingBackward(x) {
|
function onGoingBackward(x) {
|
||||||
var lp = lowerPage();
|
var lp = lowerPage(), up = upperPage();
|
||||||
|
showWaitControl(up);
|
||||||
jumpOut(lp, // move lower page off-screen
|
jumpOut(lp, // move lower page off-screen
|
||||||
function () {
|
function () {
|
||||||
flipPages(); // flip lower to upper
|
flipPages(); // flip lower to upper
|
||||||
setPage( // set upper page to previous
|
setPage( // set upper page to previous
|
||||||
lp,
|
lp,
|
||||||
getPlace(lowerPage()).getLocus({ direction: k.BACKWARDS }),
|
getPlace(lowerPage()).getLocus({ direction: k.BACKWARDS }),
|
||||||
function () { lifted(x); }
|
function () {
|
||||||
|
lifted(x);
|
||||||
|
hideWaitControl(up);
|
||||||
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
@ -3103,8 +3144,10 @@ Monocle.Flippers.Slider = function (reader) {
|
|||||||
|
|
||||||
|
|
||||||
function afterGoingForward() {
|
function afterGoingForward() {
|
||||||
var up = upperPage();
|
var up = upperPage(), lp = lowerPage();
|
||||||
if (p.interactive) {
|
if (p.interactive) {
|
||||||
|
showWaitControl(up);
|
||||||
|
showWaitControl(lp);
|
||||||
setPage( // set upper (off screen) to current
|
setPage( // set upper (off screen) to current
|
||||||
up,
|
up,
|
||||||
getPlace().getLocus({ direction: k.FORWARDS }),
|
getPlace().getLocus({ direction: k.FORWARDS }),
|
||||||
@ -3113,6 +3156,7 @@ Monocle.Flippers.Slider = function (reader) {
|
|||||||
}
|
}
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
|
showWaitControl(lp);
|
||||||
flipPages();
|
flipPages();
|
||||||
jumpIn(up, function () { prepareNextPage(announceTurn); });
|
jumpIn(up, function () { prepareNextPage(announceTurn); });
|
||||||
}
|
}
|
||||||
@ -3171,6 +3215,8 @@ Monocle.Flippers.Slider = function (reader) {
|
|||||||
|
|
||||||
|
|
||||||
function announceTurn() {
|
function announceTurn() {
|
||||||
|
hideWaitControl(upperPage());
|
||||||
|
hideWaitControl(lowerPage());
|
||||||
p.reader.dispatchEvent('monocle:turn');
|
p.reader.dispatchEvent('monocle:turn');
|
||||||
resetTurnData();
|
resetTurnData();
|
||||||
}
|
}
|
||||||
@ -3319,12 +3365,14 @@ Monocle.Flippers.Slider = function (reader) {
|
|||||||
|
|
||||||
|
|
||||||
function jumpIn(pageDiv, callback) {
|
function jumpIn(pageDiv, callback) {
|
||||||
setX(pageDiv, 0, { duration: 1 }, callback);
|
var dur = Monocle.Browser.has.jumpFlickerBug ? 1 : 0;
|
||||||
|
setX(pageDiv, 0, { duration: dur }, callback);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
function jumpOut(pageDiv, callback) {
|
function jumpOut(pageDiv, callback) {
|
||||||
setX(pageDiv, 0 - pageDiv.offsetWidth, { duration: 1 }, callback);
|
var dur = Monocle.Browser.has.jumpFlickerBug ? 1 : 0;
|
||||||
|
setX(pageDiv, 0 - pageDiv.offsetWidth, { duration: dur }, callback);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -3357,6 +3405,28 @@ Monocle.Flippers.Slider = function (reader) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function ensureWaitControl() {
|
||||||
|
if (p.waitControl) { return; }
|
||||||
|
p.waitControl = {
|
||||||
|
createControlElements: function (holder) {
|
||||||
|
return holder.dom.make('div', 'flippers_slider_wait');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
p.reader.addControl(p.waitControl, 'page');
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function showWaitControl(page) {
|
||||||
|
var ctrl = p.reader.dom.find('flippers_slider_wait', page.m.pageIndex);
|
||||||
|
ctrl.style.opacity = 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function hideWaitControl(page) {
|
||||||
|
var ctrl = p.reader.dom.find('flippers_slider_wait', page.m.pageIndex);
|
||||||
|
ctrl.style.opacity = 0;
|
||||||
|
}
|
||||||
|
|
||||||
API.pageCount = p.pageCount;
|
API.pageCount = p.pageCount;
|
||||||
API.addPage = addPage;
|
API.addPage = addPage;
|
||||||
API.getPlace = getPlace;
|
API.getPlace = getPlace;
|
||||||
|
BIN
resources/images/format-text-bold.png
Normal file
After Width: | Height: | Size: 5.0 KiB |
BIN
resources/images/format-text-italic.png
Normal file
After Width: | Height: | Size: 4.1 KiB |
BIN
resources/images/format-text-strikethrough.png
Normal file
After Width: | Height: | Size: 5.9 KiB |
BIN
resources/images/format-text-underline.png
Normal file
After Width: | Height: | Size: 4.4 KiB |
BIN
resources/images/hotmail.png
Normal file
After Width: | Height: | Size: 2.6 KiB |
BIN
resources/images/news/cnetjapan.png
Normal file
After Width: | Height: | Size: 892 B |
BIN
resources/images/news/deutsche_welle_bs.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/deutsche_welle_en.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/deutsche_welle_es.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/deutsche_welle_hr.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/deutsche_welle_pt.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/deutsche_welle_sr.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/endgadget_ja.png
Normal file
After Width: | Height: | Size: 698 B |
BIN
resources/images/news/jijinews.png
Normal file
After Width: | Height: | Size: 919 B |
BIN
resources/images/news/moscow_times.png
Normal file
After Width: | Height: | Size: 1.0 KiB |
BIN
resources/images/news/msnsankei.png
Normal file
After Width: | Height: | Size: 543 B |
BIN
resources/images/news/nikkei_free.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/nikkei_sub_economy.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/nikkei_sub_industory.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/nikkei_sub_life.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/nikkei_sub_main.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/nikkei_sub_sports.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/reuters.png
Normal file
After Width: | Height: | Size: 693 B |
BIN
resources/images/news/reuters_ja.png
Normal file
After Width: | Height: | Size: 693 B |
BIN
resources/images/news/rollingstone.png
Normal file
After Width: | Height: | Size: 1.3 KiB |
BIN
resources/images/news/the_workingham_times.png
Normal file
After Width: | Height: | Size: 1011 B |
50
resources/recipes/180.recipe
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||||
|
'''
|
||||||
|
180.com.uy
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Noticias(BasicNewsRecipe):
|
||||||
|
title = '180.com.uy'
|
||||||
|
__author__ = 'Gustavo Azambuja'
|
||||||
|
description = 'Noticias de Uruguay'
|
||||||
|
language = 'es'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
use_embedded_content = False
|
||||||
|
recursion = 5
|
||||||
|
encoding = 'utf-8'
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':'tef-md tef-md-seccion-sociedad'})]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['object','link'])
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||||
|
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||||
|
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||||
|
p {font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
'''
|
||||||
|
feeds = [
|
||||||
|
(u'Titulares', u'http://www.180.com.uy/feed.php')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
return 'http://www.180.com.uy/tplef/img/logo.gif'
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
||||||
|
|
@ -1,10 +1,8 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
aljazeera.net
|
english.aljazeera.net
|
||||||
'''
|
'''
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
@ -12,41 +10,59 @@ class AlJazeera(BasicNewsRecipe):
|
|||||||
title = 'Al Jazeera in English'
|
title = 'Al Jazeera in English'
|
||||||
__author__ = 'Darko Miletic'
|
__author__ = 'Darko Miletic'
|
||||||
description = 'News from Middle East'
|
description = 'News from Middle East'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
|
||||||
publisher = 'Al Jazeera'
|
publisher = 'Al Jazeera'
|
||||||
category = 'news, politics, middle east'
|
category = 'news, politics, middle east'
|
||||||
simultaneous_downloads = 1
|
delay = 1
|
||||||
delay = 4
|
oldest_article = 2
|
||||||
oldest_article = 1
|
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'iso-8859-1'
|
encoding = 'iso-8859-1'
|
||||||
remove_javascript = True
|
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Arial,sans-serif}
|
||||||
|
#ctl00_cphBody_dvSummary{font-weight: bold}
|
||||||
|
#dvArticleDate{font-size: small; color: #999999}
|
||||||
|
"""
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
html2lrf_options = [
|
keep_only_tags = [
|
||||||
'--comment', description
|
dict(attrs={'id':['DetailedTitle','ctl00_cphBody_dvSummary','dvArticleDate']})
|
||||||
, '--category', category
|
,dict(name='td',attrs={'class':'DetailedSummary'})
|
||||||
, '--publisher', publisher
|
]
|
||||||
, '--ignore-tables'
|
|
||||||
]
|
|
||||||
|
|
||||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_table=True'
|
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':'ctl00_divContent'})]
|
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['object','link'])
|
dict(name=['object','link','table','meta','base','iframe','embed'])
|
||||||
,dict(name='td', attrs={'class':['MostActiveDescHeader','MostActiveDescBody']})
|
,dict(name='td', attrs={'class':['MostActiveDescHeader','MostActiveDescBody']})
|
||||||
]
|
]
|
||||||
|
|
||||||
feeds = [(u'AL JAZEERA ENGLISH (AJE)', u'http://english.aljazeera.net/Services/Rss/?PostingId=2007731105943979989' )]
|
feeds = [(u'AL JAZEERA ENGLISH (AJE)', u'http://english.aljazeera.net/Services/Rss/?PostingId=2007731105943979989' )]
|
||||||
|
|
||||||
|
def get_article_url(self, article):
|
||||||
|
artlurl = article.get('link', None)
|
||||||
|
return artlurl.replace('http://english.aljazeera.net//','http://english.aljazeera.net/')
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
for item in soup.findAll(face=True):
|
for item in soup.findAll(face=True):
|
||||||
del item['face']
|
del item['face']
|
||||||
|
td = soup.find('td',attrs={'class':'DetailedSummary'})
|
||||||
|
if td:
|
||||||
|
td.name = 'div'
|
||||||
|
spn = soup.find('span',attrs={'id':'DetailedTitle'})
|
||||||
|
if spn:
|
||||||
|
spn.name='h1'
|
||||||
|
for itm in soup.findAll('span', attrs={'id':['dvArticleDate','ctl00_cphBody_lblDate']}):
|
||||||
|
itm.name = 'div'
|
||||||
|
for alink in soup.findAll('a'):
|
||||||
|
if alink.string is not None:
|
||||||
|
tstr = alink.string
|
||||||
|
alink.replaceWith(tstr)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
110
resources/recipes/arcamax.recipe
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = 'Copyright 2010 Starson17'
|
||||||
|
'''
|
||||||
|
www.arcamax.com
|
||||||
|
'''
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Arcamax(BasicNewsRecipe):
|
||||||
|
title = 'Arcamax'
|
||||||
|
__author__ = 'Starson17'
|
||||||
|
__version__ = '1.03'
|
||||||
|
__date__ = '25 November 2010'
|
||||||
|
description = u'Family Friendly Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.'
|
||||||
|
category = 'news, comics'
|
||||||
|
language = 'en'
|
||||||
|
use_embedded_content= False
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
cover_url = 'http://www.arcamax.com/images/pub/amuse/leftcol/zits.jpg'
|
||||||
|
|
||||||
|
####### USER PREFERENCES - SET COMICS AND NUMBER OF COMICS TO RETRIEVE ########
|
||||||
|
num_comics_to_get = 7
|
||||||
|
# CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS
|
||||||
|
|
||||||
|
conversion_options = {'linearize_tables' : True
|
||||||
|
, 'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class':['toon']}),
|
||||||
|
]
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
feeds = []
|
||||||
|
for title, url in [
|
||||||
|
######## COMICS - GENERAL ########
|
||||||
|
#(u"9 Chickweed Lane", u"http://www.arcamax.com/ninechickweedlane"),
|
||||||
|
#(u"Agnes", u"http://www.arcamax.com/agnes"),
|
||||||
|
#(u"Andy Capp", u"http://www.arcamax.com/andycapp"),
|
||||||
|
(u"BC", u"http://www.arcamax.com/bc"),
|
||||||
|
#(u"Baby Blues", u"http://www.arcamax.com/babyblues"),
|
||||||
|
#(u"Beetle Bailey", u"http://www.arcamax.com/beetlebailey"),
|
||||||
|
(u"Blondie", u"http://www.arcamax.com/blondie"),
|
||||||
|
#u"Boondocks", u"http://www.arcamax.com/boondocks"),
|
||||||
|
#(u"Cathy", u"http://www.arcamax.com/cathy"),
|
||||||
|
#(u"Daddys Home", u"http://www.arcamax.com/daddyshome"),
|
||||||
|
(u"Dilbert", u"http://www.arcamax.com/dilbert"),
|
||||||
|
#(u"Dinette Set", u"http://www.arcamax.com/thedinetteset"),
|
||||||
|
(u"Dog Eat Doug", u"http://www.arcamax.com/dogeatdoug"),
|
||||||
|
(u"Doonesbury", u"http://www.arcamax.com/doonesbury"),
|
||||||
|
#(u"Dustin", u"http://www.arcamax.com/dustin"),
|
||||||
|
(u"Family Circus", u"http://www.arcamax.com/familycircus"),
|
||||||
|
(u"Garfield", u"http://www.arcamax.com/garfield"),
|
||||||
|
#(u"Get Fuzzy", u"http://www.arcamax.com/getfuzzy"),
|
||||||
|
#(u"Girls and Sports", u"http://www.arcamax.com/girlsandsports"),
|
||||||
|
#(u"Hagar the Horrible", u"http://www.arcamax.com/hagarthehorrible"),
|
||||||
|
#(u"Heathcliff", u"http://www.arcamax.com/heathcliff"),
|
||||||
|
#(u"Jerry King Cartoons", u"http://www.arcamax.com/humorcartoon"),
|
||||||
|
#(u"Luann", u"http://www.arcamax.com/luann"),
|
||||||
|
#(u"Momma", u"http://www.arcamax.com/momma"),
|
||||||
|
#(u"Mother Goose and Grimm", u"http://www.arcamax.com/mothergooseandgrimm"),
|
||||||
|
(u"Mutts", u"http://www.arcamax.com/mutts"),
|
||||||
|
#(u"Non Sequitur", u"http://www.arcamax.com/nonsequitur"),
|
||||||
|
#(u"Pearls Before Swine", u"http://www.arcamax.com/pearlsbeforeswine"),
|
||||||
|
#(u"Pickles", u"http://www.arcamax.com/pickles"),
|
||||||
|
#(u"Red and Rover", u"http://www.arcamax.com/redandrover"),
|
||||||
|
#(u"Rubes", u"http://www.arcamax.com/rubes"),
|
||||||
|
#(u"Rugrats", u"http://www.arcamax.com/rugrats"),
|
||||||
|
(u"Speed Bump", u"http://www.arcamax.com/speedbump"),
|
||||||
|
(u"Wizard of Id", u"http://www.arcamax.com/wizardofid"),
|
||||||
|
(u"Dilbert", u"http://www.arcamax.com/dilbert"),
|
||||||
|
(u"Zits", u"http://www.arcamax.com/zits"),
|
||||||
|
]:
|
||||||
|
articles = self.make_links(url)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def make_links(self, url):
|
||||||
|
title = 'Temp'
|
||||||
|
current_articles = []
|
||||||
|
pages = range(1, self.num_comics_to_get+1)
|
||||||
|
for page in pages:
|
||||||
|
page_soup = self.index_to_soup(url)
|
||||||
|
if page_soup:
|
||||||
|
title = page_soup.find(name='div', attrs={'class':'toon'}).p.img['alt']
|
||||||
|
page_url = url
|
||||||
|
prev_page_url = 'http://www.arcamax.com' + page_soup.find('a', attrs={'class':'next'}, text='Previous').parent['href']
|
||||||
|
current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':''})
|
||||||
|
url = prev_page_url
|
||||||
|
current_articles.reverse()
|
||||||
|
return current_articles
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
main_comic = soup.find('p',attrs={'class':'m0'})
|
||||||
|
if main_comic.a['target'] == '_blank':
|
||||||
|
main_comic.a.img['id'] = 'main_comic'
|
||||||
|
return soup
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||||
|
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||||
|
img#main_comic {max-width:100%; min-width:100%;}
|
||||||
|
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||||
|
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||||
|
'''
|
||||||
|
|
@ -13,6 +13,7 @@ class Dnevnik(BasicNewsRecipe):
|
|||||||
labguage = 'sl'
|
labguage = 'sl'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
language = 'sl'
|
||||||
|
|
||||||
conversion_options = {'linearize_tables' : True}
|
conversion_options = {'linearize_tables' : True}
|
||||||
|
|
||||||
|
25
resources/recipes/bangkok_biz.recipe
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1290689337(BasicNewsRecipe):
|
||||||
|
__author__ = 'Anat R.'
|
||||||
|
language = 'th'
|
||||||
|
title = u'Bangkok Biz News'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = False
|
||||||
|
feeds = [(u'Headlines',
|
||||||
|
u'http://www.bangkokbiznews.com/home/services/rss/home.xml'),
|
||||||
|
(u'Politics', u'http://www.bangkokbiznews.com/home/services/rss/politics.xml'),
|
||||||
|
(u'Business', u'http://www.bangkokbiznews.com/home/services/rss/business.xml'),
|
||||||
|
(u'Finance', u' http://www.bangkokbiznews.com/home/services/rss/finance.xml'),
|
||||||
|
(u'Technology', u' http://www.bangkokbiznews.com/home/services/rss/it.xml')]
|
||||||
|
remove_tags_before = dict(name='div', attrs={'class':'box-Detailcontent'})
|
||||||
|
remove_tags_after = dict(name='p', attrs={'class':'allTags'})
|
||||||
|
remove_tags = []
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'id': 'content-tools'}))
|
||||||
|
remove_tags.append(dict(name = 'p', attrs = {'class':'allTags'}))
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'id':'morePic'}))
|
||||||
|
remove_tags.append(dict(name = 'ul', attrs = {'class':'tabs-nav'}))
|
||||||
|
|
58
resources/recipes/bitacora.recipe
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||||
|
'''
|
||||||
|
bitacora.com.uy
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class General(BasicNewsRecipe):
|
||||||
|
title = 'bitacora.com.uy'
|
||||||
|
__author__ = 'Gustavo Azambuja'
|
||||||
|
description = 'Noticias de Uruguay'
|
||||||
|
language = 'es'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
use_embedded_content = False
|
||||||
|
recursion = 5
|
||||||
|
encoding = 'iso-8859-1'
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
keep_only_tags = [dict(id=['txt'])]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':'tablafoot'}),
|
||||||
|
dict(name=['object','h4']),
|
||||||
|
dict(name=['object','link'])
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||||
|
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||||
|
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||||
|
p {font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
'''
|
||||||
|
feeds = [
|
||||||
|
(u'Titulares', u'http://www.bitacora.com.uy/anxml.cgi?15')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
cover_url = None
|
||||||
|
index = 'http://www.bitacora.com.uy'
|
||||||
|
soup = self.index_to_soup(index)
|
||||||
|
link_item = soup.find('img',attrs={'class':'imgtapa'})
|
||||||
|
if link_item:
|
||||||
|
cover_url = "http://www.bitacora.com.uy/"+link_item['src']
|
||||||
|
return cover_url
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
||||||
|
|
40
resources/recipes/biz_portal.recipe
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1283848012(BasicNewsRecipe):
|
||||||
|
description = 'This is a recipe of BizPortal.co.il.'
|
||||||
|
cover_url = 'http://www.bizportal.co.il/shukhahon/images/bizportal.jpg'
|
||||||
|
title = u'BizPortal'
|
||||||
|
language = 'he'
|
||||||
|
__author__ = 'marbs'
|
||||||
|
extra_css='img {max-width:100%;} body{direction: rtl;},title{direction: rtl; } ,article_description{direction: rtl; }, a.article{direction: rtl; } ,calibre_feed_description{direction: rtl; }'
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
remove_javascript = True
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_attributes = ['width']
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
# keep_only_tags =dict(name='div', attrs={'id':'articleContainer'})
|
||||||
|
remove_tags = [dict(name='img', attrs={'scr':['images/bizlogo_nl.gif']})]
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
#preprocess_regexps = [
|
||||||
|
# (re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: '')
|
||||||
|
# ]
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [(u'חדשות שוק ההון', u'http://www.bizportal.co.il/shukhahon/messRssUTF2.xml'),
|
||||||
|
(u'חדשות וול סטריט בעברית', u'http://www.bizportal.co.il/shukhahon/images/bizportal.jpg'),
|
||||||
|
(u'שיווק ופרסום', u'http://www.bizportal.co.il/shukhahon/messRssUTF145.xml'),
|
||||||
|
(u'משפט', u'http://www.bizportal.co.il/shukhahon/messRssUTF3.xml'),
|
||||||
|
(u'ניתוח טכני', u'http://www.bizportal.co.il/shukhahon/messRssUTF5.xml'),
|
||||||
|
(u'דיני עבודה ושכר', u'http://www.bizportal.co.il/shukhahon/messRssUTF6.xml'),
|
||||||
|
(u'מיסוי', u'http://www.bizportal.co.il/shukhahon/messRssUTF7.xml'),
|
||||||
|
(u'טאבו', u'http://www.bizportal.co.il/shukhahon/messRssUTF8.xml'),
|
||||||
|
(u'נדל"ן', u'http://www.bizportal.co.il/shukhahon/messRssUTF160.xml'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
split1 = url.split("=")
|
||||||
|
print_url = 'http://www.bizportal.co.il/web/webnew/shukhahon/biznews02print.shtml?mid=' + split1[1]
|
||||||
|
return print_url
|
@ -1,18 +1,22 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 mode: python -*-
|
||||||
|
|
||||||
|
# Find the newest version of this recipe here:
|
||||||
|
# https://github.com/consti/BrandEins-Recipe/raw/master/brandeins.recipe
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>'
|
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>'
|
||||||
__version__ = '0.95'
|
__version__ = '0.96'
|
||||||
|
|
||||||
''' http://brandeins.de - Wirtschaftsmagazin '''
|
''' http://brandeins.de - Wirtschaftsmagazin '''
|
||||||
import re
|
import re
|
||||||
import string
|
import string
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
class BrandEins(BasicNewsRecipe):
|
class BrandEins(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'Brand Eins'
|
title = u'brand eins'
|
||||||
__author__ = 'Constantin Hofstetter'
|
__author__ = 'Constantin Hofstetter'
|
||||||
description = u'Wirtschaftsmagazin'
|
description = u'Wirtschaftsmagazin'
|
||||||
publisher ='brandeins.de'
|
publisher ='brandeins.de'
|
||||||
@ -22,11 +26,14 @@ class BrandEins(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
language = 'de'
|
language = 'de'
|
||||||
|
publication_type = 'magazine'
|
||||||
|
needs_subscription = 'optional'
|
||||||
|
|
||||||
# 2 is the last full magazine (default)
|
# 2 is the last full magazine (default)
|
||||||
# 1 is the newest (but not full)
|
# 1 is the newest (but not full)
|
||||||
# 3 is one before 2 etc.
|
# 3 is one before 2 etc.
|
||||||
which_ausgabe = 2
|
# This value can be set via the username field.
|
||||||
|
default_issue = 2
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]
|
keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]
|
||||||
|
|
||||||
@ -61,17 +68,31 @@ class BrandEins(BasicNewsRecipe):
|
|||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def get_cover(self, soup):
|
||||||
|
cover_url = None
|
||||||
|
cover_item = soup.find('div', attrs = {'class': 'cover_image'})
|
||||||
|
if cover_item:
|
||||||
|
cover_url = 'http://www.brandeins.de/' + cover_item.img['src']
|
||||||
|
return cover_url
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
feeds = []
|
feeds = []
|
||||||
|
|
||||||
archive = "http://www.brandeins.de/archiv.html"
|
archive = "http://www.brandeins.de/archiv.html"
|
||||||
|
|
||||||
|
issue = self.default_issue
|
||||||
|
if self.username:
|
||||||
|
try:
|
||||||
|
issue = int(self.username)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
soup = self.index_to_soup(archive)
|
soup = self.index_to_soup(archive)
|
||||||
latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0]
|
latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0]
|
||||||
pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-self.which_ausgabe]
|
pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-issue]
|
||||||
url = pre_latest_issue.get('href', False)
|
url = pre_latest_issue.get('href', False)
|
||||||
# Get the title for the magazin - build it out of the title of the cover - take the issue and year;
|
# Get month and year of the magazine issue - build it out of the title of the cover
|
||||||
self.title = "Brand Eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d+)", pre_latest_issue.find('img').get('title', False)).group('date')
|
self.timefmt = " " + re.search(r"(?P<date>\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date')
|
||||||
url = 'http://brandeins.de/'+url
|
url = 'http://brandeins.de/'+url
|
||||||
|
|
||||||
# url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
|
# url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
|
||||||
@ -83,6 +104,7 @@ class BrandEins(BasicNewsRecipe):
|
|||||||
|
|
||||||
def brand_eins_parse_latest_issue(self, url):
|
def brand_eins_parse_latest_issue(self, url):
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
|
self.cover_url = self.get_cover(soup)
|
||||||
article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]
|
article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]
|
||||||
|
|
||||||
titles_and_articles = []
|
titles_and_articles = []
|
||||||
@ -123,3 +145,4 @@ class BrandEins(BasicNewsRecipe):
|
|||||||
current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
|
current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
|
||||||
titles_and_articles.append([chapter_title, current_articles])
|
titles_and_articles.append([chapter_title, current_articles])
|
||||||
return titles_and_articles
|
return titles_and_articles
|
||||||
|
|
||||||
|
@ -11,7 +11,6 @@ class AdvancedUserRecipe1275798572(BasicNewsRecipe):
|
|||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
language = 'en'
|
|
||||||
masthead_url = 'http://www.cbc.ca/includes/gfx/cbcnews_logo_09.gif'
|
masthead_url = 'http://www.cbc.ca/includes/gfx/cbcnews_logo_09.gif'
|
||||||
cover_url = 'http://img692.imageshack.us/img692/2814/cbc.png'
|
cover_url = 'http://img692.imageshack.us/img692/2814/cbc.png'
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':['storyhead','storybody']})]
|
keep_only_tags = [dict(name='div', attrs={'id':['storyhead','storybody']})]
|
||||||
|
32
resources/recipes/cnetjapan.recipe
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class CNetJapan(BasicNewsRecipe):
|
||||||
|
title = u'CNET Japan'
|
||||||
|
oldest_article = 3
|
||||||
|
max_articles_per_feed = 30
|
||||||
|
__author__ = 'Hiroshi Miura'
|
||||||
|
|
||||||
|
feeds = [(u'cnet rss', u'http://feeds.japan.cnet.com/cnet/rss')]
|
||||||
|
language = 'ja'
|
||||||
|
encoding = 'Shift_JIS'
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
|
(re.compile(ur'<!--\u25B2contents_left END\u25B2-->.*</body>', re.DOTALL|re.IGNORECASE|re.UNICODE),
|
||||||
|
lambda match: '</body>'),
|
||||||
|
(re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL|re.IGNORECASE),
|
||||||
|
lambda match: '</body>'),
|
||||||
|
(re.compile(ur'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->', re.UNICODE),
|
||||||
|
lambda match: '<!-- removed -->'),
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags_before = dict(name="h2")
|
||||||
|
remove_tags = [
|
||||||
|
{'class':"social_bkm_share"},
|
||||||
|
{'class':"social_bkm_print"},
|
||||||
|
{'class':"block20 clearfix"},
|
||||||
|
dict(name="div",attrs={'id':'bookreview'}),
|
||||||
|
]
|
||||||
|
remove_tags_after = {'class':"block20"}
|
||||||
|
|
@ -347,6 +347,7 @@ class Comics(BasicNewsRecipe):
|
|||||||
title = strip_tag['title']
|
title = strip_tag['title']
|
||||||
print 'title: ', title
|
print 'title: ', title
|
||||||
current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':''})
|
current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':''})
|
||||||
|
current_articles.reverse()
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
|
69
resources/recipes/cosmopolitan.recipe
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||||
|
'''
|
||||||
|
Muy Interesante
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class General(BasicNewsRecipe):
|
||||||
|
title = 'Cosmopolitan'
|
||||||
|
__author__ = 'Gustavo Azambuja'
|
||||||
|
description = 'Revista Cosmopolitan, Edicion Espanola'
|
||||||
|
language = 'es'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
use_embedded_content = False
|
||||||
|
recursion = 1
|
||||||
|
encoding = 'utf8'
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
conversion_options = {'linearize_tables': True}
|
||||||
|
|
||||||
|
oldest_article = 180
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(id=['contenido']),
|
||||||
|
dict(name='td', attrs={'class':['contentheading', 'txt_articulo']})
|
||||||
|
]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['breadcrumb', 'bloque1', 'article', 'bajo_title', 'tags_articles', 'otrosenlaces_title', 'otrosenlaces_parent', 'compartir']}),
|
||||||
|
dict(name='div', attrs={'id':'comment'}),
|
||||||
|
dict(name='table', attrs={'class':'pagenav'}),
|
||||||
|
dict(name=['object','link'])
|
||||||
|
]
|
||||||
|
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||||
|
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||||
|
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||||
|
img {float:left; clear:both; margin:10px}
|
||||||
|
p {font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
'''
|
||||||
|
feeds = [
|
||||||
|
(u'Articulos', u'http://feeds.feedburner.com/cosmohispano/FSSt')
|
||||||
|
]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
attribs = [ 'style','font','valign'
|
||||||
|
,'colspan','width','height'
|
||||||
|
,'rowspan','summary','align'
|
||||||
|
,'cellspacing','cellpadding'
|
||||||
|
,'frames','rules','border'
|
||||||
|
]
|
||||||
|
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
|
||||||
|
item.name = 'div'
|
||||||
|
for attrib in attribs:
|
||||||
|
if item.has_key(attrib):
|
||||||
|
del item[attrib]
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
index = 'http://www.cosmohispano.com/revista'
|
||||||
|
soup = self.index_to_soup(index)
|
||||||
|
link_item = soup.find('img',attrs={'class':'img_portada'})
|
||||||
|
if link_item:
|
||||||
|
cover_url = "http://www.cosmohispano.com"+link_item['src']
|
||||||
|
return cover_url
|
61
resources/recipes/deredactie.recipe
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class deredactie(BasicNewsRecipe):
|
||||||
|
title = u'Deredactie.be'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
cover_url = 'http://www.deredactie.be/polopoly_fs/1.510827!image/2710428628.gif'
|
||||||
|
language = 'de'
|
||||||
|
keep_only_tags = []
|
||||||
|
__author__ = 'malfi'
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'id': 'articlehead'}))
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'id': 'articlebody'}))
|
||||||
|
remove_tags = []
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'id': 'story'}))
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'id': 'useractions'}))
|
||||||
|
remove_tags.append(dict(name = 'hr'))
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||||
|
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||||
|
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||||
|
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||||
|
'''
|
||||||
|
def parse_index(self):
|
||||||
|
categories = []
|
||||||
|
catnames = {}
|
||||||
|
soup = self.index_to_soup('http://www.deredactie.be/cm/vrtnieuws.deutsch')
|
||||||
|
for elem in soup.findAll('li', attrs={'id' : re.compile("^navItem[2-9]") }):
|
||||||
|
a = elem.find('a', href=True)
|
||||||
|
m = re.search('(?<=/)[^/]*$', a['href'])
|
||||||
|
cat = str(m.group(0))
|
||||||
|
categories.append(cat)
|
||||||
|
catnames[cat] = a['title']
|
||||||
|
self.log("found cat %s\n" % catnames[cat])
|
||||||
|
|
||||||
|
feeds = []
|
||||||
|
|
||||||
|
for cat in categories:
|
||||||
|
articles = []
|
||||||
|
soup = self.index_to_soup('http://www.deredactie.be/cm/vrtnieuws.deutsch/'+cat)
|
||||||
|
for a in soup.findAll('a',attrs={'href' : re.compile("deutsch.*/[0-9][0-9][0-9][0-9][0-9][0-9]_")}):
|
||||||
|
skip_this_article = False
|
||||||
|
url = a['href'].strip()
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = 'http://www.deredactie.be' + url
|
||||||
|
myarticle=({'title':self.tag_to_string(a), 'url':url, 'description':'', 'date':''})
|
||||||
|
for article in articles :
|
||||||
|
if article['url'] == url :
|
||||||
|
skip_this_article = True
|
||||||
|
self.log("SKIPPING DUP %s" % url)
|
||||||
|
break
|
||||||
|
if skip_this_article :
|
||||||
|
continue;
|
||||||
|
articles.append(myarticle)
|
||||||
|
self.log("Adding URL %s\n" %url)
|
||||||
|
if articles:
|
||||||
|
feeds.append((catnames[cat], articles))
|
||||||
|
return feeds
|
||||||
|
|
76
resources/recipes/deutsche_welle_bs.recipe
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
dw-world.de
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class DeutscheWelle_bs(BasicNewsRecipe):
|
||||||
|
title = 'Deutsche Welle'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'Vijesti iz Njemacke i svijeta'
|
||||||
|
publisher = 'Deutsche Welle'
|
||||||
|
category = 'news, politics, Germany'
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
language = 'bs'
|
||||||
|
publication_type = 'newsportal'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||||
|
extra_css = """
|
||||||
|
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||||
|
body{font-family: Arial,sans1,sans-serif}
|
||||||
|
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||||
|
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||||
|
"""
|
||||||
|
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||||
|
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher': publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||||
|
,dict(attrs={'class':'actionFooter'})
|
||||||
|
]
|
||||||
|
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||||
|
remove_attributes = ['height','width','onclick','border','lang']
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Politika' , u'http://rss.dw-world.de/rdf/rss-bos-pol')
|
||||||
|
,(u'Evropa' , u'http://rss.dw-world.de/rdf/rss-bos-eu' )
|
||||||
|
,(u'Kiosk' , u'http://rss.dw-world.de/rdf/rss-bos-eu' )
|
||||||
|
,(u'Ekonomija i Nuka' , u'http://rss.dw-world.de/rdf/rss-bos-eco')
|
||||||
|
,(u'Kultura' , u'http://rss.dw-world.de/rdf/rss-bos-cul')
|
||||||
|
,(u'Sport' , u'http://rss.dw-world.de/rdf/rss-bos-sp' )
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
artl = url.rpartition('/')[2]
|
||||||
|
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll('a'):
|
||||||
|
limg = item.find('img')
|
||||||
|
if item.string is not None:
|
||||||
|
str = item.string
|
||||||
|
item.replaceWith(str)
|
||||||
|
else:
|
||||||
|
if limg:
|
||||||
|
item.name = 'div'
|
||||||
|
del item['href']
|
||||||
|
if item.has_key('target'):
|
||||||
|
del item['target']
|
||||||
|
else:
|
||||||
|
str = self.tag_to_string(item)
|
||||||
|
item.replaceWith(str)
|
||||||
|
return soup
|
||||||
|
|
66
resources/recipes/deutsche_welle_en.recipe
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
dw-world.de
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class DeutscheWelle_en(BasicNewsRecipe):
|
||||||
|
title = 'Deutsche Welle'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'News from Germany and World'
|
||||||
|
publisher = 'Deutsche Welle'
|
||||||
|
category = 'news, politics, Germany'
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
language = 'en'
|
||||||
|
publication_type = 'newsportal'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Arial,sans-serif}
|
||||||
|
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||||
|
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher': publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||||
|
,dict(attrs={'class':'actionFooter'})
|
||||||
|
]
|
||||||
|
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||||
|
remove_attributes = ['height','width','onclick','border','lang']
|
||||||
|
|
||||||
|
feeds = [(u'All news', u'http://rss.dw-world.de/rdf/rss-en-all')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
artl = url.rpartition('/')[2]
|
||||||
|
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll('a'):
|
||||||
|
limg = item.find('img')
|
||||||
|
if item.string is not None:
|
||||||
|
str = item.string
|
||||||
|
item.replaceWith(str)
|
||||||
|
else:
|
||||||
|
if limg:
|
||||||
|
item.name = 'div'
|
||||||
|
del item['href']
|
||||||
|
if item.has_key('target'):
|
||||||
|
del item['target']
|
||||||
|
else:
|
||||||
|
str = self.tag_to_string(item)
|
||||||
|
item.replaceWith(str)
|
||||||
|
return soup
|
||||||
|
|
66
resources/recipes/deutsche_welle_es.recipe
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
dw-world.de
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class DeutscheWelle_es(BasicNewsRecipe):
|
||||||
|
title = 'Deutsche Welle'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'Noticias desde Alemania y mundo'
|
||||||
|
publisher = 'Deutsche Welle'
|
||||||
|
category = 'news, politics, Germany'
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
language = 'es'
|
||||||
|
publication_type = 'newsportal'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Arial,sans-serif}
|
||||||
|
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||||
|
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher': publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||||
|
,dict(attrs={'class':'actionFooter'})
|
||||||
|
]
|
||||||
|
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||||
|
remove_attributes = ['height','width','onclick','border','lang']
|
||||||
|
|
||||||
|
feeds = [(u'Noticias', u'http://rss.dw-world.de/rdf/rss-sp-all')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
artl = url.rpartition('/')[2]
|
||||||
|
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll('a'):
|
||||||
|
limg = item.find('img')
|
||||||
|
if item.string is not None:
|
||||||
|
str = item.string
|
||||||
|
item.replaceWith(str)
|
||||||
|
else:
|
||||||
|
if limg:
|
||||||
|
item.name = 'div'
|
||||||
|
del item['href']
|
||||||
|
if item.has_key('target'):
|
||||||
|
del item['target']
|
||||||
|
else:
|
||||||
|
str = self.tag_to_string(item)
|
||||||
|
item.replaceWith(str)
|
||||||
|
return soup
|
||||||
|
|
74
resources/recipes/deutsche_welle_hr.recipe
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
dw-world.de
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class DeutscheWelle_hr(BasicNewsRecipe):
|
||||||
|
title = 'Deutsche Welle'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'Vesti iz Njemacke i svijeta'
|
||||||
|
publisher = 'Deutsche Welle'
|
||||||
|
category = 'news, politics, Germany'
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
language = 'hr'
|
||||||
|
publication_type = 'newsportal'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||||
|
extra_css = """
|
||||||
|
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||||
|
body{font-family: Arial,sans1,sans-serif}
|
||||||
|
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||||
|
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||||
|
"""
|
||||||
|
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||||
|
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher': publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||||
|
,dict(attrs={'class':'actionFooter'})
|
||||||
|
]
|
||||||
|
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||||
|
remove_attributes = ['height','width','onclick','border','lang']
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Svijet' , u'http://rss.dw-world.de/rdf/rss-cro-svijet')
|
||||||
|
,(u'Europa' , u'http://rss.dw-world.de/rdf/rss-cro-eu' )
|
||||||
|
,(u'Njemacka' , u'http://rss.dw-world.de/rdf/rss-cro-ger' )
|
||||||
|
,(u'Vijesti' , u'http://rss.dw-world.de/rdf/rss-cro-all' )
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
artl = url.rpartition('/')[2]
|
||||||
|
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll('a'):
|
||||||
|
limg = item.find('img')
|
||||||
|
if item.string is not None:
|
||||||
|
str = item.string
|
||||||
|
item.replaceWith(str)
|
||||||
|
else:
|
||||||
|
if limg:
|
||||||
|
item.name = 'div'
|
||||||
|
del item['href']
|
||||||
|
if item.has_key('target'):
|
||||||
|
del item['target']
|
||||||
|
else:
|
||||||
|
str = self.tag_to_string(item)
|
||||||
|
item.replaceWith(str)
|
||||||
|
return soup
|
||||||
|
|
66
resources/recipes/deutsche_welle_pt.recipe
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
dw-world.de
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class DeutscheWelle_pt(BasicNewsRecipe):
|
||||||
|
title = 'Deutsche Welle'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'Noticias desde Alemania y mundo'
|
||||||
|
publisher = 'Deutsche Welle'
|
||||||
|
category = 'news, politics, Germany'
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
language = 'pt'
|
||||||
|
publication_type = 'newsportal'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Arial,sans-serif}
|
||||||
|
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||||
|
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher': publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||||
|
,dict(attrs={'class':'actionFooter'})
|
||||||
|
]
|
||||||
|
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||||
|
remove_attributes = ['height','width','onclick','border','lang']
|
||||||
|
|
||||||
|
feeds = [(u'Noticias', u'http://rss.dw-world.de/rdf/rss-br-all')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
artl = url.rpartition('/')[2]
|
||||||
|
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll('a'):
|
||||||
|
limg = item.find('img')
|
||||||
|
if item.string is not None:
|
||||||
|
str = item.string
|
||||||
|
item.replaceWith(str)
|
||||||
|
else:
|
||||||
|
if limg:
|
||||||
|
item.name = 'div'
|
||||||
|
del item['href']
|
||||||
|
if item.has_key('target'):
|
||||||
|
del item['target']
|
||||||
|
else:
|
||||||
|
str = self.tag_to_string(item)
|
||||||
|
item.replaceWith(str)
|
||||||
|
return soup
|
||||||
|
|
79
resources/recipes/deutsche_welle_sr.recipe
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
dw-world.de
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class DeutscheWelle_sr(BasicNewsRecipe):
|
||||||
|
title = 'Deutsche Welle'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'Vesti iz Nemacke i sveta'
|
||||||
|
publisher = 'Deutsche Welle'
|
||||||
|
category = 'news, politics, Germany'
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
use_embedded_content = False
|
||||||
|
no_stylesheets = True
|
||||||
|
language = 'sr'
|
||||||
|
publication_type = 'newsportal'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||||
|
extra_css = """
|
||||||
|
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||||
|
body{font-family: Arial,sans1,sans-serif}
|
||||||
|
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||||
|
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||||
|
"""
|
||||||
|
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||||
|
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher': publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||||
|
,dict(attrs={'class':'actionFooter'})
|
||||||
|
]
|
||||||
|
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||||
|
remove_attributes = ['height','width','onclick','border','lang']
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Politika' , u'http://rss.dw-world.de/rdf/rss-ser-pol' )
|
||||||
|
,(u'Srbija' , u'http://rss.dw-world.de/rdf/rss-ser-pol-ser' )
|
||||||
|
,(u'Region' , u'http://rss.dw-world.de/rdf/rss-ser-pol-region' )
|
||||||
|
,(u'Evropa' , u'http://rss.dw-world.de/rdf/rss-ser-pol-eu' )
|
||||||
|
,(u'Nemacka' , u'http://rss.dw-world.de/rdf/rss-ser-pol-ger' )
|
||||||
|
,(u'Svet' , u'http://rss.dw-world.de/rdf/rss-ser-pol-ger' )
|
||||||
|
,(u'Pregled stampe', u'http://rss.dw-world.de/rdf/rss-ser-pol-ger')
|
||||||
|
,(u'Nauka Tehnika Medicina', u'http://rss.dw-world.de/rdf/rss-ser-science')
|
||||||
|
,(u'Kultura' , u'feed:http://rss.dw-world.de/rdf/rss-ser-cul' )
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
artl = url.rpartition('/')[2]
|
||||||
|
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll('a'):
|
||||||
|
limg = item.find('img')
|
||||||
|
if item.string is not None:
|
||||||
|
str = item.string
|
||||||
|
item.replaceWith(str)
|
||||||
|
else:
|
||||||
|
if limg:
|
||||||
|
item.name = 'div'
|
||||||
|
del item['href']
|
||||||
|
if item.has_key('target'):
|
||||||
|
del item['target']
|
||||||
|
else:
|
||||||
|
str = self.tag_to_string(item)
|
||||||
|
item.replaceWith(str)
|
||||||
|
return soup
|
||||||
|
|
80
resources/recipes/el_pais_uy.recipe
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||||
|
'''
|
||||||
|
http://www.elpais.com.uy/
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class General(BasicNewsRecipe):
|
||||||
|
title = 'El Pais - Uruguay'
|
||||||
|
__author__ = 'Gustavo Azambuja'
|
||||||
|
description = 'Noticias de Uruguay y el resto del mundo'
|
||||||
|
publisher = 'EL PAIS S.A.'
|
||||||
|
category = 'news, politics, Uruguay'
|
||||||
|
language = 'es'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
use_embedded_content = False
|
||||||
|
recursion = 2
|
||||||
|
encoding = 'iso-8859-1'
|
||||||
|
masthead_url = 'http://www.elpais.com.uy/Images/09/cabezal/logo_PDEP.png'
|
||||||
|
publication_type = 'newspaper'
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 200
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='h1'),
|
||||||
|
dict(name='div', attrs={'id':'Contenido'})
|
||||||
|
]
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['date_text', 'comments', 'form_section', 'share_it']}),
|
||||||
|
dict(name='div', attrs={'id':['relatedPosts', 'spacer', 'banner_izquierda', 'right_container']}),
|
||||||
|
dict(name='p', attrs={'class':'FacebookLikeButton'}),
|
||||||
|
dict(name=['object','form']),
|
||||||
|
dict(name=['object','table']) ]
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||||
|
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||||
|
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||||
|
p {font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
body{font-family: Verdana,Arial,Helvetica,sans-serif }
|
||||||
|
img{margin-bottom: 0.4em; display:block;}
|
||||||
|
'''
|
||||||
|
feeds = [
|
||||||
|
(u'Ultimo Momento', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=umomento'),
|
||||||
|
(u'Editorial', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=editorial'),
|
||||||
|
(u'Nacional', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=nacional'),
|
||||||
|
(u'Internacional', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=internacional'),
|
||||||
|
(u'Espectaculos', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=espectaculos'),
|
||||||
|
(u'Deportes', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=deportes'),
|
||||||
|
(u'Ciudades', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=ciudades'),
|
||||||
|
(u'Economia', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=economia')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
cover_url = None
|
||||||
|
index = 'http://www.elpais.com.uy'
|
||||||
|
soup = self.index_to_soup(index)
|
||||||
|
link_item = soup.find('div',attrs={'class':'boxmedio box257'})
|
||||||
|
print link_item
|
||||||
|
if link_item:
|
||||||
|
cover_url = 'http://www.elpais.com.uy'+link_item.img['src']
|
||||||
|
return cover_url
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
||||||
|
|
22
resources/recipes/endgadget_ja.recipe
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||||
|
'''
|
||||||
|
japan.engadget.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class EndgadgetJapan(BasicNewsRecipe):
|
||||||
|
title = u'Endgadget\u65e5\u672c\u7248'
|
||||||
|
language = 'ja'
|
||||||
|
__author__ = 'Hiroshi Miura'
|
||||||
|
cover_url = 'http://skins18.wincustomize.com/1/49/149320/29/7578/preview-29-7578.jpg'
|
||||||
|
masthead_url = 'http://www.blogsmithmedia.com/japanese.engadget.com/media/eng-jp-logo-t.png'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
language = 'ja'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
feeds = [(u'engadget', u'http://japanese.engadget.com/rss.xml')]
|
58
resources/recipes/eu_commission.recipe
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
LANGUAGE = 'de'
|
||||||
|
|
||||||
|
def feedlink(num):
|
||||||
|
return u'http://europa.eu/rapid/syndication/QuickRSSAction.do?id='+\
|
||||||
|
str(num)+'&lang='+ LANGUAGE
|
||||||
|
|
||||||
|
class EUCommissionPress(BasicNewsRecipe):
|
||||||
|
title = u'Pressemitteilungen der EU Kommission pro Politikbereich'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
cover_url = 'http://ec.europa.eu/wel/template_2007/images/banners/banner-background.jpg'
|
||||||
|
__author__ = 'malfi'
|
||||||
|
language = LANGUAGE
|
||||||
|
keep_only_tags = []
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'pressReleaseContentMain'}))
|
||||||
|
remove_tags = []
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Pressemitteilung des Tages',feedlink(64)),
|
||||||
|
(u'Presidency',feedlink(137)),
|
||||||
|
(u'Foreign affairs and security policy',feedlink(138)),
|
||||||
|
(u'Agriculture and rural development',feedlink(139)),
|
||||||
|
(u'Budget and financial programming ',feedlink(140)),
|
||||||
|
(u'Climate action',feedlink(141)),
|
||||||
|
(u'Competition',feedlink(142)),
|
||||||
|
(u'Development',feedlink(143)),
|
||||||
|
(u'Digital agenda',feedlink(144)),
|
||||||
|
(u'Economic and monetary affairs',feedlink(145)),
|
||||||
|
(u'Education, culture, multilingualism and youth ',feedlink(146)),
|
||||||
|
(u'Employment, social Affairs and inclusion ',feedlink(147)),
|
||||||
|
(u'Energy',feedlink(148)),
|
||||||
|
(u'Enlargment and European neighbourhood policy ',feedlink(149)),
|
||||||
|
(u'Environment',feedlink(150)),
|
||||||
|
(u'Health and consumer policy',feedlink(151)),
|
||||||
|
(u'Home affairs',feedlink(152)),
|
||||||
|
(u'Industry and entrepreneurship',feedlink(153)),
|
||||||
|
(u'Inter-Institutional relations and administration',feedlink(154)),
|
||||||
|
(u'Internal market and services',feedlink(155)),
|
||||||
|
(u'International cooperation, humanitarian aid and crisis response',feedlink(156)),
|
||||||
|
(u'Justice, fundamental rights and citizenship',feedlink(157)),
|
||||||
|
(u'Maritime affairs and fisheries',feedlink(158)),
|
||||||
|
(u'Regional policy',feedlink(159)),
|
||||||
|
(u'Research and innovation',feedlink(160)),
|
||||||
|
(u'Taxation and customs union, audit and anti-fraud',feedlink(161)),
|
||||||
|
(u'Trade',feedlink(162)),
|
||||||
|
(u'Transport',feedlink(163))
|
||||||
|
]
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||||
|
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||||
|
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||||
|
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||||
|
'''
|
||||||
|
|
51
resources/recipes/european_voice.recipe
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class EuropeanVoice(BasicNewsRecipe):
|
||||||
|
title = u'European Voice'
|
||||||
|
__author__ = 'malfi'
|
||||||
|
oldest_article = 14
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
cover_url = 'http://www.europeanvoice.com/Css/images/logo.gif'
|
||||||
|
language = 'en'
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'articleLeftColumn'})]
|
||||||
|
remove_tags = [dict(name='div', attrs={'id':'BreadCrump'})]
|
||||||
|
feeds = [
|
||||||
|
(u'Whole site ',u'http://www.europeanvoice.com/Rss/2.xml'),
|
||||||
|
(u'News and analysis',u'http://www.europeanvoice.com/Rss/6.xml'),
|
||||||
|
(u'Comment',u'http://www.europeanvoice.com/Rss/7.xml'),
|
||||||
|
(u'Special reports',u'http://www.europeanvoice.com/Rss/5.xml'),
|
||||||
|
(u'People',u'http://www.europeanvoice.com/Rss/8.xml'),
|
||||||
|
(u'Career',u'http://www.europeanvoice.com/Rss/11.xml'),
|
||||||
|
(u'Policies',u'http://www.europeanvoice.com/Rss/4.xml'),
|
||||||
|
(u'EVents',u'http://www.europeanvoice.com/Rss/10.xml'),
|
||||||
|
(u'Policies - Economics',u'http://www.europeanvoice.com/Rss/31.xml'),
|
||||||
|
(u'Policies - Business',u'http://www.europeanvoice.com/Rss/19.xml'),
|
||||||
|
(u'Policies - Trade',u'http://www.europeanvoice.com/Rss/25.xml'),
|
||||||
|
(u'Policies - Information society',u'http://www.europeanvoice.com/Rss/20.xml'),
|
||||||
|
(u'Policies - Energy',u'http://www.europeanvoice.com/Rss/15.xml'),
|
||||||
|
(u'Policies - Transport',u'http://www.europeanvoice.com/Rss/18.xml'),
|
||||||
|
(u'Policies - Climate change',u'http://www.europeanvoice.com/Rss/16.xml'),
|
||||||
|
(u'Policies - Environment',u'http://www.europeanvoice.com/Rss/17.xml'),
|
||||||
|
(u'Policies - Farming & food',u'http://www.europeanvoice.com/Rss/23.xml'),
|
||||||
|
(u'Policies - Health & society',u'http://www.europeanvoice.com/Rss/24.xml'),
|
||||||
|
(u'Policies - Justice',u'http://www.europeanvoice.com/Rss/29.xml'),
|
||||||
|
(u'Policies - Foreign affairs',u'http://www.europeanvoice.com/Rss/27.xml')
|
||||||
|
]
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||||
|
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||||
|
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||||
|
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||||
|
'''
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url + '?bPrint=1'
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
denied = soup.findAll(True,text='Subscribers')
|
||||||
|
if denied:
|
||||||
|
raise Exception('Article skipped, because content can only be seen with subscription')
|
||||||
|
return soup
|
||||||
|
|
100
resources/recipes/freeway.recipe
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||||
|
'''
|
||||||
|
http://freeway.com.uy
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class General(BasicNewsRecipe):
|
||||||
|
title = 'freeway.com.uy'
|
||||||
|
__author__ = 'Gustavo Azambuja'
|
||||||
|
description = 'Revista Freeway, Montevideo, Uruguay'
|
||||||
|
language = 'es'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
use_embedded_content = False
|
||||||
|
recursion = 1
|
||||||
|
encoding = 'utf8'
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
conversion_options = {'linearize_tables': True}
|
||||||
|
|
||||||
|
oldest_article = 180
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(id=['contenido']),
|
||||||
|
dict(name='a', attrs={'class':'titulo_art_ppal'}),
|
||||||
|
dict(name='img', attrs={'class':'recuadro'}),
|
||||||
|
dict(name='td', attrs={'class':'txt_art_ppal'})
|
||||||
|
]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['object','link'])
|
||||||
|
]
|
||||||
|
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||||
|
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||||
|
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||||
|
img {float:left; clear:both; margin:10px}
|
||||||
|
p {font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
'''
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
feeds = []
|
||||||
|
for title, url in [('Articulos', 'http://freeway.com.uy/revista/')]:
|
||||||
|
articles = self.art_parse_section(url)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def art_parse_section(self, url):
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
div = soup.find(attrs={'id': 'tbl_1'})
|
||||||
|
|
||||||
|
current_articles = []
|
||||||
|
for tag in div.findAllNext(attrs = {'class': 'ancho_articulos'}):
|
||||||
|
if tag.get('class') == 'link-list-heading':
|
||||||
|
break
|
||||||
|
for td in tag.findAll('td'):
|
||||||
|
a = td.find('a', attrs= {'class': 'titulo_articulos'})
|
||||||
|
if a is None:
|
||||||
|
continue
|
||||||
|
title = self.tag_to_string(a)
|
||||||
|
url = a.get('href', False)
|
||||||
|
if not url or not title:
|
||||||
|
continue
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = 'http://freeway.com.uy'+url
|
||||||
|
p = td.find('p', attrs= {'class': 'txt_articulos'})
|
||||||
|
description = self.tag_to_string(p)
|
||||||
|
self.log('\t\tFound article:', title)
|
||||||
|
self.log('\t\t\t', url)
|
||||||
|
self.log('\t\t\t', description)
|
||||||
|
current_articles.append({'title': title, 'url': url, 'description':description, 'date':''})
|
||||||
|
|
||||||
|
return current_articles
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
attribs = [ 'style','font','valign'
|
||||||
|
,'colspan','width','height'
|
||||||
|
,'rowspan','summary','align'
|
||||||
|
,'cellspacing','cellpadding'
|
||||||
|
,'frames','rules','border'
|
||||||
|
]
|
||||||
|
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
|
||||||
|
item.name = 'div'
|
||||||
|
for attrib in attribs:
|
||||||
|
if item.has_key(attrib):
|
||||||
|
del item[attrib]
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
#index = 'http://www.cosmohispano.com/revista'
|
||||||
|
#soup = self.index_to_soup(index)
|
||||||
|
#link_item = soup.find('img',attrs={'class':'img_portada'})
|
||||||
|
#if link_item:
|
||||||
|
# cover_url = "http://www.cosmohispano.com"+link_item['src']
|
||||||
|
return 'http://freeway.com.uy/_upload/_n_foto_grande/noticia_1792_tapanoviembre2010.jpg'
|
@ -1,5 +1,5 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__author__ = u'Marc T\xf6nsing'
|
__author__ = u'Marc Toensing'
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
@ -17,6 +17,7 @@ class GamespotCom(BasicNewsRecipe):
|
|||||||
no_javascript = True
|
no_javascript = True
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
|
('All Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5'),
|
||||||
('PC Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=5'),
|
('PC Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=5'),
|
||||||
('XBOX 360 Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=1029'),
|
('XBOX 360 Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=1029'),
|
||||||
('Wii Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=1031'),
|
('Wii Reviews', 'http://www.gamespot.com/rss/game_updates.php?type=5&platform=1031'),
|
||||||
@ -37,5 +38,3 @@ class GamespotCom(BasicNewsRecipe):
|
|||||||
|
|
||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
return article.get('link') + '?print=1'
|
return article.get('link') + '?print=1'
|
||||||
|
|
||||||
|
|
||||||
|
28
resources/recipes/german_gov.recipe
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class GermanGovermentPress(BasicNewsRecipe):
|
||||||
|
title = u'Pressemitteilungen der Bundesregierung'
|
||||||
|
oldest_article = 14
|
||||||
|
__author__ = 'malfi'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
cover_url = 'http://www.bundesregierung.de/static/images/logoBR.gif'
|
||||||
|
language = 'de'
|
||||||
|
keep_only_tags = []
|
||||||
|
keep_only_tags.append(dict(name = 'h2'))
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'textblack'}))
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'subtitle'}))
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text'}))
|
||||||
|
remove_tags = []
|
||||||
|
feeds = [ (u'Pressemitteilungen',u'http://www.bundesregierung.de/Webs/Breg/DE/Service/RSS/Functions/bundesregierungPressemitteilungenRSS20,templateId=renderNewsfeed.rdf') ]
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||||
|
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||||
|
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||||
|
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||||
|
'''
|
||||||
|
def print_version(self, url):
|
||||||
|
m = re.search(r'^(.*).html$', url)
|
||||||
|
return str(m.group(1)) + ',layoutVariant=Druckansicht.html'
|
@ -1,7 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
__copyright__ = '2010, Szing'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
'''
|
'''
|
||||||
@ -10,49 +10,52 @@ globeandmail.com
|
|||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class GlobeAndMail(BasicNewsRecipe):
|
class AdvancedUserRecipe1287083651(BasicNewsRecipe):
|
||||||
title = u'Globe and Mail'
|
title = u'Globe & Mail'
|
||||||
language = 'en_CA'
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'Szing'
|
||||||
__author__ = 'Kovid Goyal'
|
|
||||||
oldest_article = 2
|
oldest_article = 2
|
||||||
max_articles_per_feed = 10
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
extra_css = '''
|
max_articles_per_feed = 100
|
||||||
h3 {font-size: 22pt; font-weight:bold; margin:0px; padding:0px 0px 8pt 0px;}
|
encoding = 'utf8'
|
||||||
h4 {margin-top: 0px;}
|
publisher = 'Globe & Mail'
|
||||||
#byline { font-family: monospace; font-weight:bold; }
|
language = 'en_CA'
|
||||||
#placeline {font-weight:bold;}
|
extra_css = 'p.meta {font-size:75%}\n .redtext {color: red;}\n .byline {font-size: 70%}'
|
||||||
#credit {margin-top:0px;}
|
|
||||||
.tag {font-size: 22pt;}'''
|
|
||||||
description = 'Canada\'s national newspaper'
|
|
||||||
keep_only_tags = [dict(name='article')]
|
|
||||||
remove_tags = [dict(name='aside'),
|
|
||||||
dict(name='footer'),
|
|
||||||
dict(name='div', attrs={'class':(lambda x: isinstance(x, (str,unicode)) and 'articlecommentcountholder' in x.split(' '))}),
|
|
||||||
dict(name='ul', attrs={'class':(lambda x: isinstance(x, (str,unicode)) and 'articletoolbar' in x.split(' '))}),
|
|
||||||
]
|
|
||||||
feeds = [
|
|
||||||
(u'Latest headlines', u'http://www.theglobeandmail.com/?service=rss'),
|
|
||||||
(u'Top stories', u'http://www.theglobeandmail.com/?service=rss&feed=topstories'),
|
|
||||||
(u'National', u'http://www.theglobeandmail.com/news/national/?service=rss'),
|
|
||||||
(u'Politics', u'http://www.theglobeandmail.com/news/politics/?service=rss'),
|
|
||||||
(u'World', u'http://www.theglobeandmail.com/news/world/?service=rss'),
|
|
||||||
(u'Business', u'http://www.theglobeandmail.com/report-on-business/?service=rss'),
|
|
||||||
(u'Opinions', u'http://www.theglobeandmail.com/news/opinions/?service=rss'),
|
|
||||||
(u'Columnists', u'http://www.theglobeandmail.com/news/opinions/columnists/?service=rss'),
|
|
||||||
(u'Globe Investor', u'http://www.theglobeandmail.com/globe-investor/?service=rss'),
|
|
||||||
(u'Sports', u'http://www.theglobeandmail.com/sports/?service=rss'),
|
|
||||||
(u'Technology', u'http://www.theglobeandmail.com/news/technology/?service=rss'),
|
|
||||||
(u'Arts', u'http://www.theglobeandmail.com/news/arts/?service=rss'),
|
|
||||||
(u'Life', u'http://www.theglobeandmail.com/life/?service=rss'),
|
|
||||||
(u'Blogs', u'http://www.theglobeandmail.com/blogs/?service=rss'),
|
|
||||||
(u'Real Estate', u'http://www.theglobeandmail.com/real-estate/?service=rss'),
|
|
||||||
(u'Auto', u'http://www.theglobeandmail.com/auto/?service=rss')
|
|
||||||
]
|
|
||||||
|
|
||||||
def get_article_url(self, article):
|
feeds = [
|
||||||
url = BasicNewsRecipe.get_article_url(self, article)
|
(u'Top National Stories', u'http://www.theglobeandmail.com/news/national/?service=rss'),
|
||||||
if '/video/' not in url:
|
(u'Business', u'http://www.theglobeandmail.com/report-on-business/?service=rss'),
|
||||||
return url
|
(u'Commentary', u'http://www.theglobeandmail.com/report-on-business/commentary/?service=rss'),
|
||||||
|
(u'Blogs', u'http://www.theglobeandmail.com/blogs/?service=rss'),
|
||||||
|
(u'Facts & Arguments', u'http://www.theglobeandmail.com/life/facts-and-arguments/?service=rss'),
|
||||||
|
(u'Technology', u'http://www.theglobeandmail.com/news/technology/?service=rss'),
|
||||||
|
(u'Investing', u'http://www.theglobeandmail.com/globe-investor/?service=rss'),
|
||||||
|
(u'Top Polical Stories', u'http://www.theglobeandmail.com/news/politics/?service=rss'),
|
||||||
|
(u'Arts', u'http://www.theglobeandmail.com/news/arts/?service=rss'),
|
||||||
|
(u'Life', u'http://www.theglobeandmail.com/life/?service=rss'),
|
||||||
|
(u'Real Estate', u'http://www.theglobeandmail.com/real-estate/?service=rss'),
|
||||||
|
(u'Auto', u'http://www.theglobeandmail.com/sports/?service=rss'),
|
||||||
|
(u'Sports', u'http://www.theglobeandmail.com/auto/?service=rss')
|
||||||
|
]
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='h1'),
|
||||||
|
dict(name='h2', attrs={'id':'articletitle'}),
|
||||||
|
dict(name='p', attrs={'class':['leadText', 'meta', 'leadImage', 'redtext byline', 'bodyText']}),
|
||||||
|
dict(name='div', attrs={'class':['news','articlemeta','articlecopy']}),
|
||||||
|
dict(name='id', attrs={'class':'article'}),
|
||||||
|
dict(name='table', attrs={'class':'todays-market'}),
|
||||||
|
dict(name='header', attrs={'id':'leadheader'})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'id':['tabInside', 'ShareArticles', 'topStories']})
|
||||||
|
]
|
||||||
|
|
||||||
|
#this has to be here or the text in the article appears twice.
|
||||||
|
remove_tags_after = [dict(id='article')]
|
||||||
|
|
||||||
|
#Use the mobile version rather than the web version
|
||||||
|
def print_version(self, url):
|
||||||
|
return url + '&service=mobile'
|
||||||
|
|
||||||
|
47
resources/recipes/globes_co_il.recipe
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import re
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1283848012(BasicNewsRecipe):
|
||||||
|
description = 'This is Globes.co.il.'
|
||||||
|
cover_url = 'http://www.the7eye.org.il/SiteCollectionImages/BAKTANA/arye_avnery_010709_377.jpg'
|
||||||
|
title = u'Globes'
|
||||||
|
language = 'he'
|
||||||
|
__author__ = 'marbs'
|
||||||
|
extra_css='img {max-width:100%;} body{direction: rtl;max-width:100%;}title{direction: rtl; } article_description{direction: rtl; }, a.article{direction: rtl;max-width:100%;} calibre_feed_description{direction: rtl; }'
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
remove_javascript = True
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
oldest_article = 1
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
remove_attributes = ['width','style']
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [(u'שוק ההון', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=585'),
|
||||||
|
(u'נדל"ן', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=607'),
|
||||||
|
(u'וול סטריט ושווקי העולם', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=1225'),
|
||||||
|
(u'ניתוח טכני', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=1294'),
|
||||||
|
(u'היי טק', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=594'),
|
||||||
|
(u'נתח שוק וצרכנות', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=821'),
|
||||||
|
(u'דין וחשבון', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=829'),
|
||||||
|
(u'רכב', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=3220'),
|
||||||
|
(u'דעות', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=845'),
|
||||||
|
(u'קניון המניות - טור שבועי', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=3175'),
|
||||||
|
(u'סביבה', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=3221')]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
split1 = url.split("=")
|
||||||
|
print_url = 'http://www.globes.co.il/serve/globes/printwindow.asp?did=' + split1[1]
|
||||||
|
return print_url
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
soup.find('tr',attrs={'bgcolor':'black'}).findPrevious('tr').extract()
|
||||||
|
soup.find('tr',attrs={'bgcolor':'black'}).extract()
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def fixChars(self,string):
|
||||||
|
# Replace lsquo (\x91)
|
||||||
|
fixed = re.sub("■","■",string)
|
||||||
|
return fixed
|
||||||
|
|
||||||
|
|
41
resources/recipes/handelsblatt.recipe
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Handelsblatt(BasicNewsRecipe):
|
||||||
|
title = u'Handelsblatt'
|
||||||
|
__author__ = 'malfi'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
cover_url = 'http://www.handelsblatt.com/images/logo/logo_handelsblatt.com.png'
|
||||||
|
language = 'de'
|
||||||
|
keep_only_tags = []
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'structOneCol'}))
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'id': 'fullText'}))
|
||||||
|
remove_tags = [dict(name='img', attrs = {'src': 'http://www.handelsblatt.com/images/icon/loading.gif'})]
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'Handelsblatt Exklusiv',u'http://www.handelsblatt.com/rss/exklusiv'),
|
||||||
|
(u'Handelsblatt Top-Themen',u'http://www.handelsblatt.com/rss/top-themen'),
|
||||||
|
(u'Handelsblatt Schlagzeilen',u'http://www.handelsblatt.com/rss/ticker/'),
|
||||||
|
(u'Handelsblatt Finanzen',u'http://www.handelsblatt.com/rss/finanzen/'),
|
||||||
|
(u'Handelsblatt Unternehmen',u'http://www.handelsblatt.com/rss/unternehmen/'),
|
||||||
|
(u'Handelsblatt Politik',u'http://www.handelsblatt.com/rss/politik/'),
|
||||||
|
(u'Handelsblatt Technologie',u'http://www.handelsblatt.com/rss/technologie/'),
|
||||||
|
(u'Handelsblatt Meinung',u'http://www.handelsblatt.com/rss/meinung'),
|
||||||
|
(u'Handelsblatt Magazin',u'http://www.handelsblatt.com/rss/magazin/'),
|
||||||
|
(u'Handelsblatt Weblogs',u'http://www.handelsblatt.com/rss/blogs')
|
||||||
|
]
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||||
|
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||||
|
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||||
|
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||||
|
'''
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
m = re.search('(?<=;)[0-9]*', url)
|
||||||
|
return u'http://www.handelsblatt.com/_b=' + str(m.group(0)) + ',_p=21,_t=ftprint,doc_page=0;printpage'
|
||||||
|
|
||||||
|
|
38
resources/recipes/hannoversche_zeitung.recipe
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1287519083(BasicNewsRecipe):
|
||||||
|
title = u'Hannoversche Allgemeine Zeitung'
|
||||||
|
oldest_article = 1
|
||||||
|
__author__ = 'Artemis'
|
||||||
|
max_articles_per_feed = 30
|
||||||
|
language = 'de'
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
#(u'Schlagzeilen', u'http://www.haz.de/rss/feed/haz_schlagzeilen'),
|
||||||
|
(u'Politik', u'http://www.haz.de/rss/feed/haz_politik'),
|
||||||
|
(u'Wirtschaft', u'http://www.haz.de/rss/feed/haz_wirtschaft'),
|
||||||
|
(u'Panorama', u'http://www.haz.de/rss/feed/haz_panorama'),
|
||||||
|
(u'Wissen', u'http://www.haz.de/rss/feed/haz_wissen'),
|
||||||
|
(u'Kultur', u'http://www.haz.de/rss/feed/haz_kultur'),
|
||||||
|
(u'Sp\xe4tvorstellung', u'http://www.haz.de/rss/feed/haz_spaetvorstellung'),
|
||||||
|
(u'Hannover & Region', u'http://www.haz.de/rss/feed/haz_hannoverregion'),
|
||||||
|
(u'Netzgefl\xfcster', u'http://www.haz.de/rss/feed/haz_netzgefluester'),
|
||||||
|
(u'Meinung', u'http://www.haz.de/rss/feed/haz_meinung'),
|
||||||
|
(u'ZiSH', u'http://www.haz.de/rss/feed/haz_zish'),
|
||||||
|
(u'Medien', u'http://www.haz.de/rss/feed/haz_medien'),
|
||||||
|
#(u'Sport', u'http://www.haz.de/rss/feed/haz_sport'),
|
||||||
|
#(u'Hannover 96', u'http://www.haz.de/rss/feed/haz_hannover96')
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags_before =dict(id='modul_artikel')
|
||||||
|
remove_tags_after =dict(id='articlecontent')
|
||||||
|
|
||||||
|
remove_tags = dict(id='articlesidebar')
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['articlecomment',
|
||||||
|
'articlebookmark', 'teaser_anzeige', 'teaser_umfrage',
|
||||||
|
'navigation', 'subnavigation']})
|
||||||
|
]
|
||||||
|
|
@ -13,7 +13,6 @@ class IrishTimes(BasicNewsRecipe):
|
|||||||
language = 'en_IE'
|
language = 'en_IE'
|
||||||
timefmt = ' (%A, %B %d, %Y)'
|
timefmt = ' (%A, %B %d, %Y)'
|
||||||
|
|
||||||
|
|
||||||
oldest_article = 3
|
oldest_article = 3
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
simultaneous_downloads= 1
|
simultaneous_downloads= 1
|
||||||
@ -33,13 +32,13 @@ class IrishTimes(BasicNewsRecipe):
|
|||||||
('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
|
('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
if url.count('rss.feedsportal.com'):
|
if url.count('rss.feedsportal.com'):
|
||||||
u = url.replace('0Bhtml/story01.htm','_pf0Bhtml/story01.htm')
|
u = 'http://www.irishtimes.com' + \
|
||||||
else:
|
(((url[69:].replace('0C','/')).replace('0A','0'))).replace('0Bhtml/story01.htm','_pf.html')
|
||||||
u = url.replace('.html','_pf.html')
|
else:
|
||||||
return u
|
u = url.replace('.html','_pf.html')
|
||||||
|
return u
|
||||||
|
|
||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
return article.link
|
return article.link
|
||||||
|
26
resources/recipes/jijinews.recipe
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||||
|
'''
|
||||||
|
www.jiji.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class JijiDotCom(BasicNewsRecipe):
|
||||||
|
title = u'\u6642\u4e8b\u901a\u4fe1'
|
||||||
|
__author__ = 'Hiroshi Miura'
|
||||||
|
description = 'World News from Jiji Press'
|
||||||
|
publisher = 'Jiji Press Ltd.'
|
||||||
|
category = 'news'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
oldest_article = 6
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
language = 'ja'
|
||||||
|
cover_url = 'http://www.jiji.com/img/top_header_logo2.gif'
|
||||||
|
masthead_url = 'http://jen.jiji.com/images/logo_jijipress.gif'
|
||||||
|
|
||||||
|
feeds = [(u'\u30cb\u30e5\u30fc\u30b9', u'http://www.jiji.com/rss/ranking.rdf')]
|
||||||
|
remove_tags_after = dict(id="ad_google")
|
||||||
|
|
48
resources/recipes/la_diaria.recipe
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||||
|
'''
|
||||||
|
ladiaria.com.uy
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class General(BasicNewsRecipe):
|
||||||
|
title = 'La Diaria'
|
||||||
|
__author__ = 'Gustavo Azambuja'
|
||||||
|
description = 'Noticias de Uruguay'
|
||||||
|
language = 'es'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
use_embedded_content = False
|
||||||
|
recursion = 5
|
||||||
|
encoding = 'utf8'
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
keep_only_tags = [dict(id=['article'])]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['byline', 'hr', 'titlebar', 'volver-arriba-right']}),
|
||||||
|
dict(name='div', attrs={'id':'discussion'}),
|
||||||
|
dict(name=['object','link'])
|
||||||
|
]
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||||
|
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||||
|
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||||
|
p {font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
'''
|
||||||
|
feeds = [
|
||||||
|
(u'Articulos', u'http://ladiaria.com/feeds/articulos')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
return 'http://ladiaria.com/edicion/imagenportada/'
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
@ -54,10 +54,7 @@ class LaJornada_mx(BasicNewsRecipe):
|
|||||||
preprocess_regexps = [
|
preprocess_regexps = [
|
||||||
(re.compile( r'<div class="inicial">(.*)</div><p class="s-s">'
|
(re.compile( r'<div class="inicial">(.*)</div><p class="s-s">'
|
||||||
,re.DOTALL|re.IGNORECASE)
|
,re.DOTALL|re.IGNORECASE)
|
||||||
,lambda match: '<p class="inicial">' + match.group(1) + '</p><p class="s-s">'),
|
,lambda match: '<p class="inicial">' + match.group(1) + '</p><p class="s-s">')
|
||||||
(re.compile( r'<q>(.*?)</q>'
|
|
||||||
,re.DOTALL|re.IGNORECASE)
|
|
||||||
,lambda match: '"' + match.group(1) + '"')
|
|
||||||
]
|
]
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
|
@ -8,7 +8,7 @@ from calibre import strftime
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class LaRazon_Bol(BasicNewsRecipe):
|
class LaRazon_Bol(BasicNewsRecipe):
|
||||||
title = 'La Razón - Bolivia'
|
title = u'La Razón - Bolivia'
|
||||||
__author__ = 'Darko Miletic'
|
__author__ = 'Darko Miletic'
|
||||||
description = 'El diario nacional de Bolivia'
|
description = 'El diario nacional de Bolivia'
|
||||||
publisher = 'Praxsis S.R.L.'
|
publisher = 'Praxsis S.R.L.'
|
||||||
|
@ -20,11 +20,14 @@ class Lanacion(BasicNewsRecipe):
|
|||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
masthead_url = 'http://www.lanacion.com.ar/imgs/layout/logos/ln341x47.gif'
|
masthead_url = 'http://www.lanacion.com.ar/imgs/layout/logos/ln341x47.gif'
|
||||||
extra_css = """ h1{font-family: Georgia,serif}
|
extra_css = """ h1{font-family: Georgia,serif}
|
||||||
|
h2{color: #626262}
|
||||||
body{font-family: Arial,sans-serif}
|
body{font-family: Arial,sans-serif}
|
||||||
img{margin-top: 0.5em; margin-bottom: 0.2em}
|
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||||
|
.notaFecha{color: #808080}
|
||||||
.notaEpigrafe{font-size: x-small}
|
.notaEpigrafe{font-size: x-small}
|
||||||
.topNota h1{font-family: Arial,sans-serif} """
|
.topNota h1{font-family: Arial,sans-serif}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
@ -38,12 +41,12 @@ class Lanacion(BasicNewsRecipe):
|
|||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='div' , attrs={'class':'notaComentario floatFix noprint' })
|
dict(name='div' , attrs={'class':'notaComentario floatFix noprint' })
|
||||||
,dict(name='ul' , attrs={'class':['cajaHerramientas cajaTop noprint','herramientas noprint']})
|
,dict(name='ul' , attrs={'class':['cajaHerramientas cajaTop noprint','herramientas noprint']})
|
||||||
,dict(name='div' , attrs={'class':'cajaHerramientas noprint' })
|
,dict(name='div' , attrs={'class':['cajaHerramientas noprint','cajaHerramientas floatFix'] })
|
||||||
,dict(attrs={'class':['titulosMultimedia','derecha','techo color','encuesta','izquierda compartir','floatFix']})
|
,dict(attrs={'class':['titulosMultimedia','derecha','techo color','encuesta','izquierda compartir','floatFix','videoCentro']})
|
||||||
,dict(name=['iframe','embed','object','form','base','hr'])
|
,dict(name=['iframe','embed','object','form','base','hr','meta','link','input'])
|
||||||
]
|
]
|
||||||
remove_tags_after = dict(attrs={'class':['tags','nota-destacado']})
|
remove_tags_after = dict(attrs={'class':['tags','nota-destacado']})
|
||||||
remove_attributes = ['height','width','visible']
|
remove_attributes = ['height','width','visible','onclick','data-count','name']
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Ultimas noticias' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?origen=2' )
|
(u'Ultimas noticias' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?origen=2' )
|
||||||
|
26
resources/recipes/mainichi.recipe
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||||
|
'''
|
||||||
|
www.mainichi.jp
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class MainichiDailyNews(BasicNewsRecipe):
|
||||||
|
title = u'\u6bce\u65e5\u65b0\u805e'
|
||||||
|
__author__ = 'Hiroshi Miura'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
description = 'Japanese traditional newspaper Mainichi Daily News'
|
||||||
|
publisher = 'Mainichi Daily News'
|
||||||
|
category = 'news, japan'
|
||||||
|
language = 'ja'
|
||||||
|
|
||||||
|
feeds = [(u'daily news', u'http://mainichi.jp/rss/etc/flash.rss')]
|
||||||
|
|
||||||
|
remove_tags_before = {'class':"NewsTitle"}
|
||||||
|
remove_tags = [{'class':"RelatedArticle"}]
|
||||||
|
remove_tags_after = {'class':"Credit"}
|
||||||
|
|
18
resources/recipes/mainichi_it_news.recipe
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class MainichiDailyITNews(BasicNewsRecipe):
|
||||||
|
title = u'\u6bce\u65e5\u65b0\u805e(IT&\u5bb6\u96fb)'
|
||||||
|
__author__ = 'Hiroshi Miura'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
description = 'Japanese traditional newspaper Mainichi Daily News - IT and electronics'
|
||||||
|
publisher = 'Mainichi Daily News'
|
||||||
|
category = 'news, Japan, IT, Electronics'
|
||||||
|
language = 'ja'
|
||||||
|
|
||||||
|
feeds = [(u'IT News', u'http://mainichi.pheedo.jp/f/mainichijp_electronics')]
|
||||||
|
|
||||||
|
remove_tags_before = {'class':"NewsTitle"}
|
||||||
|
remove_tags = [{'class':"RelatedArticle"}]
|
||||||
|
remove_tags_after = {'class':"Credit"}
|
||||||
|
|
35
resources/recipes/marctv.recipe
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
|
'''
|
||||||
|
Fetch MarcTV.
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class MarcTVde(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = 'Marc Toensings Visionen'
|
||||||
|
|
||||||
|
description = 'Marc Toensings Visionen'
|
||||||
|
|
||||||
|
language = 'de'
|
||||||
|
|
||||||
|
__author__ = 'Marc Toensing'
|
||||||
|
|
||||||
|
max_articles_per_feed = 40
|
||||||
|
|
||||||
|
oldest_article = 665
|
||||||
|
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
|
remove_tags = []
|
||||||
|
|
||||||
|
keep_only_tags = dict(name='div', attrs={'class':["content"]})
|
||||||
|
|
||||||
|
feeds = [(u'Spiele', u'http://feeds.feedburner.com/marctv/spiele'), (u'Leben', u'http://feeds.feedburner.com/marctv/leben'), (u'Medien', u'http://feeds.feedburner.com/marctv/medien')]
|
||||||
|
|
||||||
|
extra_css = '.#wrapper .entry p img{width:620px; height: 270px;}'
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
return 'http://marctv.de/marctv.png'
|
22
resources/recipes/matichon.recipe
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1290412756(BasicNewsRecipe):
|
||||||
|
__author__ = 'Anat R.'
|
||||||
|
title = u'Matichon'
|
||||||
|
oldest_article = 7
|
||||||
|
language = 'th'
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = False
|
||||||
|
feeds = [(u'News', u'http://www.matichon.co.th/rss/news_article.xml'),
|
||||||
|
(u'Columns', u'http://www.matichon.co.th/rss/news_columns.xml'),
|
||||||
|
(u'Politics', u'http://www.matichon.co.th/rss/news_politic.xml'),
|
||||||
|
(u'Business', u'http://www.matichon.co.th/rss/news_business.xml'),
|
||||||
|
(u'World', u'http://www.matichon.co.th/rss/news_world.xml'),
|
||||||
|
(u'Sports', u'http://www.matichon.co.th/rss/news_sport.xml'),
|
||||||
|
(u'Entertainment', u'http://www.matichon.co.th/rss/news_entertainment.xml')]
|
||||||
|
keep_only_tags = []
|
||||||
|
keep_only_tags.append(dict(name = 'h3', attrs = {'class' : 'read-h'}))
|
||||||
|
keep_only_tags.append(dict(name = 'p', attrs = {'class' : 'read-time'}))
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'news-content'}))
|
@ -3,13 +3,28 @@ __copyright__ = '2010, Eddie Lau'
|
|||||||
'''
|
'''
|
||||||
modified from Singtao Toronto calibre recipe by rty
|
modified from Singtao Toronto calibre recipe by rty
|
||||||
Change Log:
|
Change Log:
|
||||||
|
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
|
||||||
|
ordering of articles
|
||||||
|
2010/11/12: add news image and eco-news section
|
||||||
|
2010/11/08: add parsing of finance section
|
||||||
|
2010/11/06: temporary work-around for Kindle device having no capability to display unicode
|
||||||
|
in section/article list.
|
||||||
2010/10/31: skip repeated articles in section pages
|
2010/10/31: skip repeated articles in section pages
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import datetime
|
import os, datetime, re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from contextlib import nested
|
||||||
|
|
||||||
class AdvancedUserRecipe1278063072(BasicNewsRecipe):
|
|
||||||
|
from calibre import __appname__, strftime
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
|
from calibre.ebooks.metadata.toc import TOC
|
||||||
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
from calibre.utils.date import now as nowf
|
||||||
|
|
||||||
|
class MPHKRecipe(BasicNewsRecipe):
|
||||||
title = 'Ming Pao - Hong Kong'
|
title = 'Ming Pao - Hong Kong'
|
||||||
oldest_article = 1
|
oldest_article = 1
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
@ -24,27 +39,131 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe):
|
|||||||
encoding = 'Big5-HKSCS'
|
encoding = 'Big5-HKSCS'
|
||||||
recursions = 0
|
recursions = 0
|
||||||
conversion_options = {'linearize_tables':True}
|
conversion_options = {'linearize_tables':True}
|
||||||
|
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;}'
|
||||||
|
#extra_css = 'img {float:right; margin:4px;}'
|
||||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||||
keep_only_tags = [dict(name='h1'),
|
keep_only_tags = [dict(name='h1'),
|
||||||
|
#dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page
|
||||||
|
dict(attrs={'class':['photo']}),
|
||||||
|
dict(attrs={'id':['newscontent']}),
|
||||||
dict(attrs={'id':['newscontent01','newscontent02']})]
|
dict(attrs={'id':['newscontent01','newscontent02']})]
|
||||||
|
remove_tags = [dict(name='style'),
|
||||||
|
dict(attrs={'id':['newscontent135']})] # for the finance page
|
||||||
|
remove_attributes = ['width']
|
||||||
|
preprocess_regexps = [
|
||||||
|
(re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
|
||||||
|
lambda match: '<h1>'),
|
||||||
|
(re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
|
||||||
|
lambda match: '</h1>'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def image_url_processor(cls, baseurl, url):
|
||||||
|
# trick: break the url at the first occurance of digit, add an additional
|
||||||
|
# '_' at the front
|
||||||
|
# not working, may need to move this to preprocess_html() method
|
||||||
|
#minIdx = 10000
|
||||||
|
#i0 = url.find('0')
|
||||||
|
#if i0 >= 0 and i0 < minIdx:
|
||||||
|
# minIdx = i0
|
||||||
|
#i1 = url.find('1')
|
||||||
|
#if i1 >= 0 and i1 < minIdx:
|
||||||
|
# minIdx = i1
|
||||||
|
#i2 = url.find('2')
|
||||||
|
#if i2 >= 0 and i2 < minIdx:
|
||||||
|
# minIdx = i2
|
||||||
|
#i3 = url.find('3')
|
||||||
|
#if i3 >= 0 and i0 < minIdx:
|
||||||
|
# minIdx = i3
|
||||||
|
#i4 = url.find('4')
|
||||||
|
#if i4 >= 0 and i4 < minIdx:
|
||||||
|
# minIdx = i4
|
||||||
|
#i5 = url.find('5')
|
||||||
|
#if i5 >= 0 and i5 < minIdx:
|
||||||
|
# minIdx = i5
|
||||||
|
#i6 = url.find('6')
|
||||||
|
#if i6 >= 0 and i6 < minIdx:
|
||||||
|
# minIdx = i6
|
||||||
|
#i7 = url.find('7')
|
||||||
|
#if i7 >= 0 and i7 < minIdx:
|
||||||
|
# minIdx = i7
|
||||||
|
#i8 = url.find('8')
|
||||||
|
#if i8 >= 0 and i8 < minIdx:
|
||||||
|
# minIdx = i8
|
||||||
|
#i9 = url.find('9')
|
||||||
|
#if i9 >= 0 and i9 < minIdx:
|
||||||
|
# minIdx = i9
|
||||||
|
#return url[0:minIdx] + '_' + url[minIdx+1:]
|
||||||
|
return url
|
||||||
|
|
||||||
def get_fetchdate(self):
|
def get_fetchdate(self):
|
||||||
dt_utc = datetime.datetime.utcnow()
|
dt_utc = datetime.datetime.utcnow()
|
||||||
# convert UTC to local hk time - at around HKT 5.30am, all news are available
|
# convert UTC to local hk time - at around HKT 6.00am, all news are available
|
||||||
dt_local = dt_utc - datetime.timedelta(-2.5/24)
|
dt_local = dt_utc - datetime.timedelta(-2.0/24)
|
||||||
return dt_local.strftime("%Y%m%d")
|
return dt_local.strftime("%Y%m%d")
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
feeds = []
|
feeds = []
|
||||||
dateStr = self.get_fetchdate()
|
dateStr = self.get_fetchdate()
|
||||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]:
|
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||||
articles = self.parse_section(url)
|
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
|
||||||
if articles:
|
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||||
feeds.append((title, articles))
|
(u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
|
||||||
|
(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||||
|
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||||
|
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
|
||||||
|
('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||||
|
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
|
||||||
|
(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||||
|
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||||
|
articles = self.parse_section(url)
|
||||||
|
if articles:
|
||||||
|
feeds.append((title, articles))
|
||||||
|
# special - finance
|
||||||
|
fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||||
|
if fin_articles:
|
||||||
|
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||||
|
# special - eco-friendly
|
||||||
|
# eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm')
|
||||||
|
# if eco_articles:
|
||||||
|
# feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
|
||||||
|
# special - entertainment
|
||||||
|
#ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||||
|
#if ent_articles:
|
||||||
|
# feeds.append(('Entertainment', ent_articles))
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def parse_section(self, url):
|
def parse_section(self, url):
|
||||||
|
dateStr = self.get_fetchdate()
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
|
||||||
|
current_articles = []
|
||||||
|
included_urls = []
|
||||||
|
divs.reverse()
|
||||||
|
for i in divs:
|
||||||
|
a = i.find('a', href = True)
|
||||||
|
title = self.tag_to_string(a)
|
||||||
|
url = a.get('href', False)
|
||||||
|
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||||
|
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||||
|
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||||
|
included_urls.append(url)
|
||||||
|
current_articles.reverse()
|
||||||
|
return current_articles
|
||||||
|
|
||||||
|
def parse_fin_section(self, url):
|
||||||
dateStr = self.get_fetchdate()
|
dateStr = self.get_fetchdate()
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
a = soup.findAll('a', href= True)
|
||||||
|
current_articles = []
|
||||||
|
for i in a:
|
||||||
|
url = i.get('href', False)
|
||||||
|
if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
|
||||||
|
title = self.tag_to_string(i)
|
||||||
|
url = 'http://www.mpfinance.com/cfm/' +url
|
||||||
|
current_articles.append({'title': title, 'url': url, 'description':''})
|
||||||
|
return current_articles
|
||||||
|
|
||||||
|
def parse_eco_section(self, url):
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
divs = soup.findAll(attrs={'class': ['bullet']})
|
divs = soup.findAll(attrs={'class': ['bullet']})
|
||||||
current_articles = []
|
current_articles = []
|
||||||
@ -53,9 +172,162 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe):
|
|||||||
a = i.find('a', href = True)
|
a = i.find('a', href = True)
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = a.get('href', False)
|
url = a.get('href', False)
|
||||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
|
||||||
if url not in included_urls:
|
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||||
current_articles.append({'title': title, 'url': url, 'description':''})
|
current_articles.append({'title': title, 'url': url, 'description':''})
|
||||||
included_urls.append(url)
|
included_urls.append(url)
|
||||||
return current_articles
|
return current_articles
|
||||||
|
|
||||||
|
#def parse_ent_section(self, url):
|
||||||
|
# dateStr = self.get_fetchdate()
|
||||||
|
# soup = self.index_to_soup(url)
|
||||||
|
# a = soup.findAll('a', href=True)
|
||||||
|
# current_articles = []
|
||||||
|
# included_urls = []
|
||||||
|
# for i in a:
|
||||||
|
# title = self.tag_to_string(i)
|
||||||
|
# url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
|
||||||
|
# if url not in included_urls and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1 and not title == '':
|
||||||
|
# current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||||
|
# return current_articles
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['width']
|
||||||
|
for item in soup.findAll(stype=True):
|
||||||
|
del item['absmiddle']
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def create_opf(self, feeds, dir=None):
|
||||||
|
#super(MPHKRecipe,self).create_opf(feeds, dir)
|
||||||
|
if dir is None:
|
||||||
|
dir = self.output_dir
|
||||||
|
title = self.short_title()
|
||||||
|
if self.output_profile.periodical_date_in_title:
|
||||||
|
title += strftime(self.timefmt)
|
||||||
|
mi = MetaInformation(title, [__appname__])
|
||||||
|
mi.publisher = __appname__
|
||||||
|
mi.author_sort = __appname__
|
||||||
|
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||||
|
mi.timestamp = nowf()
|
||||||
|
mi.comments = self.description
|
||||||
|
if not isinstance(mi.comments, unicode):
|
||||||
|
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||||
|
mi.pubdate = nowf()
|
||||||
|
opf_path = os.path.join(dir, 'index.opf')
|
||||||
|
ncx_path = os.path.join(dir, 'index.ncx')
|
||||||
|
opf = OPFCreator(dir, mi)
|
||||||
|
# Add mastheadImage entry to <guide> section
|
||||||
|
mp = getattr(self, 'masthead_path', None)
|
||||||
|
if mp is not None and os.access(mp, os.R_OK):
|
||||||
|
from calibre.ebooks.metadata.opf2 import Guide
|
||||||
|
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||||
|
ref.type = 'masthead'
|
||||||
|
ref.title = 'Masthead Image'
|
||||||
|
opf.guide.append(ref)
|
||||||
|
|
||||||
|
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||||
|
manifest.append(os.path.join(dir, 'index.html'))
|
||||||
|
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||||
|
|
||||||
|
# Get cover
|
||||||
|
cpath = getattr(self, 'cover_path', None)
|
||||||
|
if cpath is None:
|
||||||
|
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||||
|
if self.default_cover(pf):
|
||||||
|
cpath = pf.name
|
||||||
|
if cpath is not None and os.access(cpath, os.R_OK):
|
||||||
|
opf.cover = cpath
|
||||||
|
manifest.append(cpath)
|
||||||
|
|
||||||
|
# Get masthead
|
||||||
|
mpath = getattr(self, 'masthead_path', None)
|
||||||
|
if mpath is not None and os.access(mpath, os.R_OK):
|
||||||
|
manifest.append(mpath)
|
||||||
|
|
||||||
|
opf.create_manifest_from_files_in(manifest)
|
||||||
|
for mani in opf.manifest:
|
||||||
|
if mani.path.endswith('.ncx'):
|
||||||
|
mani.id = 'ncx'
|
||||||
|
if mani.path.endswith('mastheadImage.jpg'):
|
||||||
|
mani.id = 'masthead-image'
|
||||||
|
entries = ['index.html']
|
||||||
|
toc = TOC(base_path=dir)
|
||||||
|
self.play_order_counter = 0
|
||||||
|
self.play_order_map = {}
|
||||||
|
|
||||||
|
def feed_index(num, parent):
|
||||||
|
f = feeds[num]
|
||||||
|
for j, a in enumerate(f):
|
||||||
|
if getattr(a, 'downloaded', False):
|
||||||
|
adir = 'feed_%d/article_%d/'%(num, j)
|
||||||
|
auth = a.author
|
||||||
|
if not auth:
|
||||||
|
auth = None
|
||||||
|
desc = a.text_summary
|
||||||
|
if not desc:
|
||||||
|
desc = None
|
||||||
|
else:
|
||||||
|
desc = self.description_limiter(desc)
|
||||||
|
entries.append('%sindex.html'%adir)
|
||||||
|
po = self.play_order_map.get(entries[-1], None)
|
||||||
|
if po is None:
|
||||||
|
self.play_order_counter += 1
|
||||||
|
po = self.play_order_counter
|
||||||
|
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||||
|
play_order=po, author=auth, description=desc)
|
||||||
|
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||||
|
for sp in a.sub_pages:
|
||||||
|
prefix = os.path.commonprefix([opf_path, sp])
|
||||||
|
relp = sp[len(prefix):]
|
||||||
|
entries.append(relp.replace(os.sep, '/'))
|
||||||
|
last = sp
|
||||||
|
|
||||||
|
if os.path.exists(last):
|
||||||
|
with open(last, 'rb') as fi:
|
||||||
|
src = fi.read().decode('utf-8')
|
||||||
|
soup = BeautifulSoup(src)
|
||||||
|
body = soup.find('body')
|
||||||
|
if body is not None:
|
||||||
|
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||||
|
templ = self.navbar.generate(True, num, j, len(f),
|
||||||
|
not self.has_single_feed,
|
||||||
|
a.orig_url, __appname__, prefix=prefix,
|
||||||
|
center=self.center_navbar)
|
||||||
|
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||||
|
body.insert(len(body.contents), elem)
|
||||||
|
with open(last, 'wb') as fi:
|
||||||
|
fi.write(unicode(soup).encode('utf-8'))
|
||||||
|
if len(feeds) == 0:
|
||||||
|
raise Exception('All feeds are empty, aborting.')
|
||||||
|
|
||||||
|
if len(feeds) > 1:
|
||||||
|
for i, f in enumerate(feeds):
|
||||||
|
entries.append('feed_%d/index.html'%i)
|
||||||
|
po = self.play_order_map.get(entries[-1], None)
|
||||||
|
if po is None:
|
||||||
|
self.play_order_counter += 1
|
||||||
|
po = self.play_order_counter
|
||||||
|
auth = getattr(f, 'author', None)
|
||||||
|
if not auth:
|
||||||
|
auth = None
|
||||||
|
desc = getattr(f, 'description', None)
|
||||||
|
if not desc:
|
||||||
|
desc = None
|
||||||
|
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||||
|
f.title, play_order=po, description=desc, author=auth))
|
||||||
|
|
||||||
|
else:
|
||||||
|
entries.append('feed_%d/index.html'%0)
|
||||||
|
feed_index(0, toc)
|
||||||
|
|
||||||
|
for i, p in enumerate(entries):
|
||||||
|
entries[i] = os.path.join(dir, p.replace('/', os.sep))
|
||||||
|
opf.create_spine(entries)
|
||||||
|
opf.set_toc(toc)
|
||||||
|
|
||||||
|
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||||
|
opf.render(opf_file, ncx_file)
|
||||||
|
|
||||||
|
56
resources/recipes/montevideo_com.recipe
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||||
|
'''
|
||||||
|
http://www.montevideo.com.uy
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Noticias(BasicNewsRecipe):
|
||||||
|
title = 'Montevideo COMM'
|
||||||
|
__author__ = 'Gustavo Azambuja'
|
||||||
|
description = 'Noticias de Uruguay'
|
||||||
|
language = 'es'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
use_embedded_content = False
|
||||||
|
recursion = 5
|
||||||
|
encoding = 'utf-8'
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
keep_only_tags = [dict(id=['txt'])]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['object','link'])
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||||
|
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||||
|
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||||
|
p {font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
'''
|
||||||
|
feeds = [
|
||||||
|
(u'Destacados', u'http://www.montevideo.com.uy/anxml.aspx?58'),
|
||||||
|
(u'Noticias', u'http://www.montevideo.com.uy/anxml.aspx?59'),
|
||||||
|
(u'Tecnologia', u'http://www.montevideo.com.uy/anxml.aspx?133'),
|
||||||
|
(u'Tiempo Libre', u'http://www.montevideo.com.uy/anxml.aspx?60'),
|
||||||
|
# (u'Deportes', u'http://www.montevideo.com.uy/anxml.aspx?968'),
|
||||||
|
# (u'Pantallazo', u'http://www.montevideo.com.uy/anxml.aspx?1022'),
|
||||||
|
(u'Gastronomia', u'http://www.montevideo.com.uy/anxml.aspx?1023')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
return 'http://sphotos.ak.fbcdn.net/hphotos-ak-snc1/hs276.snc1/10319_147339559330_147337559330_2625816_6636564_n.jpg'
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
||||||
|
|
@ -1,31 +1,33 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
moscowtimes.ru
|
www.themoscowtimes.com
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Moscowtimes(BasicNewsRecipe):
|
class Moscowtimes(BasicNewsRecipe):
|
||||||
title = u'The Moscow Times'
|
title = 'The Moscow Times'
|
||||||
__author__ = 'Darko Miletic and Sujata Raman'
|
__author__ = 'Darko Miletic and Sujata Raman'
|
||||||
description = 'News from Russia'
|
description = 'The Moscow Times is a daily English-language newspaper featuring objective, reliable news on business, politics, sports and culture in Moscow, in Russia and the former Soviet Union (CIS).'
|
||||||
language = 'en'
|
category = 'Russia, Moscow, Russian news, Moscow news, Russian newspaper, daily news, independent news, reliable news, USSR, Soviet Union, CIS, Russian politics, Russian business, Russian culture, Russian opinion, St Petersburg, Saint Petersburg'
|
||||||
lang = 'en'
|
publisher = 'The Moscow Times'
|
||||||
oldest_article = 7
|
language = 'en'
|
||||||
|
oldest_article = 2
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
#encoding = 'utf-8'
|
remove_empty_feeds = True
|
||||||
encoding = 'cp1252'
|
encoding = 'cp1251'
|
||||||
remove_javascript = True
|
masthead_url = 'http://www.themoscowtimes.com/bitrix/templates/tmt/img/logo.gif'
|
||||||
|
publication_type = 'newspaper'
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment' : description
|
||||||
, 'language' : lang
|
, 'tags' : category
|
||||||
}
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
h1{ color:#0066B3; font-family: Georgia,serif ; font-size: large}
|
h1{ color:#0066B3; font-family: Georgia,serif ; font-size: large}
|
||||||
@ -35,39 +37,37 @@ class Moscowtimes(BasicNewsRecipe):
|
|||||||
.text{font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; font-size:75%; }
|
.text{font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; font-size:75%; }
|
||||||
'''
|
'''
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'The Moscow Times Top Stories' , u'http://www.themoscowtimes.com/rss/top'),
|
(u'Top Stories' , u'http://www.themoscowtimes.com/rss/top' )
|
||||||
(u'The Moscow Times Current Issue' , u'http://www.themoscowtimes.com/rss/issue'),
|
,(u'Current Issue' , u'http://www.themoscowtimes.com/rss/issue' )
|
||||||
(u'The Moscow Times News' , u'http://www.themoscowtimes.com/rss/news'),
|
,(u'News' , u'http://www.themoscowtimes.com/rss/news' )
|
||||||
(u'The Moscow Times Business' , u'http://www.themoscowtimes.com/rss/business'),
|
,(u'Business' , u'http://www.themoscowtimes.com/rss/business')
|
||||||
(u'The Moscow Times Art and Ideas' , u'http://www.themoscowtimes.com/rss/art'),
|
,(u'Art and Ideas' , u'http://www.themoscowtimes.com/rss/art' )
|
||||||
(u'The Moscow Times Opinion' , u'http://www.themoscowtimes.com/rss/opinion')
|
,(u'Opinion' , u'http://www.themoscowtimes.com/rss/opinion' )
|
||||||
]
|
]
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
|
||||||
dict(name='div', attrs={'class':['newstextblock']})
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='div', attrs={'class':['photo_nav']})
|
dict(name='div', attrs={'class':['photo_nav','phototext']})
|
||||||
]
|
,dict(name=['iframe','meta','base','link','embed','object'])
|
||||||
|
]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
soup.html['xml:lang'] = self.lang
|
for lnk in soup.findAll('a'):
|
||||||
soup.html['lang'] = self.lang
|
if lnk.string is not None:
|
||||||
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
|
ind = self.tag_to_string(lnk)
|
||||||
soup.head.insert(0,mtag)
|
lnk.replaceWith(ind)
|
||||||
|
return soup
|
||||||
return self.adeify_images(soup)
|
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('.themoscowtimes.com/','.themoscowtimes.com/print/')
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
|
cover_url = None
|
||||||
href = 'http://www.themoscowtimes.com/pdf/'
|
href = 'http://www.themoscowtimes.com/pdf/'
|
||||||
|
soup = self.index_to_soup(href)
|
||||||
soup = self.index_to_soup(href)
|
|
||||||
div = soup.find('div',attrs={'class':'left'})
|
div = soup.find('div',attrs={'class':'left'})
|
||||||
a = div.find('a')
|
if div:
|
||||||
print a
|
a = div.find('a')
|
||||||
if a :
|
if a :
|
||||||
cover_url = a.img['src']
|
cover_url = 'http://www.themoscowtimes.com' + a.img['src']
|
||||||
return cover_url
|
return cover_url
|
||||||
|
24
resources/recipes/msnsankei.recipe
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||||
|
'''
|
||||||
|
sankei.jp.msn.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class MSNSankeiNewsProduct(BasicNewsRecipe):
|
||||||
|
title = u'MSN\u7523\u7d4c\u30cb\u30e5\u30fc\u30b9(\u65b0\u5546\u54c1)'
|
||||||
|
__author__ = 'Hiroshi Miura'
|
||||||
|
description = 'Products release from Japan'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
encoding = 'Shift_JIS'
|
||||||
|
language = 'ja'
|
||||||
|
|
||||||
|
feeds = [(u'\u65b0\u5546\u54c1', u'http://sankei.jp.msn.com/rss/news/release.xml')]
|
||||||
|
|
||||||
|
remove_tags_before = dict(id="__r_article_title__")
|
||||||
|
remove_tags_after = dict(id="ajax_release_news")
|
||||||
|
remove_tags = [{'class':"parent chromeCustom6G"}]
|
68
resources/recipes/newsweek_polska.recipe
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Mateusz Kielar, matek09@gmail.com'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Newsweek(BasicNewsRecipe):
|
||||||
|
EDITION = 0
|
||||||
|
|
||||||
|
title = u'Newsweek Polska'
|
||||||
|
__author__ = 'Mateusz Kielar'
|
||||||
|
description = 'Weekly magazine'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
no_stylesheets = True
|
||||||
|
language = 'en'
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
keep_only_tags =[]
|
||||||
|
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'}))
|
||||||
|
|
||||||
|
remove_tags =[]
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'class' : 'copy'}))
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'class' : 'url'}))
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
.body {font-size: small}
|
||||||
|
.author {font-size: x-small}
|
||||||
|
.lead {font-size: x-small}
|
||||||
|
.title{font-size: x-large; font-weight: bold}
|
||||||
|
'''
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace("http://www.newsweek.pl/artykuly/wydanie/" + str(self.EDITION), "http://www.newsweek.pl/artykuly") + '/print'
|
||||||
|
|
||||||
|
def find_last_full_issue(self):
|
||||||
|
page = self.index_to_soup('http://www.newsweek.pl/Frames/IssueCover.aspx')
|
||||||
|
issue = 'http://www.newsweek.pl/Frames/' + page.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href']
|
||||||
|
page = self.index_to_soup(issue)
|
||||||
|
issue = 'http://www.newsweek.pl/Frames/' + page.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href']
|
||||||
|
page = self.index_to_soup(issue)
|
||||||
|
self.EDITION = page.find('a', attrs={'target' : '_parent'})['href'].replace('/wydania/','')
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
self.find_last_full_issue()
|
||||||
|
soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + str(self.EDITION))
|
||||||
|
img = soup.find('img', id="ctl00_C1_PaperIsssueView_IssueImage", src=True)
|
||||||
|
self.cover_url = img['src']
|
||||||
|
feeds = []
|
||||||
|
parent = soup.find(id='content-left-big')
|
||||||
|
for txt in parent.findAll(attrs={'class':'txt_normal_red strong'}):
|
||||||
|
section = self.tag_to_string(txt).capitalize()
|
||||||
|
articles = list(self.find_articles(txt))
|
||||||
|
feeds.append((section, articles))
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def find_articles(self, txt):
|
||||||
|
for a in txt.findAllNext( attrs={'class':['strong','hr']}):
|
||||||
|
if a.name in "div":
|
||||||
|
break
|
||||||
|
yield {
|
||||||
|
'title' : self.tag_to_string(a),
|
||||||
|
'url' : 'http://www.newsweek.pl'+a['href'],
|
||||||
|
'date' : '',
|
||||||
|
'description' : ''
|
||||||
|
}
|
||||||
|
|
||||||
|
|
60
resources/recipes/nikkei_free.recipe
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||||
|
'''
|
||||||
|
www.nikkei.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class NikkeiNet(BasicNewsRecipe):
|
||||||
|
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Free)'
|
||||||
|
__author__ = 'Hiroshi Miura'
|
||||||
|
description = 'News and current market affairs from Japan'
|
||||||
|
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||||
|
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
language = 'ja'
|
||||||
|
|
||||||
|
feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'),
|
||||||
|
(u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'),
|
||||||
|
(u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'),
|
||||||
|
(u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'),
|
||||||
|
(u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'),
|
||||||
|
(u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'),
|
||||||
|
(u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'),
|
||||||
|
(u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'),
|
||||||
|
(u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'),
|
||||||
|
(u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'),
|
||||||
|
(u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'),
|
||||||
|
(u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'),
|
||||||
|
(u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'),
|
||||||
|
(u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'),
|
||||||
|
(u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'),
|
||||||
|
(u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'),
|
||||||
|
(u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'),
|
||||||
|
(u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'),
|
||||||
|
(u'\u5730\u57df\u30cb\u30e5\u30fc\u30b9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=local'),
|
||||||
|
(u'\u7d71\u8a08\u30fb\u767d\u66f8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=report'),
|
||||||
|
(u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking'),
|
||||||
|
(u'\u4f1a\u898b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=interview'),
|
||||||
|
(u'\u793e\u8aac\u30fb\u6625\u79cb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shasetsu'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba'),
|
||||||
|
(u'\u8abf\u67fb\u30fb\u30a2\u30f3\u30b1\u30fc\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=research')
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags_before = dict(id="CONTENTS")
|
||||||
|
remove_tags = [
|
||||||
|
dict(name="form"),
|
||||||
|
{'class':"cmn-hide"},
|
||||||
|
]
|
||||||
|
remove_tags_after = {'class':"cmn-pr_list"}
|
||||||
|
|
125
resources/recipes/nikkei_sub.recipe
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
import re
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
import mechanize
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
|
||||||
|
|
||||||
|
class NikkeiNet_subscription(BasicNewsRecipe):
|
||||||
|
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248'
|
||||||
|
__author__ = 'Hiroshi Miura'
|
||||||
|
description = 'News and current market affairs from Japan'
|
||||||
|
needs_subscription = True
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
language = 'ja'
|
||||||
|
remove_javascript = False
|
||||||
|
temp_files = []
|
||||||
|
|
||||||
|
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||||
|
remove_tags = [
|
||||||
|
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||||
|
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||||
|
{'class':"cmn-print_headline cmn-clearfix"},
|
||||||
|
]
|
||||||
|
remove_tags_after = {'class':"cmn-pr_list"}
|
||||||
|
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
|
||||||
|
cj = mechanize.LWPCookieJar()
|
||||||
|
br.set_cookiejar(cj)
|
||||||
|
|
||||||
|
#br.set_debug_http(True)
|
||||||
|
#br.set_debug_redirects(True)
|
||||||
|
#br.set_debug_responses(True)
|
||||||
|
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
#print "----------------------------get login form--------------------------------------------"
|
||||||
|
# open login form
|
||||||
|
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||||
|
response = br.response()
|
||||||
|
#print "----------------------------get login form---------------------------------------------"
|
||||||
|
#print "----------------------------set login form---------------------------------------------"
|
||||||
|
# remove disabled input which brings error on mechanize
|
||||||
|
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||||
|
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||||
|
br.set_response(response)
|
||||||
|
br.select_form(name='LA0010Form01')
|
||||||
|
br['LA0010Form01:LA0010Email'] = self.username
|
||||||
|
br['LA0010Form01:LA0010Password'] = self.password
|
||||||
|
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||||
|
br.submit()
|
||||||
|
br.response()
|
||||||
|
#print "----------------------------send login form---------------------------------------------"
|
||||||
|
#print "----------------------------open news main page-----------------------------------------"
|
||||||
|
# open news site
|
||||||
|
br.open('http://www.nikkei.com/')
|
||||||
|
br.response()
|
||||||
|
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||||
|
#print response2.get_data()
|
||||||
|
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||||
|
# forced redirect in default
|
||||||
|
br.select_form(nr=0)
|
||||||
|
br.submit()
|
||||||
|
response3 = br.response()
|
||||||
|
# return some cookie which should be set by Javascript
|
||||||
|
#print response3.geturl()
|
||||||
|
raw = response3.get_data()
|
||||||
|
#print "---------------------------response to form --------------------------------------------"
|
||||||
|
# grab cookie from JS and set it
|
||||||
|
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||||
|
br.select_form(nr=0)
|
||||||
|
|
||||||
|
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||||
|
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||||
|
|
||||||
|
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||||
|
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||||
|
self.temp_files[-1].close()
|
||||||
|
cj.load(self.temp_files[-1].name)
|
||||||
|
|
||||||
|
br.submit()
|
||||||
|
|
||||||
|
#br.set_debug_http(False)
|
||||||
|
#br.set_debug_redirects(False)
|
||||||
|
#br.set_debug_responses(False)
|
||||||
|
return br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'),
|
||||||
|
(u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'),
|
||||||
|
(u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'),
|
||||||
|
(u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'),
|
||||||
|
(u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'),
|
||||||
|
(u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'),
|
||||||
|
(u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'),
|
||||||
|
(u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'),
|
||||||
|
(u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'),
|
||||||
|
(u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'),
|
||||||
|
(u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'),
|
||||||
|
(u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'),
|
||||||
|
(u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'),
|
||||||
|
(u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'),
|
||||||
|
(u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'),
|
||||||
|
(u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'),
|
||||||
|
(u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'),
|
||||||
|
(u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'),
|
||||||
|
(u'\u5730\u57df\u30cb\u30e5\u30fc\u30b9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=local'),
|
||||||
|
(u'\u7d71\u8a08\u30fb\u767d\u66f8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=report'),
|
||||||
|
(u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking'),
|
||||||
|
(u'\u4f1a\u898b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=interview'),
|
||||||
|
(u'\u793e\u8aac\u30fb\u6625\u79cb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shasetsu'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba'),
|
||||||
|
(u'\u8abf\u67fb\u30fb\u30a2\u30f3\u30b1\u30fc\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=research')
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
109
resources/recipes/nikkei_sub_economy.recipe
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||||
|
'''
|
||||||
|
www.nikkei.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
import mechanize
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
|
||||||
|
class NikkeiNet_sub_economy(BasicNewsRecipe):
|
||||||
|
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7d4c\u6e08)'
|
||||||
|
__author__ = 'Hiroshi Miura'
|
||||||
|
description = 'News and current market affairs from Japan'
|
||||||
|
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||||
|
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||||
|
needs_subscription = True
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
language = 'ja'
|
||||||
|
remove_javascript = False
|
||||||
|
temp_files = []
|
||||||
|
|
||||||
|
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||||
|
remove_tags = [
|
||||||
|
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||||
|
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||||
|
{'class':"cmn-print_headline cmn-clearfix"},
|
||||||
|
]
|
||||||
|
remove_tags_after = {'class':"cmn-pr_list"}
|
||||||
|
|
||||||
|
feeds = [ (u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'),
|
||||||
|
(u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'),
|
||||||
|
(u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'),
|
||||||
|
(u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'),
|
||||||
|
(u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'),
|
||||||
|
(u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'),
|
||||||
|
(u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'),
|
||||||
|
(u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
|
||||||
|
cj = mechanize.LWPCookieJar()
|
||||||
|
br.set_cookiejar(cj)
|
||||||
|
|
||||||
|
#br.set_debug_http(True)
|
||||||
|
#br.set_debug_redirects(True)
|
||||||
|
#br.set_debug_responses(True)
|
||||||
|
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
#print "----------------------------get login form--------------------------------------------"
|
||||||
|
# open login form
|
||||||
|
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||||
|
response = br.response()
|
||||||
|
#print "----------------------------get login form---------------------------------------------"
|
||||||
|
#print "----------------------------set login form---------------------------------------------"
|
||||||
|
# remove disabled input which brings error on mechanize
|
||||||
|
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||||
|
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||||
|
br.set_response(response)
|
||||||
|
br.select_form(name='LA0010Form01')
|
||||||
|
br['LA0010Form01:LA0010Email'] = self.username
|
||||||
|
br['LA0010Form01:LA0010Password'] = self.password
|
||||||
|
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||||
|
br.submit()
|
||||||
|
br.response()
|
||||||
|
#print "----------------------------send login form---------------------------------------------"
|
||||||
|
#print "----------------------------open news main page-----------------------------------------"
|
||||||
|
# open news site
|
||||||
|
br.open('http://www.nikkei.com/')
|
||||||
|
br.response()
|
||||||
|
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||||
|
#print response2.get_data()
|
||||||
|
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||||
|
# forced redirect in default
|
||||||
|
br.select_form(nr=0)
|
||||||
|
br.submit()
|
||||||
|
response3 = br.response()
|
||||||
|
# return some cookie which should be set by Javascript
|
||||||
|
#print response3.geturl()
|
||||||
|
raw = response3.get_data()
|
||||||
|
#print "---------------------------response to form --------------------------------------------"
|
||||||
|
# grab cookie from JS and set it
|
||||||
|
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||||
|
br.select_form(nr=0)
|
||||||
|
|
||||||
|
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||||
|
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||||
|
|
||||||
|
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||||
|
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||||
|
self.temp_files[-1].close()
|
||||||
|
cj.load(self.temp_files[-1].name)
|
||||||
|
|
||||||
|
br.submit()
|
||||||
|
|
||||||
|
#br.set_debug_http(False)
|
||||||
|
#br.set_debug_redirects(False)
|
||||||
|
#br.set_debug_responses(False)
|
||||||
|
return br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
108
resources/recipes/nikkei_sub_industry.recipe
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||||
|
'''
|
||||||
|
www.nikkei.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
import mechanize
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
|
||||||
|
|
||||||
|
class NikkeiNet_sub_industory(BasicNewsRecipe):
|
||||||
|
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7523\u696d)'
|
||||||
|
__author__ = 'Hiroshi Miura'
|
||||||
|
description = 'News and current market affairs from Japan'
|
||||||
|
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||||
|
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||||
|
needs_subscription = True
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
language = 'ja'
|
||||||
|
remove_javascript = False
|
||||||
|
temp_files = []
|
||||||
|
|
||||||
|
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||||
|
remove_tags = [
|
||||||
|
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||||
|
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||||
|
{'class':"cmn-print_headline cmn-clearfix"},
|
||||||
|
]
|
||||||
|
remove_tags_after = {'class':"cmn-pr_list"}
|
||||||
|
|
||||||
|
feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'),
|
||||||
|
(u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'),
|
||||||
|
(u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'),
|
||||||
|
(u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'),
|
||||||
|
(u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'),
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
|
||||||
|
cj = mechanize.LWPCookieJar()
|
||||||
|
br.set_cookiejar(cj)
|
||||||
|
|
||||||
|
#br.set_debug_http(True)
|
||||||
|
#br.set_debug_redirects(True)
|
||||||
|
#br.set_debug_responses(True)
|
||||||
|
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
#print "----------------------------get login form--------------------------------------------"
|
||||||
|
# open login form
|
||||||
|
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||||
|
response = br.response()
|
||||||
|
#print "----------------------------get login form---------------------------------------------"
|
||||||
|
#print "----------------------------set login form---------------------------------------------"
|
||||||
|
# remove disabled input which brings error on mechanize
|
||||||
|
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||||
|
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||||
|
br.set_response(response)
|
||||||
|
br.select_form(name='LA0010Form01')
|
||||||
|
br['LA0010Form01:LA0010Email'] = self.username
|
||||||
|
br['LA0010Form01:LA0010Password'] = self.password
|
||||||
|
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||||
|
br.submit()
|
||||||
|
br.response()
|
||||||
|
#print "----------------------------send login form---------------------------------------------"
|
||||||
|
#print "----------------------------open news main page-----------------------------------------"
|
||||||
|
# open news site
|
||||||
|
br.open('http://www.nikkei.com/')
|
||||||
|
br.response()
|
||||||
|
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||||
|
#print response2.get_data()
|
||||||
|
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||||
|
# forced redirect in default
|
||||||
|
br.select_form(nr=0)
|
||||||
|
br.submit()
|
||||||
|
response3 = br.response()
|
||||||
|
# return some cookie which should be set by Javascript
|
||||||
|
#print response3.geturl()
|
||||||
|
raw = response3.get_data()
|
||||||
|
#print "---------------------------response to form --------------------------------------------"
|
||||||
|
# grab cookie from JS and set it
|
||||||
|
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||||
|
br.select_form(nr=0)
|
||||||
|
|
||||||
|
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||||
|
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||||
|
|
||||||
|
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||||
|
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||||
|
self.temp_files[-1].close()
|
||||||
|
cj.load(self.temp_files[-1].name)
|
||||||
|
|
||||||
|
br.submit()
|
||||||
|
|
||||||
|
#br.set_debug_http(False)
|
||||||
|
#br.set_debug_redirects(False)
|
||||||
|
#br.set_debug_responses(False)
|
||||||
|
return br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
109
resources/recipes/nikkei_sub_life.recipe
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||||
|
'''
|
||||||
|
www.nikkei.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
import mechanize
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
|
||||||
|
|
||||||
|
class NikkeiNet_sub_life(BasicNewsRecipe):
|
||||||
|
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u751f\u6d3b)'
|
||||||
|
__author__ = 'Hiroshi Miura'
|
||||||
|
description = 'News and current market affairs from Japan'
|
||||||
|
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||||
|
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||||
|
needs_subscription = True
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
language = 'ja'
|
||||||
|
remove_javascript = False
|
||||||
|
temp_files = []
|
||||||
|
|
||||||
|
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||||
|
remove_tags = [
|
||||||
|
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||||
|
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||||
|
{'class':"cmn-print_headline cmn-clearfix"},
|
||||||
|
]
|
||||||
|
remove_tags_after = {'class':"cmn-pr_list"}
|
||||||
|
|
||||||
|
feeds = [ (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'),
|
||||||
|
(u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'),
|
||||||
|
(u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'),
|
||||||
|
(u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'),
|
||||||
|
(u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'),
|
||||||
|
(u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
|
||||||
|
cj = mechanize.LWPCookieJar()
|
||||||
|
br.set_cookiejar(cj)
|
||||||
|
|
||||||
|
#br.set_debug_http(True)
|
||||||
|
#br.set_debug_redirects(True)
|
||||||
|
#br.set_debug_responses(True)
|
||||||
|
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
#print "----------------------------get login form--------------------------------------------"
|
||||||
|
# open login form
|
||||||
|
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||||
|
response = br.response()
|
||||||
|
#print "----------------------------get login form---------------------------------------------"
|
||||||
|
#print "----------------------------set login form---------------------------------------------"
|
||||||
|
# remove disabled input which brings error on mechanize
|
||||||
|
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||||
|
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||||
|
br.set_response(response)
|
||||||
|
br.select_form(name='LA0010Form01')
|
||||||
|
br['LA0010Form01:LA0010Email'] = self.username
|
||||||
|
br['LA0010Form01:LA0010Password'] = self.password
|
||||||
|
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||||
|
br.submit()
|
||||||
|
br.response()
|
||||||
|
#print "----------------------------send login form---------------------------------------------"
|
||||||
|
#print "----------------------------open news main page-----------------------------------------"
|
||||||
|
# open news site
|
||||||
|
br.open('http://www.nikkei.com/')
|
||||||
|
br.response()
|
||||||
|
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||||
|
#print response2.get_data()
|
||||||
|
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||||
|
# forced redirect in default
|
||||||
|
br.select_form(nr=0)
|
||||||
|
br.submit()
|
||||||
|
response3 = br.response()
|
||||||
|
# return some cookie which should be set by Javascript
|
||||||
|
#print response3.geturl()
|
||||||
|
raw = response3.get_data()
|
||||||
|
#print "---------------------------response to form --------------------------------------------"
|
||||||
|
# grab cookie from JS and set it
|
||||||
|
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||||
|
br.select_form(nr=0)
|
||||||
|
|
||||||
|
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||||
|
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||||
|
|
||||||
|
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||||
|
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||||
|
self.temp_files[-1].close()
|
||||||
|
cj.load(self.temp_files[-1].name)
|
||||||
|
|
||||||
|
br.submit()
|
||||||
|
|
||||||
|
#br.set_debug_http(False)
|
||||||
|
#br.set_debug_redirects(False)
|
||||||
|
#br.set_debug_responses(False)
|
||||||
|
return br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
102
resources/recipes/nikkei_sub_main.recipe
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||||
|
'''
|
||||||
|
www.nikkei.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
import mechanize
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
|
||||||
|
|
||||||
|
class NikkeiNet_sub_main(BasicNewsRecipe):
|
||||||
|
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7dcf\u5408)'
|
||||||
|
__author__ = 'Hiroshi Miura'
|
||||||
|
description = 'News and current market affairs from Japan'
|
||||||
|
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||||
|
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||||
|
needs_subscription = True
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
language = 'ja'
|
||||||
|
remove_javascript = False
|
||||||
|
temp_files = []
|
||||||
|
|
||||||
|
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||||
|
remove_tags = [
|
||||||
|
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||||
|
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||||
|
{'class':"cmn-print_headline cmn-clearfix"},
|
||||||
|
]
|
||||||
|
remove_tags_after = {'class':"cmn-pr_list"}
|
||||||
|
|
||||||
|
feeds = [ (u'NIKKEI', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=main')]
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
|
||||||
|
cj = mechanize.LWPCookieJar()
|
||||||
|
br.set_cookiejar(cj)
|
||||||
|
|
||||||
|
#br.set_debug_http(True)
|
||||||
|
#br.set_debug_redirects(True)
|
||||||
|
#br.set_debug_responses(True)
|
||||||
|
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
#print "----------------------------get login form--------------------------------------------"
|
||||||
|
# open login form
|
||||||
|
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||||
|
response = br.response()
|
||||||
|
#print "----------------------------get login form---------------------------------------------"
|
||||||
|
#print "----------------------------set login form---------------------------------------------"
|
||||||
|
# remove disabled input which brings error on mechanize
|
||||||
|
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||||
|
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||||
|
br.set_response(response)
|
||||||
|
br.select_form(name='LA0010Form01')
|
||||||
|
br['LA0010Form01:LA0010Email'] = self.username
|
||||||
|
br['LA0010Form01:LA0010Password'] = self.password
|
||||||
|
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||||
|
br.submit()
|
||||||
|
br.response()
|
||||||
|
#print "----------------------------send login form---------------------------------------------"
|
||||||
|
#print "----------------------------open news main page-----------------------------------------"
|
||||||
|
# open news site
|
||||||
|
br.open('http://www.nikkei.com/')
|
||||||
|
br.response()
|
||||||
|
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||||
|
#print response2.get_data()
|
||||||
|
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||||
|
# forced redirect in default
|
||||||
|
br.select_form(nr=0)
|
||||||
|
br.submit()
|
||||||
|
response3 = br.response()
|
||||||
|
# return some cookie which should be set by Javascript
|
||||||
|
#print response3.geturl()
|
||||||
|
raw = response3.get_data()
|
||||||
|
#print "---------------------------response to form --------------------------------------------"
|
||||||
|
# grab cookie from JS and set it
|
||||||
|
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||||
|
br.select_form(nr=0)
|
||||||
|
|
||||||
|
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||||
|
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||||
|
|
||||||
|
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||||
|
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||||
|
self.temp_files[-1].close()
|
||||||
|
cj.load(self.temp_files[-1].name)
|
||||||
|
|
||||||
|
br.submit()
|
||||||
|
|
||||||
|
#br.set_debug_http(False)
|
||||||
|
#br.set_debug_redirects(False)
|
||||||
|
#br.set_debug_responses(False)
|
||||||
|
return br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
109
resources/recipes/nikkei_sub_sports.recipe
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||||
|
'''
|
||||||
|
www.nikkei.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
import mechanize
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
|
||||||
|
|
||||||
|
class NikkeiNet_sub_sports(BasicNewsRecipe):
|
||||||
|
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u30b9\u30dd\u30fc\u30c4)'
|
||||||
|
__author__ = 'Hiroshi Miura'
|
||||||
|
description = 'News and current market affairs from Japan'
|
||||||
|
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||||
|
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||||
|
needs_subscription = True
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 20
|
||||||
|
language = 'ja'
|
||||||
|
remove_javascript = False
|
||||||
|
temp_files = []
|
||||||
|
|
||||||
|
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||||
|
remove_tags = [
|
||||||
|
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||||
|
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||||
|
{'class':"cmn-print_headline cmn-clearfix"},
|
||||||
|
]
|
||||||
|
remove_tags_after = {'class':"cmn-pr_list"}
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'),
|
||||||
|
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
|
||||||
|
cj = mechanize.LWPCookieJar()
|
||||||
|
br.set_cookiejar(cj)
|
||||||
|
|
||||||
|
#br.set_debug_http(True)
|
||||||
|
#br.set_debug_redirects(True)
|
||||||
|
#br.set_debug_responses(True)
|
||||||
|
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
#print "----------------------------get login form--------------------------------------------"
|
||||||
|
# open login form
|
||||||
|
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||||
|
response = br.response()
|
||||||
|
#print "----------------------------get login form---------------------------------------------"
|
||||||
|
#print "----------------------------set login form---------------------------------------------"
|
||||||
|
# remove disabled input which brings error on mechanize
|
||||||
|
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||||
|
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||||
|
br.set_response(response)
|
||||||
|
br.select_form(name='LA0010Form01')
|
||||||
|
br['LA0010Form01:LA0010Email'] = self.username
|
||||||
|
br['LA0010Form01:LA0010Password'] = self.password
|
||||||
|
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||||
|
br.submit()
|
||||||
|
br.response()
|
||||||
|
#print "----------------------------send login form---------------------------------------------"
|
||||||
|
#print "----------------------------open news main page-----------------------------------------"
|
||||||
|
# open news site
|
||||||
|
br.open('http://www.nikkei.com/')
|
||||||
|
br.response()
|
||||||
|
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||||
|
#print response2.get_data()
|
||||||
|
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||||
|
# forced redirect in default
|
||||||
|
br.select_form(nr=0)
|
||||||
|
br.submit()
|
||||||
|
response3 = br.response()
|
||||||
|
# return some cookie which should be set by Javascript
|
||||||
|
#print response3.geturl()
|
||||||
|
raw = response3.get_data()
|
||||||
|
#print "---------------------------response to form --------------------------------------------"
|
||||||
|
# grab cookie from JS and set it
|
||||||
|
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||||
|
br.select_form(nr=0)
|
||||||
|
|
||||||
|
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||||
|
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||||
|
|
||||||
|
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||||
|
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||||
|
self.temp_files[-1].close()
|
||||||
|
cj.load(self.temp_files[-1].name)
|
||||||
|
|
||||||
|
br.submit()
|
||||||
|
|
||||||
|
#br.set_debug_http(False)
|
||||||
|
#br.set_debug_redirects(False)
|
||||||
|
#br.set_debug_responses(False)
|
||||||
|
return br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
36
resources/recipes/now_toronto.recipe
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
#Based on Lars Jacob's Taz Digiabo recipe
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Starson17'
|
||||||
|
|
||||||
|
import os, urllib2, zipfile
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
|
||||||
|
class NowToronto(BasicNewsRecipe):
|
||||||
|
title = u'Now Toronto'
|
||||||
|
description = u'Now Toronto'
|
||||||
|
__author__ = 'Starson17'
|
||||||
|
language = 'en_CA'
|
||||||
|
conversion_options = {
|
||||||
|
'no_default_epub_cover' : True
|
||||||
|
}
|
||||||
|
|
||||||
|
def build_index(self):
|
||||||
|
epub_feed = "http://feeds.feedburner.com/NowEpubEditions"
|
||||||
|
soup = self.index_to_soup(epub_feed)
|
||||||
|
url = soup.find(name = 'feedburner:origlink').string
|
||||||
|
f = urllib2.urlopen(url)
|
||||||
|
tmp = PersistentTemporaryFile(suffix='.epub')
|
||||||
|
self.report_progress(0,_('downloading epub'))
|
||||||
|
tmp.write(f.read())
|
||||||
|
tmp.close()
|
||||||
|
zfile = zipfile.ZipFile(tmp.name, 'r')
|
||||||
|
self.report_progress(0,_('extracting epub'))
|
||||||
|
zfile.extractall(self.output_dir)
|
||||||
|
tmp.close()
|
||||||
|
index = os.path.join(self.output_dir, 'content.opf')
|
||||||
|
self.report_progress(1,_('epub downloaded and extracted'))
|
||||||
|
return index
|
@ -7,14 +7,22 @@ nytimes.com
|
|||||||
'''
|
'''
|
||||||
import re, string, time
|
import re, string, time
|
||||||
from calibre import entity_to_unicode, strftime
|
from calibre import entity_to_unicode, strftime
|
||||||
|
from datetime import timedelta, date
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
||||||
|
|
||||||
|
|
||||||
class NYTimes(BasicNewsRecipe):
|
class NYTimes(BasicNewsRecipe):
|
||||||
|
|
||||||
# set headlinesOnly to True for the headlines-only version
|
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
||||||
headlinesOnly = True
|
headlinesOnly = True
|
||||||
|
|
||||||
|
# set webEdition to True for the Web edition of the newspaper. Set oldest_article to the
|
||||||
|
# number of days old an article can be for inclusion. If oldest_article = 0 all articles
|
||||||
|
# will be included. Note: oldest_article is ignored if webEdition = False
|
||||||
|
webEdition = False
|
||||||
|
oldest_article = 7
|
||||||
|
|
||||||
# includeSections: List of sections to include. If empty, all sections found will be included.
|
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||||||
# Otherwise, only the sections named will be included. For example,
|
# Otherwise, only the sections named will be included. For example,
|
||||||
#
|
#
|
||||||
@ -39,20 +47,76 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
# from an article (if one exists). If one_picture_per_article = True, the image
|
# from an article (if one exists). If one_picture_per_article = True, the image
|
||||||
# will be moved to a location between the headline and the byline.
|
# will be moved to a location between the headline and the byline.
|
||||||
# If one_picture_per_article = False, all images from the article will be included
|
# If one_picture_per_article = False, all images from the article will be included
|
||||||
|
|
||||||
# and shown in their original location.
|
# and shown in their original location.
|
||||||
one_picture_per_article = True
|
one_picture_per_article = False
|
||||||
|
|
||||||
# The maximum number of articles that will be downloaded
|
# The maximum number of articles that will be downloaded
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
|
# Whether to omit duplicates of articles (typically arsing when articles are indexed in
|
||||||
|
# more than one section). If True, only the first occurance will be downloaded.
|
||||||
|
filterDuplicates = True
|
||||||
|
|
||||||
|
# Sections to collect for the Web edition.
|
||||||
|
# Delete any you don't want, or use includeSections or excludeSections
|
||||||
|
web_sections = [(u'World',u'world'),
|
||||||
|
(u'U.S.',u'national'),
|
||||||
|
(u'Politics',u'politics'),
|
||||||
|
(u'New York',u'nyregion'),
|
||||||
|
(u'Business','business'),
|
||||||
|
(u'Technology',u'technology'),
|
||||||
|
(u'Sports',u'sports'),
|
||||||
|
(u'Science',u'science'),
|
||||||
|
(u'Health',u'health'),
|
||||||
|
(u'Opinion',u'opinion'),
|
||||||
|
(u'Arts',u'arts'),
|
||||||
|
(u'Books',u'books'),
|
||||||
|
(u'Movies',u'movies'),
|
||||||
|
(u'Music',u'arts/music'),
|
||||||
|
(u'Television',u'arts/television'),
|
||||||
|
(u'Style',u'style'),
|
||||||
|
(u'Dining & Wine',u'dining'),
|
||||||
|
(u'Fashion & Style',u'fashion'),
|
||||||
|
(u'Home & Garden',u'garden'),
|
||||||
|
(u'Travel',u'travel'),
|
||||||
|
('Education',u'education'),
|
||||||
|
('Multimedia',u'multimedia'),
|
||||||
|
(u'Obituaries',u'obituaries'),
|
||||||
|
(u'Sunday Magazine',u'magazine'),
|
||||||
|
(u'Week in Review',u'weekinreview')]
|
||||||
|
|
||||||
|
|
||||||
if headlinesOnly:
|
if headlinesOnly:
|
||||||
title='New York Times Headlines'
|
title='New York Times Headlines'
|
||||||
description = 'Headlines from the New York Times'
|
description = 'Headlines from the New York Times'
|
||||||
|
needs_subscription = False
|
||||||
|
elif webEdition:
|
||||||
|
title='New York Times (Web)'
|
||||||
|
description = 'New York Times on the Web'
|
||||||
|
needs_subscription = True
|
||||||
else:
|
else:
|
||||||
title='New York Times'
|
title='New York Times'
|
||||||
description = 'Today\'s New York Times'
|
description = 'Today\'s New York Times'
|
||||||
|
needs_subscription = True
|
||||||
|
|
||||||
|
|
||||||
|
month_list = ['january','february','march','april','may','june','july','august','september','october','november','december']
|
||||||
|
|
||||||
|
def decode_us_date(self,datestr):
|
||||||
|
udate = datestr.strip().lower().split()
|
||||||
|
try:
|
||||||
|
m = self.month_list.index(udate[0])+1
|
||||||
|
except:
|
||||||
|
return date.today()
|
||||||
|
d = int(udate[1])
|
||||||
|
y = int(udate[2])
|
||||||
|
try:
|
||||||
|
d = date(y,m,d)
|
||||||
|
except:
|
||||||
|
d = date.today
|
||||||
|
return d
|
||||||
|
|
||||||
|
earliest_date = date.today() - timedelta(days=oldest_article)
|
||||||
|
|
||||||
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
@ -136,6 +200,12 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
.image {text-align: center;}
|
.image {text-align: center;}
|
||||||
.source {text-align: left; }'''
|
.source {text-align: left; }'''
|
||||||
|
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = None
|
||||||
|
ans = []
|
||||||
|
url_list = []
|
||||||
|
|
||||||
def filter_ans(self, ans) :
|
def filter_ans(self, ans) :
|
||||||
total_article_count = 0
|
total_article_count = 0
|
||||||
idx = 0
|
idx = 0
|
||||||
@ -164,6 +234,29 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
self.log( "Queued %d articles" % total_article_count )
|
self.log( "Queued %d articles" % total_article_count )
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
def exclude_url(self,url):
|
||||||
|
if not url.startswith("http"):
|
||||||
|
return True
|
||||||
|
if not url.endswith(".html"):
|
||||||
|
return True
|
||||||
|
if 'nytimes.com' not in url:
|
||||||
|
return True
|
||||||
|
if 'podcast' in url:
|
||||||
|
return True
|
||||||
|
if '/video/' in url:
|
||||||
|
return True
|
||||||
|
if '/slideshow/' in url:
|
||||||
|
return True
|
||||||
|
if '/magazine/index' in url:
|
||||||
|
return True
|
||||||
|
if '/interactive/' in url:
|
||||||
|
return True
|
||||||
|
if '/reference/' in url:
|
||||||
|
return True
|
||||||
|
if '/premium/' in url:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def fixChars(self,string):
|
def fixChars(self,string):
|
||||||
# Replace lsquo (\x91)
|
# Replace lsquo (\x91)
|
||||||
fixed = re.sub("\x91","‘",string)
|
fixed = re.sub("\x91","‘",string)
|
||||||
@ -249,7 +342,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return BeautifulSoup(_raw, markupMassage=massage)
|
return BeautifulSoup(_raw, markupMassage=massage)
|
||||||
|
|
||||||
# Entry point
|
# Entry point
|
||||||
print "index_to_soup()"
|
|
||||||
soup = get_the_soup( self.encoding, url_or_raw )
|
soup = get_the_soup( self.encoding, url_or_raw )
|
||||||
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
||||||
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
||||||
@ -273,83 +365,110 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
return description
|
return description
|
||||||
|
|
||||||
def parse_todays_index(self):
|
def feed_title(self,div):
|
||||||
|
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
||||||
|
|
||||||
def feed_title(div):
|
def handle_article(self,div):
|
||||||
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
thumbnail = div.find('div','thumbnail')
|
||||||
|
if thumbnail:
|
||||||
articles = {}
|
thumbnail.extract()
|
||||||
key = None
|
a = div.find('a', href=True)
|
||||||
ans = []
|
if not a:
|
||||||
url_list = []
|
return
|
||||||
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
def handle_article(div):
|
if self.exclude_url(url):
|
||||||
a = div.find('a', href=True)
|
return
|
||||||
if not a:
|
url += '?pagewanted=all'
|
||||||
|
if self.filterDuplicates:
|
||||||
|
if url in self.url_list:
|
||||||
return
|
return
|
||||||
url = re.sub(r'\?.*', '', a['href'])
|
self.url_list.append(url)
|
||||||
if not url.startswith("http"):
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
return
|
description = ''
|
||||||
if not url.endswith(".html"):
|
pubdate = strftime('%a, %d %b')
|
||||||
return
|
summary = div.find(True, attrs={'class':'summary'})
|
||||||
if 'podcast' in url:
|
if summary:
|
||||||
return
|
description = self.tag_to_string(summary, use_alt=False)
|
||||||
if '/video/' in url:
|
author = ''
|
||||||
return
|
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||||
url += '?pagewanted=all'
|
if authorAttribution:
|
||||||
if url in url_list:
|
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||||
return
|
else:
|
||||||
url_list.append(url)
|
|
||||||
title = self.tag_to_string(a, use_alt=True).strip()
|
|
||||||
description = ''
|
|
||||||
pubdate = strftime('%a, %d %b')
|
|
||||||
summary = div.find(True, attrs={'class':'summary'})
|
|
||||||
if summary:
|
|
||||||
description = self.tag_to_string(summary, use_alt=False)
|
|
||||||
author = ''
|
|
||||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||||
if authorAttribution:
|
if authorAttribution:
|
||||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||||
else:
|
feed = self.key if self.key is not None else 'Uncategorized'
|
||||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
if not self.articles.has_key(feed):
|
||||||
if authorAttribution:
|
self.ans.append(feed)
|
||||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
self.articles[feed] = []
|
||||||
feed = key if key is not None else 'Uncategorized'
|
self.articles[feed].append(
|
||||||
if not articles.has_key(feed):
|
dict(title=title, url=url, date=pubdate,
|
||||||
ans.append(feed)
|
description=description, author=author,
|
||||||
articles[feed] = []
|
content=''))
|
||||||
articles[feed].append(
|
|
||||||
dict(title=title, url=url, date=pubdate,
|
|
||||||
description=description, author=author,
|
|
||||||
content=''))
|
|
||||||
|
|
||||||
|
|
||||||
|
def parse_web_edition(self):
|
||||||
|
|
||||||
|
for (sec_title,index_url) in self.web_sections:
|
||||||
|
if self.includeSections != []:
|
||||||
|
if sec_title not in self.includeSections:
|
||||||
|
print "SECTION NOT INCLUDED: ",sec_title
|
||||||
|
continue
|
||||||
|
if sec_title in self.excludeSections:
|
||||||
|
print "SECTION EXCLUDED: ",sec_title
|
||||||
|
continue
|
||||||
|
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
|
||||||
|
soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
|
||||||
|
self.key = sec_title
|
||||||
|
# Find each article
|
||||||
|
for div in soup.findAll(True,
|
||||||
|
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||||
|
if div['class'] in ['story', 'story headline'] :
|
||||||
|
self.handle_article(div)
|
||||||
|
elif div['class'] == 'headlinesOnly multiline flush':
|
||||||
|
for lidiv in div.findAll('li'):
|
||||||
|
self.handle_article(lidiv)
|
||||||
|
|
||||||
|
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||||
|
return self.filter_ans(self.ans)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_todays_index(self):
|
||||||
|
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||||
|
|
||||||
|
skipping = False
|
||||||
# Find each article
|
# Find each article
|
||||||
for div in soup.findAll(True,
|
for div in soup.findAll(True,
|
||||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||||
|
|
||||||
if div['class'] in ['section-headline','sectionHeader']:
|
if div['class'] in ['section-headline','sectionHeader']:
|
||||||
key = string.capwords(feed_title(div))
|
self.key = string.capwords(self.feed_title(div))
|
||||||
key = key.replace('Op-ed','Op-Ed')
|
self.key = self.key.replace('Op-ed','Op-Ed')
|
||||||
key = key.replace('U.s.','U.S.')
|
self.key = self.key.replace('U.s.','U.S.')
|
||||||
|
self.key = self.key.replace('N.y.','N.Y.')
|
||||||
|
skipping = False
|
||||||
|
if self.includeSections != []:
|
||||||
|
if self.key not in self.includeSections:
|
||||||
|
print "SECTION NOT INCLUDED: ",self.key
|
||||||
|
skipping = True
|
||||||
|
if self.key in self.excludeSections:
|
||||||
|
print "SECTION EXCLUDED: ",self.key
|
||||||
|
skipping = True
|
||||||
|
|
||||||
elif div['class'] in ['story', 'story headline'] :
|
elif div['class'] in ['story', 'story headline'] :
|
||||||
handle_article(div)
|
if not skipping:
|
||||||
|
self.handle_article(div)
|
||||||
elif div['class'] == 'headlinesOnly multiline flush':
|
elif div['class'] == 'headlinesOnly multiline flush':
|
||||||
for lidiv in div.findAll('li'):
|
for lidiv in div.findAll('li'):
|
||||||
handle_article(lidiv)
|
if not skipping:
|
||||||
|
self.handle_article(lidiv)
|
||||||
|
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||||
return self.filter_ans(ans)
|
return self.filter_ans(self.ans)
|
||||||
|
|
||||||
def parse_headline_index(self):
|
def parse_headline_index(self):
|
||||||
|
|
||||||
articles = {}
|
|
||||||
ans = []
|
|
||||||
url_list = []
|
|
||||||
|
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
||||||
|
|
||||||
# Fetch the content table
|
# Fetch the content table
|
||||||
@ -363,15 +482,24 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
|
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
|
||||||
for div_sec in td_col.findAll('div',recursive=False):
|
for div_sec in td_col.findAll('div',recursive=False):
|
||||||
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
|
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
|
||||||
|
|
||||||
section_name = self.tag_to_string(h6_sec_name,use_alt=False)
|
section_name = self.tag_to_string(h6_sec_name,use_alt=False)
|
||||||
section_name = re.sub(r'^ *$','',section_name)
|
section_name = re.sub(r'^ *$','',section_name)
|
||||||
|
|
||||||
if section_name == '':
|
if section_name == '':
|
||||||
continue
|
continue
|
||||||
|
if self.includeSections != []:
|
||||||
|
if section_name not in self.includeSections:
|
||||||
|
print "SECTION NOT INCLUDED: ",section_name
|
||||||
|
continue
|
||||||
|
if section_name in self.excludeSections:
|
||||||
|
print "SECTION EXCLUDED: ",section_name
|
||||||
|
continue
|
||||||
|
|
||||||
section_name=string.capwords(section_name)
|
section_name=string.capwords(section_name)
|
||||||
if section_name == 'U.s.':
|
section_name = section_name.replace('Op-ed','Op-Ed')
|
||||||
section_name = 'U.S.'
|
section_name = section_name.replace('U.s.','U.S.')
|
||||||
elif section_name == 'Op-ed':
|
section_name = section_name.replace('N.y.','N.Y.')
|
||||||
section_name = 'Op-Ed'
|
|
||||||
pubdate = strftime('%a, %d %b')
|
pubdate = strftime('%a, %d %b')
|
||||||
|
|
||||||
search_div = div_sec
|
search_div = div_sec
|
||||||
@ -392,37 +520,32 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if not a:
|
if not a:
|
||||||
continue
|
continue
|
||||||
url = re.sub(r'\?.*', '', a['href'])
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
if not url.startswith("http"):
|
if self.exclude_url(url):
|
||||||
continue
|
|
||||||
if not url.endswith(".html"):
|
|
||||||
continue
|
|
||||||
if 'podcast' in url:
|
|
||||||
continue
|
|
||||||
if 'video' in url:
|
|
||||||
continue
|
continue
|
||||||
url += '?pagewanted=all'
|
url += '?pagewanted=all'
|
||||||
if url in url_list:
|
if self.filterDuplicates:
|
||||||
continue
|
if url in self.url_list:
|
||||||
url_list.append(url)
|
continue
|
||||||
self.log("URL %s" % url)
|
self.url_list.append(url)
|
||||||
title = self.tag_to_string(a, use_alt=True).strip()
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
desc = h3_item.find('p')
|
desc = h3_item.find('p')
|
||||||
if desc is not None:
|
if desc is not None:
|
||||||
description = self.tag_to_string(desc,use_alt=False)
|
description = self.tag_to_string(desc,use_alt=False)
|
||||||
else:
|
else:
|
||||||
description = ''
|
description = ''
|
||||||
if not articles.has_key(section_name):
|
if not self.articles.has_key(section_name):
|
||||||
ans.append(section_name)
|
self.ans.append(section_name)
|
||||||
articles[section_name] = []
|
self.articles[section_name] = []
|
||||||
articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||||
|
|
||||||
|
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
return self.filter_ans(self.ans)
|
||||||
return self.filter_ans(ans)
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
if self.headlinesOnly:
|
if self.headlinesOnly:
|
||||||
return self.parse_headline_index()
|
return self.parse_headline_index()
|
||||||
|
elif self.webEdition:
|
||||||
|
return self.parse_web_edition()
|
||||||
else:
|
else:
|
||||||
return self.parse_todays_index()
|
return self.parse_todays_index()
|
||||||
|
|
||||||
@ -438,6 +561,21 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
|
||||||
|
if self.webEdition & (self.oldest_article>0):
|
||||||
|
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
||||||
|
if date_tag:
|
||||||
|
date_str = self.tag_to_string(date_tag,use_alt=False)
|
||||||
|
date_str = date_str.replace('Published:','')
|
||||||
|
date_items = date_str.split(',')
|
||||||
|
try:
|
||||||
|
datestring = date_items[0]+' '+date_items[1]
|
||||||
|
article_date = self.decode_us_date(datestring)
|
||||||
|
except:
|
||||||
|
article_date = date.today()
|
||||||
|
if article_date < self.earliest_date:
|
||||||
|
self.log("Skipping article dated %s" % date_str)
|
||||||
|
return None
|
||||||
|
|
||||||
kicker_tag = soup.find(attrs={'class':'kicker'})
|
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||||||
if kicker_tag: # remove Op_Ed author head shots
|
if kicker_tag: # remove Op_Ed author head shots
|
||||||
tagline = self.tag_to_string(kicker_tag)
|
tagline = self.tag_to_string(kicker_tag)
|
||||||
@ -462,7 +600,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
for inlineImg in inlineImgs[1:]:
|
for inlineImg in inlineImgs[1:]:
|
||||||
inlineImg.extract()
|
inlineImg.extract()
|
||||||
# Move firstImg before article body
|
# Move firstImg before article body
|
||||||
#article_body = soup.find(True, {'id':'articleBody'})
|
|
||||||
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
||||||
if cgFirst:
|
if cgFirst:
|
||||||
# Strip all sibling NavigableStrings: noise
|
# Strip all sibling NavigableStrings: noise
|
||||||
@ -548,4 +685,3 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
divTag.replaceWith(tag)
|
divTag.replaceWith(tag)
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
@ -7,14 +7,22 @@ nytimes.com
|
|||||||
'''
|
'''
|
||||||
import re, string, time
|
import re, string, time
|
||||||
from calibre import entity_to_unicode, strftime
|
from calibre import entity_to_unicode, strftime
|
||||||
|
from datetime import timedelta, date
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
||||||
|
|
||||||
|
|
||||||
class NYTimes(BasicNewsRecipe):
|
class NYTimes(BasicNewsRecipe):
|
||||||
|
|
||||||
# set headlinesOnly to True for the headlines-only version
|
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
||||||
headlinesOnly = False
|
headlinesOnly = False
|
||||||
|
|
||||||
|
# set webEdition to True for the Web edition of the newspaper. Set oldest_article to the
|
||||||
|
# number of days old an article can be for inclusion. If oldest_article = 0 all articles
|
||||||
|
# will be included. Note: oldest_article is ignored if webEdition = False
|
||||||
|
webEdition = False
|
||||||
|
oldest_article = 7
|
||||||
|
|
||||||
# includeSections: List of sections to include. If empty, all sections found will be included.
|
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||||||
# Otherwise, only the sections named will be included. For example,
|
# Otherwise, only the sections named will be included. For example,
|
||||||
#
|
#
|
||||||
@ -39,20 +47,76 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
# from an article (if one exists). If one_picture_per_article = True, the image
|
# from an article (if one exists). If one_picture_per_article = True, the image
|
||||||
# will be moved to a location between the headline and the byline.
|
# will be moved to a location between the headline and the byline.
|
||||||
# If one_picture_per_article = False, all images from the article will be included
|
# If one_picture_per_article = False, all images from the article will be included
|
||||||
|
|
||||||
# and shown in their original location.
|
# and shown in their original location.
|
||||||
one_picture_per_article = True
|
one_picture_per_article = False
|
||||||
|
|
||||||
# The maximum number of articles that will be downloaded
|
# The maximum number of articles that will be downloaded
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
|
|
||||||
|
# Whether to omit duplicates of articles (typically arsing when articles are indexed in
|
||||||
|
# more than one section). If True, only the first occurance will be downloaded.
|
||||||
|
filterDuplicates = True
|
||||||
|
|
||||||
|
# Sections to collect for the Web edition.
|
||||||
|
# Delete any you don't want, or use includeSections or excludeSections
|
||||||
|
web_sections = [(u'World',u'world'),
|
||||||
|
(u'U.S.',u'national'),
|
||||||
|
(u'Politics',u'politics'),
|
||||||
|
(u'New York',u'nyregion'),
|
||||||
|
(u'Business','business'),
|
||||||
|
(u'Technology',u'technology'),
|
||||||
|
(u'Sports',u'sports'),
|
||||||
|
(u'Science',u'science'),
|
||||||
|
(u'Health',u'health'),
|
||||||
|
(u'Opinion',u'opinion'),
|
||||||
|
(u'Arts',u'arts'),
|
||||||
|
(u'Books',u'books'),
|
||||||
|
(u'Movies',u'movies'),
|
||||||
|
(u'Music',u'arts/music'),
|
||||||
|
(u'Television',u'arts/television'),
|
||||||
|
(u'Style',u'style'),
|
||||||
|
(u'Dining & Wine',u'dining'),
|
||||||
|
(u'Fashion & Style',u'fashion'),
|
||||||
|
(u'Home & Garden',u'garden'),
|
||||||
|
(u'Travel',u'travel'),
|
||||||
|
('Education',u'education'),
|
||||||
|
('Multimedia',u'multimedia'),
|
||||||
|
(u'Obituaries',u'obituaries'),
|
||||||
|
(u'Sunday Magazine',u'magazine'),
|
||||||
|
(u'Week in Review',u'weekinreview')]
|
||||||
|
|
||||||
|
|
||||||
if headlinesOnly:
|
if headlinesOnly:
|
||||||
title='New York Times Headlines'
|
title='New York Times Headlines'
|
||||||
description = 'Headlines from the New York Times'
|
description = 'Headlines from the New York Times'
|
||||||
|
needs_subscription = False
|
||||||
|
elif webEdition:
|
||||||
|
title='New York Times (Web)'
|
||||||
|
description = 'New York Times on the Web'
|
||||||
|
needs_subscription = True
|
||||||
else:
|
else:
|
||||||
title='New York Times'
|
title='New York Times'
|
||||||
description = 'Today\'s New York Times'
|
description = 'Today\'s New York Times'
|
||||||
|
needs_subscription = True
|
||||||
|
|
||||||
|
|
||||||
|
month_list = ['january','february','march','april','may','june','july','august','september','october','november','december']
|
||||||
|
|
||||||
|
def decode_us_date(self,datestr):
|
||||||
|
udate = datestr.strip().lower().split()
|
||||||
|
try:
|
||||||
|
m = self.month_list.index(udate[0])+1
|
||||||
|
except:
|
||||||
|
return date.today()
|
||||||
|
d = int(udate[1])
|
||||||
|
y = int(udate[2])
|
||||||
|
try:
|
||||||
|
d = date(y,m,d)
|
||||||
|
except:
|
||||||
|
d = date.today
|
||||||
|
return d
|
||||||
|
|
||||||
|
earliest_date = date.today() - timedelta(days=oldest_article)
|
||||||
|
|
||||||
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
@ -60,7 +124,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
timefmt = ''
|
timefmt = ''
|
||||||
needs_subscription = True
|
|
||||||
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||||
cover_margins = (18,18,'grey99')
|
cover_margins = (18,18,'grey99')
|
||||||
|
|
||||||
@ -137,6 +200,12 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
.image {text-align: center;}
|
.image {text-align: center;}
|
||||||
.source {text-align: left; }'''
|
.source {text-align: left; }'''
|
||||||
|
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = None
|
||||||
|
ans = []
|
||||||
|
url_list = []
|
||||||
|
|
||||||
def filter_ans(self, ans) :
|
def filter_ans(self, ans) :
|
||||||
total_article_count = 0
|
total_article_count = 0
|
||||||
idx = 0
|
idx = 0
|
||||||
@ -165,6 +234,29 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
self.log( "Queued %d articles" % total_article_count )
|
self.log( "Queued %d articles" % total_article_count )
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
def exclude_url(self,url):
|
||||||
|
if not url.startswith("http"):
|
||||||
|
return True
|
||||||
|
if not url.endswith(".html"):
|
||||||
|
return True
|
||||||
|
if 'nytimes.com' not in url:
|
||||||
|
return True
|
||||||
|
if 'podcast' in url:
|
||||||
|
return True
|
||||||
|
if '/video/' in url:
|
||||||
|
return True
|
||||||
|
if '/slideshow/' in url:
|
||||||
|
return True
|
||||||
|
if '/magazine/index' in url:
|
||||||
|
return True
|
||||||
|
if '/interactive/' in url:
|
||||||
|
return True
|
||||||
|
if '/reference/' in url:
|
||||||
|
return True
|
||||||
|
if '/premium/' in url:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def fixChars(self,string):
|
def fixChars(self,string):
|
||||||
# Replace lsquo (\x91)
|
# Replace lsquo (\x91)
|
||||||
fixed = re.sub("\x91","‘",string)
|
fixed = re.sub("\x91","‘",string)
|
||||||
@ -250,7 +342,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return BeautifulSoup(_raw, markupMassage=massage)
|
return BeautifulSoup(_raw, markupMassage=massage)
|
||||||
|
|
||||||
# Entry point
|
# Entry point
|
||||||
print "index_to_soup()"
|
|
||||||
soup = get_the_soup( self.encoding, url_or_raw )
|
soup = get_the_soup( self.encoding, url_or_raw )
|
||||||
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
||||||
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
||||||
@ -274,83 +365,110 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
return description
|
return description
|
||||||
|
|
||||||
def parse_todays_index(self):
|
def feed_title(self,div):
|
||||||
|
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
||||||
|
|
||||||
def feed_title(div):
|
def handle_article(self,div):
|
||||||
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
thumbnail = div.find('div','thumbnail')
|
||||||
|
if thumbnail:
|
||||||
articles = {}
|
thumbnail.extract()
|
||||||
key = None
|
a = div.find('a', href=True)
|
||||||
ans = []
|
if not a:
|
||||||
url_list = []
|
return
|
||||||
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
def handle_article(div):
|
if self.exclude_url(url):
|
||||||
a = div.find('a', href=True)
|
return
|
||||||
if not a:
|
url += '?pagewanted=all'
|
||||||
|
if self.filterDuplicates:
|
||||||
|
if url in self.url_list:
|
||||||
return
|
return
|
||||||
url = re.sub(r'\?.*', '', a['href'])
|
self.url_list.append(url)
|
||||||
if not url.startswith("http"):
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
return
|
description = ''
|
||||||
if not url.endswith(".html"):
|
pubdate = strftime('%a, %d %b')
|
||||||
return
|
summary = div.find(True, attrs={'class':'summary'})
|
||||||
if 'podcast' in url:
|
if summary:
|
||||||
return
|
description = self.tag_to_string(summary, use_alt=False)
|
||||||
if '/video/' in url:
|
author = ''
|
||||||
return
|
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||||
url += '?pagewanted=all'
|
if authorAttribution:
|
||||||
if url in url_list:
|
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||||
return
|
else:
|
||||||
url_list.append(url)
|
|
||||||
title = self.tag_to_string(a, use_alt=True).strip()
|
|
||||||
description = ''
|
|
||||||
pubdate = strftime('%a, %d %b')
|
|
||||||
summary = div.find(True, attrs={'class':'summary'})
|
|
||||||
if summary:
|
|
||||||
description = self.tag_to_string(summary, use_alt=False)
|
|
||||||
author = ''
|
|
||||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||||
if authorAttribution:
|
if authorAttribution:
|
||||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||||
else:
|
feed = self.key if self.key is not None else 'Uncategorized'
|
||||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
if not self.articles.has_key(feed):
|
||||||
if authorAttribution:
|
self.ans.append(feed)
|
||||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
self.articles[feed] = []
|
||||||
feed = key if key is not None else 'Uncategorized'
|
self.articles[feed].append(
|
||||||
if not articles.has_key(feed):
|
dict(title=title, url=url, date=pubdate,
|
||||||
ans.append(feed)
|
description=description, author=author,
|
||||||
articles[feed] = []
|
content=''))
|
||||||
articles[feed].append(
|
|
||||||
dict(title=title, url=url, date=pubdate,
|
|
||||||
description=description, author=author,
|
|
||||||
content=''))
|
|
||||||
|
|
||||||
|
|
||||||
|
def parse_web_edition(self):
|
||||||
|
|
||||||
|
for (sec_title,index_url) in self.web_sections:
|
||||||
|
if self.includeSections != []:
|
||||||
|
if sec_title not in self.includeSections:
|
||||||
|
print "SECTION NOT INCLUDED: ",sec_title
|
||||||
|
continue
|
||||||
|
if sec_title in self.excludeSections:
|
||||||
|
print "SECTION EXCLUDED: ",sec_title
|
||||||
|
continue
|
||||||
|
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
|
||||||
|
soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
|
||||||
|
self.key = sec_title
|
||||||
|
# Find each article
|
||||||
|
for div in soup.findAll(True,
|
||||||
|
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||||
|
if div['class'] in ['story', 'story headline'] :
|
||||||
|
self.handle_article(div)
|
||||||
|
elif div['class'] == 'headlinesOnly multiline flush':
|
||||||
|
for lidiv in div.findAll('li'):
|
||||||
|
self.handle_article(lidiv)
|
||||||
|
|
||||||
|
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||||
|
return self.filter_ans(self.ans)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_todays_index(self):
|
||||||
|
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||||
|
|
||||||
|
skipping = False
|
||||||
# Find each article
|
# Find each article
|
||||||
for div in soup.findAll(True,
|
for div in soup.findAll(True,
|
||||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||||
|
|
||||||
if div['class'] in ['section-headline','sectionHeader']:
|
if div['class'] in ['section-headline','sectionHeader']:
|
||||||
key = string.capwords(feed_title(div))
|
self.key = string.capwords(self.feed_title(div))
|
||||||
key = key.replace('Op-ed','Op-Ed')
|
self.key = self.key.replace('Op-ed','Op-Ed')
|
||||||
key = key.replace('U.s.','U.S.')
|
self.key = self.key.replace('U.s.','U.S.')
|
||||||
|
self.key = self.key.replace('N.y.','N.Y.')
|
||||||
|
skipping = False
|
||||||
|
if self.includeSections != []:
|
||||||
|
if self.key not in self.includeSections:
|
||||||
|
print "SECTION NOT INCLUDED: ",self.key
|
||||||
|
skipping = True
|
||||||
|
if self.key in self.excludeSections:
|
||||||
|
print "SECTION EXCLUDED: ",self.key
|
||||||
|
skipping = True
|
||||||
|
|
||||||
elif div['class'] in ['story', 'story headline'] :
|
elif div['class'] in ['story', 'story headline'] :
|
||||||
handle_article(div)
|
if not skipping:
|
||||||
|
self.handle_article(div)
|
||||||
elif div['class'] == 'headlinesOnly multiline flush':
|
elif div['class'] == 'headlinesOnly multiline flush':
|
||||||
for lidiv in div.findAll('li'):
|
for lidiv in div.findAll('li'):
|
||||||
handle_article(lidiv)
|
if not skipping:
|
||||||
|
self.handle_article(lidiv)
|
||||||
|
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||||
return self.filter_ans(ans)
|
return self.filter_ans(self.ans)
|
||||||
|
|
||||||
def parse_headline_index(self):
|
def parse_headline_index(self):
|
||||||
|
|
||||||
articles = {}
|
|
||||||
ans = []
|
|
||||||
url_list = []
|
|
||||||
|
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
||||||
|
|
||||||
# Fetch the content table
|
# Fetch the content table
|
||||||
@ -364,15 +482,24 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
|
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
|
||||||
for div_sec in td_col.findAll('div',recursive=False):
|
for div_sec in td_col.findAll('div',recursive=False):
|
||||||
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
|
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
|
||||||
|
|
||||||
section_name = self.tag_to_string(h6_sec_name,use_alt=False)
|
section_name = self.tag_to_string(h6_sec_name,use_alt=False)
|
||||||
section_name = re.sub(r'^ *$','',section_name)
|
section_name = re.sub(r'^ *$','',section_name)
|
||||||
|
|
||||||
if section_name == '':
|
if section_name == '':
|
||||||
continue
|
continue
|
||||||
|
if self.includeSections != []:
|
||||||
|
if section_name not in self.includeSections:
|
||||||
|
print "SECTION NOT INCLUDED: ",section_name
|
||||||
|
continue
|
||||||
|
if section_name in self.excludeSections:
|
||||||
|
print "SECTION EXCLUDED: ",section_name
|
||||||
|
continue
|
||||||
|
|
||||||
section_name=string.capwords(section_name)
|
section_name=string.capwords(section_name)
|
||||||
if section_name == 'U.s.':
|
section_name = section_name.replace('Op-ed','Op-Ed')
|
||||||
section_name = 'U.S.'
|
section_name = section_name.replace('U.s.','U.S.')
|
||||||
elif section_name == 'Op-ed':
|
section_name = section_name.replace('N.y.','N.Y.')
|
||||||
section_name = 'Op-Ed'
|
|
||||||
pubdate = strftime('%a, %d %b')
|
pubdate = strftime('%a, %d %b')
|
||||||
|
|
||||||
search_div = div_sec
|
search_div = div_sec
|
||||||
@ -393,37 +520,32 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
if not a:
|
if not a:
|
||||||
continue
|
continue
|
||||||
url = re.sub(r'\?.*', '', a['href'])
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
if not url.startswith("http"):
|
if self.exclude_url(url):
|
||||||
continue
|
|
||||||
if not url.endswith(".html"):
|
|
||||||
continue
|
|
||||||
if 'podcast' in url:
|
|
||||||
continue
|
|
||||||
if 'video' in url:
|
|
||||||
continue
|
continue
|
||||||
url += '?pagewanted=all'
|
url += '?pagewanted=all'
|
||||||
if url in url_list:
|
if self.filterDuplicates:
|
||||||
continue
|
if url in self.url_list:
|
||||||
url_list.append(url)
|
continue
|
||||||
self.log("URL %s" % url)
|
self.url_list.append(url)
|
||||||
title = self.tag_to_string(a, use_alt=True).strip()
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
desc = h3_item.find('p')
|
desc = h3_item.find('p')
|
||||||
if desc is not None:
|
if desc is not None:
|
||||||
description = self.tag_to_string(desc,use_alt=False)
|
description = self.tag_to_string(desc,use_alt=False)
|
||||||
else:
|
else:
|
||||||
description = ''
|
description = ''
|
||||||
if not articles.has_key(section_name):
|
if not self.articles.has_key(section_name):
|
||||||
ans.append(section_name)
|
self.ans.append(section_name)
|
||||||
articles[section_name] = []
|
self.articles[section_name] = []
|
||||||
articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||||
|
|
||||||
|
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
return self.filter_ans(self.ans)
|
||||||
return self.filter_ans(ans)
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
if self.headlinesOnly:
|
if self.headlinesOnly:
|
||||||
return self.parse_headline_index()
|
return self.parse_headline_index()
|
||||||
|
elif self.webEdition:
|
||||||
|
return self.parse_web_edition()
|
||||||
else:
|
else:
|
||||||
return self.parse_todays_index()
|
return self.parse_todays_index()
|
||||||
|
|
||||||
@ -439,6 +561,21 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
|
||||||
|
if self.webEdition & (self.oldest_article>0):
|
||||||
|
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
||||||
|
if date_tag:
|
||||||
|
date_str = self.tag_to_string(date_tag,use_alt=False)
|
||||||
|
date_str = date_str.replace('Published:','')
|
||||||
|
date_items = date_str.split(',')
|
||||||
|
try:
|
||||||
|
datestring = date_items[0]+' '+date_items[1]
|
||||||
|
article_date = self.decode_us_date(datestring)
|
||||||
|
except:
|
||||||
|
article_date = date.today()
|
||||||
|
if article_date < self.earliest_date:
|
||||||
|
self.log("Skipping article dated %s" % date_str)
|
||||||
|
return None
|
||||||
|
|
||||||
kicker_tag = soup.find(attrs={'class':'kicker'})
|
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||||||
if kicker_tag: # remove Op_Ed author head shots
|
if kicker_tag: # remove Op_Ed author head shots
|
||||||
tagline = self.tag_to_string(kicker_tag)
|
tagline = self.tag_to_string(kicker_tag)
|
||||||
@ -463,7 +600,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
for inlineImg in inlineImgs[1:]:
|
for inlineImg in inlineImgs[1:]:
|
||||||
inlineImg.extract()
|
inlineImg.extract()
|
||||||
# Move firstImg before article body
|
# Move firstImg before article body
|
||||||
#article_body = soup.find(True, {'id':'articleBody'})
|
|
||||||
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
||||||
if cgFirst:
|
if cgFirst:
|
||||||
# Strip all sibling NavigableStrings: noise
|
# Strip all sibling NavigableStrings: noise
|
||||||
|
63
resources/recipes/observa_digital.recipe
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||||
|
'''
|
||||||
|
observa.com.uy
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Noticias(BasicNewsRecipe):
|
||||||
|
title = 'Observa Digital'
|
||||||
|
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||||
|
description = 'Noticias desde Uruguay'
|
||||||
|
language = 'es'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
use_embedded_content = False
|
||||||
|
recursion = 5
|
||||||
|
encoding = 'utf8'
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
oldest_article = 2
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
keep_only_tags = [dict(id=['contenido'])]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'id':'contenedorVinculadas'}),
|
||||||
|
dict(name='p', attrs={'id':'nota_firma'}),
|
||||||
|
dict(name=['object','link'])
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||||
|
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||||
|
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||||
|
p {font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
'''
|
||||||
|
feeds = [
|
||||||
|
(u'Actualidad', u'http://www.observa.com.uy/RSS/actualidad.xml'),
|
||||||
|
(u'Deportes', u'http://www.observa.com.uy/RSS/deportes.xml'),
|
||||||
|
(u'Vida', u'http://www.observa.com.uy/RSS/vida.xml'),
|
||||||
|
(u'Ciencia y Tecnologia', u'http://www.observa.com.uy/RSS/ciencia.xml')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
cover_url = None
|
||||||
|
index = 'http://www.elobservador.com.uy/elobservador/nav_portada.asp?suplemento=dia'
|
||||||
|
soup = self.index_to_soup(index)
|
||||||
|
link_item = soup.find('img',attrs={'usemap':'#mapeo_imagenes'})
|
||||||
|
if link_item:
|
||||||
|
cover_url = 'http://www.elobservador.com.uy'+link_item['src'].strip()
|
||||||
|
|
||||||
|
print cover_url
|
||||||
|
|
||||||
|
return cover_url
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
@ -21,8 +21,16 @@ class Pagina12(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'es'
|
language = 'es'
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
|
publication_type = 'newspaper'
|
||||||
masthead_url = 'http://www.pagina12.com.ar/commons/imgs/logo-home.gif'
|
masthead_url = 'http://www.pagina12.com.ar/commons/imgs/logo-home.gif'
|
||||||
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} #autor{font-weight: bold} #fecha,#epigrafe{font-size: 0.9em; margin: 5px} #imagen{border: 1px solid black; margin: 0 0 1.25em 1.25em; width: 232px } '
|
extra_css = """
|
||||||
|
body{font-family: Arial,Helvetica,sans-serif }
|
||||||
|
img{margin-bottom: 0.4em; display:block}
|
||||||
|
#autor{font-weight: bold}
|
||||||
|
#fecha,#epigrafe{font-size: 0.9em; margin: 5px}
|
||||||
|
#imagen{border: 1px solid black; margin: 0 0 1.25em 1.25em; width: 232px }
|
||||||
|
.fgprincipal{font-size: large; font-weight: bold}
|
||||||
|
"""
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment' : description
|
||||||
@ -31,7 +39,11 @@ class Pagina12(BasicNewsRecipe):
|
|||||||
, 'language' : language
|
, 'language' : language
|
||||||
}
|
}
|
||||||
|
|
||||||
remove_tags = [dict(name='div', attrs={'id':['volver','logo','logo_suple','fin','permalink']})]
|
remove_tags = [
|
||||||
|
dict(name=['meta','link'])
|
||||||
|
,dict(name='div', attrs={'id':['volver','logo','logo_suple','fin','permalink']})
|
||||||
|
]
|
||||||
|
remove_attributes=['lang']
|
||||||
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
@ -65,4 +77,13 @@ class Pagina12(BasicNewsRecipe):
|
|||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
|
for item in soup.findAll('span', attrs={'id':'seccion'}):
|
||||||
|
it = item.a
|
||||||
|
it.name='span'
|
||||||
|
del it['href']
|
||||||
|
del it['title']
|
||||||
|
for item in soup.findAll('p'):
|
||||||
|
it = item.find('h3')
|
||||||
|
if it:
|
||||||
|
it.name='span'
|
||||||
return soup
|
return soup
|
70
resources/recipes/pc_lab.recipe
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
class PCLab(BasicNewsRecipe):
|
||||||
|
cover_url = 'http://pclab.pl/img/logo.png'
|
||||||
|
title = u"PC Lab"
|
||||||
|
__author__ = 'ravcio - rlelusz[at]gmail.com'
|
||||||
|
description = u"Articles from PC Lab website"
|
||||||
|
language = 'pl'
|
||||||
|
oldest_article = 30.0
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
recursions = 0
|
||||||
|
encoding = 'iso-8859-2'
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_javascript = True
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='div', attrs={'class':['substance']})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['chapters']})
|
||||||
|
,dict(name='div', attrs={'id':['script_bxad_slot_display_list_bxad_slot']})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags_after = [
|
||||||
|
dict(name='div', attrs={'class':['navigation']})
|
||||||
|
]
|
||||||
|
|
||||||
|
#links to RSS feeds
|
||||||
|
feeds = [ ('PCLab', u'http://pclab.pl/xml/artykuly.xml') ]
|
||||||
|
|
||||||
|
#load second and subsequent page content
|
||||||
|
# in: soup - full page with 'next' button
|
||||||
|
# out: appendtag - tag to which new page is to be added
|
||||||
|
def append_page(self, soup, appendtag):
|
||||||
|
# find the 'Next' button
|
||||||
|
pager = soup.find('div', attrs={'class':'next'})
|
||||||
|
|
||||||
|
if pager:
|
||||||
|
#search for 'a' element with link to next page (exit if not found)
|
||||||
|
a = pager.find('a')
|
||||||
|
if a:
|
||||||
|
nexturl = a['href']
|
||||||
|
|
||||||
|
soup2 = self.index_to_soup('http://pclab.pl/' + nexturl)
|
||||||
|
|
||||||
|
pagetext_substance = soup2.find('div', attrs={'class':'substance'})
|
||||||
|
pagetext = pagetext_substance.find('div', attrs={'class':'data'})
|
||||||
|
pagetext.extract()
|
||||||
|
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
appendtag.insert(pos, pagetext)
|
||||||
|
pos = len(appendtag.contents)
|
||||||
|
|
||||||
|
self.append_page(soup2, appendtag)
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
|
||||||
|
# soup.body contains no title and no navigator, they are in soup
|
||||||
|
self.append_page(soup, soup.body)
|
||||||
|
|
||||||
|
# finally remove some tags
|
||||||
|
tags = soup.findAll('div',attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']})
|
||||||
|
[tag.extract() for tag in tags]
|
||||||
|
|
||||||
|
return soup
|
@ -1,13 +1,10 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
|
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
'''
|
'''
|
||||||
politika.rs
|
politika.rs
|
||||||
'''
|
'''
|
||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
|
||||||
|
|
||||||
class Politika(BasicNewsRecipe):
|
class Politika(BasicNewsRecipe):
|
||||||
title = 'Politika Online'
|
title = 'Politika Online'
|
||||||
@ -19,53 +16,51 @@ class Politika(BasicNewsRecipe):
|
|||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
remove_javascript = True
|
|
||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
language = 'sr'
|
delay = 1
|
||||||
|
language = 'sr'
|
||||||
lang = 'sr-Latn-RS'
|
publication_type = 'newspaper'
|
||||||
direction = 'ltr'
|
masthead_url = 'http://static.politika.co.rs/images_new/politika.gif'
|
||||||
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
|
extra_css = """
|
||||||
|
@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
|
||||||
|
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||||
|
body{font-family: Arial,Helvetica,sans1,sans-serif}
|
||||||
|
h1{font-family: "Times New Roman",Times,serif1,serif}
|
||||||
|
.articledescription{font-family: sans1, sans-serif}
|
||||||
|
"""
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment' : description
|
||||||
, 'tags' : category
|
, 'tags' : category
|
||||||
, 'publisher' : publisher
|
, 'publisher' : publisher
|
||||||
, 'language' : lang
|
, 'language' : language
|
||||||
, 'pretty_print' : True
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class':'content_center_border'})]
|
keep_only_tags = [dict(name='div', attrs={'class':'big_article_home item_details'})]
|
||||||
|
remove_tags_after = dict(attrs={'class':'online_date'})
|
||||||
remove_tags = [
|
remove_tags = [dict(name=['link','meta','iframe','embed','object'])]
|
||||||
dict(name='div', attrs={'class':['send_print','txt-komentar']})
|
|
||||||
,dict(name=['object','link','a'])
|
|
||||||
,dict(name='h1', attrs={'class':'box_header-tags'})
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
(u'Politika' , u'http://www.politika.rs/rubrike/Politika/index.1.lt.xml' )
|
(u'Politika' , u'http://www.politika.rs/rubrike/Politika/index.1.lt.xml' )
|
||||||
,(u'Svet' , u'http://www.politika.rs/rubrike/Svet/index.1.lt.xml' )
|
,(u'Svet' , u'http://www.politika.rs/rubrike/Svet/index.1.lt.xml' )
|
||||||
,(u'Redakcijski komentari', u'http://www.politika.rs/rubrike/redakcijski-komentari/index.1.lt.xml')
|
,(u'Ostali komentari' , u'http://www.politika.rs/rubrike/ostali-komentari/index.1.lt.xml' )
|
||||||
,(u'Pogledi' , u'http://www.politika.rs/pogledi/index.lt.xml' )
|
,(u'Pogledi' , u'http://www.politika.rs/pogledi/index.lt.xml' )
|
||||||
,(u'Pogledi sa strane' , u'http://www.politika.rs/rubrike/Pogledi-sa-strane/index.1.lt.xml' )
|
,(u'Pogledi sa strane', u'http://www.politika.rs/rubrike/Pogledi-sa-strane/index.1.lt.xml')
|
||||||
,(u'Tema dana' , u'http://www.politika.rs/rubrike/tema-dana/index.1.lt.xml' )
|
,(u'Tema dana' , u'http://www.politika.rs/rubrike/tema-dana/index.1.lt.xml' )
|
||||||
,(u'Kultura' , u'http://www.politika.rs/rubrike/Kultura/index.1.lt.xml' )
|
,(u'Kultura' , u'http://www.politika.rs/rubrike/Kultura/index.1.lt.xml' )
|
||||||
,(u'Zivot i stil' , u'http://www.politika.rs/rubrike/zivot-i-stil/index.1.lt.xml' )
|
,(u'Spektar' , u'http://www.politika.rs/rubrike/zivot-i-stil/index.1.lt.xml' )
|
||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
soup.html['lang'] = self.lang
|
|
||||||
soup.html['dir' ] = self.direction
|
|
||||||
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
|
|
||||||
soup.head.insert(0,mlang)
|
|
||||||
for item in soup.findAll(style=True):
|
for item in soup.findAll(style=True):
|
||||||
del item['style']
|
del item['style']
|
||||||
ftag = soup.find('div',attrs={'class':'content_center_border'})
|
for item in soup.findAll('a', attrs={'class':'category'}):
|
||||||
if ftag.has_key('align'):
|
item.name='span'
|
||||||
del ftag['align']
|
if item.has_key('href'):
|
||||||
return self.adeify_images(soup)
|
del item['href']
|
||||||
|
if item.has_key('title'):
|
||||||
|
del item['title']
|
||||||
|
return soup
|
||||||
|
68
resources/recipes/polityka.recipe
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Mateusz Kielar, matek09@gmail.com'
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Polityka(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = u'Polityka'
|
||||||
|
__author__ = 'Mateusz Kielar'
|
||||||
|
description = 'Weekly magazine. Last archive issue'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
no_stylesheets = True
|
||||||
|
language = 'en'
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
remove_tags_before = dict(dict(name = 'h2', attrs = {'class' : 'box_nag'}))
|
||||||
|
remove_tags_after = dict(dict(name = 'div', attrs = {'class' : 'box_footer'}))
|
||||||
|
|
||||||
|
remove_tags =[]
|
||||||
|
remove_tags.append(dict(name = 'h2', attrs = {'class' : 'box_nag'}))
|
||||||
|
remove_tags.append(dict(name = 'div', attrs = {'class' : 'box_footer'}))
|
||||||
|
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1 {font-size: x-large; font-weight: bold}
|
||||||
|
'''
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup('http://archiwum.polityka.pl/')
|
||||||
|
box_img3 = soup.findAll(attrs={'class' : 'box_img3'})
|
||||||
|
feeds = []
|
||||||
|
last = 0
|
||||||
|
self.cover_url = 'http://archiwum.polityka.pl' + box_img3[-1].find('img')['src']
|
||||||
|
last_edition = 'http://archiwum.polityka.pl' + box_img3[-1].find('a')['href']
|
||||||
|
|
||||||
|
while True:
|
||||||
|
index = self.index_to_soup(last_edition)
|
||||||
|
|
||||||
|
|
||||||
|
box_list = index.findAll('div', attrs={'class' : 'box_list'})
|
||||||
|
if len(box_list) == 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
for box in box_list:
|
||||||
|
for div in box.findAll('div', attrs={'class': 'list_tresc'}):
|
||||||
|
article_page = self.index_to_soup('http://archiwum.polityka.pl' + div.a['href'],)
|
||||||
|
section = self.tag_to_string(article_page.find('h2', attrs = {'class' : 'box_nag'})).split('/')[0].lstrip().rstrip()
|
||||||
|
print section
|
||||||
|
if not articles.has_key(section):
|
||||||
|
articles[section] = []
|
||||||
|
articles[section].append( {
|
||||||
|
'title' : self.tag_to_string(div.a),
|
||||||
|
'url' : 'http://archiwum.polityka.pl' + div.a['href'],
|
||||||
|
'date' : '',
|
||||||
|
'description' : ''
|
||||||
|
})
|
||||||
|
|
||||||
|
for section in articles:
|
||||||
|
feeds.append((section, articles[section]))
|
||||||
|
|
||||||
|
last_edition = last_edition.replace('http://archiwum.polityka.pl/wydanie/' + str(last), 'http://archiwum.polityka.pl/wydanie/' + str(last + 1))
|
||||||
|
last = last + 1
|
||||||
|
|
||||||
|
return feeds
|
||||||
|
|
18
resources/recipes/rds.recipe
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class AdvancedUserRecipe1290013720(BasicNewsRecipe):
|
||||||
|
title = u'RDS'
|
||||||
|
__author__ = 'Nexus'
|
||||||
|
language = 'en_CA'
|
||||||
|
description = 'Hockey News'
|
||||||
|
oldest_article = 7
|
||||||
|
max_articles_per_feed = 25
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_tags = [dict(name='div', attrs={'id':'rdsWrap'}),
|
||||||
|
dict(name='table', attrs={'id':'aVoir'}),
|
||||||
|
dict(name='div', attrs={'id':'imageChronique'})]
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':['enteteChronique']}),
|
||||||
|
dict(name='div', attrs={'id':['contenuChronique']})]
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [(u'RDS', u'http://www.rds.ca/hockey/fildepresse_rds.xml')]
|
37
resources/recipes/reuters_ja.recipe
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
import re
|
||||||
|
|
||||||
|
class ReutersJa(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = 'Reuters(Japan)'
|
||||||
|
description = 'Global news in Japanese'
|
||||||
|
__author__ = 'Hiroshi Miura'
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'ja'
|
||||||
|
max_articles_per_feed = 10
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
|
feeds = [ ('Top Stories', 'http://feeds.reuters.com/reuters/JPTopNews?format=xml'),
|
||||||
|
('World News', 'http://feeds.reuters.com/reuters/JPWorldNews?format=xml'),
|
||||||
|
('Business News', 'http://feeds.reuters.com/reuters/JPBusinessNews?format=xml'),
|
||||||
|
('Technology News', 'http://feeds.reuters.com/reuters/JPTechnologyNews?format=xml'),
|
||||||
|
('Oddly Enough News', 'http://feeds.reuters.com/reuters/JPOddlyEnoughNews?format=xml')
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags_before = {'class':"article primaryContent"}
|
||||||
|
remove_tags = [ dict(id="banner"),
|
||||||
|
dict(id="autilities"),
|
||||||
|
dict(id="textSizer"),
|
||||||
|
dict(id="shareFooter"),
|
||||||
|
dict(id="relatedNews"),
|
||||||
|
dict(id="editorsChoice"),
|
||||||
|
dict(id="ecArticles"),
|
||||||
|
{'class':"secondaryContent"},
|
||||||
|
{'class':"module"},
|
||||||
|
]
|
||||||
|
remove_tags_after = {'class':"assetBuddy"}
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
m = re.search('(.*idJPJAPAN-[0-9]+)', url)
|
||||||
|
return m.group(0)+'?sp=true'
|
||||||
|
|
54
resources/recipes/revista_bla.recipe
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||||
|
'''
|
||||||
|
http://www.revistabla.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Noticias(BasicNewsRecipe):
|
||||||
|
title = 'Revista Bla'
|
||||||
|
__author__ = 'Gustavo Azambuja'
|
||||||
|
description = 'Moda | Uruguay'
|
||||||
|
language = 'es'
|
||||||
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
use_embedded_content = False
|
||||||
|
recursion = 5
|
||||||
|
encoding = 'utf8'
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
oldest_article = 20
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
keep_only_tags = [dict(id=['body_container'])]
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['date_text', 'comments', 'form_section', 'share_it']}),
|
||||||
|
dict(name='div', attrs={'id':['relatedPosts', 'spacer', 'banner_izquierda', 'right_container']}),
|
||||||
|
dict(name='p', attrs={'class':'FacebookLikeButton'}),
|
||||||
|
dict(name=['object','link']) ]
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||||
|
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||||
|
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||||
|
p {font-family:Arial,Helvetica,sans-serif;}
|
||||||
|
'''
|
||||||
|
feeds = [
|
||||||
|
(u'Articulos', u'http://www.revistabla.com/feed/')
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
cover_url = None
|
||||||
|
index = 'http://www.revistabla.com'
|
||||||
|
soup = self.index_to_soup(index)
|
||||||
|
link_item = soup.find('div',attrs={'class':'header_right'})
|
||||||
|
if link_item:
|
||||||
|
cover_url = link_item.img['src']
|
||||||
|
return cover_url
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
@ -108,3 +108,10 @@ class RevistaMuyInteresante(BasicNewsRecipe):
|
|||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
index = 'http://www.muyinteresante.es/revista'
|
||||||
|
soup = self.index_to_soup(index)
|
||||||
|
link_item = soup.find('img',attrs={'class':'img_portada'})
|
||||||
|
if link_item:
|
||||||
|
cover_url = "http://www.muyinteresante.es"+link_item['src']
|
||||||
|
return cover_url
|
||||||
|
69
resources/recipes/rollingstone.recipe
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
rollingstone.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class RollingStone(BasicNewsRecipe):
|
||||||
|
title = 'Rolling Stone Magazine - free content'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'Rolling Stone Magazine features music, album and artist news, movie reviews, political, economic and pop culture commentary, videos, photos, and more.'
|
||||||
|
publisher = 'Werner Media inc.'
|
||||||
|
category = 'news, music, USA, world'
|
||||||
|
oldest_article = 15
|
||||||
|
max_articles_per_feed = 200
|
||||||
|
no_stylesheets = True
|
||||||
|
encoding = 'utf8'
|
||||||
|
use_embedded_content = False
|
||||||
|
language = 'en'
|
||||||
|
remove_empty_feeds = True
|
||||||
|
publication_type = 'magazine'
|
||||||
|
masthead_url = 'http://www.rollingstone.com/templates/rolling-stone-templates/theme/rstheme/images/rsLogo.png'
|
||||||
|
extra_css = """
|
||||||
|
body{font-family: Georgia,Times,serif }
|
||||||
|
img{margin-bottom: 0.4em; display:block}
|
||||||
|
"""
|
||||||
|
|
||||||
|
conversion_options = {
|
||||||
|
'comment' : description
|
||||||
|
, 'tags' : category
|
||||||
|
, 'publisher' : publisher
|
||||||
|
, 'language' : language
|
||||||
|
}
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
|
(re.compile(r'xml:lang="en">.*?<head>', re.DOTALL|re.IGNORECASE),lambda match: 'xml:lang="en">\n<head>\n')
|
||||||
|
,(re.compile(r'</title>.*?</head>' , re.DOTALL|re.IGNORECASE),lambda match: '</title>\n</head>\n' )
|
||||||
|
]
|
||||||
|
|
||||||
|
keep_only_tags=[
|
||||||
|
dict(attrs={'class':['headerImgHolder','headerContent']})
|
||||||
|
,dict(name='div',attrs={'id':['teaser','storyTextContainer']})
|
||||||
|
,dict(name='div',attrs={'class':'blogDetailModule clearfix'})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name=['meta','iframe','object','embed'])
|
||||||
|
,dict(attrs={'id':'mpStoryHeader'})
|
||||||
|
,dict(attrs={'class':'relatedTopics'})
|
||||||
|
]
|
||||||
|
remove_attributes=['lang','onclick','width','height','name']
|
||||||
|
remove_tags_before=dict(attrs={'class':'bloggerInfo'})
|
||||||
|
remove_tags_after=dict(attrs={'class':'relatedTopics'})
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'All News' , u'http://www.rollingstone.com/siteServices/rss/allNews' )
|
||||||
|
,(u'All Blogs' , u'http://www.rollingstone.com/siteServices/rss/allBlogs' )
|
||||||
|
,(u'Movie Reviews' , u'http://www.rollingstone.com/siteServices/rss/movieReviews' )
|
||||||
|
,(u'Album Reviews' , u'http://www.rollingstone.com/siteServices/rss/albumReviews' )
|
||||||
|
,(u'Song Reviews' , u'http://www.rollingstone.com/siteServices/rss/songReviews' )
|
||||||
|
]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
@ -6,6 +6,7 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
|||||||
spiegel.de
|
spiegel.de
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
from time import strftime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class Spiegel_ger(BasicNewsRecipe):
|
class Spiegel_ger(BasicNewsRecipe):
|
||||||
@ -44,3 +45,6 @@ class Spiegel_ger(BasicNewsRecipe):
|
|||||||
rmain, rsep, rrest = main.rpartition(',')
|
rmain, rsep, rrest = main.rpartition(',')
|
||||||
purl = rmain + ',druck-' + rrest + ',' + rest
|
purl = rmain + ',druck-' + rrest + ',' + rest
|
||||||
return purl
|
return purl
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
return 'http://wissen.spiegel.de/wissen/titel/SP/' + strftime("%Y/%W/%j/titel.jpg")
|
||||||
|
@ -7,7 +7,7 @@ class AdvancedUserRecipe1284927619(BasicNewsRecipe):
|
|||||||
__author__ = 'noxxx'
|
__author__ = 'noxxx'
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
description = 'tagesanzeiger.ch: Nichts verpassen'
|
description = 'tagesanzeiger.ch: Nichts verpassen'
|
||||||
category = 'News, Politik, Nachrichten, Schweiz, Zürich'
|
category = 'News, Politik, Nachrichten, Schweiz, Zuerich'
|
||||||
language = 'de'
|
language = 'de'
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
|
@ -3,12 +3,12 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
|
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
|
||||||
|
|
||||||
''' http://www.derstandard.at - Austrian Newspaper '''
|
|
||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class TelepolisNews(BasicNewsRecipe):
|
class TelepolisNews(BasicNewsRecipe):
|
||||||
title = u'Telepolis (News)'
|
title = u'Telepolis (News+Artikel)'
|
||||||
__author__ = 'Gerhard Aigner'
|
__author__ = 'Gerhard Aigner'
|
||||||
publisher = 'Heise Zeitschriften Verlag GmbH & Co KG'
|
publisher = 'Heise Zeitschriften Verlag GmbH & Co KG'
|
||||||
description = 'News from telepolis'
|
description = 'News from telepolis'
|
||||||
@ -20,16 +20,16 @@ class TelepolisNews(BasicNewsRecipe):
|
|||||||
encoding = "utf-8"
|
encoding = "utf-8"
|
||||||
language = 'de_AT'
|
language = 'de_AT'
|
||||||
|
|
||||||
use_embedded_content = False
|
use_embedded_content =False
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
|
|
||||||
preprocess_regexps = [(re.compile(r'<a[^>]*>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
preprocess_regexps = [(re.compile(r'<a[^>]*>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||||
(re.compile(r'</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),]
|
(re.compile(r'</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),]
|
||||||
|
|
||||||
keep_only_tags = [dict(name = 'table',attrs={'class':'blogtable'})]
|
keep_only_tags = [dict(name = 'td',attrs={'class':'bloghead'}),dict(name = 'td',attrs={'class':'blogfliess'})]
|
||||||
remove_tags = [dict(name='img'), dict(name='td',attrs={'class':'blogbottom'})]
|
remove_tags = [dict(name='img'), dict(name='td',attrs={'class':'blogbottom'}), dict(name='td',attrs={'class':'forum'})]
|
||||||
|
|
||||||
feeds = [(u'News', u'http://www.heise.de/tp/news.rdf')]
|
feeds = [(u'News', u'http://www.heise.de/tp/news-atom.xml')]
|
||||||
|
|
||||||
html2lrf_options = [
|
html2lrf_options = [
|
||||||
'--comment' , description
|
'--comment' , description
|
||||||
@ -41,7 +41,7 @@ class TelepolisNews(BasicNewsRecipe):
|
|||||||
|
|
||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
'''if the linked article is of kind artikel don't take it'''
|
'''if the linked article is of kind artikel don't take it'''
|
||||||
if (article.link.count('artikel') > 0) :
|
if (article.link.count('artikel') > 1) :
|
||||||
return None
|
return None
|
||||||
return article.link
|
return article.link
|
||||||
|
|
||||||
@ -49,3 +49,5 @@ class TelepolisNews(BasicNewsRecipe):
|
|||||||
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
|
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
|
||||||
soup.head.insert(0,mtag)
|
soup.head.insert(0,mtag)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|