[Sync] Sync with trunk r7149
287
Changelog.yaml
@ -4,6 +4,293 @@
|
||||
# for important features/bug fixes.
|
||||
# Also, each release can have new and improved recipes.
|
||||
|
||||
- version: 0.7.32
|
||||
date: 2010-12-03
|
||||
|
||||
new features:
|
||||
- title: "All new linux binary build. With updated libraries and replacing cx_Freeze with my own C python launcher code."
|
||||
|
||||
- title: "Edit metadata dialog: Add Next and Previous buttons and show cover size in tooltip"
|
||||
tickets: [7706, 7711]
|
||||
|
||||
- title: "A new custom column type: Enumeration. This column can take one of a user defined set of values."
|
||||
|
||||
- title: "PML Output: Add option to reduce image sizes/bit depth to allow PML Output to be used with DropBook"
|
||||
|
||||
- title: "TXT Output: Add option to generate Markdown output. Turn <br> tags into spaces."
|
||||
|
||||
- title: "Add a count function to the template language. Make author_sort searchable."
|
||||
|
||||
- title: "Various consistency and usability enhancements to the search box."
|
||||
tickets: [7726]
|
||||
description: >
|
||||
"Always select first book in result set of a search. Similar books searches added to search history. Search history order is no longer randomized. When focussing the search box with a keyboard shortcut, select all text. If you press enter in the search box, the search is executed and the book list os automatically focussed."
|
||||
|
||||
- title: "Driver for samsung fascinate and PocketBook 902"
|
||||
|
||||
- title: "FB2 Output: Add option to create FB2 sections based on internal file structure of input file (useful for EPUB files that have been split on chapter boundaries). Also add options to mark h1/h2/h3 tags as section titles in the FB2 file."
|
||||
tickets: [7738]
|
||||
|
||||
- title: "Metadata jacket: Add publisher information to jacket."
|
||||
|
||||
- title: "Catalog generation: Allow use of custom columns as well as tags to indicate read books. Note that your previously saved read books setting will be lost."
|
||||
|
||||
- title: "Bulk metadata edit dialog: Add an Apply button to allow you to perform multiple operations in sequence"
|
||||
|
||||
- title: "Allow drag and drop of books onto user categories. If you drag a book from a particular column (say authors) and drop it onto a user category, the column value will be added to the user category. So for authors, the authros will be added to the user category."
|
||||
|
||||
- title: "Check Library can now check and repair the has_cover cache"
|
||||
|
||||
- title: "Allow GUI plugins to be distributed in ZIP files. See http://www.mobileread.com/forums/showthread.php?t=108774"
|
||||
|
||||
- title: "Allow searching by the number of tags/authors/formats/etc. See User Manual for details."
|
||||
|
||||
- title: "Tiny speed up when loading large libraries and make various metadata editing tasks a little faster by reducing the number of times the Tag Browser is refreshed"
|
||||
|
||||
bug fixes:
|
||||
- title: "E-book viewer: Fix broken backwards searching"
|
||||
|
||||
- title: "Fix custom ratings column values being displayed incorrectly in book details area"
|
||||
tickets: [7740]
|
||||
|
||||
- title: "Fix book details dialog not using internal viewer to view ebooks"
|
||||
tickets: [7424]
|
||||
|
||||
- title: "MOBI Output: When the input document does not explicitly specify a size for images, set the size to be the natural size of the image. This works around Amazon's *truly wonderful* MOBI renderer's tendency to expand images that do not have a width and height specified."
|
||||
|
||||
- title: "Conversion pipeline: Fix bug that caused height/width specified in %/em of screen size to be incorrectly calculated by a factor of 72./DPI"
|
||||
|
||||
- title: "Conversion pipeline: Respect max-width and max-height when calculating the effective size of an element"
|
||||
|
||||
- title: "Conversion pipeline: Do not override CSS for images with the value of the img width/height attributes, unless no CSS is specified for the image"
|
||||
|
||||
- title: "E-book viewer: Resize automatically to fit on smaller screens"
|
||||
|
||||
- title: "Use the same MIME database on all platforms that calibre runs on, works around python 2.7's crazy insistence on reading MIME data from the registry"
|
||||
|
||||
- title: "Kobo driver: Allow html, txt and rtf documents to be deleted"
|
||||
|
||||
- title: "Always overwrite title/author metadata when downloading metadata for books added by ISBN"
|
||||
|
||||
- title: "Nook Color profile: Reduce screen height to 900px"
|
||||
|
||||
- title: "Fix regression that broke RTF conversion on some linux systems"
|
||||
|
||||
- title: "Fix bug that could break searching after copying and deleting a book from the current library"
|
||||
tickets: [7459]
|
||||
|
||||
improved recipes:
|
||||
- NZZ
|
||||
- Frankfurter Rundschau
|
||||
- JiJi Press
|
||||
- Revista Muy Intersante
|
||||
|
||||
new recipes:
|
||||
- title: "Global Times"
|
||||
author: "malfi"
|
||||
|
||||
- title: "The Philosopher's Magazine"
|
||||
author: "Darko Miletic"
|
||||
|
||||
- title: "Poughkeepsie Journal"
|
||||
author: "weebl"
|
||||
|
||||
- title: "Business Spectator and ABC Australia"
|
||||
author: "Dean Cording"
|
||||
|
||||
- title: "La Rijoa and NacionRed"
|
||||
author: "Arturo Martinez Nieves"
|
||||
|
||||
- title: "Animal Politico"
|
||||
author: "leamsi"
|
||||
|
||||
|
||||
- version: 0.7.31
|
||||
date: 2010-11-27
|
||||
|
||||
bug fixes:
|
||||
- title: "Fix various regressions in the calibre windows build caused by the switch to python 2.7. If you are on windows and upgraded to 0.7.30, it is highly recommended that you upgrade to 0.7.31. If you are not on windows, you can ignore 0.7.31"
|
||||
tickets: [7685, 7694, 7691]
|
||||
|
||||
|
||||
- version: 0.7.30
|
||||
date: 2010-11-26
|
||||
|
||||
new features:
|
||||
- title: "Support for Acer Lumiread and PocketBook Pro 602"
|
||||
|
||||
- title: "When importing by ISBN also allow the specification of a file to be imported."
|
||||
tickets: [7400]
|
||||
|
||||
- title: "E-mail sending: Email sends are now regular jobs that can be accessed from the jobs list. Also when sending using gmail/hotmail send at most one email every five minutes to avoid trigerring their spam controls. Failed sends are now retried one more time, automatically."
|
||||
|
||||
- title: "Content server: When a category contains only one item, go directly to the book list instead of forcing the user to click on that one item"
|
||||
|
||||
- title: "E-mail sending: Allow unencrypted connections to SMTP relay"
|
||||
|
||||
- title: "Improve startup times for large libraries by caching the has_cover check"
|
||||
|
||||
- title: "Update windows binary build to use python 2.7"
|
||||
|
||||
- title: "Metadata and cover download plugins from Nicebooks (disabled by default)"
|
||||
|
||||
|
||||
bug fixes:
|
||||
- title: "MOBI Input: Fix bug in cleanup regex that broke parsing of escaped XML declarations."
|
||||
tickets: [7585]
|
||||
|
||||
- title: "Content server: Fix bug when user has custom categories/columns with non ascii names"
|
||||
tickets: [7590]
|
||||
|
||||
- title: "RTF Output: Handle non breaking spaces correctly"
|
||||
tickets: [7668]
|
||||
|
||||
- title: "Conversion pipeline: When rasterizing SVG images workaround incorrect handinlg of percentage height specifications in QSvgRenderer."
|
||||
tickets: [7598]
|
||||
|
||||
- title: "News download: Update version of feedparser used to parse RSS feeds."
|
||||
tickets: [7674]
|
||||
|
||||
- title: "Tag Browser: Allow user to restore hidden categories by a right click even is all categories have been hidden"
|
||||
|
||||
- title: "TXT/RTF Output: Handle XML processing instructions embedded in content correctly."
|
||||
tickets: [7644]
|
||||
|
||||
- title: "MOBI Input: Workarounds for lack of nesting rules between block and inline tags"
|
||||
tickets: [7618]
|
||||
|
||||
- title: "E-book viewer: Load all hyphenation patterns to support multi-lingual books"
|
||||
|
||||
- title: "E-book viewer: Fix incorrect lang names being used in hyphenation"
|
||||
|
||||
- title: "Check to see that the result file from a conversion is not empty before adding it, protects against the case where the conversion process crashes and the GUI adds a zero byte file to the book record"
|
||||
|
||||
- title: "E-book viewer: More sophisticated algorithm to resize images to fit viewer window. Should preserve aspect ratio in more cases"
|
||||
|
||||
- title: "Remove unneccessary calls to set_path when creating book records. Speeds up record creation by about 30% on my system"
|
||||
|
||||
- title: "Speedup for bibtex catalog generation."
|
||||
|
||||
- title: "Kobo driver: Fix missing table in deleting books process for Kobo WiFi and Kobo-O 1.8 Beta"
|
||||
|
||||
- title: "RTF Input: Preserve scene breaks in the form of empty paragraphs. Preprocessing: Improvements to chapter detection"
|
||||
|
||||
- title: "Fix custom recipe not sorted by title"
|
||||
tickets: [7486]
|
||||
|
||||
- title: "Kobo driver: Fix bug in managing the Im_Reading category on windows"
|
||||
|
||||
improved recipes:
|
||||
- "El Pais - Uruguay"
|
||||
- Argentinian La Nacion
|
||||
- comics.com
|
||||
- Mingpao
|
||||
- Revista Muy Intersante
|
||||
- Telepolis
|
||||
- New York Times
|
||||
|
||||
new recipes:
|
||||
- title: "Bangkok Biz News and Matichon"
|
||||
author: "Anat Ruangrassamee"
|
||||
|
||||
- title: "The Workingham Times and Deutsche Welle"
|
||||
author: "Darko Miletic"
|
||||
|
||||
- title: "Biz Portal"
|
||||
author: "marbs"
|
||||
|
||||
- title: "Various Japanese news sources"
|
||||
author: "Hiroshi Miura"
|
||||
|
||||
- title: "Arcamax"
|
||||
author: "Starson17"
|
||||
|
||||
- title: "Various Spanish news sources"
|
||||
author: "Gustavo Azambuja"
|
||||
|
||||
- title: "TSN"
|
||||
author: Nexus
|
||||
|
||||
- title: "Zeit Online Premium"
|
||||
author: Steffen Siebert
|
||||
|
||||
|
||||
- version: 0.7.29
|
||||
date: 2010-11-19
|
||||
|
||||
new features:
|
||||
- title: "OSX binary build is now based on Qt 4.7. Also, the build is now Intel only and requires at least OS X 10.5.2. If you are on a PowerPC machine or an older OS X version, do not upgrade"
|
||||
|
||||
- title: "Content server: Allow direct navigation to a set of books in the book list."
|
||||
tickets: [7453]
|
||||
|
||||
- title: "OS X: When deleting books, put the files into the recycle bin instead of deleting them permanently"
|
||||
|
||||
- title: "Add button to easy configure Hotmail as email relay. Also improve usability of easy config buttons"
|
||||
|
||||
- title: "Kobo driver: Support Currently_Reading category"
|
||||
|
||||
- title: "Catalog generation: Thumbnail caching, wishlist, improved description layout."
|
||||
tickets: [7376]
|
||||
|
||||
- title: "Support for the Cybook Orizon"
|
||||
|
||||
bug fixes:
|
||||
- title: "Fix restore to defaults in preferences incorrectly setting PDF unwrap factor to 0.0"
|
||||
|
||||
- title: "PDF Input: Fix unwrapping of accented characters"
|
||||
|
||||
- title: "Do not display dialogs asking for confirmation or showing conversion errors when calibre is minimized to system tray"
|
||||
tickets: [7549]
|
||||
|
||||
- title: "calibre server: Fix regression that broke digest authentication when the calibre interface language was set to non English"
|
||||
|
||||
- title: "EPUB Output: Do not raise an error for invalid embedded fonts in the input document."
|
||||
tickets: [7567]
|
||||
|
||||
- title: "RTF Input: Improved conversion of tables, with support for border styles on table cells"
|
||||
|
||||
- title: "E-book viewer: Fix regression that broke hyphenation. Also add more language patterns for hyphenation"
|
||||
|
||||
- title: "SONY driver: Fix cover thumbnails being uploaded to wrong directory on windows"
|
||||
|
||||
- title: "Fix UnicodeDecodeError when displaying a failed metadata fetch message"
|
||||
tickets: [7560]
|
||||
|
||||
- title: "Bulk metadata edit: Speed up remove all tags operation"
|
||||
|
||||
- title: "MOBI Output: Specify image sizes in pixels instead of em to accomodate Amazon's @#$%#@! MOBI renderer"
|
||||
|
||||
- title: "Fix bug preventing customizing of builtin recipes if they are not ascii encoded"
|
||||
|
||||
- title: "SONY XML cache: Handle case where XML db contains reference to a file that does not exist gracefully"
|
||||
|
||||
improved recipes:
|
||||
- Al Jazeera
|
||||
- The Moscow Times
|
||||
- GLobe and Mail
|
||||
- Washington Post
|
||||
|
||||
new recipes:
|
||||
- title: "Hannoversche Allgemeine Zeitung"
|
||||
author: "Artemis"
|
||||
|
||||
- title: "globes.co.il"
|
||||
author: "marbs"
|
||||
|
||||
- title: "THN and RDS"
|
||||
author: "Nexus"
|
||||
|
||||
- title: "pclab.pl"
|
||||
author: "ravcio"
|
||||
|
||||
- title: "Now Toronto"
|
||||
author: "Starson17"
|
||||
|
||||
- title: "Press releases of the German government and EU Commission"
|
||||
author: "malfi"
|
||||
|
||||
|
||||
- version: 0.7.28
|
||||
date: 2010-11-12
|
||||
|
||||
|
@ -12,8 +12,8 @@ p.title {
|
||||
p.author {
|
||||
margin-top:0em;
|
||||
margin-bottom:0em;
|
||||
text-align: left;
|
||||
text-indent: 1em;
|
||||
text-align: center;
|
||||
text-indent: 0em;
|
||||
font-size:large;
|
||||
}
|
||||
|
||||
@ -27,17 +27,28 @@ p.author_index {
|
||||
}
|
||||
|
||||
p.tags {
|
||||
margin-top:0em;
|
||||
margin-top:0.5em;
|
||||
margin-bottom:0em;
|
||||
text-align: left;
|
||||
text-indent: 1em;
|
||||
font-size:small;
|
||||
text-indent: 0.0in;
|
||||
}
|
||||
|
||||
p.description {
|
||||
text-align:left;
|
||||
font-style:normal;
|
||||
margin-top: 0em;
|
||||
p.formats {
|
||||
font-size:90%;
|
||||
margin-top:0em;
|
||||
margin-bottom:0.5em;
|
||||
text-align: left;
|
||||
text-indent: 0.0in;
|
||||
}
|
||||
|
||||
div.description > p:first-child {
|
||||
margin: 0 0 0 0;
|
||||
text-indent: 0em;
|
||||
}
|
||||
|
||||
div.description {
|
||||
margin: 0 0 0 0;
|
||||
text-indent: 1em;
|
||||
}
|
||||
|
||||
p.date_index {
|
||||
@ -121,5 +132,5 @@ td.rating {
|
||||
text-align: center;
|
||||
}
|
||||
td.thumbnail img {
|
||||
-webkit-box-shadow: 6px 6px 6px #888;
|
||||
-webkit-box-shadow: 4px 4px 12px #999;
|
||||
}
|
@ -355,6 +355,25 @@ h2.library_name {
|
||||
color: red;
|
||||
}
|
||||
|
||||
#booklist > #pagelist { display: none; }
|
||||
|
||||
#goto_page_dialog ul {
|
||||
list-style-type: none;
|
||||
font-size: medium;
|
||||
}
|
||||
|
||||
#goto_page_dialog li {
|
||||
margin-bottom: 1.5ex;
|
||||
}
|
||||
|
||||
#goto_page_dialog a {
|
||||
text-decoration: none;
|
||||
color: blue;
|
||||
}
|
||||
|
||||
#goto_page_dialog a:hover {
|
||||
color: red;
|
||||
}
|
||||
|
||||
#booklist .left .ui-button-text {
|
||||
font-size: medium;
|
||||
|
@ -96,5 +96,6 @@
|
||||
</div>
|
||||
</div>
|
||||
<div id="book_details_dialog"></div>
|
||||
<div id="goto_page_dialog"></div>
|
||||
</body>
|
||||
</html>
|
||||
|
@ -202,6 +202,23 @@ function previous_page() {
|
||||
else last_page();
|
||||
}
|
||||
|
||||
function gp_internal(id) {
|
||||
var gp = $('#goto_page_dialog');
|
||||
gp.dialog('close');
|
||||
var elem = $("#booklist #" + id);
|
||||
load_page(elem);
|
||||
}
|
||||
|
||||
function goto_page() {
|
||||
var gp = $('#goto_page_dialog');
|
||||
var pl = $('#booklist > #pagelist');
|
||||
gp.html(pl.html());
|
||||
gp.dialog('option', 'title', pl.attr('title'));
|
||||
gp.dialog('option', 'height', $(window).height() - 100);
|
||||
gp.dialog('open');
|
||||
|
||||
}
|
||||
|
||||
function load_page(elem) {
|
||||
if (elem.is(":visible")) return;
|
||||
var ld = elem.find('.load_data');
|
||||
@ -251,6 +268,12 @@ function booklist(hide_sort) {
|
||||
modal: true,
|
||||
show: 'slide'
|
||||
});
|
||||
$("#goto_page_dialog").dialog({
|
||||
autoOpen: false,
|
||||
modal: true,
|
||||
show: 'slide'
|
||||
});
|
||||
|
||||
first_page();
|
||||
}
|
||||
|
||||
|
@ -38,6 +38,7 @@ Monocle.Browser.on = {
|
||||
iPad: navigator.userAgent.indexOf("iPad") != -1,
|
||||
BlackBerry: navigator.userAgent.indexOf("BlackBerry") != -1,
|
||||
Android: navigator.userAgent.indexOf('Android') != -1,
|
||||
MacOSX: navigator.userAgent.indexOf('Mac OS X') != -1,
|
||||
Kindle3: navigator.userAgent.match(/Kindle\/3/)
|
||||
}
|
||||
|
||||
@ -162,12 +163,23 @@ Monocle.Browser.has.transform3d = Monocle.Browser.CSSProps.isSupported([
|
||||
'OPerspective',
|
||||
'msPerspective'
|
||||
]) && Monocle.Browser.CSSProps.supportsMediaQueryProperty('transform-3d');
|
||||
Monocle.Browser.has.embedded = (top != self);
|
||||
|
||||
Monocle.Browser.has.iframeTouchBug = Monocle.Browser.iOSVersionBelow("4.2");
|
||||
|
||||
Monocle.Browser.has.selectThruBug = Monocle.Browser.iOSVersionBelow("4.2");
|
||||
|
||||
Monocle.Browser.has.mustScrollSheaf = Monocle.Browser.is.MobileSafari;
|
||||
Monocle.Browser.has.iframeDoubleWidthBug = Monocle.Browser.has.mustScrollSheaf;
|
||||
|
||||
Monocle.Browser.has.floatColumnBug = Monocle.Browser.is.WebKit;
|
||||
|
||||
Monocle.Browser.has.relativeIframeWidthBug = Monocle.Browser.on.Android;
|
||||
|
||||
|
||||
Monocle.Browser.has.jumpFlickerBug =
|
||||
Monocle.Browser.on.MacOSX && Monocle.Browser.is.WebKit;
|
||||
|
||||
|
||||
if (typeof window.console == "undefined") {
|
||||
window.console = {
|
||||
@ -1091,11 +1103,29 @@ Monocle.Reader = function (node, bookData, options, onLoadCallback) {
|
||||
cmpt.dom.setStyles(Monocle.Styles.component);
|
||||
Monocle.Styles.applyRules(cmpt.contentDocument.body, Monocle.Styles.body);
|
||||
}
|
||||
lockFrameWidths();
|
||||
dom.find('overlay').dom.setStyles(Monocle.Styles.overlay);
|
||||
dispatchEvent('monocle:styles');
|
||||
}
|
||||
|
||||
|
||||
function lockingFrameWidths() {
|
||||
if (!Monocle.Browser.has.relativeIframeWidthBug) { return; }
|
||||
for (var i = 0, cmpt; cmpt = dom.find('component', i); ++i) {
|
||||
cmpt.style.display = "none";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function lockFrameWidths() {
|
||||
if (!Monocle.Browser.has.relativeIframeWidthBug) { return; }
|
||||
for (var i = 0, cmpt; cmpt = dom.find('component', i); ++i) {
|
||||
cmpt.style.width = cmpt.parentNode.offsetWidth+"px";
|
||||
cmpt.style.display = "block";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function setBook(bk, place, callback) {
|
||||
p.book = bk;
|
||||
var pageCount = 0;
|
||||
@ -1121,12 +1151,14 @@ Monocle.Reader = function (node, bookData, options, onLoadCallback) {
|
||||
if (!p.initialized) {
|
||||
console.warn('Attempt to resize book before initialization.');
|
||||
}
|
||||
lockingFrameWidths();
|
||||
if (!dispatchEvent("monocle:resizing", {}, true)) {
|
||||
return;
|
||||
}
|
||||
clearTimeout(p.resizeTimer);
|
||||
p.resizeTimer = setTimeout(
|
||||
function () {
|
||||
lockFrameWidths();
|
||||
p.flipper.moveTo({ page: pageNumber() });
|
||||
dispatchEvent("monocle:resize");
|
||||
},
|
||||
@ -1765,12 +1797,7 @@ Monocle.Book = function (dataSource) {
|
||||
|
||||
|
||||
function componentIdMatching(str) {
|
||||
for (var i = 0; i < p.componentIds.length; ++i) {
|
||||
if (str.indexOf(p.componentIds[i]) > -1) {
|
||||
return p.componentIds[i];
|
||||
}
|
||||
}
|
||||
return null;
|
||||
return p.componentIds.indexOf(str) >= 0 ? str : null;
|
||||
}
|
||||
|
||||
|
||||
@ -2018,6 +2045,12 @@ Monocle.Component = function (book, id, index, chapters, source) {
|
||||
|
||||
|
||||
function loadFrameFromURL(url, frame, callback) {
|
||||
if (!url.match(/^\//)) {
|
||||
var link = document.createElement('a');
|
||||
link.setAttribute('href', url);
|
||||
url = link.href;
|
||||
delete(link);
|
||||
}
|
||||
frame.onload = function () {
|
||||
frame.onload = null;
|
||||
Monocle.defer(callback);
|
||||
@ -2460,7 +2493,7 @@ Monocle.Flippers.Legacy = function (reader) {
|
||||
function moveTo(locus, callback) {
|
||||
var fn = frameToLocus;
|
||||
if (typeof callback == "function") {
|
||||
fn = function () { frameToLocus(); callback(); }
|
||||
fn = function (locus) { frameToLocus(locus); callback(locus); }
|
||||
}
|
||||
p.reader.getBook().setOrLoadPageAt(page(), locus, fn);
|
||||
}
|
||||
@ -2794,7 +2827,9 @@ Monocle.Dimensions.Columns = function (pageDiv) {
|
||||
function scrollerWidth() {
|
||||
var bdy = p.page.m.activeFrame.contentDocument.body;
|
||||
if (Monocle.Browser.has.iframeDoubleWidthBug) {
|
||||
if (Monocle.Browser.iOSVersion < "4.1") {
|
||||
if (Monocle.Browser.on.Android) {
|
||||
return bdy.scrollWidth * 1.5; // I actually have no idea why 1.5.
|
||||
} else if (Monocle.Browser.iOSVersion < "4.1") {
|
||||
var hbw = bdy.scrollWidth / 2;
|
||||
var sew = scrollerElement().scrollWidth;
|
||||
return Math.max(sew, hbw);
|
||||
@ -2969,6 +3004,7 @@ Monocle.Flippers.Slider = function (reader) {
|
||||
|
||||
|
||||
function setPage(pageDiv, locus, callback) {
|
||||
ensureWaitControl();
|
||||
p.reader.getBook().setOrLoadPageAt(
|
||||
pageDiv,
|
||||
locus,
|
||||
@ -3048,6 +3084,7 @@ Monocle.Flippers.Slider = function (reader) {
|
||||
checkPoint(boxPointX);
|
||||
|
||||
p.turnData.releasing = true;
|
||||
showWaitControl(lowerPage());
|
||||
|
||||
if (dir == k.FORWARDS) {
|
||||
if (
|
||||
@ -3088,14 +3125,18 @@ Monocle.Flippers.Slider = function (reader) {
|
||||
|
||||
|
||||
function onGoingBackward(x) {
|
||||
var lp = lowerPage();
|
||||
var lp = lowerPage(), up = upperPage();
|
||||
showWaitControl(up);
|
||||
jumpOut(lp, // move lower page off-screen
|
||||
function () {
|
||||
flipPages(); // flip lower to upper
|
||||
setPage( // set upper page to previous
|
||||
lp,
|
||||
getPlace(lowerPage()).getLocus({ direction: k.BACKWARDS }),
|
||||
function () { lifted(x); }
|
||||
function () {
|
||||
lifted(x);
|
||||
hideWaitControl(up);
|
||||
}
|
||||
);
|
||||
}
|
||||
);
|
||||
@ -3103,8 +3144,10 @@ Monocle.Flippers.Slider = function (reader) {
|
||||
|
||||
|
||||
function afterGoingForward() {
|
||||
var up = upperPage();
|
||||
var up = upperPage(), lp = lowerPage();
|
||||
if (p.interactive) {
|
||||
showWaitControl(up);
|
||||
showWaitControl(lp);
|
||||
setPage( // set upper (off screen) to current
|
||||
up,
|
||||
getPlace().getLocus({ direction: k.FORWARDS }),
|
||||
@ -3113,6 +3156,7 @@ Monocle.Flippers.Slider = function (reader) {
|
||||
}
|
||||
);
|
||||
} else {
|
||||
showWaitControl(lp);
|
||||
flipPages();
|
||||
jumpIn(up, function () { prepareNextPage(announceTurn); });
|
||||
}
|
||||
@ -3171,6 +3215,8 @@ Monocle.Flippers.Slider = function (reader) {
|
||||
|
||||
|
||||
function announceTurn() {
|
||||
hideWaitControl(upperPage());
|
||||
hideWaitControl(lowerPage());
|
||||
p.reader.dispatchEvent('monocle:turn');
|
||||
resetTurnData();
|
||||
}
|
||||
@ -3319,12 +3365,14 @@ Monocle.Flippers.Slider = function (reader) {
|
||||
|
||||
|
||||
function jumpIn(pageDiv, callback) {
|
||||
setX(pageDiv, 0, { duration: 1 }, callback);
|
||||
var dur = Monocle.Browser.has.jumpFlickerBug ? 1 : 0;
|
||||
setX(pageDiv, 0, { duration: dur }, callback);
|
||||
}
|
||||
|
||||
|
||||
function jumpOut(pageDiv, callback) {
|
||||
setX(pageDiv, 0 - pageDiv.offsetWidth, { duration: 1 }, callback);
|
||||
var dur = Monocle.Browser.has.jumpFlickerBug ? 1 : 0;
|
||||
setX(pageDiv, 0 - pageDiv.offsetWidth, { duration: dur }, callback);
|
||||
}
|
||||
|
||||
|
||||
@ -3357,6 +3405,28 @@ Monocle.Flippers.Slider = function (reader) {
|
||||
}
|
||||
|
||||
|
||||
function ensureWaitControl() {
|
||||
if (p.waitControl) { return; }
|
||||
p.waitControl = {
|
||||
createControlElements: function (holder) {
|
||||
return holder.dom.make('div', 'flippers_slider_wait');
|
||||
}
|
||||
}
|
||||
p.reader.addControl(p.waitControl, 'page');
|
||||
}
|
||||
|
||||
|
||||
function showWaitControl(page) {
|
||||
var ctrl = p.reader.dom.find('flippers_slider_wait', page.m.pageIndex);
|
||||
ctrl.style.opacity = 0.5;
|
||||
}
|
||||
|
||||
|
||||
function hideWaitControl(page) {
|
||||
var ctrl = p.reader.dom.find('flippers_slider_wait', page.m.pageIndex);
|
||||
ctrl.style.opacity = 0;
|
||||
}
|
||||
|
||||
API.pageCount = p.pageCount;
|
||||
API.addPage = addPage;
|
||||
API.getPlace = getPlace;
|
||||
|
@ -217,3 +217,15 @@ generate_cover_foot_font = None
|
||||
# open_viewer, do_nothing, edit_cell. Default: open_viewer.
|
||||
# Example: doubleclick_on_library_view = 'do_nothing'
|
||||
doubleclick_on_library_view = 'open_viewer'
|
||||
|
||||
|
||||
# Language to use when sorting. Setting this tweak will force sorting to use the
|
||||
# collating order for the specified language. This might be useful if you run
|
||||
# calibre in English but want sorting to work in the language where you live.
|
||||
# Set the tweak to the desired ISO 639-1 language code, in lower case.
|
||||
# You can find the list of supported locales at
|
||||
# http://publib.boulder.ibm.com/infocenter/iseries/v5r3/topic/nls/rbagsicusortsequencetables.htm
|
||||
# Default: locale_for_sorting = '' -- use the language calibre displays in
|
||||
# Example: locale_for_sorting = 'fr' -- sort using French rules.
|
||||
# Example: locale_for_sorting = 'nb' -- sort using Norwegian rules.
|
||||
locale_for_sorting = ''
|
||||
|
BIN
resources/images/news/cnetjapan.png
Normal file
After Width: | Height: | Size: 892 B |
BIN
resources/images/news/cnetjapan_digital.png
Normal file
After Width: | Height: | Size: 892 B |
BIN
resources/images/news/cnetjapan_release.png
Normal file
After Width: | Height: | Size: 892 B |
BIN
resources/images/news/deutsche_welle_bs.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/deutsche_welle_en.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/deutsche_welle_es.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/deutsche_welle_hr.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/deutsche_welle_pt.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/deutsche_welle_sr.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/endgadget_ja.png
Normal file
After Width: | Height: | Size: 698 B |
BIN
resources/images/news/jijinews.png
Normal file
After Width: | Height: | Size: 919 B |
BIN
resources/images/news/mainichi.png
Normal file
After Width: | Height: | Size: 953 B |
BIN
resources/images/news/mainichi_it_news.png
Normal file
After Width: | Height: | Size: 953 B |
BIN
resources/images/news/moscow_times.png
Normal file
After Width: | Height: | Size: 1.0 KiB |
BIN
resources/images/news/msnsankei.png
Normal file
After Width: | Height: | Size: 543 B |
BIN
resources/images/news/nikkei_free.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/nikkei_sub.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/nikkei_sub_economy.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/nikkei_sub_industry.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/nikkei_sub_life.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/nikkei_sub_main.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/nikkei_sub_sports.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/novaya_gazeta.png
Normal file
After Width: | Height: | Size: 610 B |
BIN
resources/images/news/reuters.png
Normal file
After Width: | Height: | Size: 693 B |
BIN
resources/images/news/reuters_ja.png
Normal file
After Width: | Height: | Size: 693 B |
BIN
resources/images/news/the_workingham_times.png
Normal file
After Width: | Height: | Size: 1011 B |
BIN
resources/images/news/tpm_uk.png
Normal file
After Width: | Height: | Size: 873 B |
BIN
resources/images/news/vedomosti.png
Normal file
After Width: | Height: | Size: 693 B |
BIN
resources/images/news/yomiuri.png
Normal file
After Width: | Height: | Size: 660 B |
@ -36,22 +36,37 @@
|
||||
/*
|
||||
** Title
|
||||
*/
|
||||
.cbj_title {
|
||||
table.cbj_header td.cbj_title {
|
||||
font-size: x-large;
|
||||
font-style: italic;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
/*
|
||||
** Series
|
||||
*/
|
||||
table.cbj_header td.cbj_series {
|
||||
font-size: medium;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
/*
|
||||
** Author
|
||||
*/
|
||||
.cbj_author {
|
||||
table.cbj_header td.cbj_author {
|
||||
font-size: medium;
|
||||
text-align: center;
|
||||
margin-bottom: 1ex;
|
||||
}
|
||||
|
||||
/*
|
||||
** Table containing Series, Publication Year, Rating and Tags
|
||||
** Publisher/published
|
||||
*/
|
||||
table.cbj_header td.cbj_pubdata {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
/*
|
||||
** Table containing Rating and Tags
|
||||
*/
|
||||
table.cbj_header {
|
||||
width: 100%;
|
||||
@ -62,9 +77,8 @@ table.cbj_header {
|
||||
*/
|
||||
table.cbj_header td.cbj_label {
|
||||
font-family: sans-serif;
|
||||
font-weight: bold;
|
||||
text-align: right;
|
||||
width: 40%;
|
||||
width: 33%;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -73,9 +87,23 @@ table.cbj_header td.cbj_label {
|
||||
table.cbj_header td.cbj_content {
|
||||
font-family: sans-serif;
|
||||
text-align: left;
|
||||
width:60%;
|
||||
width:67%;
|
||||
}
|
||||
|
||||
/*
|
||||
** Metadata divider
|
||||
*/
|
||||
hr.metadata_divider {
|
||||
width:90%;
|
||||
margin-left:5%;
|
||||
border-top: solid white 0px;
|
||||
border-right: solid white 0px;
|
||||
border-bottom: solid black 1px;
|
||||
border-left: solid white 0px;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
** To skip a banner item (Series|Published|Rating|Tags),
|
||||
** edit the appropriate CSS rule below.
|
||||
|
@ -6,17 +6,24 @@
|
||||
</head>
|
||||
<body>
|
||||
<div class="cbj_banner">
|
||||
<div class="cbj_title">{title}</div>
|
||||
<div class="cbj_author">{author}</div>
|
||||
<table class="cbj_header">
|
||||
<tr class="cbj_series">
|
||||
<td class="cbj_label">{series_label}:</td>
|
||||
<td class="cbj_content">{series}</td>
|
||||
<tr>
|
||||
<td class="cbj_title" colspan="2">{title}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="cbj_series" colspan="2">{series}</td>
|
||||
</tr>
|
||||
<tr class="cbj_pubdate">
|
||||
<td class="cbj_label">{pubdate_label}:</td>
|
||||
<td class="cbj_content">{pubdate}</td>
|
||||
<tr>
|
||||
<td class="cbj_author" colspan="2">{author}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="cbj_pubdata" colspan="2">{publisher} ({pubdate})</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td class="cbj_author" colspan="2"><hr class="metadata_divider" /></td>
|
||||
</tr>
|
||||
|
||||
<tr class="cbj_rating">
|
||||
<td class="cbj_label">{rating_label}:</td>
|
||||
<td class="cbj_content">{rating}</td>
|
||||
|
1381
resources/mime.types
Normal file
50
resources/recipes/180.recipe
Normal file
@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
'''
|
||||
180.com.uy
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Noticias(BasicNewsRecipe):
|
||||
title = '180.com.uy'
|
||||
__author__ = 'Gustavo Azambuja'
|
||||
description = 'Noticias de Uruguay'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
use_embedded_content = False
|
||||
recursion = 5
|
||||
encoding = 'utf-8'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'tef-md tef-md-seccion-sociedad'})]
|
||||
remove_tags = [
|
||||
dict(name=['object','link'])
|
||||
]
|
||||
|
||||
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||
p {font-family:Arial,Helvetica,sans-serif;}
|
||||
'''
|
||||
feeds = [
|
||||
(u'Titulares', u'http://www.180.com.uy/feed.php')
|
||||
]
|
||||
|
||||
def get_cover_url(self):
|
||||
return 'http://www.180.com.uy/tplef/img/logo.gif'
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
54
resources/recipes/abc_au.recipe
Normal file
@ -0,0 +1,54 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Dean Cording'
|
||||
'''
|
||||
abc.net.au/news
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class ABCNews(BasicNewsRecipe):
|
||||
title = 'ABC News'
|
||||
__author__ = 'Dean Cording'
|
||||
description = 'News from Australia'
|
||||
masthead_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
|
||||
cover_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = False
|
||||
#delay = 1
|
||||
use_embedded_content = False
|
||||
encoding = 'utf8'
|
||||
publisher = 'ABC News'
|
||||
category = 'News, Australia, World'
|
||||
language = 'en_AU'
|
||||
publication_type = 'newsportal'
|
||||
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'linearize_tables': False
|
||||
}
|
||||
|
||||
keep_only_tags = dict(id='article')
|
||||
|
||||
remove_tags = [dict(attrs={'class':['related', 'tags']}),
|
||||
dict(id='statepromo')
|
||||
]
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [
|
||||
('Top Stories', 'http://www.abc.net.au/news/syndicate/topstoriesrss.xml'),
|
||||
('Canberra', 'http://www.abc.net.au/news/indexes/idx-act/rss.xml'),
|
||||
('Sydney', 'http://www.abc.net.au/news/indexes/sydney/rss.xml'),
|
||||
('Melbourne', 'http://www.abc.net.au/news/indexes/melbourne/rss.xml'),
|
||||
('Brisbane', 'http://www.abc.net.au/news/indexes/brisbane/rss.xml'),
|
||||
('Perth', 'http://www.abc.net.au/news/indexes/perth/rss.xml'),
|
||||
('Australia', 'http://www.abc.net.au/news/indexes/idx-australia/rss.xml'),
|
||||
('World', 'http://www.abc.net.au/news/indexes/world/rss.xml'),
|
||||
('Business', 'http://www.abc.net.au/news/indexes/business/rss.xml'),
|
||||
('Science and Technology', 'http://www.abc.net.au/news/tag/science-and-technology/rss.xml'),
|
||||
]
|
@ -1,10 +1,8 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
|
||||
'''
|
||||
aljazeera.net
|
||||
english.aljazeera.net
|
||||
'''
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
@ -12,41 +10,59 @@ class AlJazeera(BasicNewsRecipe):
|
||||
title = 'Al Jazeera in English'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'News from Middle East'
|
||||
language = 'en'
|
||||
|
||||
language = 'en'
|
||||
publisher = 'Al Jazeera'
|
||||
category = 'news, politics, middle east'
|
||||
simultaneous_downloads = 1
|
||||
delay = 4
|
||||
oldest_article = 1
|
||||
delay = 1
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
encoding = 'iso-8859-1'
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
extra_css = """
|
||||
body{font-family: Arial,sans-serif}
|
||||
#ctl00_cphBody_dvSummary{font-weight: bold}
|
||||
#dvArticleDate{font-size: small; color: #999999}
|
||||
"""
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment', description
|
||||
, '--category', category
|
||||
, '--publisher', publisher
|
||||
, '--ignore-tables'
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_table=True'
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'ctl00_divContent'})]
|
||||
keep_only_tags = [
|
||||
dict(attrs={'id':['DetailedTitle','ctl00_cphBody_dvSummary','dvArticleDate']})
|
||||
,dict(name='td',attrs={'class':'DetailedSummary'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['object','link'])
|
||||
dict(name=['object','link','table','meta','base','iframe','embed'])
|
||||
,dict(name='td', attrs={'class':['MostActiveDescHeader','MostActiveDescBody']})
|
||||
]
|
||||
|
||||
feeds = [(u'AL JAZEERA ENGLISH (AJE)', u'http://english.aljazeera.net/Services/Rss/?PostingId=2007731105943979989' )]
|
||||
|
||||
def get_article_url(self, article):
|
||||
artlurl = article.get('link', None)
|
||||
return artlurl.replace('http://english.aljazeera.net//','http://english.aljazeera.net/')
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll(face=True):
|
||||
del item['face']
|
||||
td = soup.find('td',attrs={'class':'DetailedSummary'})
|
||||
if td:
|
||||
td.name = 'div'
|
||||
spn = soup.find('span',attrs={'id':'DetailedTitle'})
|
||||
if spn:
|
||||
spn.name='h1'
|
||||
for itm in soup.findAll('span', attrs={'id':['dvArticleDate','ctl00_cphBody_lblDate']}):
|
||||
itm.name = 'div'
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
return soup
|
||||
|
||||
|
111
resources/recipes/animal_politico.recipe
Normal file
@ -0,0 +1,111 @@
|
||||
#!/usr/bin/python
|
||||
# encoding: utf-8
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1290663986(BasicNewsRecipe):
|
||||
title = u'Animal Pol\u00EDtico'
|
||||
publisher = u'Animal Pol\u00EDtico'
|
||||
category = u'News, Mexico'
|
||||
description = u'Noticias Pol\u00EDticas'
|
||||
__author__ = 'leamsi'
|
||||
masthead_url = 'http://www.animalpolitico.com/wp-content/themes/animal_mu/images/logo.png'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
language = 'es'
|
||||
|
||||
#feeds = [(u'Animal Politico', u'http://www.animalpolitico.com/feed/')]
|
||||
|
||||
remove_tags_before = dict(name='div', id='main')
|
||||
remove_tags = [dict(name='div', attrs={'class':'fb-like-button'})]
|
||||
keep_only_tags = [dict(name='h1', attrs={'class':'entry-title'}),
|
||||
dict(name='div', attrs={'class':'entry-content'})]
|
||||
remove_javascript = True
|
||||
INDEX = 'http://www.animalpolitico.com/'
|
||||
|
||||
def generic_parse(self, soup):
|
||||
articles = []
|
||||
for entry in soup.findAll(lambda tag: tag.name == 'li' and tag.has_key('class') and tag['class'].find('hentry') != -1): #soup.findAll('li', 'hentry'):
|
||||
article_url = entry.a['href'] + '?print=yes'
|
||||
article_title= entry.find('h3', 'entry-title')
|
||||
article_title= self.tag_to_string(article_title)
|
||||
article_date = entry.find('span', 'the-time')
|
||||
article_date = self.tag_to_string(article_date)
|
||||
article_desc = self.tag_to_string(entry.find('p'))
|
||||
|
||||
#print 'Article:',article_title, article_date,article_url
|
||||
#print entry['class']
|
||||
|
||||
articles.append({'title' : article_title,
|
||||
'date' : article_date,
|
||||
'description' : article_desc,
|
||||
'url' : article_url})
|
||||
# Avoid including the multimedia stuff.
|
||||
if entry['class'].find('last') != -1:
|
||||
break
|
||||
|
||||
return articles
|
||||
|
||||
def plumaje_parse(self, soup):
|
||||
articles = []
|
||||
blogs_soup = soup.find(lambda tag: tag.name == 'ul' and tag.has_key('class') and tag['class'].find('bloglist-fecha') != -1)
|
||||
for entry in blogs_soup.findAll('li'):
|
||||
article_title = entry.p
|
||||
article_url = article_title.a['href'] + '?print=yes'
|
||||
article_date = article_title.nextSibling
|
||||
article_title = self.tag_to_string(article_title)
|
||||
article_date = self.tag_to_string(article_date).replace(u'Last Updated: ', '')
|
||||
article_desc = self.tag_to_string(entry.find('h4'))
|
||||
|
||||
#print 'Article:',article_title, article_date,article_url
|
||||
articles.append({'title' : article_title,
|
||||
'date' : article_date,
|
||||
'description' : article_desc,
|
||||
'url' : article_url})
|
||||
|
||||
return articles
|
||||
|
||||
def boca_parse(self, soup):
|
||||
articles = []
|
||||
for entry in soup.findAll(lambda tag: tag.name == 'div' and tag.has_key('class') and tag['class'].find('hentry') != -1): #soup.findAll('li', 'hentry'):
|
||||
article_title= entry.find('h2', 'entry-title')
|
||||
article_url = article_title.a['href'] + '?print=yes'
|
||||
article_title= self.tag_to_string(article_title)
|
||||
article_date = entry.find('span', 'entry-date')
|
||||
article_date = self.tag_to_string(article_date)
|
||||
article_desc = self.tag_to_string(entry.find('div', 'entry-content'))
|
||||
|
||||
#print 'Article:',article_title, article_date,article_url
|
||||
#print entry['class']
|
||||
|
||||
articles.append({'title' : article_title,
|
||||
'date' : article_date,
|
||||
'description' : article_desc,
|
||||
'url' : article_url})
|
||||
# Avoid including the multimedia stuff.
|
||||
if entry['class'].find('last') != -1:
|
||||
break
|
||||
|
||||
return articles
|
||||
|
||||
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
gobierno_soup = self.index_to_soup(self.INDEX+'gobierno/')
|
||||
congreso_soup = self.index_to_soup(self.INDEX+'congreso/')
|
||||
seguridad_soup = self.index_to_soup(self.INDEX+'seguridad/')
|
||||
comunidad_soup = self.index_to_soup(self.INDEX+'comunidad/')
|
||||
plumaje_soup = self.index_to_soup(self.INDEX+'plumaje/')
|
||||
la_boca_del_lobo_soup = self.index_to_soup(self.INDEX+'category/la-boca-del-lobo/')
|
||||
|
||||
gobierno_articles = self.generic_parse(gobierno_soup)
|
||||
congreso_articles = self.generic_parse(congreso_soup)
|
||||
seguridad_articles = self.generic_parse(seguridad_soup)
|
||||
comunidad_articles = self.generic_parse(comunidad_soup)
|
||||
plumaje_articles = self.plumaje_parse(plumaje_soup)
|
||||
la_boca_del_lobo_articles = self.boca_parse(la_boca_del_lobo_soup)
|
||||
|
||||
|
||||
return [ (u'Gobierno', gobierno_articles), (u'Congreso', congreso_articles), (u'Seguridad', seguridad_articles),
|
||||
(u'Comunidad', comunidad_articles), (u'Plumaje', plumaje_articles), (u'La Boca del Lobo', la_boca_del_lobo_articles), ]
|
110
resources/recipes/arcamax.recipe
Normal file
@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = 'Copyright 2010 Starson17'
|
||||
'''
|
||||
www.arcamax.com
|
||||
'''
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Arcamax(BasicNewsRecipe):
|
||||
title = 'Arcamax'
|
||||
__author__ = 'Starson17'
|
||||
__version__ = '1.03'
|
||||
__date__ = '25 November 2010'
|
||||
description = u'Family Friendly Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.'
|
||||
category = 'news, comics'
|
||||
language = 'en'
|
||||
use_embedded_content= False
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
cover_url = 'http://www.arcamax.com/images/pub/amuse/leftcol/zits.jpg'
|
||||
|
||||
####### USER PREFERENCES - SET COMICS AND NUMBER OF COMICS TO RETRIEVE ########
|
||||
num_comics_to_get = 7
|
||||
# CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS
|
||||
|
||||
conversion_options = {'linearize_tables' : True
|
||||
, 'comment' : description
|
||||
, 'tags' : category
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['toon']}),
|
||||
]
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
for title, url in [
|
||||
######## COMICS - GENERAL ########
|
||||
#(u"9 Chickweed Lane", u"http://www.arcamax.com/ninechickweedlane"),
|
||||
#(u"Agnes", u"http://www.arcamax.com/agnes"),
|
||||
#(u"Andy Capp", u"http://www.arcamax.com/andycapp"),
|
||||
(u"BC", u"http://www.arcamax.com/bc"),
|
||||
#(u"Baby Blues", u"http://www.arcamax.com/babyblues"),
|
||||
#(u"Beetle Bailey", u"http://www.arcamax.com/beetlebailey"),
|
||||
(u"Blondie", u"http://www.arcamax.com/blondie"),
|
||||
#u"Boondocks", u"http://www.arcamax.com/boondocks"),
|
||||
#(u"Cathy", u"http://www.arcamax.com/cathy"),
|
||||
#(u"Daddys Home", u"http://www.arcamax.com/daddyshome"),
|
||||
(u"Dilbert", u"http://www.arcamax.com/dilbert"),
|
||||
#(u"Dinette Set", u"http://www.arcamax.com/thedinetteset"),
|
||||
(u"Dog Eat Doug", u"http://www.arcamax.com/dogeatdoug"),
|
||||
(u"Doonesbury", u"http://www.arcamax.com/doonesbury"),
|
||||
#(u"Dustin", u"http://www.arcamax.com/dustin"),
|
||||
(u"Family Circus", u"http://www.arcamax.com/familycircus"),
|
||||
(u"Garfield", u"http://www.arcamax.com/garfield"),
|
||||
#(u"Get Fuzzy", u"http://www.arcamax.com/getfuzzy"),
|
||||
#(u"Girls and Sports", u"http://www.arcamax.com/girlsandsports"),
|
||||
#(u"Hagar the Horrible", u"http://www.arcamax.com/hagarthehorrible"),
|
||||
#(u"Heathcliff", u"http://www.arcamax.com/heathcliff"),
|
||||
#(u"Jerry King Cartoons", u"http://www.arcamax.com/humorcartoon"),
|
||||
#(u"Luann", u"http://www.arcamax.com/luann"),
|
||||
#(u"Momma", u"http://www.arcamax.com/momma"),
|
||||
#(u"Mother Goose and Grimm", u"http://www.arcamax.com/mothergooseandgrimm"),
|
||||
(u"Mutts", u"http://www.arcamax.com/mutts"),
|
||||
#(u"Non Sequitur", u"http://www.arcamax.com/nonsequitur"),
|
||||
#(u"Pearls Before Swine", u"http://www.arcamax.com/pearlsbeforeswine"),
|
||||
#(u"Pickles", u"http://www.arcamax.com/pickles"),
|
||||
#(u"Red and Rover", u"http://www.arcamax.com/redandrover"),
|
||||
#(u"Rubes", u"http://www.arcamax.com/rubes"),
|
||||
#(u"Rugrats", u"http://www.arcamax.com/rugrats"),
|
||||
(u"Speed Bump", u"http://www.arcamax.com/speedbump"),
|
||||
(u"Wizard of Id", u"http://www.arcamax.com/wizardofid"),
|
||||
(u"Dilbert", u"http://www.arcamax.com/dilbert"),
|
||||
(u"Zits", u"http://www.arcamax.com/zits"),
|
||||
]:
|
||||
articles = self.make_links(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
return feeds
|
||||
|
||||
def make_links(self, url):
|
||||
title = 'Temp'
|
||||
current_articles = []
|
||||
pages = range(1, self.num_comics_to_get+1)
|
||||
for page in pages:
|
||||
page_soup = self.index_to_soup(url)
|
||||
if page_soup:
|
||||
title = page_soup.find(name='div', attrs={'class':'toon'}).p.img['alt']
|
||||
page_url = url
|
||||
prev_page_url = 'http://www.arcamax.com' + page_soup.find('a', attrs={'class':'next'}, text='Previous').parent['href']
|
||||
current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':''})
|
||||
url = prev_page_url
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
main_comic = soup.find('p',attrs={'class':'m0'})
|
||||
if main_comic.a['target'] == '_blank':
|
||||
main_comic.a.img['id'] = 'main_comic'
|
||||
return soup
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||
img#main_comic {max-width:100%; min-width:100%;}
|
||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||
'''
|
||||
|
@ -13,6 +13,7 @@ class Dnevnik(BasicNewsRecipe):
|
||||
labguage = 'sl'
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'sl'
|
||||
|
||||
conversion_options = {'linearize_tables' : True}
|
||||
|
||||
|
25
resources/recipes/bangkok_biz.recipe
Normal file
@ -0,0 +1,25 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1290689337(BasicNewsRecipe):
|
||||
__author__ = 'Anat R.'
|
||||
language = 'th'
|
||||
title = u'Bangkok Biz News'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
feeds = [(u'Headlines',
|
||||
u'http://www.bangkokbiznews.com/home/services/rss/home.xml'),
|
||||
(u'Politics', u'http://www.bangkokbiznews.com/home/services/rss/politics.xml'),
|
||||
(u'Business', u'http://www.bangkokbiznews.com/home/services/rss/business.xml'),
|
||||
(u'Finance', u' http://www.bangkokbiznews.com/home/services/rss/finance.xml'),
|
||||
(u'Technology', u' http://www.bangkokbiznews.com/home/services/rss/it.xml')]
|
||||
remove_tags_before = dict(name='div', attrs={'class':'box-Detailcontent'})
|
||||
remove_tags_after = dict(name='p', attrs={'class':'allTags'})
|
||||
remove_tags = []
|
||||
remove_tags.append(dict(name = 'div', attrs = {'id': 'content-tools'}))
|
||||
remove_tags.append(dict(name = 'p', attrs = {'class':'allTags'}))
|
||||
remove_tags.append(dict(name = 'div', attrs = {'id':'morePic'}))
|
||||
remove_tags.append(dict(name = 'ul', attrs = {'class':'tabs-nav'}))
|
||||
|
58
resources/recipes/bitacora.recipe
Normal file
@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
'''
|
||||
bitacora.com.uy
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class General(BasicNewsRecipe):
|
||||
title = 'bitacora.com.uy'
|
||||
__author__ = 'Gustavo Azambuja'
|
||||
description = 'Noticias de Uruguay'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
use_embedded_content = False
|
||||
recursion = 5
|
||||
encoding = 'iso-8859-1'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [dict(id=['txt'])]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':'tablafoot'}),
|
||||
dict(name=['object','h4']),
|
||||
dict(name=['object','link'])
|
||||
]
|
||||
|
||||
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||
p {font-family:Arial,Helvetica,sans-serif;}
|
||||
'''
|
||||
feeds = [
|
||||
(u'Titulares', u'http://www.bitacora.com.uy/anxml.cgi?15')
|
||||
]
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = None
|
||||
index = 'http://www.bitacora.com.uy'
|
||||
soup = self.index_to_soup(index)
|
||||
link_item = soup.find('img',attrs={'class':'imgtapa'})
|
||||
if link_item:
|
||||
cover_url = "http://www.bitacora.com.uy/"+link_item['src']
|
||||
return cover_url
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
40
resources/recipes/biz_portal.recipe
Normal file
@ -0,0 +1,40 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1283848012(BasicNewsRecipe):
|
||||
description = 'This is a recipe of BizPortal.co.il.'
|
||||
cover_url = 'http://www.bizportal.co.il/shukhahon/images/bizportal.jpg'
|
||||
title = u'BizPortal'
|
||||
language = 'he'
|
||||
__author__ = 'marbs'
|
||||
extra_css='img {max-width:100%;} body{direction: rtl;},title{direction: rtl; } ,article_description{direction: rtl; }, a.article{direction: rtl; } ,calibre_feed_description{direction: rtl; }'
|
||||
simultaneous_downloads = 5
|
||||
remove_javascript = True
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
remove_empty_feeds = True
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
remove_attributes = ['width']
|
||||
simultaneous_downloads = 5
|
||||
# keep_only_tags =dict(name='div', attrs={'id':'articleContainer'})
|
||||
remove_tags = [dict(name='img', attrs={'scr':['images/bizlogo_nl.gif']})]
|
||||
max_articles_per_feed = 100
|
||||
#preprocess_regexps = [
|
||||
# (re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: '')
|
||||
# ]
|
||||
|
||||
|
||||
feeds = [(u'חדשות שוק ההון', u'http://www.bizportal.co.il/shukhahon/messRssUTF2.xml'),
|
||||
(u'חדשות וול סטריט בעברית', u'http://www.bizportal.co.il/shukhahon/images/bizportal.jpg'),
|
||||
(u'שיווק ופרסום', u'http://www.bizportal.co.il/shukhahon/messRssUTF145.xml'),
|
||||
(u'משפט', u'http://www.bizportal.co.il/shukhahon/messRssUTF3.xml'),
|
||||
(u'ניתוח טכני', u'http://www.bizportal.co.il/shukhahon/messRssUTF5.xml'),
|
||||
(u'דיני עבודה ושכר', u'http://www.bizportal.co.il/shukhahon/messRssUTF6.xml'),
|
||||
(u'מיסוי', u'http://www.bizportal.co.il/shukhahon/messRssUTF7.xml'),
|
||||
(u'טאבו', u'http://www.bizportal.co.il/shukhahon/messRssUTF8.xml'),
|
||||
(u'נדל"ן', u'http://www.bizportal.co.il/shukhahon/messRssUTF160.xml'),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
split1 = url.split("=")
|
||||
print_url = 'http://www.bizportal.co.il/web/webnew/shukhahon/biznews02print.shtml?mid=' + split1[1]
|
||||
return print_url
|
@ -1,18 +1,22 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# -*- coding: utf-8 mode: python -*-
|
||||
|
||||
# Find the newest version of this recipe here:
|
||||
# https://github.com/consti/BrandEins-Recipe/raw/master/brandeins.recipe
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>'
|
||||
__version__ = '0.95'
|
||||
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>'
|
||||
__version__ = '0.96'
|
||||
|
||||
''' http://brandeins.de - Wirtschaftsmagazin '''
|
||||
import re
|
||||
import string
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
|
||||
class BrandEins(BasicNewsRecipe):
|
||||
|
||||
title = u'Brand Eins'
|
||||
title = u'brand eins'
|
||||
__author__ = 'Constantin Hofstetter'
|
||||
description = u'Wirtschaftsmagazin'
|
||||
publisher ='brandeins.de'
|
||||
@ -22,11 +26,14 @@ class BrandEins(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
language = 'de'
|
||||
publication_type = 'magazine'
|
||||
needs_subscription = 'optional'
|
||||
|
||||
# 2 is the last full magazine (default)
|
||||
# 1 is the newest (but not full)
|
||||
# 3 is one before 2 etc.
|
||||
which_ausgabe = 2
|
||||
# This value can be set via the username field.
|
||||
default_issue = 2
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]
|
||||
|
||||
@ -61,17 +68,31 @@ class BrandEins(BasicNewsRecipe):
|
||||
|
||||
return soup
|
||||
|
||||
def get_cover(self, soup):
|
||||
cover_url = None
|
||||
cover_item = soup.find('div', attrs = {'class': 'cover_image'})
|
||||
if cover_item:
|
||||
cover_url = 'http://www.brandeins.de/' + cover_item.img['src']
|
||||
return cover_url
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
|
||||
archive = "http://www.brandeins.de/archiv.html"
|
||||
|
||||
issue = self.default_issue
|
||||
if self.username:
|
||||
try:
|
||||
issue = int(self.username)
|
||||
except:
|
||||
pass
|
||||
|
||||
soup = self.index_to_soup(archive)
|
||||
latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0]
|
||||
pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-self.which_ausgabe]
|
||||
pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-issue]
|
||||
url = pre_latest_issue.get('href', False)
|
||||
# Get the title for the magazin - build it out of the title of the cover - take the issue and year;
|
||||
self.title = "Brand Eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d+)", pre_latest_issue.find('img').get('title', False)).group('date')
|
||||
# Get month and year of the magazine issue - build it out of the title of the cover
|
||||
self.timefmt = " " + re.search(r"(?P<date>\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date')
|
||||
url = 'http://brandeins.de/'+url
|
||||
|
||||
# url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
|
||||
@ -83,6 +104,7 @@ class BrandEins(BasicNewsRecipe):
|
||||
|
||||
def brand_eins_parse_latest_issue(self, url):
|
||||
soup = self.index_to_soup(url)
|
||||
self.cover_url = self.get_cover(soup)
|
||||
article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]
|
||||
|
||||
titles_and_articles = []
|
||||
@ -123,3 +145,4 @@ class BrandEins(BasicNewsRecipe):
|
||||
current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
|
||||
titles_and_articles.append([chapter_title, current_articles])
|
||||
return titles_and_articles
|
||||
|
||||
|
48
resources/recipes/business_spectator.recipe
Normal file
@ -0,0 +1,48 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Dean Cording'
|
||||
'''
|
||||
abc.net.au/news
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class BusinessSpectator(BasicNewsRecipe):
|
||||
title = 'Business Spectator'
|
||||
__author__ = 'Dean Cording'
|
||||
description = 'Australian Business News & commentary delivered the way you want it.'
|
||||
masthead_url = 'http://www.businessspectator.com.au/bs.nsf/logo-business-spectator.gif'
|
||||
cover_url = masthead_url
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
#delay = 1
|
||||
use_embedded_content = False
|
||||
encoding = 'utf8'
|
||||
publisher = 'Business Spectator'
|
||||
category = 'News, Australia, Business'
|
||||
language = 'en_AU'
|
||||
publication_type = 'newsportal'
|
||||
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'linearize_tables': False
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(id='storyHeader'), dict(id='body-html')]
|
||||
|
||||
remove_tags = [dict(attrs={'class':'hql'})]
|
||||
|
||||
remove_attributes = ['width','height','style']
|
||||
|
||||
feeds = [
|
||||
('Top Stories', 'http://www.businessspectator.com.au/top-stories.rss'),
|
||||
('Alan Kohler', 'http://www.businessspectator.com.au/bs.nsf/RSS?readform&type=spectators&cat=Alan%20Kohler'),
|
||||
('Robert Gottliebsen', 'http://www.businessspectator.com.au/bs.nsf/RSS?readform&type=spectators&cat=Robert%20Gottliebsen'),
|
||||
('Stephen Bartholomeusz', 'http://www.businessspectator.com.au/bs.nsf/RSS?readform&type=spectators&cat=Stephen%20Bartholomeusz'),
|
||||
('Daily Dossier', 'http://www.businessspectator.com.au/bs.nsf/RSS?readform&type=kgb&cat=dossier'),
|
||||
('Australia', 'http://www.businessspectator.com.au/bs.nsf/RSS?readform&type=region&cat=australia'),
|
||||
]
|
@ -11,7 +11,6 @@ class AdvancedUserRecipe1275798572(BasicNewsRecipe):
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'en'
|
||||
masthead_url = 'http://www.cbc.ca/includes/gfx/cbcnews_logo_09.gif'
|
||||
cover_url = 'http://img692.imageshack.us/img692/2814/cbc.png'
|
||||
keep_only_tags = [dict(name='div', attrs={'id':['storyhead','storybody']})]
|
||||
|
51
resources/recipes/cnetjapan.recipe
Normal file
@ -0,0 +1,51 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class CNetJapan(BasicNewsRecipe):
|
||||
title = u'CNET Japan'
|
||||
oldest_article = 3
|
||||
max_articles_per_feed = 30
|
||||
__author__ = 'Hiroshi Miura'
|
||||
|
||||
feeds = [(u'CNet News', u'http://feed.japan.cnet.com/rss/index.rdf'),
|
||||
(u'CNet Blog', u'http://feed.japan.cnet.com/rss/blog/index.rdf')
|
||||
]
|
||||
language = 'ja'
|
||||
encoding = 'Shift_JIS'
|
||||
remove_javascript = True
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(ur'<!--\u25B2contents_left END\u25B2-->.*</body>', re.DOTALL|re.IGNORECASE|re.UNICODE),
|
||||
lambda match: '</body>'),
|
||||
(re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '</body>'),
|
||||
(re.compile(ur'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->', re.UNICODE),
|
||||
lambda match: '<!-- removed -->'),
|
||||
]
|
||||
|
||||
remove_tags_before = dict(id="contents_l")
|
||||
remove_tags = [
|
||||
{'class':"social_bkm_share"},
|
||||
{'class':"social_bkm_print"},
|
||||
{'class':"block20 clearfix"},
|
||||
dict(name="div",attrs={'id':'bookreview'}),
|
||||
{'class':"tag_left_ttl"},
|
||||
{'class':"tag_right"}
|
||||
]
|
||||
remove_tags_after = {'class':"block20"}
|
||||
|
||||
def parse_feeds(self):
|
||||
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
|
||||
for curfeed in feeds:
|
||||
delList = []
|
||||
for a,curarticle in enumerate(curfeed.articles):
|
||||
if re.search(r'pheedo.jp', curarticle.url):
|
||||
delList.append(curarticle)
|
||||
if len(delList)>0:
|
||||
for d in delList:
|
||||
index = curfeed.articles.index(d)
|
||||
curfeed.articles[index:index+1] = []
|
||||
|
||||
return feeds
|
49
resources/recipes/cnetjapan_digital.recipe
Normal file
@ -0,0 +1,49 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class CNetJapanDigital(BasicNewsRecipe):
|
||||
title = u'CNET Japan Digital'
|
||||
oldest_article = 3
|
||||
max_articles_per_feed = 30
|
||||
__author__ = 'Hiroshi Miura'
|
||||
|
||||
feeds = [(u'CNet digital',u'http://feed.japan.cnet.com/rss/digital/index.rdf') ]
|
||||
language = 'ja'
|
||||
encoding = 'Shift_JIS'
|
||||
remove_javascript = True
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(ur'<!--\u25B2contents_left END\u25B2-->.*</body>', re.DOTALL|re.IGNORECASE|re.UNICODE),
|
||||
lambda match: '</body>'),
|
||||
(re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '</body>'),
|
||||
(re.compile(ur'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->', re.UNICODE),
|
||||
lambda match: '<!-- removed -->'),
|
||||
]
|
||||
|
||||
remove_tags_before = dict(id="contents_l")
|
||||
remove_tags = [
|
||||
{'class':"social_bkm_share"},
|
||||
{'class':"social_bkm_print"},
|
||||
{'class':"block20 clearfix"},
|
||||
dict(name="div",attrs={'id':'bookreview'}),
|
||||
{'class':"tag_left_ttl"},
|
||||
{'class':"tag_right"}
|
||||
]
|
||||
remove_tags_after = {'class':"block20"}
|
||||
|
||||
def parse_feeds(self):
|
||||
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
|
||||
for curfeed in feeds:
|
||||
delList = []
|
||||
for a,curarticle in enumerate(curfeed.articles):
|
||||
if re.search(r'pheedo.jp', curarticle.url):
|
||||
delList.append(curarticle)
|
||||
if len(delList)>0:
|
||||
for d in delList:
|
||||
index = curfeed.articles.index(d)
|
||||
curfeed.articles[index:index+1] = []
|
||||
|
||||
return feeds
|
48
resources/recipes/cnetjapan_release.recipe
Normal file
@ -0,0 +1,48 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class CNetJapanRelease(BasicNewsRecipe):
|
||||
title = u'CNET Japan release'
|
||||
oldest_article = 3
|
||||
max_articles_per_feed = 30
|
||||
__author__ = 'Hiroshi Miura'
|
||||
|
||||
feeds = [(u'CNet Release', u'http://feed.japan.cnet.com/rss/release/index.rdf') ]
|
||||
language = 'ja'
|
||||
encoding = 'Shift_JIS'
|
||||
remove_javascript = True
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(ur'<!--\u25B2contents_left END\u25B2-->.*</body>', re.DOTALL|re.IGNORECASE|re.UNICODE),
|
||||
lambda match: '</body>'),
|
||||
(re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '</body>'),
|
||||
(re.compile(ur'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->', re.UNICODE),
|
||||
lambda match: '<!-- removed -->'),
|
||||
]
|
||||
|
||||
remove_tags_before = dict(id="contents_l")
|
||||
remove_tags = [
|
||||
{'class':"social_bkm_share"},
|
||||
{'class':"social_bkm_print"},
|
||||
{'class':"block20 clearfix"},
|
||||
dict(name="div",attrs={'id':'bookreview'}),
|
||||
{'class':"tag_left_ttl"}
|
||||
]
|
||||
remove_tags_after = {'class':"block20"}
|
||||
|
||||
def parse_feeds(self):
|
||||
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
|
||||
for curfeed in feeds:
|
||||
delList = []
|
||||
for a,curarticle in enumerate(curfeed.articles):
|
||||
if re.search(r'pheedo.jp', curarticle.url):
|
||||
delList.append(curarticle)
|
||||
if len(delList)>0:
|
||||
for d in delList:
|
||||
index = curfeed.articles.index(d)
|
||||
curfeed.articles[index:index+1] = []
|
||||
|
||||
return feeds
|
@ -347,6 +347,7 @@ class Comics(BasicNewsRecipe):
|
||||
title = strip_tag['title']
|
||||
print 'title: ', title
|
||||
current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':''})
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
extra_css = '''
|
||||
|
69
resources/recipes/cosmopolitan.recipe
Normal file
@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
'''
|
||||
Muy Interesante
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class General(BasicNewsRecipe):
|
||||
title = 'Cosmopolitan'
|
||||
__author__ = 'Gustavo Azambuja'
|
||||
description = 'Revista Cosmopolitan, Edicion Espanola'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
use_embedded_content = False
|
||||
recursion = 1
|
||||
encoding = 'utf8'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
conversion_options = {'linearize_tables': True}
|
||||
|
||||
oldest_article = 180
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [
|
||||
dict(id=['contenido']),
|
||||
dict(name='td', attrs={'class':['contentheading', 'txt_articulo']})
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['breadcrumb', 'bloque1', 'article', 'bajo_title', 'tags_articles', 'otrosenlaces_title', 'otrosenlaces_parent', 'compartir']}),
|
||||
dict(name='div', attrs={'id':'comment'}),
|
||||
dict(name='table', attrs={'class':'pagenav'}),
|
||||
dict(name=['object','link'])
|
||||
]
|
||||
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||
img {float:left; clear:both; margin:10px}
|
||||
p {font-family:Arial,Helvetica,sans-serif;}
|
||||
'''
|
||||
feeds = [
|
||||
(u'Articulos', u'http://feeds.feedburner.com/cosmohispano/FSSt')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
attribs = [ 'style','font','valign'
|
||||
,'colspan','width','height'
|
||||
,'rowspan','summary','align'
|
||||
,'cellspacing','cellpadding'
|
||||
,'frames','rules','border'
|
||||
]
|
||||
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
|
||||
item.name = 'div'
|
||||
for attrib in attribs:
|
||||
if item.has_key(attrib):
|
||||
del item[attrib]
|
||||
return soup
|
||||
|
||||
def get_cover_url(self):
|
||||
index = 'http://www.cosmohispano.com/revista'
|
||||
soup = self.index_to_soup(index)
|
||||
link_item = soup.find('img',attrs={'class':'img_portada'})
|
||||
if link_item:
|
||||
cover_url = "http://www.cosmohispano.com"+link_item['src']
|
||||
return cover_url
|
76
resources/recipes/deutsche_welle_bs.recipe
Normal file
@ -0,0 +1,76 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
dw-world.de
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DeutscheWelle_bs(BasicNewsRecipe):
|
||||
title = 'Deutsche Welle'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Vijesti iz Njemacke i svijeta'
|
||||
publisher = 'Deutsche Welle'
|
||||
category = 'news, politics, Germany'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'bs'
|
||||
publication_type = 'newsportal'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||
extra_css = """
|
||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||
body{font-family: Arial,sans1,sans-serif}
|
||||
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||
"""
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher': publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||
,dict(attrs={'class':'actionFooter'})
|
||||
]
|
||||
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||
remove_attributes = ['height','width','onclick','border','lang']
|
||||
|
||||
feeds = [
|
||||
(u'Politika' , u'http://rss.dw-world.de/rdf/rss-bos-pol')
|
||||
,(u'Evropa' , u'http://rss.dw-world.de/rdf/rss-bos-eu' )
|
||||
,(u'Kiosk' , u'http://rss.dw-world.de/rdf/rss-bos-eu' )
|
||||
,(u'Ekonomija i Nuka' , u'http://rss.dw-world.de/rdf/rss-bos-eco')
|
||||
,(u'Kultura' , u'http://rss.dw-world.de/rdf/rss-bos-cul')
|
||||
,(u'Sport' , u'http://rss.dw-world.de/rdf/rss-bos-sp' )
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
artl = url.rpartition('/')[2]
|
||||
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
del item['href']
|
||||
if item.has_key('target'):
|
||||
del item['target']
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
return soup
|
||||
|
66
resources/recipes/deutsche_welle_en.recipe
Normal file
@ -0,0 +1,66 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
dw-world.de
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DeutscheWelle_en(BasicNewsRecipe):
|
||||
title = 'Deutsche Welle'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'News from Germany and World'
|
||||
publisher = 'Deutsche Welle'
|
||||
category = 'news, politics, Germany'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'en'
|
||||
publication_type = 'newsportal'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||
extra_css = """
|
||||
body{font-family: Arial,sans-serif}
|
||||
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||
"""
|
||||
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher': publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||
,dict(attrs={'class':'actionFooter'})
|
||||
]
|
||||
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||
remove_attributes = ['height','width','onclick','border','lang']
|
||||
|
||||
feeds = [(u'All news', u'http://rss.dw-world.de/rdf/rss-en-all')]
|
||||
|
||||
def print_version(self, url):
|
||||
artl = url.rpartition('/')[2]
|
||||
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
del item['href']
|
||||
if item.has_key('target'):
|
||||
del item['target']
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
return soup
|
||||
|
66
resources/recipes/deutsche_welle_es.recipe
Normal file
@ -0,0 +1,66 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
dw-world.de
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DeutscheWelle_es(BasicNewsRecipe):
|
||||
title = 'Deutsche Welle'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Noticias desde Alemania y mundo'
|
||||
publisher = 'Deutsche Welle'
|
||||
category = 'news, politics, Germany'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'es'
|
||||
publication_type = 'newsportal'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||
extra_css = """
|
||||
body{font-family: Arial,sans-serif}
|
||||
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||
"""
|
||||
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher': publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||
,dict(attrs={'class':'actionFooter'})
|
||||
]
|
||||
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||
remove_attributes = ['height','width','onclick','border','lang']
|
||||
|
||||
feeds = [(u'Noticias', u'http://rss.dw-world.de/rdf/rss-sp-all')]
|
||||
|
||||
def print_version(self, url):
|
||||
artl = url.rpartition('/')[2]
|
||||
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
del item['href']
|
||||
if item.has_key('target'):
|
||||
del item['target']
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
return soup
|
||||
|
74
resources/recipes/deutsche_welle_hr.recipe
Normal file
@ -0,0 +1,74 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
dw-world.de
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DeutscheWelle_hr(BasicNewsRecipe):
|
||||
title = 'Deutsche Welle'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Vesti iz Njemacke i svijeta'
|
||||
publisher = 'Deutsche Welle'
|
||||
category = 'news, politics, Germany'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'hr'
|
||||
publication_type = 'newsportal'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||
extra_css = """
|
||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||
body{font-family: Arial,sans1,sans-serif}
|
||||
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||
"""
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher': publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||
,dict(attrs={'class':'actionFooter'})
|
||||
]
|
||||
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||
remove_attributes = ['height','width','onclick','border','lang']
|
||||
|
||||
feeds = [
|
||||
(u'Svijet' , u'http://rss.dw-world.de/rdf/rss-cro-svijet')
|
||||
,(u'Europa' , u'http://rss.dw-world.de/rdf/rss-cro-eu' )
|
||||
,(u'Njemacka' , u'http://rss.dw-world.de/rdf/rss-cro-ger' )
|
||||
,(u'Vijesti' , u'http://rss.dw-world.de/rdf/rss-cro-all' )
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
artl = url.rpartition('/')[2]
|
||||
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
del item['href']
|
||||
if item.has_key('target'):
|
||||
del item['target']
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
return soup
|
||||
|
66
resources/recipes/deutsche_welle_pt.recipe
Normal file
@ -0,0 +1,66 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
dw-world.de
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DeutscheWelle_pt(BasicNewsRecipe):
|
||||
title = 'Deutsche Welle'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Noticias desde Alemania y mundo'
|
||||
publisher = 'Deutsche Welle'
|
||||
category = 'news, politics, Germany'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'pt'
|
||||
publication_type = 'newsportal'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||
extra_css = """
|
||||
body{font-family: Arial,sans-serif}
|
||||
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||
"""
|
||||
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher': publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||
,dict(attrs={'class':'actionFooter'})
|
||||
]
|
||||
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||
remove_attributes = ['height','width','onclick','border','lang']
|
||||
|
||||
feeds = [(u'Noticias', u'http://rss.dw-world.de/rdf/rss-br-all')]
|
||||
|
||||
def print_version(self, url):
|
||||
artl = url.rpartition('/')[2]
|
||||
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
del item['href']
|
||||
if item.has_key('target'):
|
||||
del item['target']
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
return soup
|
||||
|
79
resources/recipes/deutsche_welle_sr.recipe
Normal file
@ -0,0 +1,79 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
dw-world.de
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DeutscheWelle_sr(BasicNewsRecipe):
|
||||
title = 'Deutsche Welle'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Vesti iz Nemacke i sveta'
|
||||
publisher = 'Deutsche Welle'
|
||||
category = 'news, politics, Germany'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'sr'
|
||||
publication_type = 'newsportal'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||
extra_css = """
|
||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||
body{font-family: Arial,sans1,sans-serif}
|
||||
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||
"""
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher': publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||
,dict(attrs={'class':'actionFooter'})
|
||||
]
|
||||
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||
remove_attributes = ['height','width','onclick','border','lang']
|
||||
|
||||
feeds = [
|
||||
(u'Politika' , u'http://rss.dw-world.de/rdf/rss-ser-pol' )
|
||||
,(u'Srbija' , u'http://rss.dw-world.de/rdf/rss-ser-pol-ser' )
|
||||
,(u'Region' , u'http://rss.dw-world.de/rdf/rss-ser-pol-region' )
|
||||
,(u'Evropa' , u'http://rss.dw-world.de/rdf/rss-ser-pol-eu' )
|
||||
,(u'Nemacka' , u'http://rss.dw-world.de/rdf/rss-ser-pol-ger' )
|
||||
,(u'Svet' , u'http://rss.dw-world.de/rdf/rss-ser-pol-ger' )
|
||||
,(u'Pregled stampe', u'http://rss.dw-world.de/rdf/rss-ser-pol-ger')
|
||||
,(u'Nauka Tehnika Medicina', u'http://rss.dw-world.de/rdf/rss-ser-science')
|
||||
,(u'Kultura' , u'feed:http://rss.dw-world.de/rdf/rss-ser-cul' )
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
artl = url.rpartition('/')[2]
|
||||
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
del item['href']
|
||||
if item.has_key('target'):
|
||||
del item['target']
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
return soup
|
||||
|
80
resources/recipes/el_pais_uy.recipe
Normal file
@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
'''
|
||||
http://www.elpais.com.uy/
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class General(BasicNewsRecipe):
|
||||
title = 'El Pais - Uruguay'
|
||||
__author__ = 'Gustavo Azambuja'
|
||||
description = 'Noticias de Uruguay y el resto del mundo'
|
||||
publisher = 'EL PAIS S.A.'
|
||||
category = 'news, politics, Uruguay'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
use_embedded_content = False
|
||||
recursion = 2
|
||||
encoding = 'iso-8859-1'
|
||||
masthead_url = 'http://www.elpais.com.uy/Images/09/cabezal/logo_PDEP.png'
|
||||
publication_type = 'newspaper'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
keep_only_tags = [
|
||||
dict(name='h1'),
|
||||
dict(name='div', attrs={'id':'Contenido'})
|
||||
]
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['date_text', 'comments', 'form_section', 'share_it']}),
|
||||
dict(name='div', attrs={'id':['relatedPosts', 'spacer', 'banner_izquierda', 'right_container']}),
|
||||
dict(name='p', attrs={'class':'FacebookLikeButton'}),
|
||||
dict(name=['object','form']),
|
||||
dict(name=['object','table']) ]
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||
p {font-family:Arial,Helvetica,sans-serif;}
|
||||
body{font-family: Verdana,Arial,Helvetica,sans-serif }
|
||||
img{margin-bottom: 0.4em; display:block;}
|
||||
'''
|
||||
feeds = [
|
||||
(u'Ultimo Momento', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=umomento'),
|
||||
(u'Editorial', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=editorial'),
|
||||
(u'Nacional', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=nacional'),
|
||||
(u'Internacional', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=internacional'),
|
||||
(u'Espectaculos', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=espectaculos'),
|
||||
(u'Deportes', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=deportes'),
|
||||
(u'Ciudades', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=ciudades'),
|
||||
(u'Economia', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=economia')
|
||||
]
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = None
|
||||
index = 'http://www.elpais.com.uy'
|
||||
soup = self.index_to_soup(index)
|
||||
link_item = soup.find('div',attrs={'class':'boxmedio box257'})
|
||||
print link_item
|
||||
if link_item:
|
||||
cover_url = 'http://www.elpais.com.uy'+link_item.img['src']
|
||||
return cover_url
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
20
resources/recipes/endgadget_ja.recipe
Normal file
@ -0,0 +1,20 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
japan.engadget.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class EndgadgetJapan(BasicNewsRecipe):
|
||||
title = u'Endgadget\u65e5\u672c\u7248'
|
||||
language = 'ja'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
cover_url = 'http://skins18.wincustomize.com/1/49/149320/29/7578/preview-29-7578.jpg'
|
||||
masthead_url = 'http://www.blogsmithmedia.com/japanese.engadget.com/media/eng-jp-logo-t.png'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
language = 'ja'
|
||||
encoding = 'utf-8'
|
||||
feeds = [(u'engadget', u'http://japanese.engadget.com/rss.xml')]
|
87
resources/recipes/esenja.recipe
Normal file
@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, matek09, matek09@gmail.com'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
class Esensja(BasicNewsRecipe):
|
||||
|
||||
title = u'Esensja'
|
||||
__author__ = 'matek09'
|
||||
description = 'Monthly magazine'
|
||||
encoding = 'utf-8'
|
||||
no_stylesheets = True
|
||||
language = 'pl'
|
||||
remove_javascript = True
|
||||
HREF = '0'
|
||||
|
||||
#keep_only_tags =[]
|
||||
#keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'})
|
||||
remove_tags_before = dict(dict(name = 'div', attrs = {'class' : 't-title'}))
|
||||
remove_tags_after = dict(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_bot.gif'}))
|
||||
|
||||
remove_tags =[]
|
||||
remove_tags.append(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_top.gif'}))
|
||||
remove_tags.append(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_bot.gif'}))
|
||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 't-title2 nextpage'}))
|
||||
|
||||
extra_css = '''
|
||||
.t-title {font-size: x-large; font-weight: bold; text-align: left}
|
||||
.t-author {font-size: x-small; text-align: left}
|
||||
.t-title2 {font-size: x-small; font-style: italic; text-align: left}
|
||||
.text {font-size: small; text-align: left}
|
||||
.annot-ref {font-style: italic; text-align: left}
|
||||
'''
|
||||
|
||||
preprocess_regexps = [(re.compile(r'alt="[^"]*"'),
|
||||
lambda match: '')]
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup('http://www.esensja.pl/magazyn/')
|
||||
a = soup.find('a', attrs={'href' : re.compile('.*/index.html')})
|
||||
year = a['href'].split('/')[0]
|
||||
month = a['href'].split('/')[1]
|
||||
self.HREF = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/iso/'
|
||||
soup = self.index_to_soup(self.HREF + '01.html')
|
||||
self.cover_url = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/img/ilustr/cover_b.jpg'
|
||||
feeds = []
|
||||
intro = soup.find('div', attrs={'class' : 'n-title'})
|
||||
introduction = {'title' : self.tag_to_string(intro.a),
|
||||
'url' : self.HREF + intro.a['href'],
|
||||
'date' : '',
|
||||
'description' : ''}
|
||||
chapter = 'Wprowadzenie'
|
||||
subchapter = ''
|
||||
articles = []
|
||||
articles.append(introduction)
|
||||
for tag in intro.findAllNext(attrs={'class': ['chapter', 'subchapter', 'n-title']}):
|
||||
if tag.name in 'td':
|
||||
if len(articles) > 0:
|
||||
section = chapter
|
||||
if len(subchapter) > 0:
|
||||
section += ' - ' + subchapter
|
||||
feeds.append((section, articles))
|
||||
articles = []
|
||||
if tag['class'] == 'chapter':
|
||||
chapter = self.tag_to_string(tag).capitalize()
|
||||
subchapter = ''
|
||||
else:
|
||||
subchapter = self.tag_to_string(tag)
|
||||
subchapter = self.tag_to_string(tag)
|
||||
continue
|
||||
articles.append({'title' : self.tag_to_string(tag.a), 'url' : self.HREF + tag.a['href'], 'date' : '', 'description' : ''})
|
||||
|
||||
a = self.index_to_soup(self.HREF + tag.a['href'])
|
||||
i = 1
|
||||
while True:
|
||||
div = a.find('div', attrs={'class' : 't-title2 nextpage'})
|
||||
if div is not None:
|
||||
a = self.index_to_soup(self.HREF + div.a['href'])
|
||||
articles.append({'title' : self.tag_to_string(tag.a) + ' c. d. ' + str(i), 'url' : self.HREF + div.a['href'], 'date' : '', 'description' : ''})
|
||||
i = i + 1
|
||||
else:
|
||||
break
|
||||
|
||||
return feeds
|
@ -1,67 +1,61 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Justus Bisser <justus.bisser at gmail.com>'
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Christian Schmitt'
|
||||
|
||||
'''
|
||||
fr-online.de
|
||||
'''
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class Spiegel_ger(BasicNewsRecipe):
|
||||
title = 'Frankfurter Rundschau'
|
||||
__author__ = 'Justus Bisser'
|
||||
description = "Dies ist die Online-Ausgabe der Frankfurter Rundschau. Um die abgerufenen individuell einzustellen bearbeiten sie die Liste im erweiterten Modus. Die Feeds findet man auf http://www.fr-online.de/verlagsservice/fr_newsreader/?em_cnt=574255"
|
||||
publisher = 'Druck- und Verlagshaus Frankfurt am Main GmbH'
|
||||
category = 'FR Online, Frankfurter Rundschau, Nachrichten, News,Dienste, RSS, RSS, Feedreader, Newsfeed, iGoogle, Netvibes, Widget'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
language = 'de'
|
||||
lang = 'de-DE'
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
#encoding = 'cp1252'
|
||||
class FROnlineRecipe(BasicNewsRecipe):
|
||||
title = 'Frankfurter Rundschau'
|
||||
__author__ = 'maccs'
|
||||
description = 'Nachrichten aus D und aller Welt'
|
||||
encoding = 'utf-8'
|
||||
masthead_url = 'http://www.fr-online.de/image/view/-/1474018/data/823552/-/logo.png'
|
||||
publisher = 'Druck- und Verlagshaus Frankfurt am Main GmbH'
|
||||
category = 'news, germany, world'
|
||||
language = 'de'
|
||||
publication_type = 'newspaper'
|
||||
use_embedded_content = False
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
oldest_article = 1 # Increase this number if you're interested in older articles
|
||||
max_articles_per_feed = 50 # Seems a reasonable number to me
|
||||
extra_css = '''
|
||||
body { font-family: "arial", "verdana", "geneva", sans-serif; font-size: 12px; margin: 0px; background-color: #ffffff;}
|
||||
.imgSubline{background-color: #f4f4f4; font-size: 0.8em;}
|
||||
.p--heading-1 {font-weight: bold;}
|
||||
.calibre_navbar {font-size: 0.8em; font-family: "arial", "verdana", "geneva", sans-serif;}
|
||||
'''
|
||||
remove_tags = [dict(name='div', attrs={'id':'Logo'})]
|
||||
cover_url = 'http://www.fr-online.de/image/view/-/1474018/data/823552/-/logo.png'
|
||||
cover_margins = (100, 150, '#ffffff')
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : lang
|
||||
}
|
||||
|
||||
recursions = 0
|
||||
max_articles_per_feed = 100
|
||||
#keep_only_tags = [dict(name='div', attrs={'class':'text'})]
|
||||
#tags_remove = [dict(name='div', attrs={'style':'text-align: left; margin: 4px 0px 0px 4px; width: 200px; float: right;'})]
|
||||
remove_attributes = ['style']
|
||||
feeds = []
|
||||
#remove_tags_before = [dict(name='div', attrs={'style':'padding-left: 0px;'})]
|
||||
#remove_tags_after = [dict(name='div', attrs={'class':'box_head_text'})]
|
||||
feeds = []
|
||||
feeds.append(('Startseite', u'http://www.fr-online.de/home/-/1472778/1472778/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Politik', u'http://www.fr-online.de/politik/-/1472596/1472596/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Meinung', u'http://www.fr-online.de/politik/meinung/-/1472602/1472602/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Wirtschaft', u'http://www.fr-online.de/wirtschaft/-/1472780/1472780/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Sport', u'http://www.fr-online.de/sport/-/1472784/1472784/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Eintracht Frankfurt', u'http://www.fr-online.de/sport/eintracht-frankfurt/-/1473446/1473446/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Kultur und Medien', u'http://www.fr-online.de/kultur/-/1472786/1472786/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Panorama', u'http://www.fr-online.de/panorama/-/1472782/1472782/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Frankfurt', u'http://www.fr-online.de/frankfurt/-/1472798/1472798/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Rhein-Main', u'http://www.fr-online.de/rhein-main/-/1472796/1472796/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Hanau', u'http://www.fr-online.de/rhein-main/hanau/-/1472866/1472866/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Darmstadt', u'http://www.fr-online.de/rhein-main/darmstadt/-/1472858/1472858/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Wiesbaden', u'http://www.fr-online.de/rhein-main/wiesbaden/-/1472860/1472860/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Offenbach', u'http://www.fr-online.de/rhein-main/offenbach/-/1472856/1472856/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Bad Homburg', u'http://www.fr-online.de/rhein-main/bad-homburg/-/1472864/1472864/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Digital', u'http://www.fr-online.de/digital/-/1472406/1472406/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Wissenschaft', u'http://www.fr-online.de/wissenschaft/-/1472788/1472788/-/view/asFeed/-/index.xml'))
|
||||
|
||||
# enable for all news
|
||||
allNews = 0
|
||||
if allNews:
|
||||
feeds = [(u'Frankfurter Rundschau', u'http://www.fr-online.de/rss/sport/index.xml')]
|
||||
else:
|
||||
#select the feeds you like
|
||||
feeds = [(u'Nachrichten', u'http://www.fr-online.de/rss/politik/index.xml')]
|
||||
feeds.append((u'Kommentare und Analysen', u'http://www.fr-online.de/rss/meinung/index.xml'))
|
||||
feeds.append((u'Dokumentationen', u'http://www.fr-online.de/rss/dokumentation/index.xml'))
|
||||
feeds.append((u'Deutschlandtrend', u'http://www.fr-online.de/rss/deutschlandtrend/index.xml'))
|
||||
feeds.append((u'Wirtschaft', u'http://www.fr-online.de/rss/wirtschaft/index.xml'))
|
||||
feeds.append((u'Sport', u'http://www.fr-online.de/rss/sport/index.xml'))
|
||||
feeds.append((u'Feuilleton', u'http://www.fr-online.de/rss/feuilleton/index.xml'))
|
||||
feeds.append((u'Panorama', u'http://www.fr-online.de/rss/panorama/index.xml'))
|
||||
feeds.append((u'Rhein Main und Hessen', u'http://www.fr-online.de/rss/hessen/index.xml'))
|
||||
feeds.append((u'Fitness und Gesundheit', u'http://www.fr-online.de/rss/fit/index.xml'))
|
||||
feeds.append((u'Multimedia', u'http://www.fr-online.de/rss/multimedia/index.xml'))
|
||||
feeds.append((u'Wissen und Bildung', u'http://www.fr-online.de/rss/wissen/index.xml'))
|
||||
|
||||
def get_article_url(self, article):
|
||||
url = article.link
|
||||
regex = re.compile("0C[0-9]{6,8}0A?")
|
||||
def print_version(self, url):
|
||||
return url.replace('index.html', 'view/printVersion/-/index.html')
|
||||
|
||||
liste = regex.findall(url)
|
||||
string = liste.pop(0)
|
||||
string = string[2:len(string)-1]
|
||||
return "http://www.fr-online.de/_em_cms/_globals/print.php?em_cnt=" + string
|
||||
|
||||
|
100
resources/recipes/freeway.recipe
Normal file
@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
'''
|
||||
http://freeway.com.uy
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class General(BasicNewsRecipe):
|
||||
title = 'freeway.com.uy'
|
||||
__author__ = 'Gustavo Azambuja'
|
||||
description = 'Revista Freeway, Montevideo, Uruguay'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
use_embedded_content = False
|
||||
recursion = 1
|
||||
encoding = 'utf8'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
conversion_options = {'linearize_tables': True}
|
||||
|
||||
oldest_article = 180
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [
|
||||
dict(id=['contenido']),
|
||||
dict(name='a', attrs={'class':'titulo_art_ppal'}),
|
||||
dict(name='img', attrs={'class':'recuadro'}),
|
||||
dict(name='td', attrs={'class':'txt_art_ppal'})
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name=['object','link'])
|
||||
]
|
||||
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||
img {float:left; clear:both; margin:10px}
|
||||
p {font-family:Arial,Helvetica,sans-serif;}
|
||||
'''
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
for title, url in [('Articulos', 'http://freeway.com.uy/revista/')]:
|
||||
articles = self.art_parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
return feeds
|
||||
|
||||
def art_parse_section(self, url):
|
||||
soup = self.index_to_soup(url)
|
||||
div = soup.find(attrs={'id': 'tbl_1'})
|
||||
|
||||
current_articles = []
|
||||
for tag in div.findAllNext(attrs = {'class': 'ancho_articulos'}):
|
||||
if tag.get('class') == 'link-list-heading':
|
||||
break
|
||||
for td in tag.findAll('td'):
|
||||
a = td.find('a', attrs= {'class': 'titulo_articulos'})
|
||||
if a is None:
|
||||
continue
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
if not url or not title:
|
||||
continue
|
||||
if url.startswith('/'):
|
||||
url = 'http://freeway.com.uy'+url
|
||||
p = td.find('p', attrs= {'class': 'txt_articulos'})
|
||||
description = self.tag_to_string(p)
|
||||
self.log('\t\tFound article:', title)
|
||||
self.log('\t\t\t', url)
|
||||
self.log('\t\t\t', description)
|
||||
current_articles.append({'title': title, 'url': url, 'description':description, 'date':''})
|
||||
|
||||
return current_articles
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
attribs = [ 'style','font','valign'
|
||||
,'colspan','width','height'
|
||||
,'rowspan','summary','align'
|
||||
,'cellspacing','cellpadding'
|
||||
,'frames','rules','border'
|
||||
]
|
||||
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
|
||||
item.name = 'div'
|
||||
for attrib in attribs:
|
||||
if item.has_key(attrib):
|
||||
del item[attrib]
|
||||
return soup
|
||||
|
||||
def get_cover_url(self):
|
||||
#index = 'http://www.cosmohispano.com/revista'
|
||||
#soup = self.index_to_soup(index)
|
||||
#link_item = soup.find('img',attrs={'class':'img_portada'})
|
||||
#if link_item:
|
||||
# cover_url = "http://www.cosmohispano.com"+link_item['src']
|
||||
return 'http://freeway.com.uy/_upload/_n_foto_grande/noticia_1792_tapanoviembre2010.jpg'
|
46
resources/recipes/globaltimes.recipe
Normal file
@ -0,0 +1,46 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
class globaltimes(BasicNewsRecipe):
|
||||
title = u'Global Times'
|
||||
__author__ = 'malfi'
|
||||
language = 'zh'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
cover_url = 'http://enhimg2.huanqiu.com/images/logo.png'
|
||||
language = 'en'
|
||||
keep_only_tags = []
|
||||
keep_only_tags.append(dict(name = 'div', attrs = {'id': 'content'}))
|
||||
remove_tags = []
|
||||
remove_tags.append(dict(name = 'div', attrs = {'class': 'location'}))
|
||||
remove_tags.append(dict(name = 'div', attrs = {'class': 'contentpage'}))
|
||||
remove_tags.append(dict(name = 'li', attrs = {'id': 'pl'}))
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||
'''
|
||||
def parse_index(self):
|
||||
catnames = {}
|
||||
catnames["http://china.globaltimes.cn/chinanews/"] = "China Politics"
|
||||
catnames["http://china.globaltimes.cn/diplomacy/"] = "China Diplomacy"
|
||||
catnames["http://military.globaltimes.cn/china/"] = "China Military"
|
||||
catnames["http://business.globaltimes.cn/china-economy/"] = "China Economy"
|
||||
catnames["http://world.globaltimes.cn/asia-pacific/"] = "Asia Pacific"
|
||||
feeds = []
|
||||
|
||||
for cat in catnames.keys():
|
||||
articles = []
|
||||
soup = self.index_to_soup(cat)
|
||||
for a in soup.findAll('a',attrs={'href' : re.compile(cat+"201[0-9]-[0-1][0-9]/[0-9][0-9][0-9][0-9][0-9][0-9].html")}):
|
||||
url = a['href'].strip()
|
||||
myarticle=({'title':self.tag_to_string(a), 'url':url, 'description':'', 'date':''})
|
||||
self.log("found %s" % url)
|
||||
articles.append(myarticle)
|
||||
self.log("Adding URL %s\n" %url)
|
||||
if articles:
|
||||
feeds.append((catnames[cat], articles))
|
||||
return feeds
|
47
resources/recipes/globes_co_il.recipe
Normal file
@ -0,0 +1,47 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import re
|
||||
|
||||
class AdvancedUserRecipe1283848012(BasicNewsRecipe):
|
||||
description = 'This is Globes.co.il.'
|
||||
cover_url = 'http://www.the7eye.org.il/SiteCollectionImages/BAKTANA/arye_avnery_010709_377.jpg'
|
||||
title = u'Globes'
|
||||
language = 'he'
|
||||
__author__ = 'marbs'
|
||||
extra_css='img {max-width:100%;} body{direction: rtl;max-width:100%;}title{direction: rtl; } article_description{direction: rtl; }, a.article{direction: rtl;max-width:100%;} calibre_feed_description{direction: rtl; }'
|
||||
simultaneous_downloads = 5
|
||||
remove_javascript = True
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
remove_attributes = ['width','style']
|
||||
|
||||
|
||||
feeds = [(u'שוק ההון', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=585'),
|
||||
(u'נדל"ן', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=607'),
|
||||
(u'וול סטריט ושווקי העולם', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=1225'),
|
||||
(u'ניתוח טכני', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=1294'),
|
||||
(u'היי טק', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=594'),
|
||||
(u'נתח שוק וצרכנות', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=821'),
|
||||
(u'דין וחשבון', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=829'),
|
||||
(u'רכב', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=3220'),
|
||||
(u'דעות', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=845'),
|
||||
(u'קניון המניות - טור שבועי', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=3175'),
|
||||
(u'סביבה', u'http://www.globes.co.il/webservice/rss/rssfeeder.asmx/FeederNode?iID=3221')]
|
||||
|
||||
def print_version(self, url):
|
||||
split1 = url.split("=")
|
||||
print_url = 'http://www.globes.co.il/serve/globes/printwindow.asp?did=' + split1[1]
|
||||
return print_url
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
soup.find('tr',attrs={'bgcolor':'black'}).findPrevious('tr').extract()
|
||||
soup.find('tr',attrs={'bgcolor':'black'}).extract()
|
||||
return soup
|
||||
|
||||
def fixChars(self,string):
|
||||
# Replace lsquo (\x91)
|
||||
fixed = re.sub("■","■",string)
|
||||
return fixed
|
||||
|
||||
|
38
resources/recipes/hannoversche_zeitung.recipe
Normal file
@ -0,0 +1,38 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1287519083(BasicNewsRecipe):
|
||||
title = u'Hannoversche Allgemeine Zeitung'
|
||||
oldest_article = 1
|
||||
__author__ = 'Artemis'
|
||||
max_articles_per_feed = 30
|
||||
language = 'de'
|
||||
no_stylesheets = True
|
||||
|
||||
feeds = [
|
||||
#(u'Schlagzeilen', u'http://www.haz.de/rss/feed/haz_schlagzeilen'),
|
||||
(u'Politik', u'http://www.haz.de/rss/feed/haz_politik'),
|
||||
(u'Wirtschaft', u'http://www.haz.de/rss/feed/haz_wirtschaft'),
|
||||
(u'Panorama', u'http://www.haz.de/rss/feed/haz_panorama'),
|
||||
(u'Wissen', u'http://www.haz.de/rss/feed/haz_wissen'),
|
||||
(u'Kultur', u'http://www.haz.de/rss/feed/haz_kultur'),
|
||||
(u'Sp\xe4tvorstellung', u'http://www.haz.de/rss/feed/haz_spaetvorstellung'),
|
||||
(u'Hannover & Region', u'http://www.haz.de/rss/feed/haz_hannoverregion'),
|
||||
(u'Netzgefl\xfcster', u'http://www.haz.de/rss/feed/haz_netzgefluester'),
|
||||
(u'Meinung', u'http://www.haz.de/rss/feed/haz_meinung'),
|
||||
(u'ZiSH', u'http://www.haz.de/rss/feed/haz_zish'),
|
||||
(u'Medien', u'http://www.haz.de/rss/feed/haz_medien'),
|
||||
#(u'Sport', u'http://www.haz.de/rss/feed/haz_sport'),
|
||||
#(u'Hannover 96', u'http://www.haz.de/rss/feed/haz_hannover96')
|
||||
]
|
||||
|
||||
remove_tags_before =dict(id='modul_artikel')
|
||||
remove_tags_after =dict(id='articlecontent')
|
||||
|
||||
remove_tags = dict(id='articlesidebar')
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['articlecomment',
|
||||
'articlebookmark', 'teaser_anzeige', 'teaser_umfrage',
|
||||
'navigation', 'subnavigation']})
|
||||
]
|
||||
|
59
resources/recipes/histmag.recipe
Normal file
@ -0,0 +1,59 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, matek09, matek09@gmail.com'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
class Histmag(BasicNewsRecipe):
|
||||
|
||||
title = u'Histmag'
|
||||
__author__ = 'matek09'
|
||||
description = u"Artykuly historyczne i publicystyczne"
|
||||
encoding = 'utf-8'
|
||||
no_stylesheets = True
|
||||
language = 'pl'
|
||||
remove_javascript = True
|
||||
#max_articles_per_feed = 1
|
||||
remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'article'}))
|
||||
remove_tags_after = dict(dict(name = 'h2', attrs = {'class' : 'komentarze'}))
|
||||
#keep_only_tags =[]
|
||||
#keep_only_tags.append(dict(name = 'h2'))
|
||||
#keep_only_tags.append(dict(name = 'p'))
|
||||
|
||||
remove_tags =[]
|
||||
remove_tags.append(dict(name = 'p', attrs = {'class' : 'podpis'}))
|
||||
remove_tags.append(dict(name = 'h2', attrs = {'class' : 'komentarze'}))
|
||||
remove_tags.append(dict(name = 'img', attrs = {'src' : 'style/buttons/wesprzyjnas-1.jpg'}))
|
||||
|
||||
preprocess_regexps = [(re.compile(r'</span>'), lambda match: '</span><br><br>'),
|
||||
(re.compile(r'<span>'), lambda match: '<br><br><span>')]
|
||||
extra_css = '''
|
||||
.left {font-size: x-small}
|
||||
.right {font-size: x-small}
|
||||
'''
|
||||
|
||||
def find_articles(self, soup):
|
||||
articles = []
|
||||
for div in soup.findAll('div', attrs={'class' : 'text'}):
|
||||
articles.append({
|
||||
'title' : self.tag_to_string(div.h3.a),
|
||||
'url' : 'http://www.histmag.org/' + div.h3.a['href'],
|
||||
'date' : self.tag_to_string(div.next('p')).split('|')[0],
|
||||
'description' : self.tag_to_string(div.next('p', podpis=False)),
|
||||
})
|
||||
return articles
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup('http://histmag.org/?arc=4&dx=0')
|
||||
feeds = []
|
||||
feeds.append((u"Artykuly historyczne", self.find_articles(soup)))
|
||||
soup = self.index_to_soup('http://histmag.org/?arc=5&dx=0')
|
||||
feeds.append((u"Artykuly publicystyczne", self.find_articles(soup)))
|
||||
soup = self.index_to_soup('http://histmag.org/?arc=1&dx=0')
|
||||
feeds.append((u"Wydarzenia", self.find_articles(soup)))
|
||||
|
||||
return feeds
|
||||
|
||||
|
@ -13,7 +13,6 @@ class IrishTimes(BasicNewsRecipe):
|
||||
language = 'en_IE'
|
||||
timefmt = ' (%A, %B %d, %Y)'
|
||||
|
||||
|
||||
oldest_article = 3
|
||||
no_stylesheets = True
|
||||
simultaneous_downloads= 1
|
||||
@ -35,12 +34,11 @@ class IrishTimes(BasicNewsRecipe):
|
||||
|
||||
def print_version(self, url):
|
||||
if url.count('rss.feedsportal.com'):
|
||||
u = 'http://www.irishtimes.com' + \
|
||||
(((url[69:].replace('0C','/')).replace('0A','0'))).replace('0Bhtml/story01..htm','_pf.html')
|
||||
u = 'http://www.irishtimes.com' + \
|
||||
(((url[69:].replace('0C','/')).replace('0A','0'))).replace('0Bhtml/story01.htm','_pf.html')
|
||||
else:
|
||||
u = url.replace('.html','_pf.html')
|
||||
return u
|
||||
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.link
|
||||
|
31
resources/recipes/jijinews.recipe
Normal file
@ -0,0 +1,31 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.jiji.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class JijiDotCom(BasicNewsRecipe):
|
||||
title = u'\u6642\u4e8b\u901a\u4fe1'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'World News from Jiji Press'
|
||||
publisher = 'Jiji Press Ltd.'
|
||||
category = 'news'
|
||||
oldest_article = 6
|
||||
max_articles_per_feed = 100
|
||||
encoding = 'euc_jisx0213'
|
||||
language = 'ja'
|
||||
masthead_url = 'http://jen.jiji.com/images/logo_jijipress.gif'
|
||||
top_url = 'http://www.jiji.com/'
|
||||
|
||||
feeds = [(u'\u30cb\u30e5\u30fc\u30b9', u'http://www.jiji.com/rss/ranking.rdf')]
|
||||
remove_tags_after = dict(id="ad_google")
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = 'http://www.jiji.com/img/top_header_logo2.gif'
|
||||
soup = self.index_to_soup(self.top_url)
|
||||
cover_item = soup.find('div', attrs={'class':'top-pad-photos'})
|
||||
if cover_item:
|
||||
cover_url = self.top_url + cover_item.img['src']
|
||||
return cover_url
|
48
resources/recipes/la_diaria.recipe
Normal file
@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
'''
|
||||
ladiaria.com.uy
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class General(BasicNewsRecipe):
|
||||
title = 'La Diaria'
|
||||
__author__ = 'Gustavo Azambuja'
|
||||
description = 'Noticias de Uruguay'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
use_embedded_content = False
|
||||
recursion = 5
|
||||
encoding = 'utf8'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [dict(id=['article'])]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['byline', 'hr', 'titlebar', 'volver-arriba-right']}),
|
||||
dict(name='div', attrs={'id':'discussion'}),
|
||||
dict(name=['object','link'])
|
||||
]
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||
p {font-family:Arial,Helvetica,sans-serif;}
|
||||
'''
|
||||
feeds = [
|
||||
(u'Articulos', u'http://ladiaria.com/feeds/articulos')
|
||||
]
|
||||
|
||||
def get_cover_url(self):
|
||||
return 'http://ladiaria.com/edicion/imagenportada/'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
@ -8,7 +8,7 @@ from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class LaRazon_Bol(BasicNewsRecipe):
|
||||
title = 'La Razón - Bolivia'
|
||||
title = u'La Razón - Bolivia'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'El diario nacional de Bolivia'
|
||||
publisher = 'Praxsis S.R.L.'
|
||||
|
54
resources/recipes/la_rioja.recipe
Normal file
@ -0,0 +1,54 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.larioja.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class LaRioja(BasicNewsRecipe):
|
||||
title = 'La Rioja'
|
||||
__author__ = 'Arturo Martinez Nieves'
|
||||
description = 'Noticias de La Rioja y el resto del mundo'
|
||||
publisher = 'La Rioja'
|
||||
category = 'news, politics, Spain'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
language = 'es'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.larioja.com/includes/manuales/larioja/include-lariojapapeldigital-zonac-fondocabecera01.jpg'
|
||||
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(attrs={'id':'title'})
|
||||
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
|
||||
]
|
||||
remove_tags = [dict(name='ul')]
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Ultimas Noticias' , u'http://www.larioja.com/rss/feeds/ultima.xml' )
|
||||
,(u'Portada' , u'http://www.larioja.com/rss/feeds/portada.xml' )
|
||||
,(u'Mundo' , u'http://www.larioja.com/rss/feeds/mundo.xml' )
|
||||
,(u'Espana' , u'http://www.larioja.com/rss/feeds/espana.xml' )
|
||||
,(u'Region' , u'http://www.larioja.com/rss/feeds/region.xml' )
|
||||
,(u'Comarcas' , u'http://www.larioja.com/rss/feeds/comarcas.xml')
|
||||
,(u'Deportes' , u'http://www.larioja.com/rss/feeds/deportes.xml' )
|
||||
,(u'Economia' , u'http://www.larioja.com/rss/feeds/economia.xml' )
|
||||
,(u'Cultura' , u'http://www.larioja.com/rss/feeds/cultura.xml' )
|
||||
,(u'Opinion' , u'http://www.larioja.com/rss/feeds/opinion.xml' )
|
||||
,(u'Sociedad' , u'http://www.larioja.com/rss/feeds/sociedad.xml' )
|
||||
|
||||
]
|
||||
|
@ -21,10 +21,13 @@ class Lanacion(BasicNewsRecipe):
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.lanacion.com.ar/imgs/layout/logos/ln341x47.gif'
|
||||
extra_css = """ h1{font-family: Georgia,serif}
|
||||
h2{color: #626262}
|
||||
body{font-family: Arial,sans-serif}
|
||||
img{margin-top: 0.5em; margin-bottom: 0.2em}
|
||||
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||
.notaFecha{color: #808080}
|
||||
.notaEpigrafe{font-size: x-small}
|
||||
.topNota h1{font-family: Arial,sans-serif} """
|
||||
.topNota h1{font-family: Arial,sans-serif}
|
||||
"""
|
||||
|
||||
|
||||
conversion_options = {
|
||||
@ -38,12 +41,12 @@ class Lanacion(BasicNewsRecipe):
|
||||
remove_tags = [
|
||||
dict(name='div' , attrs={'class':'notaComentario floatFix noprint' })
|
||||
,dict(name='ul' , attrs={'class':['cajaHerramientas cajaTop noprint','herramientas noprint']})
|
||||
,dict(name='div' , attrs={'class':'cajaHerramientas noprint' })
|
||||
,dict(attrs={'class':['titulosMultimedia','derecha','techo color','encuesta','izquierda compartir','floatFix']})
|
||||
,dict(name=['iframe','embed','object','form','base','hr'])
|
||||
,dict(name='div' , attrs={'class':['cajaHerramientas noprint','cajaHerramientas floatFix'] })
|
||||
,dict(attrs={'class':['titulosMultimedia','derecha','techo color','encuesta','izquierda compartir','floatFix','videoCentro']})
|
||||
,dict(name=['iframe','embed','object','form','base','hr','meta','link','input'])
|
||||
]
|
||||
remove_tags_after = dict(attrs={'class':['tags','nota-destacado']})
|
||||
remove_attributes = ['height','width','visible']
|
||||
remove_attributes = ['height','width','visible','onclick','data-count','name']
|
||||
|
||||
feeds = [
|
||||
(u'Ultimas noticias' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?origen=2' )
|
||||
|
40
resources/recipes/mainichi.recipe
Normal file
@ -0,0 +1,40 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.mainichi.jp
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class MainichiDailyNews(BasicNewsRecipe):
|
||||
title = u'\u6bce\u65e5\u65b0\u805e'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
description = 'Japanese traditional newspaper Mainichi Daily News'
|
||||
publisher = 'Mainichi Daily News'
|
||||
category = 'news, japan'
|
||||
language = 'ja'
|
||||
|
||||
feeds = [(u'daily news', u'http://mainichi.jp/rss/etc/flash.rss')]
|
||||
|
||||
remove_tags_before = {'class':"NewsTitle"}
|
||||
remove_tags = [{'class':"RelatedArticle"}]
|
||||
remove_tags_after = {'class':"Credit"}
|
||||
|
||||
def parse_feeds(self):
|
||||
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
|
||||
for curfeed in feeds:
|
||||
delList = []
|
||||
for a,curarticle in enumerate(curfeed.articles):
|
||||
if re.search(r'pheedo.jp', curarticle.url):
|
||||
delList.append(curarticle)
|
||||
if len(delList)>0:
|
||||
for d in delList:
|
||||
index = curfeed.articles.index(d)
|
||||
curfeed.articles[index:index+1] = []
|
||||
|
||||
return feeds
|
32
resources/recipes/mainichi_it_news.recipe
Normal file
@ -0,0 +1,32 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class MainichiDailyITNews(BasicNewsRecipe):
|
||||
title = u'\u6bce\u65e5\u65b0\u805e(IT&\u5bb6\u96fb)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
description = 'Japanese traditional newspaper Mainichi Daily News - IT and electronics'
|
||||
publisher = 'Mainichi Daily News'
|
||||
category = 'news, Japan, IT, Electronics'
|
||||
language = 'ja'
|
||||
|
||||
feeds = [(u'IT News', u'http://mainichi.pheedo.jp/f/mainichijp_electronics')]
|
||||
|
||||
remove_tags_before = {'class':"NewsTitle"}
|
||||
remove_tags = [{'class':"RelatedArticle"}]
|
||||
|
||||
def parse_feeds(self):
|
||||
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
|
||||
for curfeed in feeds:
|
||||
delList = []
|
||||
for a,curarticle in enumerate(curfeed.articles):
|
||||
if re.search(r'pheedo.jp', curarticle.url):
|
||||
delList.append(curarticle)
|
||||
if len(delList)>0:
|
||||
for d in delList:
|
||||
index = curfeed.articles.index(d)
|
||||
curfeed.articles[index:index+1] = []
|
||||
|
||||
return feeds remove_tags_after = {'class':"Credit"}
|
22
resources/recipes/matichon.recipe
Normal file
@ -0,0 +1,22 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1290412756(BasicNewsRecipe):
|
||||
__author__ = 'Anat R.'
|
||||
title = u'Matichon'
|
||||
oldest_article = 7
|
||||
language = 'th'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
feeds = [(u'News', u'http://www.matichon.co.th/rss/news_article.xml'),
|
||||
(u'Columns', u'http://www.matichon.co.th/rss/news_columns.xml'),
|
||||
(u'Politics', u'http://www.matichon.co.th/rss/news_politic.xml'),
|
||||
(u'Business', u'http://www.matichon.co.th/rss/news_business.xml'),
|
||||
(u'World', u'http://www.matichon.co.th/rss/news_world.xml'),
|
||||
(u'Sports', u'http://www.matichon.co.th/rss/news_sport.xml'),
|
||||
(u'Entertainment', u'http://www.matichon.co.th/rss/news_entertainment.xml')]
|
||||
keep_only_tags = []
|
||||
keep_only_tags.append(dict(name = 'h3', attrs = {'class' : 'read-h'}))
|
||||
keep_only_tags.append(dict(name = 'p', attrs = {'class' : 'read-time'}))
|
||||
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'news-content'}))
|
@ -3,13 +3,28 @@ __copyright__ = '2010, Eddie Lau'
|
||||
'''
|
||||
modified from Singtao Toronto calibre recipe by rty
|
||||
Change Log:
|
||||
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
|
||||
ordering of articles
|
||||
2010/11/12: add news image and eco-news section
|
||||
2010/11/08: add parsing of finance section
|
||||
2010/11/06: temporary work-around for Kindle device having no capability to display unicode
|
||||
in section/article list.
|
||||
2010/10/31: skip repeated articles in section pages
|
||||
'''
|
||||
|
||||
import datetime
|
||||
import os, datetime, re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
|
||||
class AdvancedUserRecipe1278063072(BasicNewsRecipe):
|
||||
|
||||
from calibre import __appname__, strftime
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.utils.date import now as nowf
|
||||
|
||||
class MPHKRecipe(BasicNewsRecipe):
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
@ -24,27 +39,131 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe):
|
||||
encoding = 'Big5-HKSCS'
|
||||
recursions = 0
|
||||
conversion_options = {'linearize_tables':True}
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;}'
|
||||
#extra_css = 'img {float:right; margin:4px;}'
|
||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||
keep_only_tags = [dict(name='h1'),
|
||||
#dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page
|
||||
dict(attrs={'class':['photo']}),
|
||||
dict(attrs={'id':['newscontent']}),
|
||||
dict(attrs={'id':['newscontent01','newscontent02']})]
|
||||
remove_tags = [dict(name='style'),
|
||||
dict(attrs={'id':['newscontent135']})] # for the finance page
|
||||
remove_attributes = ['width']
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '<h1>'),
|
||||
(re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '</h1>'),
|
||||
]
|
||||
|
||||
def image_url_processor(cls, baseurl, url):
|
||||
# trick: break the url at the first occurance of digit, add an additional
|
||||
# '_' at the front
|
||||
# not working, may need to move this to preprocess_html() method
|
||||
#minIdx = 10000
|
||||
#i0 = url.find('0')
|
||||
#if i0 >= 0 and i0 < minIdx:
|
||||
# minIdx = i0
|
||||
#i1 = url.find('1')
|
||||
#if i1 >= 0 and i1 < minIdx:
|
||||
# minIdx = i1
|
||||
#i2 = url.find('2')
|
||||
#if i2 >= 0 and i2 < minIdx:
|
||||
# minIdx = i2
|
||||
#i3 = url.find('3')
|
||||
#if i3 >= 0 and i0 < minIdx:
|
||||
# minIdx = i3
|
||||
#i4 = url.find('4')
|
||||
#if i4 >= 0 and i4 < minIdx:
|
||||
# minIdx = i4
|
||||
#i5 = url.find('5')
|
||||
#if i5 >= 0 and i5 < minIdx:
|
||||
# minIdx = i5
|
||||
#i6 = url.find('6')
|
||||
#if i6 >= 0 and i6 < minIdx:
|
||||
# minIdx = i6
|
||||
#i7 = url.find('7')
|
||||
#if i7 >= 0 and i7 < minIdx:
|
||||
# minIdx = i7
|
||||
#i8 = url.find('8')
|
||||
#if i8 >= 0 and i8 < minIdx:
|
||||
# minIdx = i8
|
||||
#i9 = url.find('9')
|
||||
#if i9 >= 0 and i9 < minIdx:
|
||||
# minIdx = i9
|
||||
#return url[0:minIdx] + '_' + url[minIdx+1:]
|
||||
return url
|
||||
|
||||
def get_fetchdate(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
# convert UTC to local hk time - at around HKT 5.30am, all news are available
|
||||
dt_local = dt_utc - datetime.timedelta(-2.5/24)
|
||||
# convert UTC to local hk time - at around HKT 6.00am, all news are available
|
||||
dt_local = dt_utc - datetime.timedelta(-2.0/24)
|
||||
return dt_local.strftime("%Y%m%d")
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
dateStr = self.get_fetchdate()
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
feeds = []
|
||||
dateStr = self.get_fetchdate()
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
|
||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||
(u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
|
||||
(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
|
||||
('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
|
||||
(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
# special - finance
|
||||
fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||
if fin_articles:
|
||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
# special - eco-friendly
|
||||
# eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm')
|
||||
# if eco_articles:
|
||||
# feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
|
||||
# special - entertainment
|
||||
#ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
#if ent_articles:
|
||||
# feeds.append(('Entertainment', ent_articles))
|
||||
return feeds
|
||||
|
||||
def parse_section(self, url):
|
||||
dateStr = self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
divs.reverse()
|
||||
for i in divs:
|
||||
a = i.find('a', href = True)
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def parse_fin_section(self, url):
|
||||
dateStr = self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href= True)
|
||||
current_articles = []
|
||||
for i in a:
|
||||
url = i.get('href', False)
|
||||
if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://www.mpfinance.com/cfm/' +url
|
||||
current_articles.append({'title': title, 'url': url, 'description':''})
|
||||
return current_articles
|
||||
|
||||
def parse_eco_section(self, url):
|
||||
soup = self.index_to_soup(url)
|
||||
divs = soup.findAll(attrs={'class': ['bullet']})
|
||||
current_articles = []
|
||||
@ -53,9 +172,162 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe):
|
||||
a = i.find('a', href = True)
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||
if url not in included_urls:
|
||||
url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
|
||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||
current_articles.append({'title': title, 'url': url, 'description':''})
|
||||
included_urls.append(url)
|
||||
return current_articles
|
||||
|
||||
#def parse_ent_section(self, url):
|
||||
# dateStr = self.get_fetchdate()
|
||||
# soup = self.index_to_soup(url)
|
||||
# a = soup.findAll('a', href=True)
|
||||
# current_articles = []
|
||||
# included_urls = []
|
||||
# for i in a:
|
||||
# title = self.tag_to_string(i)
|
||||
# url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
|
||||
# if url not in included_urls and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1 and not title == '':
|
||||
# current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
# return current_articles
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll(style=True):
|
||||
del item['width']
|
||||
for item in soup.findAll(stype=True):
|
||||
del item['absmiddle']
|
||||
return soup
|
||||
|
||||
def create_opf(self, feeds, dir=None):
|
||||
#super(MPHKRecipe,self).create_opf(feeds, dir)
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
title = self.short_title()
|
||||
if self.output_profile.periodical_date_in_title:
|
||||
title += strftime(self.timefmt)
|
||||
mi = MetaInformation(title, [__appname__])
|
||||
mi.publisher = __appname__
|
||||
mi.author_sort = __appname__
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
mi.timestamp = nowf()
|
||||
mi.comments = self.description
|
||||
if not isinstance(mi.comments, unicode):
|
||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
mi.pubdate = nowf()
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
if mp is not None and os.access(mp, os.R_OK):
|
||||
from calibre.ebooks.metadata.opf2 import Guide
|
||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||
ref.type = 'masthead'
|
||||
ref.title = 'Masthead Image'
|
||||
opf.guide.append(ref)
|
||||
|
||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
|
||||
# Get cover
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
if self.default_cover(pf):
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
|
||||
# Get masthead
|
||||
mpath = getattr(self, 'masthead_path', None)
|
||||
if mpath is not None and os.access(mpath, os.R_OK):
|
||||
manifest.append(mpath)
|
||||
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
for mani in opf.manifest:
|
||||
if mani.path.endswith('.ncx'):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
for j, a in enumerate(f):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/'%(num, j)
|
||||
auth = a.author
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = a.text_summary
|
||||
if not desc:
|
||||
desc = None
|
||||
else:
|
||||
desc = self.description_limiter(desc)
|
||||
entries.append('%sindex.html'%adir)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth, description=desc)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
relp = sp[len(prefix):]
|
||||
entries.append(relp.replace(os.sep, '/'))
|
||||
last = sp
|
||||
|
||||
if os.path.exists(last):
|
||||
with open(last, 'rb') as fi:
|
||||
src = fi.read().decode('utf-8')
|
||||
soup = BeautifulSoup(src)
|
||||
body = soup.find('body')
|
||||
if body is not None:
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, __appname__, prefix=prefix,
|
||||
center=self.center_navbar)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
with open(last, 'wb') as fi:
|
||||
fi.write(unicode(soup).encode('utf-8'))
|
||||
if len(feeds) == 0:
|
||||
raise Exception('All feeds are empty, aborting.')
|
||||
|
||||
if len(feeds) > 1:
|
||||
for i, f in enumerate(feeds):
|
||||
entries.append('feed_%d/index.html'%i)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
auth = getattr(f, 'author', None)
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = getattr(f, 'description', None)
|
||||
if not desc:
|
||||
desc = None
|
||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||
f.title, play_order=po, description=desc, author=auth))
|
||||
|
||||
else:
|
||||
entries.append('feed_%d/index.html'%0)
|
||||
feed_index(0, toc)
|
||||
|
||||
for i, p in enumerate(entries):
|
||||
entries[i] = os.path.join(dir, p.replace('/', os.sep))
|
||||
opf.create_spine(entries)
|
||||
opf.set_toc(toc)
|
||||
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
||||
|
56
resources/recipes/montevideo_com.recipe
Normal file
@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
'''
|
||||
http://www.montevideo.com.uy
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Noticias(BasicNewsRecipe):
|
||||
title = 'Montevideo COMM'
|
||||
__author__ = 'Gustavo Azambuja'
|
||||
description = 'Noticias de Uruguay'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
use_embedded_content = False
|
||||
recursion = 5
|
||||
encoding = 'utf-8'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [dict(id=['txt'])]
|
||||
remove_tags = [
|
||||
dict(name=['object','link'])
|
||||
]
|
||||
|
||||
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||
p {font-family:Arial,Helvetica,sans-serif;}
|
||||
'''
|
||||
feeds = [
|
||||
(u'Destacados', u'http://www.montevideo.com.uy/anxml.aspx?58'),
|
||||
(u'Noticias', u'http://www.montevideo.com.uy/anxml.aspx?59'),
|
||||
(u'Tecnologia', u'http://www.montevideo.com.uy/anxml.aspx?133'),
|
||||
(u'Tiempo Libre', u'http://www.montevideo.com.uy/anxml.aspx?60'),
|
||||
# (u'Deportes', u'http://www.montevideo.com.uy/anxml.aspx?968'),
|
||||
# (u'Pantallazo', u'http://www.montevideo.com.uy/anxml.aspx?1022'),
|
||||
(u'Gastronomia', u'http://www.montevideo.com.uy/anxml.aspx?1023')
|
||||
]
|
||||
|
||||
def get_cover_url(self):
|
||||
return 'http://sphotos.ak.fbcdn.net/hphotos-ak-snc1/hs276.snc1/10319_147339559330_147337559330_2625816_6636564_n.jpg'
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
@ -1,31 +1,33 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
moscowtimes.ru
|
||||
www.themoscowtimes.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Moscowtimes(BasicNewsRecipe):
|
||||
title = u'The Moscow Times'
|
||||
title = 'The Moscow Times'
|
||||
__author__ = 'Darko Miletic and Sujata Raman'
|
||||
description = 'News from Russia'
|
||||
language = 'en'
|
||||
lang = 'en'
|
||||
oldest_article = 7
|
||||
description = 'The Moscow Times is a daily English-language newspaper featuring objective, reliable news on business, politics, sports and culture in Moscow, in Russia and the former Soviet Union (CIS).'
|
||||
category = 'Russia, Moscow, Russian news, Moscow news, Russian newspaper, daily news, independent news, reliable news, USSR, Soviet Union, CIS, Russian politics, Russian business, Russian culture, Russian opinion, St Petersburg, Saint Petersburg'
|
||||
publisher = 'The Moscow Times'
|
||||
language = 'en'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
#encoding = 'utf-8'
|
||||
encoding = 'cp1252'
|
||||
remove_javascript = True
|
||||
remove_empty_feeds = True
|
||||
encoding = 'cp1251'
|
||||
masthead_url = 'http://www.themoscowtimes.com/bitrix/templates/tmt/img/logo.gif'
|
||||
publication_type = 'newspaper'
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'language' : lang
|
||||
}
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
extra_css = '''
|
||||
h1{ color:#0066B3; font-family: Georgia,serif ; font-size: large}
|
||||
@ -35,39 +37,37 @@ class Moscowtimes(BasicNewsRecipe):
|
||||
.text{font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; font-size:75%; }
|
||||
'''
|
||||
feeds = [
|
||||
(u'The Moscow Times Top Stories' , u'http://www.themoscowtimes.com/rss/top'),
|
||||
(u'The Moscow Times Current Issue' , u'http://www.themoscowtimes.com/rss/issue'),
|
||||
(u'The Moscow Times News' , u'http://www.themoscowtimes.com/rss/news'),
|
||||
(u'The Moscow Times Business' , u'http://www.themoscowtimes.com/rss/business'),
|
||||
(u'The Moscow Times Art and Ideas' , u'http://www.themoscowtimes.com/rss/art'),
|
||||
(u'The Moscow Times Opinion' , u'http://www.themoscowtimes.com/rss/opinion')
|
||||
(u'Top Stories' , u'http://www.themoscowtimes.com/rss/top' )
|
||||
,(u'Current Issue' , u'http://www.themoscowtimes.com/rss/issue' )
|
||||
,(u'News' , u'http://www.themoscowtimes.com/rss/news' )
|
||||
,(u'Business' , u'http://www.themoscowtimes.com/rss/business')
|
||||
,(u'Art and Ideas' , u'http://www.themoscowtimes.com/rss/art' )
|
||||
,(u'Opinion' , u'http://www.themoscowtimes.com/rss/opinion' )
|
||||
]
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['newstextblock']})
|
||||
]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['photo_nav']})
|
||||
]
|
||||
dict(name='div', attrs={'class':['photo_nav','phototext']})
|
||||
,dict(name=['iframe','meta','base','link','embed','object'])
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
soup.html['xml:lang'] = self.lang
|
||||
soup.html['lang'] = self.lang
|
||||
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
|
||||
soup.head.insert(0,mtag)
|
||||
|
||||
return self.adeify_images(soup)
|
||||
for lnk in soup.findAll('a'):
|
||||
if lnk.string is not None:
|
||||
ind = self.tag_to_string(lnk)
|
||||
lnk.replaceWith(ind)
|
||||
return soup
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('.themoscowtimes.com/','.themoscowtimes.com/print/')
|
||||
|
||||
def get_cover_url(self):
|
||||
|
||||
cover_url = None
|
||||
href = 'http://www.themoscowtimes.com/pdf/'
|
||||
|
||||
soup = self.index_to_soup(href)
|
||||
div = soup.find('div',attrs={'class':'left'})
|
||||
a = div.find('a')
|
||||
print a
|
||||
if a :
|
||||
cover_url = a.img['src']
|
||||
if div:
|
||||
a = div.find('a')
|
||||
if a :
|
||||
cover_url = 'http://www.themoscowtimes.com' + a.img['src']
|
||||
return cover_url
|
||||
|
27
resources/recipes/msnsankei.recipe
Normal file
@ -0,0 +1,27 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
sankei.jp.msn.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class MSNSankeiNewsProduct(BasicNewsRecipe):
|
||||
title = u'MSN\u7523\u7d4c\u30cb\u30e5\u30fc\u30b9(\u65b0\u5546\u54c1)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'Products release from Japan'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
encoding = 'Shift_JIS'
|
||||
language = 'ja'
|
||||
cover_url = 'http://sankei.jp.msn.com/images/common/sankeShinbunLogo.jpg'
|
||||
masthead_url = 'http://sankei.jp.msn.com/images/common/sankeiNewsLogo.gif'
|
||||
|
||||
feeds = [(u'\u65b0\u5546\u54c1', u'http://sankei.jp.msn.com/rss/news/release.xml')]
|
||||
|
||||
remove_tags_before = dict(id="__r_article_title__")
|
||||
remove_tags_after = dict(id="ajax_release_news")
|
||||
remove_tags = [{'class':"parent chromeCustom6G"},
|
||||
dict(id="RelatedImg")
|
||||
]
|
11
resources/recipes/nacionred.recipe
Normal file
@ -0,0 +1,11 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1291022049(BasicNewsRecipe):
|
||||
title = u'NacionRed.com'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
language = 'es'
|
||||
__author__ = 'Arturo Martinez Nieves'
|
||||
|
||||
feeds = [(u'NacionRed.com', u'http://feeds.weblogssl.com/nacionred?format=xml')]
|
||||
|
@ -22,8 +22,19 @@ class NewYorker(BasicNewsRecipe):
|
||||
masthead_url = 'http://www.newyorker.com/css/i/hed/logo.gif'
|
||||
extra_css = """
|
||||
body {font-family: "Times New Roman",Times,serif}
|
||||
.articleauthor{color: #9F9F9F; font-family: Arial, sans-serif; font-size: small; text-transform: uppercase}
|
||||
.rubric{color: #CD0021; font-family: Arial, sans-serif; font-size: small; text-transform: uppercase}
|
||||
.articleauthor{color: #9F9F9F;
|
||||
font-family: Arial, sans-serif;
|
||||
font-size: small;
|
||||
text-transform: uppercase}
|
||||
.rubric,.dd,h6#credit{color: #CD0021;
|
||||
font-family: Arial, sans-serif;
|
||||
font-size: small;
|
||||
text-transform: uppercase}
|
||||
.descender:first-letter{display: inline; font-size: xx-large; font-weight: bold}
|
||||
.dd,h6#credit{color: gray}
|
||||
.c{display: block}
|
||||
.caption,h2#articleintro{font-style: italic}
|
||||
.caption{font-size: small}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
@ -39,7 +50,7 @@ class NewYorker(BasicNewsRecipe):
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name=['meta','iframe','base','link','embed','object'])
|
||||
,dict(attrs={'class':['utils','articleRailLinks','icons'] })
|
||||
,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons'] })
|
||||
,dict(attrs={'id':['show-header','show-footer'] })
|
||||
]
|
||||
remove_attributes = ['lang']
|
||||
@ -59,3 +70,13 @@ class NewYorker(BasicNewsRecipe):
|
||||
cover_url = 'http://www.newyorker.com' + cover_item['src'].strip()
|
||||
return cover_url
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
auth = soup.find(attrs={'id':'articleauthor'})
|
||||
if auth:
|
||||
alink = auth.find('a')
|
||||
if alink and alink.string is not None:
|
||||
txt = alink.string
|
||||
alink.replaceWith(txt)
|
||||
return soup
|
||||
|
@ -1,19 +1,22 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Mateusz Kielar, matek09@gmail.com'
|
||||
__copyright__ = '2010, matek09, matek09@gmail.com'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Newsweek(BasicNewsRecipe):
|
||||
EDITION = 0
|
||||
FIND_LAST_FULL_ISSUE = True
|
||||
EDITION = '0'
|
||||
EXCLUDE_LOCKED = True
|
||||
LOCKED_ICO = 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif'
|
||||
|
||||
title = u'Newsweek Polska'
|
||||
__author__ = 'Mateusz Kielar'
|
||||
__author__ = 'matek09'
|
||||
description = 'Weekly magazine'
|
||||
encoding = 'utf-8'
|
||||
no_stylesheets = True
|
||||
language = 'en'
|
||||
language = 'pl'
|
||||
remove_javascript = True
|
||||
|
||||
keep_only_tags =[]
|
||||
@ -33,34 +36,54 @@ class Newsweek(BasicNewsRecipe):
|
||||
def print_version(self, url):
|
||||
return url.replace("http://www.newsweek.pl/artykuly/wydanie/" + str(self.EDITION), "http://www.newsweek.pl/artykuly") + '/print'
|
||||
|
||||
def is_locked(self, a):
|
||||
if a.findNext('img')['src'] == 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif':
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def is_full(self, issue_soup):
|
||||
if len(issue_soup.findAll('img', attrs={'src' : 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif'})) > 1:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def find_last_full_issue(self):
|
||||
page = self.index_to_soup('http://www.newsweek.pl/Frames/IssueCover.aspx')
|
||||
issue = 'http://www.newsweek.pl/Frames/' + page.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href']
|
||||
page = self.index_to_soup(issue)
|
||||
issue = 'http://www.newsweek.pl/Frames/' + page.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href']
|
||||
page = self.index_to_soup(issue)
|
||||
self.EDITION = page.find('a', attrs={'target' : '_parent'})['href'].replace('/wydania/','')
|
||||
frame_url = 'http://www.newsweek.pl/Frames/IssueCover.aspx'
|
||||
while True:
|
||||
frame_soup = self.index_to_soup(frame_url)
|
||||
self.EDITION = frame_soup.find('a', attrs={'target' : '_parent'})['href'].replace('/wydania/','')
|
||||
issue_soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
|
||||
if self.is_full(issue_soup):
|
||||
break
|
||||
frame_url = 'http://www.newsweek.pl/Frames/' + frame_soup.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href']
|
||||
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
self.find_last_full_issue()
|
||||
soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + str(self.EDITION))
|
||||
if self.FIND_LAST_FULL_ISSUE:
|
||||
self.find_last_full_issue()
|
||||
soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
|
||||
img = soup.find('img', id="ctl00_C1_PaperIsssueView_IssueImage", src=True)
|
||||
self.cover_url = img['src']
|
||||
feeds = []
|
||||
parent = soup.find(id='content-left-big')
|
||||
for txt in parent.findAll(attrs={'class':'txt_normal_red strong'}):
|
||||
section = self.tag_to_string(txt).capitalize()
|
||||
articles = list(self.find_articles(txt))
|
||||
feeds.append((section, articles))
|
||||
if len(articles) > 0:
|
||||
section = self.tag_to_string(txt).capitalize()
|
||||
feeds.append((section, articles))
|
||||
return feeds
|
||||
|
||||
def find_articles(self, txt):
|
||||
for a in txt.findAllNext( attrs={'class':['strong','hr']}):
|
||||
if a.name in "div":
|
||||
break
|
||||
if (not self.FIND_LAST_FULL_ISSUE) & self.EXCLUDE_LOCKED & self.is_locked(a):
|
||||
continue
|
||||
yield {
|
||||
'title' : self.tag_to_string(a),
|
||||
'url' : 'http://www.newsweek.pl'+a['href'],
|
||||
'url' : 'http://www.newsweek.pl' + a['href'],
|
||||
'date' : '',
|
||||
'description' : ''
|
||||
}
|
||||
|
58
resources/recipes/nikkei_free.recipe
Normal file
@ -0,0 +1,58 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.nikkei.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class NikkeiNet(BasicNewsRecipe):
|
||||
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Free, MAX)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'News and current market affairs from Japan, no subscription and getting max feed.'
|
||||
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
language = 'ja'
|
||||
|
||||
feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'),
|
||||
(u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'),
|
||||
(u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'),
|
||||
(u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'),
|
||||
(u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'),
|
||||
(u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'),
|
||||
(u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'),
|
||||
(u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'),
|
||||
(u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'),
|
||||
(u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'),
|
||||
(u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'),
|
||||
(u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'),
|
||||
(u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'),
|
||||
(u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'),
|
||||
(u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'),
|
||||
(u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'),
|
||||
(u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'),
|
||||
(u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'),
|
||||
(u'\u5730\u57df\u30cb\u30e5\u30fc\u30b9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=local'),
|
||||
(u'\u7d71\u8a08\u30fb\u767d\u66f8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=report'),
|
||||
(u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking'),
|
||||
(u'\u4f1a\u898b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=interview'),
|
||||
(u'\u793e\u8aac\u30fb\u6625\u79cb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shasetsu'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba'),
|
||||
(u'\u8abf\u67fb\u30fb\u30a2\u30f3\u30b1\u30fc\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=research')
|
||||
]
|
||||
|
||||
remove_tags_before = dict(id="CONTENTS")
|
||||
remove_tags = [
|
||||
dict(name="form"),
|
||||
{'class':"cmn-hide"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
125
resources/recipes/nikkei_sub.recipe
Normal file
@ -0,0 +1,125 @@
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import mechanize
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
|
||||
class NikkeiNet_subscription(BasicNewsRecipe):
|
||||
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(MAX)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'News and current market affairs from Japan, gather MAX articles'
|
||||
needs_subscription = True
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 10
|
||||
language = 'ja'
|
||||
remove_javascript = False
|
||||
temp_files = []
|
||||
|
||||
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||
remove_tags = [
|
||||
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||
{'class':"cmn-print_headline cmn-clearfix"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
|
||||
cj = mechanize.LWPCookieJar()
|
||||
br.set_cookiejar(cj)
|
||||
|
||||
#br.set_debug_http(True)
|
||||
#br.set_debug_redirects(True)
|
||||
#br.set_debug_responses(True)
|
||||
|
||||
if self.username is not None and self.password is not None:
|
||||
#print "----------------------------get login form--------------------------------------------"
|
||||
# open login form
|
||||
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||
response = br.response()
|
||||
#print "----------------------------get login form---------------------------------------------"
|
||||
#print "----------------------------set login form---------------------------------------------"
|
||||
# remove disabled input which brings error on mechanize
|
||||
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||
br.set_response(response)
|
||||
br.select_form(name='LA0010Form01')
|
||||
br['LA0010Form01:LA0010Email'] = self.username
|
||||
br['LA0010Form01:LA0010Password'] = self.password
|
||||
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||
br.submit()
|
||||
br.response()
|
||||
#print "----------------------------send login form---------------------------------------------"
|
||||
#print "----------------------------open news main page-----------------------------------------"
|
||||
# open news site
|
||||
br.open('http://www.nikkei.com/')
|
||||
br.response()
|
||||
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||
#print response2.get_data()
|
||||
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||
# forced redirect in default
|
||||
br.select_form(nr=0)
|
||||
br.submit()
|
||||
response3 = br.response()
|
||||
# return some cookie which should be set by Javascript
|
||||
#print response3.geturl()
|
||||
raw = response3.get_data()
|
||||
#print "---------------------------response to form --------------------------------------------"
|
||||
# grab cookie from JS and set it
|
||||
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||
br.select_form(nr=0)
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||
|
||||
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].close()
|
||||
cj.load(self.temp_files[-1].name)
|
||||
|
||||
br.submit()
|
||||
|
||||
#br.set_debug_http(False)
|
||||
#br.set_debug_redirects(False)
|
||||
#br.set_debug_responses(False)
|
||||
return br
|
||||
|
||||
|
||||
|
||||
feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'),
|
||||
(u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'),
|
||||
(u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'),
|
||||
(u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'),
|
||||
(u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'),
|
||||
(u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'),
|
||||
(u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'),
|
||||
(u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'),
|
||||
(u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'),
|
||||
(u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'),
|
||||
(u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'),
|
||||
(u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'),
|
||||
(u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'),
|
||||
(u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'),
|
||||
(u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'),
|
||||
(u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'),
|
||||
(u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'),
|
||||
(u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'),
|
||||
(u'\u5730\u57df\u30cb\u30e5\u30fc\u30b9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=local'),
|
||||
(u'\u7d71\u8a08\u30fb\u767d\u66f8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=report'),
|
||||
(u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking'),
|
||||
(u'\u4f1a\u898b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=interview'),
|
||||
(u'\u793e\u8aac\u30fb\u6625\u79cb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shasetsu'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba'),
|
||||
(u'\u8abf\u67fb\u30fb\u30a2\u30f3\u30b1\u30fc\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=research')
|
||||
]
|
||||
|
||||
|
||||
|
107
resources/recipes/nikkei_sub_economy.recipe
Normal file
@ -0,0 +1,107 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.nikkei.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import mechanize
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
class NikkeiNet_sub_economy(BasicNewsRecipe):
|
||||
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7d4c\u6e08)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'News and current market affairs from Japan'
|
||||
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
needs_subscription = True
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
language = 'ja'
|
||||
remove_javascript = False
|
||||
temp_files = []
|
||||
|
||||
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||
remove_tags = [
|
||||
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||
{'class':"cmn-print_headline cmn-clearfix"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
||||
feeds = [ (u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'),
|
||||
(u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'),
|
||||
(u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'),
|
||||
(u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'),
|
||||
(u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'),
|
||||
(u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'),
|
||||
(u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'),
|
||||
(u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'),
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
|
||||
cj = mechanize.LWPCookieJar()
|
||||
br.set_cookiejar(cj)
|
||||
|
||||
#br.set_debug_http(True)
|
||||
#br.set_debug_redirects(True)
|
||||
#br.set_debug_responses(True)
|
||||
|
||||
if self.username is not None and self.password is not None:
|
||||
#print "----------------------------get login form--------------------------------------------"
|
||||
# open login form
|
||||
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||
response = br.response()
|
||||
#print "----------------------------get login form---------------------------------------------"
|
||||
#print "----------------------------set login form---------------------------------------------"
|
||||
# remove disabled input which brings error on mechanize
|
||||
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||
br.set_response(response)
|
||||
br.select_form(name='LA0010Form01')
|
||||
br['LA0010Form01:LA0010Email'] = self.username
|
||||
br['LA0010Form01:LA0010Password'] = self.password
|
||||
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||
br.submit()
|
||||
br.response()
|
||||
#print "----------------------------send login form---------------------------------------------"
|
||||
#print "----------------------------open news main page-----------------------------------------"
|
||||
# open news site
|
||||
br.open('http://www.nikkei.com/')
|
||||
br.response()
|
||||
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||
#print response2.get_data()
|
||||
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||
# forced redirect in default
|
||||
br.select_form(nr=0)
|
||||
br.submit()
|
||||
response3 = br.response()
|
||||
# return some cookie which should be set by Javascript
|
||||
#print response3.geturl()
|
||||
raw = response3.get_data()
|
||||
#print "---------------------------response to form --------------------------------------------"
|
||||
# grab cookie from JS and set it
|
||||
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||
br.select_form(nr=0)
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||
|
||||
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].close()
|
||||
cj.load(self.temp_files[-1].name)
|
||||
|
||||
br.submit()
|
||||
|
||||
#br.set_debug_http(False)
|
||||
#br.set_debug_redirects(False)
|
||||
#br.set_debug_responses(False)
|
||||
return br
|
||||
|
||||
|
||||
|
||||
|
107
resources/recipes/nikkei_sub_industry.recipe
Normal file
@ -0,0 +1,107 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.nikkei.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import mechanize
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
|
||||
class NikkeiNet_sub_industory(BasicNewsRecipe):
|
||||
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7523\u696d)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'News and current market affairs from Japan'
|
||||
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
needs_subscription = True
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
language = 'ja'
|
||||
remove_javascript = False
|
||||
temp_files = []
|
||||
|
||||
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||
remove_tags = [
|
||||
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||
{'class':"cmn-print_headline cmn-clearfix"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
||||
feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'),
|
||||
(u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'),
|
||||
(u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'),
|
||||
(u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'),
|
||||
(u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'),
|
||||
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
|
||||
cj = mechanize.LWPCookieJar()
|
||||
br.set_cookiejar(cj)
|
||||
|
||||
#br.set_debug_http(True)
|
||||
#br.set_debug_redirects(True)
|
||||
#br.set_debug_responses(True)
|
||||
|
||||
if self.username is not None and self.password is not None:
|
||||
#print "----------------------------get login form--------------------------------------------"
|
||||
# open login form
|
||||
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||
response = br.response()
|
||||
#print "----------------------------get login form---------------------------------------------"
|
||||
#print "----------------------------set login form---------------------------------------------"
|
||||
# remove disabled input which brings error on mechanize
|
||||
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||
br.set_response(response)
|
||||
br.select_form(name='LA0010Form01')
|
||||
br['LA0010Form01:LA0010Email'] = self.username
|
||||
br['LA0010Form01:LA0010Password'] = self.password
|
||||
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||
br.submit()
|
||||
br.response()
|
||||
#print "----------------------------send login form---------------------------------------------"
|
||||
#print "----------------------------open news main page-----------------------------------------"
|
||||
# open news site
|
||||
br.open('http://www.nikkei.com/')
|
||||
br.response()
|
||||
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||
#print response2.get_data()
|
||||
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||
# forced redirect in default
|
||||
br.select_form(nr=0)
|
||||
br.submit()
|
||||
response3 = br.response()
|
||||
# return some cookie which should be set by Javascript
|
||||
#print response3.geturl()
|
||||
raw = response3.get_data()
|
||||
#print "---------------------------response to form --------------------------------------------"
|
||||
# grab cookie from JS and set it
|
||||
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||
br.select_form(nr=0)
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||
|
||||
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].close()
|
||||
cj.load(self.temp_files[-1].name)
|
||||
|
||||
br.submit()
|
||||
|
||||
#br.set_debug_http(False)
|
||||
#br.set_debug_redirects(False)
|
||||
#br.set_debug_responses(False)
|
||||
return br
|
||||
|
||||
|
||||
|
||||
|
104
resources/recipes/nikkei_sub_life.recipe
Normal file
@ -0,0 +1,104 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.nikkei.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import mechanize
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
|
||||
class NikkeiNet_sub_life(BasicNewsRecipe):
|
||||
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u751f\u6d3b)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'News and current market affairs from Japan'
|
||||
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
needs_subscription = True
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
language = 'ja'
|
||||
remove_javascript = False
|
||||
temp_files = []
|
||||
|
||||
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||
remove_tags = [
|
||||
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||
{'class':"cmn-print_headline cmn-clearfix"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
||||
feeds = [ (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'),
|
||||
(u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'),
|
||||
(u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'),
|
||||
(u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special')
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
|
||||
cj = mechanize.LWPCookieJar()
|
||||
br.set_cookiejar(cj)
|
||||
|
||||
#br.set_debug_http(True)
|
||||
#br.set_debug_redirects(True)
|
||||
#br.set_debug_responses(True)
|
||||
|
||||
if self.username is not None and self.password is not None:
|
||||
#print "----------------------------get login form--------------------------------------------"
|
||||
# open login form
|
||||
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||
response = br.response()
|
||||
#print "----------------------------get login form---------------------------------------------"
|
||||
#print "----------------------------set login form---------------------------------------------"
|
||||
# remove disabled input which brings error on mechanize
|
||||
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||
br.set_response(response)
|
||||
br.select_form(name='LA0010Form01')
|
||||
br['LA0010Form01:LA0010Email'] = self.username
|
||||
br['LA0010Form01:LA0010Password'] = self.password
|
||||
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||
br.submit()
|
||||
br.response()
|
||||
#print "----------------------------send login form---------------------------------------------"
|
||||
#print "----------------------------open news main page-----------------------------------------"
|
||||
# open news site
|
||||
br.open('http://www.nikkei.com/')
|
||||
br.response()
|
||||
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||
#print response2.get_data()
|
||||
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||
# forced redirect in default
|
||||
br.select_form(nr=0)
|
||||
br.submit()
|
||||
response3 = br.response()
|
||||
# return some cookie which should be set by Javascript
|
||||
#print response3.geturl()
|
||||
raw = response3.get_data()
|
||||
#print "---------------------------response to form --------------------------------------------"
|
||||
# grab cookie from JS and set it
|
||||
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||
br.select_form(nr=0)
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||
|
||||
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].close()
|
||||
cj.load(self.temp_files[-1].name)
|
||||
|
||||
br.submit()
|
||||
|
||||
#br.set_debug_http(False)
|
||||
#br.set_debug_redirects(False)
|
||||
#br.set_debug_responses(False)
|
||||
return br
|
||||
|
||||
|
||||
|
||||
|
103
resources/recipes/nikkei_sub_main.recipe
Normal file
@ -0,0 +1,103 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.nikkei.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import mechanize
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
|
||||
class NikkeiNet_sub_main(BasicNewsRecipe):
|
||||
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7dcf\u5408)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'News and current market affairs from Japan'
|
||||
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
needs_subscription = True
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
language = 'ja'
|
||||
remove_javascript = False
|
||||
temp_files = []
|
||||
|
||||
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||
remove_tags = [
|
||||
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||
{'class':"cmn-print_headline cmn-clearfix"},
|
||||
{'class':"cmn-article_list"},
|
||||
{'class':"cmn-dashedline"},
|
||||
{'class':"cmn-hide"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
||||
feeds = [ (u'NIKKEI', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=main')]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
|
||||
cj = mechanize.LWPCookieJar()
|
||||
br.set_cookiejar(cj)
|
||||
|
||||
#br.set_debug_http(True)
|
||||
#br.set_debug_redirects(True)
|
||||
#br.set_debug_responses(True)
|
||||
|
||||
if self.username is not None and self.password is not None:
|
||||
#print "----------------------------get login form--------------------------------------------"
|
||||
# open login form
|
||||
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||
response = br.response()
|
||||
#print "----------------------------get login form---------------------------------------------"
|
||||
#print "----------------------------set login form---------------------------------------------"
|
||||
# remove disabled input which brings error on mechanize
|
||||
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||
br.set_response(response)
|
||||
br.select_form(name='LA0010Form01')
|
||||
br['LA0010Form01:LA0010Email'] = self.username
|
||||
br['LA0010Form01:LA0010Password'] = self.password
|
||||
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||
br.submit()
|
||||
br.response()
|
||||
#print "----------------------------send login form---------------------------------------------"
|
||||
#print "----------------------------open news main page-----------------------------------------"
|
||||
# open news site
|
||||
br.open('http://www.nikkei.com/')
|
||||
br.response()
|
||||
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||
#print response2.get_data()
|
||||
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||
# forced redirect in default
|
||||
br.select_form(nr=0)
|
||||
br.submit()
|
||||
response3 = br.response()
|
||||
# return some cookie which should be set by Javascript
|
||||
#print response3.geturl()
|
||||
raw = response3.get_data()
|
||||
#print "---------------------------response to form --------------------------------------------"
|
||||
# grab cookie from JS and set it
|
||||
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||
br.select_form(nr=0)
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||
|
||||
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].close()
|
||||
cj.load(self.temp_files[-1].name)
|
||||
|
||||
br.submit()
|
||||
|
||||
#br.set_debug_http(False)
|
||||
#br.set_debug_redirects(False)
|
||||
#br.set_debug_responses(False)
|
||||
return br
|
||||
|
||||
|
||||
|
||||
|
102
resources/recipes/nikkei_sub_shakai.recipe
Normal file
@ -0,0 +1,102 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.nikkei.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import mechanize
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
|
||||
class NikkeiNet_sub_life(BasicNewsRecipe):
|
||||
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u751f\u6d3b)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'News and current market affairs from Japan'
|
||||
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
needs_subscription = True
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
language = 'ja'
|
||||
remove_javascript = False
|
||||
temp_files = []
|
||||
|
||||
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||
remove_tags = [
|
||||
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||
{'class':"cmn-print_headline cmn-clearfix"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
||||
feeds = [
|
||||
(u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai')
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
|
||||
cj = mechanize.LWPCookieJar()
|
||||
br.set_cookiejar(cj)
|
||||
|
||||
#br.set_debug_http(True)
|
||||
#br.set_debug_redirects(True)
|
||||
#br.set_debug_responses(True)
|
||||
|
||||
if self.username is not None and self.password is not None:
|
||||
#print "----------------------------get login form--------------------------------------------"
|
||||
# open login form
|
||||
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||
response = br.response()
|
||||
#print "----------------------------get login form---------------------------------------------"
|
||||
#print "----------------------------set login form---------------------------------------------"
|
||||
# remove disabled input which brings error on mechanize
|
||||
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||
br.set_response(response)
|
||||
br.select_form(name='LA0010Form01')
|
||||
br['LA0010Form01:LA0010Email'] = self.username
|
||||
br['LA0010Form01:LA0010Password'] = self.password
|
||||
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||
br.submit()
|
||||
br.response()
|
||||
#print "----------------------------send login form---------------------------------------------"
|
||||
#print "----------------------------open news main page-----------------------------------------"
|
||||
# open news site
|
||||
br.open('http://www.nikkei.com/')
|
||||
br.response()
|
||||
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||
#print response2.get_data()
|
||||
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||
# forced redirect in default
|
||||
br.select_form(nr=0)
|
||||
br.submit()
|
||||
response3 = br.response()
|
||||
# return some cookie which should be set by Javascript
|
||||
#print response3.geturl()
|
||||
raw = response3.get_data()
|
||||
#print "---------------------------response to form --------------------------------------------"
|
||||
# grab cookie from JS and set it
|
||||
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||
br.select_form(nr=0)
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||
|
||||
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].close()
|
||||
cj.load(self.temp_files[-1].name)
|
||||
|
||||
br.submit()
|
||||
|
||||
#br.set_debug_http(False)
|
||||
#br.set_debug_redirects(False)
|
||||
#br.set_debug_responses(False)
|
||||
return br
|
||||
|
||||
|
||||
|
||||
|
108
resources/recipes/nikkei_sub_sports.recipe
Normal file
@ -0,0 +1,108 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.nikkei.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import mechanize
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
|
||||
class NikkeiNet_sub_sports(BasicNewsRecipe):
|
||||
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u30b9\u30dd\u30fc\u30c4)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'News and current market affairs from Japan'
|
||||
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
needs_subscription = True
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
language = 'ja'
|
||||
remove_javascript = False
|
||||
temp_files = []
|
||||
|
||||
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||
remove_tags = [
|
||||
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||
{'class':"cmn-print_headline cmn-clearfix"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
||||
feeds = [
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba')
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
|
||||
cj = mechanize.LWPCookieJar()
|
||||
br.set_cookiejar(cj)
|
||||
|
||||
#br.set_debug_http(True)
|
||||
#br.set_debug_redirects(True)
|
||||
#br.set_debug_responses(True)
|
||||
|
||||
if self.username is not None and self.password is not None:
|
||||
#print "----------------------------get login form--------------------------------------------"
|
||||
# open login form
|
||||
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||
response = br.response()
|
||||
#print "----------------------------get login form---------------------------------------------"
|
||||
#print "----------------------------set login form---------------------------------------------"
|
||||
# remove disabled input which brings error on mechanize
|
||||
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||
br.set_response(response)
|
||||
br.select_form(name='LA0010Form01')
|
||||
br['LA0010Form01:LA0010Email'] = self.username
|
||||
br['LA0010Form01:LA0010Password'] = self.password
|
||||
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||
br.submit()
|
||||
br.response()
|
||||
#print "----------------------------send login form---------------------------------------------"
|
||||
#print "----------------------------open news main page-----------------------------------------"
|
||||
# open news site
|
||||
br.open('http://www.nikkei.com/')
|
||||
br.response()
|
||||
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||
#print response2.get_data()
|
||||
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||
# forced redirect in default
|
||||
br.select_form(nr=0)
|
||||
br.submit()
|
||||
response3 = br.response()
|
||||
# return some cookie which should be set by Javascript
|
||||
#print response3.geturl()
|
||||
raw = response3.get_data()
|
||||
#print "---------------------------response to form --------------------------------------------"
|
||||
# grab cookie from JS and set it
|
||||
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||
br.select_form(nr=0)
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||
|
||||
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].close()
|
||||
cj.load(self.temp_files[-1].name)
|
||||
|
||||
br.submit()
|
||||
|
||||
#br.set_debug_http(False)
|
||||
#br.set_debug_redirects(False)
|
||||
#br.set_debug_responses(False)
|
||||
return br
|
||||
|
||||
|
||||
|
||||
|
@ -8,12 +8,15 @@ www.nin.co.rs
|
||||
import re
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from contextlib import closing
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre import entity_to_unicode
|
||||
|
||||
class Nin(BasicNewsRecipe):
|
||||
title = 'NIN online'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Nedeljne Informativne Novine'
|
||||
publisher = 'NIN d.o.o.'
|
||||
publisher = 'NIN d.o.o. - Ringier d.o.o.'
|
||||
category = 'news, politics, Serbia'
|
||||
no_stylesheets = True
|
||||
delay = 1
|
||||
@ -26,18 +29,29 @@ class Nin(BasicNewsRecipe):
|
||||
use_embedded_content = False
|
||||
language = 'sr'
|
||||
publication_type = 'magazine'
|
||||
extra_css = ' @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Verdana, Lucida, sans1, sans-serif} .article_description{font-family: Verdana, Lucida, sans1, sans-serif} .artTitle{font-size: x-large; font-weight: bold; color: #900} .izjava{font-size: x-large; font-weight: bold} .columnhead{font-size: small; font-weight: bold;} img{margin-top:0.5em; margin-bottom: 0.7em} b{margin-top: 1em} '
|
||||
extra_css = """
|
||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||
body{font-family: Verdana, Lucida, sans1, sans-serif}
|
||||
.article_description{font-family: Verdana, Lucida, sans1, sans-serif}
|
||||
.artTitle{font-size: x-large; font-weight: bold; color: #900}
|
||||
.izjava{font-size: x-large; font-weight: bold}
|
||||
.columnhead{font-size: small; font-weight: bold;}
|
||||
img{margin-top:0.5em; margin-bottom: 0.7em; display: block}
|
||||
b{margin-top: 1em}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
, 'linearize_tables' : True
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
remove_attributes = ['height','width']
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'</body>.*?<html>', re.DOTALL|re.IGNORECASE),lambda match: '</body>')
|
||||
,(re.compile(r'</html>.*?</html>', re.DOTALL|re.IGNORECASE),lambda match: '</html>')
|
||||
,(re.compile(u'\u0110'), lambda match: u'\u00D0')
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
@ -50,7 +64,10 @@ class Nin(BasicNewsRecipe):
|
||||
return br
|
||||
|
||||
keep_only_tags =[dict(name='td', attrs={'width':'520'})]
|
||||
remove_tags_before =dict(name='span', attrs={'class':'izjava'})
|
||||
remove_tags_after =dict(name='html')
|
||||
remove_tags = [dict(name=['object','link','iframe','meta','base'])]
|
||||
remove_attributes=['border','background','height','width','align','valign']
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = None
|
||||
@ -63,7 +80,7 @@ class Nin(BasicNewsRecipe):
|
||||
def parse_index(self):
|
||||
articles = []
|
||||
count = 0
|
||||
soup = self.index_to_soup(self.PREFIX)
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
for item in soup.findAll('a',attrs={'class':'lmeninavFont'}):
|
||||
count = count +1
|
||||
if self.test and count > 2:
|
||||
@ -90,3 +107,45 @@ class Nin(BasicNewsRecipe):
|
||||
articles.append((section,inarts))
|
||||
return articles
|
||||
|
||||
def index_to_soup(self, url_or_raw, raw=False):
|
||||
if re.match(r'\w+://', url_or_raw):
|
||||
open_func = getattr(self.browser, 'open_novisit', self.browser.open)
|
||||
with closing(open_func(url_or_raw)) as f:
|
||||
_raw = f.read()
|
||||
if not _raw:
|
||||
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
|
||||
else:
|
||||
_raw = url_or_raw
|
||||
if raw:
|
||||
return _raw
|
||||
if not isinstance(_raw, unicode) and self.encoding:
|
||||
if callable(self.encoding):
|
||||
_raw = self.encoding(_raw)
|
||||
else:
|
||||
_raw = _raw.decode(self.encoding, 'replace')
|
||||
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
||||
enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
|
||||
massage.append((re.compile(r'&(\S+?);'), lambda match:
|
||||
entity_to_unicode(match, encoding=enc)))
|
||||
massage.append((re.compile(r'[\x00-\x08]+'), lambda match:
|
||||
''))
|
||||
return BeautifulSoup(_raw, markupMassage=massage)
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll('div'):
|
||||
if len(item.contents) == 0:
|
||||
item.extract()
|
||||
for item in soup.findAll(['td','tr']):
|
||||
item.name='div'
|
||||
for item in soup.findAll('img'):
|
||||
if not item.has_key('alt'):
|
||||
item['alt'] = 'image'
|
||||
for tbl in soup.findAll('table'):
|
||||
img = tbl.find('img')
|
||||
if img:
|
||||
img.extract()
|
||||
tbl.replaceWith(img)
|
||||
return soup
|
||||
|
||||
|
@ -13,6 +13,7 @@ class NowToronto(BasicNewsRecipe):
|
||||
title = u'Now Toronto'
|
||||
description = u'Now Toronto'
|
||||
__author__ = 'Starson17'
|
||||
language = 'en_CA'
|
||||
conversion_options = {
|
||||
'no_default_epub_cover' : True
|
||||
}
|
||||
|
@ -7,14 +7,22 @@ nytimes.com
|
||||
'''
|
||||
import re, string, time
|
||||
from calibre import entity_to_unicode, strftime
|
||||
from datetime import timedelta, date
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
||||
|
||||
|
||||
class NYTimes(BasicNewsRecipe):
|
||||
|
||||
# set headlinesOnly to True for the headlines-only version
|
||||
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
||||
headlinesOnly = True
|
||||
|
||||
# set webEdition to True for the Web edition of the newspaper. Set oldest_article to the
|
||||
# number of days old an article can be for inclusion. If oldest_article = 0 all articles
|
||||
# will be included. Note: oldest_article is ignored if webEdition = False
|
||||
webEdition = False
|
||||
oldest_article = 7
|
||||
|
||||
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||||
# Otherwise, only the sections named will be included. For example,
|
||||
#
|
||||
@ -39,20 +47,76 @@ class NYTimes(BasicNewsRecipe):
|
||||
# from an article (if one exists). If one_picture_per_article = True, the image
|
||||
# will be moved to a location between the headline and the byline.
|
||||
# If one_picture_per_article = False, all images from the article will be included
|
||||
|
||||
# and shown in their original location.
|
||||
one_picture_per_article = True
|
||||
one_picture_per_article = False
|
||||
|
||||
# The maximum number of articles that will be downloaded
|
||||
max_articles_per_feed = 100
|
||||
|
||||
# Whether to omit duplicates of articles (typically arsing when articles are indexed in
|
||||
# more than one section). If True, only the first occurance will be downloaded.
|
||||
filterDuplicates = True
|
||||
|
||||
# Sections to collect for the Web edition.
|
||||
# Delete any you don't want, or use includeSections or excludeSections
|
||||
web_sections = [(u'World',u'world'),
|
||||
(u'U.S.',u'national'),
|
||||
(u'Politics',u'politics'),
|
||||
(u'New York',u'nyregion'),
|
||||
(u'Business','business'),
|
||||
(u'Technology',u'technology'),
|
||||
(u'Sports',u'sports'),
|
||||
(u'Science',u'science'),
|
||||
(u'Health',u'health'),
|
||||
(u'Opinion',u'opinion'),
|
||||
(u'Arts',u'arts'),
|
||||
(u'Books',u'books'),
|
||||
(u'Movies',u'movies'),
|
||||
(u'Music',u'arts/music'),
|
||||
(u'Television',u'arts/television'),
|
||||
(u'Style',u'style'),
|
||||
(u'Dining & Wine',u'dining'),
|
||||
(u'Fashion & Style',u'fashion'),
|
||||
(u'Home & Garden',u'garden'),
|
||||
(u'Travel',u'travel'),
|
||||
('Education',u'education'),
|
||||
('Multimedia',u'multimedia'),
|
||||
(u'Obituaries',u'obituaries'),
|
||||
(u'Sunday Magazine',u'magazine'),
|
||||
(u'Week in Review',u'weekinreview')]
|
||||
|
||||
|
||||
if headlinesOnly:
|
||||
title='New York Times Headlines'
|
||||
description = 'Headlines from the New York Times'
|
||||
needs_subscription = False
|
||||
elif webEdition:
|
||||
title='New York Times (Web)'
|
||||
description = 'New York Times on the Web'
|
||||
needs_subscription = True
|
||||
else:
|
||||
title='New York Times'
|
||||
description = 'Today\'s New York Times'
|
||||
needs_subscription = True
|
||||
|
||||
|
||||
month_list = ['january','february','march','april','may','june','july','august','september','october','november','december']
|
||||
|
||||
def decode_us_date(self,datestr):
|
||||
udate = datestr.strip().lower().split()
|
||||
try:
|
||||
m = self.month_list.index(udate[0])+1
|
||||
except:
|
||||
return date.today()
|
||||
d = int(udate[1])
|
||||
y = int(udate[2])
|
||||
try:
|
||||
d = date(y,m,d)
|
||||
except:
|
||||
d = date.today
|
||||
return d
|
||||
|
||||
earliest_date = date.today() - timedelta(days=oldest_article)
|
||||
|
||||
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||||
language = 'en'
|
||||
@ -136,6 +200,12 @@ class NYTimes(BasicNewsRecipe):
|
||||
.image {text-align: center;}
|
||||
.source {text-align: left; }'''
|
||||
|
||||
|
||||
articles = {}
|
||||
key = None
|
||||
ans = []
|
||||
url_list = []
|
||||
|
||||
def filter_ans(self, ans) :
|
||||
total_article_count = 0
|
||||
idx = 0
|
||||
@ -164,6 +234,29 @@ class NYTimes(BasicNewsRecipe):
|
||||
self.log( "Queued %d articles" % total_article_count )
|
||||
return ans
|
||||
|
||||
def exclude_url(self,url):
|
||||
if not url.startswith("http"):
|
||||
return True
|
||||
if not url.endswith(".html"):
|
||||
return True
|
||||
if 'nytimes.com' not in url:
|
||||
return True
|
||||
if 'podcast' in url:
|
||||
return True
|
||||
if '/video/' in url:
|
||||
return True
|
||||
if '/slideshow/' in url:
|
||||
return True
|
||||
if '/magazine/index' in url:
|
||||
return True
|
||||
if '/interactive/' in url:
|
||||
return True
|
||||
if '/reference/' in url:
|
||||
return True
|
||||
if '/premium/' in url:
|
||||
return True
|
||||
return False
|
||||
|
||||
def fixChars(self,string):
|
||||
# Replace lsquo (\x91)
|
||||
fixed = re.sub("\x91","‘",string)
|
||||
@ -189,9 +282,9 @@ class NYTimes(BasicNewsRecipe):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://www.nytimes.com/auth/login')
|
||||
br.select_form(name='login')
|
||||
br['USERID'] = self.username
|
||||
br['PASSWORD'] = self.password
|
||||
br.form = br.forms().next()
|
||||
br['userid'] = self.username
|
||||
br['password'] = self.password
|
||||
raw = br.submit().read()
|
||||
if 'Please try again' in raw:
|
||||
raise Exception('Your username and password are incorrect')
|
||||
@ -249,7 +342,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
return BeautifulSoup(_raw, markupMassage=massage)
|
||||
|
||||
# Entry point
|
||||
print "index_to_soup()"
|
||||
soup = get_the_soup( self.encoding, url_or_raw )
|
||||
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
||||
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
||||
@ -273,83 +365,110 @@ class NYTimes(BasicNewsRecipe):
|
||||
else:
|
||||
return description
|
||||
|
||||
def parse_todays_index(self):
|
||||
def feed_title(self,div):
|
||||
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
||||
|
||||
def feed_title(div):
|
||||
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
||||
|
||||
articles = {}
|
||||
key = None
|
||||
ans = []
|
||||
url_list = []
|
||||
|
||||
def handle_article(div):
|
||||
a = div.find('a', href=True)
|
||||
if not a:
|
||||
def handle_article(self,div):
|
||||
thumbnail = div.find('div','thumbnail')
|
||||
if thumbnail:
|
||||
thumbnail.extract()
|
||||
a = div.find('a', href=True)
|
||||
if not a:
|
||||
return
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if self.exclude_url(url):
|
||||
return
|
||||
url += '?pagewanted=all'
|
||||
if self.filterDuplicates:
|
||||
if url in self.url_list:
|
||||
return
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if not url.startswith("http"):
|
||||
return
|
||||
if not url.endswith(".html"):
|
||||
return
|
||||
if 'podcast' in url:
|
||||
return
|
||||
if '/video/' in url:
|
||||
return
|
||||
url += '?pagewanted=all'
|
||||
if url in url_list:
|
||||
return
|
||||
url_list.append(url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description = ''
|
||||
pubdate = strftime('%a, %d %b')
|
||||
summary = div.find(True, attrs={'class':'summary'})
|
||||
if summary:
|
||||
description = self.tag_to_string(summary, use_alt=False)
|
||||
author = ''
|
||||
self.url_list.append(url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description = ''
|
||||
pubdate = strftime('%a, %d %b')
|
||||
summary = div.find(True, attrs={'class':'summary'})
|
||||
if summary:
|
||||
description = self.tag_to_string(summary, use_alt=False)
|
||||
author = ''
|
||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||
if authorAttribution:
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
else:
|
||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||
if authorAttribution:
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
else:
|
||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||
if authorAttribution:
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
feed = key if key is not None else 'Uncategorized'
|
||||
if not articles.has_key(feed):
|
||||
ans.append(feed)
|
||||
articles[feed] = []
|
||||
articles[feed].append(
|
||||
dict(title=title, url=url, date=pubdate,
|
||||
description=description, author=author,
|
||||
content=''))
|
||||
feed = self.key if self.key is not None else 'Uncategorized'
|
||||
if not self.articles.has_key(feed):
|
||||
self.ans.append(feed)
|
||||
self.articles[feed] = []
|
||||
self.articles[feed].append(
|
||||
dict(title=title, url=url, date=pubdate,
|
||||
description=description, author=author,
|
||||
content=''))
|
||||
|
||||
|
||||
def parse_web_edition(self):
|
||||
|
||||
for (sec_title,index_url) in self.web_sections:
|
||||
if self.includeSections != []:
|
||||
if sec_title not in self.includeSections:
|
||||
print "SECTION NOT INCLUDED: ",sec_title
|
||||
continue
|
||||
if sec_title in self.excludeSections:
|
||||
print "SECTION EXCLUDED: ",sec_title
|
||||
continue
|
||||
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
|
||||
self.key = sec_title
|
||||
# Find each article
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||
if div['class'] in ['story', 'story headline'] :
|
||||
self.handle_article(div)
|
||||
elif div['class'] == 'headlinesOnly multiline flush':
|
||||
for lidiv in div.findAll('li'):
|
||||
self.handle_article(lidiv)
|
||||
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
|
||||
|
||||
def parse_todays_index(self):
|
||||
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||
|
||||
|
||||
skipping = False
|
||||
# Find each article
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||
|
||||
if div['class'] in ['section-headline','sectionHeader']:
|
||||
key = string.capwords(feed_title(div))
|
||||
key = key.replace('Op-ed','Op-Ed')
|
||||
key = key.replace('U.s.','U.S.')
|
||||
self.key = string.capwords(self.feed_title(div))
|
||||
self.key = self.key.replace('Op-ed','Op-Ed')
|
||||
self.key = self.key.replace('U.s.','U.S.')
|
||||
self.key = self.key.replace('N.y.','N.Y.')
|
||||
skipping = False
|
||||
if self.includeSections != []:
|
||||
if self.key not in self.includeSections:
|
||||
print "SECTION NOT INCLUDED: ",self.key
|
||||
skipping = True
|
||||
if self.key in self.excludeSections:
|
||||
print "SECTION EXCLUDED: ",self.key
|
||||
skipping = True
|
||||
|
||||
elif div['class'] in ['story', 'story headline'] :
|
||||
handle_article(div)
|
||||
if not skipping:
|
||||
self.handle_article(div)
|
||||
elif div['class'] == 'headlinesOnly multiline flush':
|
||||
for lidiv in div.findAll('li'):
|
||||
handle_article(lidiv)
|
||||
if not skipping:
|
||||
self.handle_article(lidiv)
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return self.filter_ans(ans)
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
|
||||
def parse_headline_index(self):
|
||||
|
||||
articles = {}
|
||||
ans = []
|
||||
url_list = []
|
||||
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
||||
|
||||
# Fetch the content table
|
||||
@ -363,15 +482,24 @@ class NYTimes(BasicNewsRecipe):
|
||||
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
|
||||
for div_sec in td_col.findAll('div',recursive=False):
|
||||
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
|
||||
|
||||
section_name = self.tag_to_string(h6_sec_name,use_alt=False)
|
||||
section_name = re.sub(r'^ *$','',section_name)
|
||||
|
||||
if section_name == '':
|
||||
continue
|
||||
if self.includeSections != []:
|
||||
if section_name not in self.includeSections:
|
||||
print "SECTION NOT INCLUDED: ",section_name
|
||||
continue
|
||||
if section_name in self.excludeSections:
|
||||
print "SECTION EXCLUDED: ",section_name
|
||||
continue
|
||||
|
||||
section_name=string.capwords(section_name)
|
||||
if section_name == 'U.s.':
|
||||
section_name = 'U.S.'
|
||||
elif section_name == 'Op-ed':
|
||||
section_name = 'Op-Ed'
|
||||
section_name = section_name.replace('Op-ed','Op-Ed')
|
||||
section_name = section_name.replace('U.s.','U.S.')
|
||||
section_name = section_name.replace('N.y.','N.Y.')
|
||||
pubdate = strftime('%a, %d %b')
|
||||
|
||||
search_div = div_sec
|
||||
@ -392,37 +520,32 @@ class NYTimes(BasicNewsRecipe):
|
||||
if not a:
|
||||
continue
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if not url.startswith("http"):
|
||||
continue
|
||||
if not url.endswith(".html"):
|
||||
continue
|
||||
if 'podcast' in url:
|
||||
continue
|
||||
if 'video' in url:
|
||||
if self.exclude_url(url):
|
||||
continue
|
||||
url += '?pagewanted=all'
|
||||
if url in url_list:
|
||||
continue
|
||||
url_list.append(url)
|
||||
self.log("URL %s" % url)
|
||||
if self.filterDuplicates:
|
||||
if url in self.url_list:
|
||||
continue
|
||||
self.url_list.append(url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
desc = h3_item.find('p')
|
||||
if desc is not None:
|
||||
description = self.tag_to_string(desc,use_alt=False)
|
||||
else:
|
||||
description = ''
|
||||
if not articles.has_key(section_name):
|
||||
ans.append(section_name)
|
||||
articles[section_name] = []
|
||||
articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||
if not self.articles.has_key(section_name):
|
||||
self.ans.append(section_name)
|
||||
self.articles[section_name] = []
|
||||
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return self.filter_ans(ans)
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
|
||||
def parse_index(self):
|
||||
if self.headlinesOnly:
|
||||
return self.parse_headline_index()
|
||||
elif self.webEdition:
|
||||
return self.parse_web_edition()
|
||||
else:
|
||||
return self.parse_todays_index()
|
||||
|
||||
@ -438,6 +561,21 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
if self.webEdition & (self.oldest_article>0):
|
||||
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
||||
if date_tag:
|
||||
date_str = self.tag_to_string(date_tag,use_alt=False)
|
||||
date_str = date_str.replace('Published:','')
|
||||
date_items = date_str.split(',')
|
||||
try:
|
||||
datestring = date_items[0]+' '+date_items[1]
|
||||
article_date = self.decode_us_date(datestring)
|
||||
except:
|
||||
article_date = date.today()
|
||||
if article_date < self.earliest_date:
|
||||
self.log("Skipping article dated %s" % date_str)
|
||||
return None
|
||||
|
||||
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||||
if kicker_tag: # remove Op_Ed author head shots
|
||||
tagline = self.tag_to_string(kicker_tag)
|
||||
@ -462,7 +600,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
for inlineImg in inlineImgs[1:]:
|
||||
inlineImg.extract()
|
||||
# Move firstImg before article body
|
||||
#article_body = soup.find(True, {'id':'articleBody'})
|
||||
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
||||
if cgFirst:
|
||||
# Strip all sibling NavigableStrings: noise
|
||||
@ -548,4 +685,3 @@ class NYTimes(BasicNewsRecipe):
|
||||
divTag.replaceWith(tag)
|
||||
|
||||
return soup
|
||||
|
||||
|
@ -7,14 +7,22 @@ nytimes.com
|
||||
'''
|
||||
import re, string, time
|
||||
from calibre import entity_to_unicode, strftime
|
||||
from datetime import timedelta, date
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
||||
|
||||
|
||||
class NYTimes(BasicNewsRecipe):
|
||||
|
||||
# set headlinesOnly to True for the headlines-only version
|
||||
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
||||
headlinesOnly = False
|
||||
|
||||
# set webEdition to True for the Web edition of the newspaper. Set oldest_article to the
|
||||
# number of days old an article can be for inclusion. If oldest_article = 0 all articles
|
||||
# will be included. Note: oldest_article is ignored if webEdition = False
|
||||
webEdition = False
|
||||
oldest_article = 7
|
||||
|
||||
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||||
# Otherwise, only the sections named will be included. For example,
|
||||
#
|
||||
@ -39,20 +47,76 @@ class NYTimes(BasicNewsRecipe):
|
||||
# from an article (if one exists). If one_picture_per_article = True, the image
|
||||
# will be moved to a location between the headline and the byline.
|
||||
# If one_picture_per_article = False, all images from the article will be included
|
||||
|
||||
# and shown in their original location.
|
||||
one_picture_per_article = True
|
||||
one_picture_per_article = False
|
||||
|
||||
# The maximum number of articles that will be downloaded
|
||||
max_articles_per_feed = 100
|
||||
|
||||
# Whether to omit duplicates of articles (typically arsing when articles are indexed in
|
||||
# more than one section). If True, only the first occurance will be downloaded.
|
||||
filterDuplicates = True
|
||||
|
||||
# Sections to collect for the Web edition.
|
||||
# Delete any you don't want, or use includeSections or excludeSections
|
||||
web_sections = [(u'World',u'world'),
|
||||
(u'U.S.',u'national'),
|
||||
(u'Politics',u'politics'),
|
||||
(u'New York',u'nyregion'),
|
||||
(u'Business','business'),
|
||||
(u'Technology',u'technology'),
|
||||
(u'Sports',u'sports'),
|
||||
(u'Science',u'science'),
|
||||
(u'Health',u'health'),
|
||||
(u'Opinion',u'opinion'),
|
||||
(u'Arts',u'arts'),
|
||||
(u'Books',u'books'),
|
||||
(u'Movies',u'movies'),
|
||||
(u'Music',u'arts/music'),
|
||||
(u'Television',u'arts/television'),
|
||||
(u'Style',u'style'),
|
||||
(u'Dining & Wine',u'dining'),
|
||||
(u'Fashion & Style',u'fashion'),
|
||||
(u'Home & Garden',u'garden'),
|
||||
(u'Travel',u'travel'),
|
||||
('Education',u'education'),
|
||||
('Multimedia',u'multimedia'),
|
||||
(u'Obituaries',u'obituaries'),
|
||||
(u'Sunday Magazine',u'magazine'),
|
||||
(u'Week in Review',u'weekinreview')]
|
||||
|
||||
|
||||
if headlinesOnly:
|
||||
title='New York Times Headlines'
|
||||
description = 'Headlines from the New York Times'
|
||||
needs_subscription = False
|
||||
elif webEdition:
|
||||
title='New York Times (Web)'
|
||||
description = 'New York Times on the Web'
|
||||
needs_subscription = True
|
||||
else:
|
||||
title='New York Times'
|
||||
description = 'Today\'s New York Times'
|
||||
needs_subscription = True
|
||||
|
||||
|
||||
month_list = ['january','february','march','april','may','june','july','august','september','october','november','december']
|
||||
|
||||
def decode_us_date(self,datestr):
|
||||
udate = datestr.strip().lower().split()
|
||||
try:
|
||||
m = self.month_list.index(udate[0])+1
|
||||
except:
|
||||
return date.today()
|
||||
d = int(udate[1])
|
||||
y = int(udate[2])
|
||||
try:
|
||||
d = date(y,m,d)
|
||||
except:
|
||||
d = date.today
|
||||
return d
|
||||
|
||||
earliest_date = date.today() - timedelta(days=oldest_article)
|
||||
|
||||
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||||
language = 'en'
|
||||
@ -60,7 +124,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
|
||||
timefmt = ''
|
||||
needs_subscription = True
|
||||
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||
cover_margins = (18,18,'grey99')
|
||||
|
||||
@ -137,6 +200,12 @@ class NYTimes(BasicNewsRecipe):
|
||||
.image {text-align: center;}
|
||||
.source {text-align: left; }'''
|
||||
|
||||
|
||||
articles = {}
|
||||
key = None
|
||||
ans = []
|
||||
url_list = []
|
||||
|
||||
def filter_ans(self, ans) :
|
||||
total_article_count = 0
|
||||
idx = 0
|
||||
@ -165,6 +234,29 @@ class NYTimes(BasicNewsRecipe):
|
||||
self.log( "Queued %d articles" % total_article_count )
|
||||
return ans
|
||||
|
||||
def exclude_url(self,url):
|
||||
if not url.startswith("http"):
|
||||
return True
|
||||
if not url.endswith(".html"):
|
||||
return True
|
||||
if 'nytimes.com' not in url:
|
||||
return True
|
||||
if 'podcast' in url:
|
||||
return True
|
||||
if '/video/' in url:
|
||||
return True
|
||||
if '/slideshow/' in url:
|
||||
return True
|
||||
if '/magazine/index' in url:
|
||||
return True
|
||||
if '/interactive/' in url:
|
||||
return True
|
||||
if '/reference/' in url:
|
||||
return True
|
||||
if '/premium/' in url:
|
||||
return True
|
||||
return False
|
||||
|
||||
def fixChars(self,string):
|
||||
# Replace lsquo (\x91)
|
||||
fixed = re.sub("\x91","‘",string)
|
||||
@ -190,9 +282,9 @@ class NYTimes(BasicNewsRecipe):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://www.nytimes.com/auth/login')
|
||||
br.select_form(name='login')
|
||||
br['USERID'] = self.username
|
||||
br['PASSWORD'] = self.password
|
||||
br.form = br.forms().next()
|
||||
br['userid'] = self.username
|
||||
br['password'] = self.password
|
||||
raw = br.submit().read()
|
||||
if 'Please try again' in raw:
|
||||
raise Exception('Your username and password are incorrect')
|
||||
@ -250,7 +342,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
return BeautifulSoup(_raw, markupMassage=massage)
|
||||
|
||||
# Entry point
|
||||
print "index_to_soup()"
|
||||
soup = get_the_soup( self.encoding, url_or_raw )
|
||||
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
||||
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
||||
@ -274,83 +365,110 @@ class NYTimes(BasicNewsRecipe):
|
||||
else:
|
||||
return description
|
||||
|
||||
def parse_todays_index(self):
|
||||
def feed_title(self,div):
|
||||
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
||||
|
||||
def feed_title(div):
|
||||
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
||||
|
||||
articles = {}
|
||||
key = None
|
||||
ans = []
|
||||
url_list = []
|
||||
|
||||
def handle_article(div):
|
||||
a = div.find('a', href=True)
|
||||
if not a:
|
||||
def handle_article(self,div):
|
||||
thumbnail = div.find('div','thumbnail')
|
||||
if thumbnail:
|
||||
thumbnail.extract()
|
||||
a = div.find('a', href=True)
|
||||
if not a:
|
||||
return
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if self.exclude_url(url):
|
||||
return
|
||||
url += '?pagewanted=all'
|
||||
if self.filterDuplicates:
|
||||
if url in self.url_list:
|
||||
return
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if not url.startswith("http"):
|
||||
return
|
||||
if not url.endswith(".html"):
|
||||
return
|
||||
if 'podcast' in url:
|
||||
return
|
||||
if '/video/' in url:
|
||||
return
|
||||
url += '?pagewanted=all'
|
||||
if url in url_list:
|
||||
return
|
||||
url_list.append(url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description = ''
|
||||
pubdate = strftime('%a, %d %b')
|
||||
summary = div.find(True, attrs={'class':'summary'})
|
||||
if summary:
|
||||
description = self.tag_to_string(summary, use_alt=False)
|
||||
author = ''
|
||||
self.url_list.append(url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description = ''
|
||||
pubdate = strftime('%a, %d %b')
|
||||
summary = div.find(True, attrs={'class':'summary'})
|
||||
if summary:
|
||||
description = self.tag_to_string(summary, use_alt=False)
|
||||
author = ''
|
||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||
if authorAttribution:
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
else:
|
||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||
if authorAttribution:
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
else:
|
||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||
if authorAttribution:
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
feed = key if key is not None else 'Uncategorized'
|
||||
if not articles.has_key(feed):
|
||||
ans.append(feed)
|
||||
articles[feed] = []
|
||||
articles[feed].append(
|
||||
dict(title=title, url=url, date=pubdate,
|
||||
description=description, author=author,
|
||||
content=''))
|
||||
feed = self.key if self.key is not None else 'Uncategorized'
|
||||
if not self.articles.has_key(feed):
|
||||
self.ans.append(feed)
|
||||
self.articles[feed] = []
|
||||
self.articles[feed].append(
|
||||
dict(title=title, url=url, date=pubdate,
|
||||
description=description, author=author,
|
||||
content=''))
|
||||
|
||||
|
||||
def parse_web_edition(self):
|
||||
|
||||
for (sec_title,index_url) in self.web_sections:
|
||||
if self.includeSections != []:
|
||||
if sec_title not in self.includeSections:
|
||||
print "SECTION NOT INCLUDED: ",sec_title
|
||||
continue
|
||||
if sec_title in self.excludeSections:
|
||||
print "SECTION EXCLUDED: ",sec_title
|
||||
continue
|
||||
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
|
||||
self.key = sec_title
|
||||
# Find each article
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||
if div['class'] in ['story', 'story headline'] :
|
||||
self.handle_article(div)
|
||||
elif div['class'] == 'headlinesOnly multiline flush':
|
||||
for lidiv in div.findAll('li'):
|
||||
self.handle_article(lidiv)
|
||||
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
|
||||
|
||||
def parse_todays_index(self):
|
||||
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||
|
||||
|
||||
skipping = False
|
||||
# Find each article
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||
|
||||
if div['class'] in ['section-headline','sectionHeader']:
|
||||
key = string.capwords(feed_title(div))
|
||||
key = key.replace('Op-ed','Op-Ed')
|
||||
key = key.replace('U.s.','U.S.')
|
||||
self.key = string.capwords(self.feed_title(div))
|
||||
self.key = self.key.replace('Op-ed','Op-Ed')
|
||||
self.key = self.key.replace('U.s.','U.S.')
|
||||
self.key = self.key.replace('N.y.','N.Y.')
|
||||
skipping = False
|
||||
if self.includeSections != []:
|
||||
if self.key not in self.includeSections:
|
||||
print "SECTION NOT INCLUDED: ",self.key
|
||||
skipping = True
|
||||
if self.key in self.excludeSections:
|
||||
print "SECTION EXCLUDED: ",self.key
|
||||
skipping = True
|
||||
|
||||
elif div['class'] in ['story', 'story headline'] :
|
||||
handle_article(div)
|
||||
if not skipping:
|
||||
self.handle_article(div)
|
||||
elif div['class'] == 'headlinesOnly multiline flush':
|
||||
for lidiv in div.findAll('li'):
|
||||
handle_article(lidiv)
|
||||
if not skipping:
|
||||
self.handle_article(lidiv)
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return self.filter_ans(ans)
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
|
||||
def parse_headline_index(self):
|
||||
|
||||
articles = {}
|
||||
ans = []
|
||||
url_list = []
|
||||
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
||||
|
||||
# Fetch the content table
|
||||
@ -364,15 +482,24 @@ class NYTimes(BasicNewsRecipe):
|
||||
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
|
||||
for div_sec in td_col.findAll('div',recursive=False):
|
||||
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
|
||||
|
||||
section_name = self.tag_to_string(h6_sec_name,use_alt=False)
|
||||
section_name = re.sub(r'^ *$','',section_name)
|
||||
|
||||
if section_name == '':
|
||||
continue
|
||||
if self.includeSections != []:
|
||||
if section_name not in self.includeSections:
|
||||
print "SECTION NOT INCLUDED: ",section_name
|
||||
continue
|
||||
if section_name in self.excludeSections:
|
||||
print "SECTION EXCLUDED: ",section_name
|
||||
continue
|
||||
|
||||
section_name=string.capwords(section_name)
|
||||
if section_name == 'U.s.':
|
||||
section_name = 'U.S.'
|
||||
elif section_name == 'Op-ed':
|
||||
section_name = 'Op-Ed'
|
||||
section_name = section_name.replace('Op-ed','Op-Ed')
|
||||
section_name = section_name.replace('U.s.','U.S.')
|
||||
section_name = section_name.replace('N.y.','N.Y.')
|
||||
pubdate = strftime('%a, %d %b')
|
||||
|
||||
search_div = div_sec
|
||||
@ -393,37 +520,32 @@ class NYTimes(BasicNewsRecipe):
|
||||
if not a:
|
||||
continue
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if not url.startswith("http"):
|
||||
continue
|
||||
if not url.endswith(".html"):
|
||||
continue
|
||||
if 'podcast' in url:
|
||||
continue
|
||||
if 'video' in url:
|
||||
if self.exclude_url(url):
|
||||
continue
|
||||
url += '?pagewanted=all'
|
||||
if url in url_list:
|
||||
continue
|
||||
url_list.append(url)
|
||||
self.log("URL %s" % url)
|
||||
if self.filterDuplicates:
|
||||
if url in self.url_list:
|
||||
continue
|
||||
self.url_list.append(url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
desc = h3_item.find('p')
|
||||
if desc is not None:
|
||||
description = self.tag_to_string(desc,use_alt=False)
|
||||
else:
|
||||
description = ''
|
||||
if not articles.has_key(section_name):
|
||||
ans.append(section_name)
|
||||
articles[section_name] = []
|
||||
articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||
if not self.articles.has_key(section_name):
|
||||
self.ans.append(section_name)
|
||||
self.articles[section_name] = []
|
||||
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return self.filter_ans(ans)
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
|
||||
def parse_index(self):
|
||||
if self.headlinesOnly:
|
||||
return self.parse_headline_index()
|
||||
elif self.webEdition:
|
||||
return self.parse_web_edition()
|
||||
else:
|
||||
return self.parse_todays_index()
|
||||
|
||||
@ -439,6 +561,21 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
if self.webEdition & (self.oldest_article>0):
|
||||
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
||||
if date_tag:
|
||||
date_str = self.tag_to_string(date_tag,use_alt=False)
|
||||
date_str = date_str.replace('Published:','')
|
||||
date_items = date_str.split(',')
|
||||
try:
|
||||
datestring = date_items[0]+' '+date_items[1]
|
||||
article_date = self.decode_us_date(datestring)
|
||||
except:
|
||||
article_date = date.today()
|
||||
if article_date < self.earliest_date:
|
||||
self.log("Skipping article dated %s" % date_str)
|
||||
return None
|
||||
|
||||
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||||
if kicker_tag: # remove Op_Ed author head shots
|
||||
tagline = self.tag_to_string(kicker_tag)
|
||||
@ -463,7 +600,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
for inlineImg in inlineImgs[1:]:
|
||||
inlineImg.extract()
|
||||
# Move firstImg before article body
|
||||
#article_body = soup.find(True, {'id':'articleBody'})
|
||||
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
||||
if cgFirst:
|
||||
# Strip all sibling NavigableStrings: noise
|
||||
|
@ -1,6 +1,6 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
|
||||
'''
|
||||
www.nzz.ch
|
||||
@ -20,6 +20,19 @@ class Nzz(BasicNewsRecipe):
|
||||
encoding = 'utf-8'
|
||||
use_embedded_content = False
|
||||
language = 'de'
|
||||
extra_css = """
|
||||
body{font-family: Georgia,"Times New Roman",Times,serif }
|
||||
.artikel h3,.artikel h4,.bildLegende,.question,.autor{font-family: Arial,Verdana,Helvetica,sans-serif}
|
||||
.bildLegende{font-size: small}
|
||||
.autor{font-size: 0.9375em; color: #666666}
|
||||
.quote{font-size: large !important;
|
||||
font-style: italic;
|
||||
font-weight: normal !important;
|
||||
border-bottom: 1px dotted #BFBFBF;
|
||||
border-top: 1px dotted #BFBFBF;
|
||||
line-height: 1.25em}
|
||||
.quelle{color: #666666; font-style: italic; white-space: nowrap}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
@ -28,12 +41,14 @@ class Nzz(BasicNewsRecipe):
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'article'})]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'zone'})]
|
||||
remove_tags_before = dict(name='p', attrs={'class':'dachzeile'})
|
||||
remove_tags_after=dict(name='p', attrs={'class':'fussnote'})
|
||||
remove_attributes=['width','height','lang']
|
||||
remove_tags = [
|
||||
dict(name=['object','link','base'])
|
||||
,dict(name='div',attrs={'class':['more','teaser','advXertXoriXals','legal']})
|
||||
,dict(name='div',attrs={'id':['popup-src','readercomments','google-ad','advXertXoriXals']})
|
||||
dict(name=['object','link','base','meta','iframe'])
|
||||
,dict(attrs={'id':'content_rectangle_1'})
|
||||
,dict(attrs={'class':['weiterfuehrendeLinks','fussnote','video']})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
@ -50,7 +65,7 @@ class Nzz(BasicNewsRecipe):
|
||||
,(u'Reisen' , u'http://www.nzz.ch/magazin/reisen?rss=true')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?printview=true'
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return self.adeify_images(soup)
|
||||
|