0.7.30 updates
110
Changelog.yaml
@ -4,6 +4,108 @@
|
||||
# for important features/bug fixes.
|
||||
# Also, each release can have new and improved recipes.
|
||||
|
||||
- version: 0.7.30
|
||||
date: 2010-11-26
|
||||
|
||||
new features:
|
||||
- title: "Support for Acer Lumiread and PocketBook Pro 602"
|
||||
|
||||
- title: "When importing by ISBN also allow the specification of a file to be imported."
|
||||
tickets: [7400]
|
||||
|
||||
- title: "E-mail sending: Email sends are now regular jobs that can be accessed from the jobs list. Also when sending using gmail/hotmail send at most one email every five minutes to avoid trigerring their spam controls. Failed sends are now retried one more time, automatically."
|
||||
|
||||
- title: "Content server: When a category contains only one item, go directly to the book list instead of forcing the user to click on that one item"
|
||||
|
||||
- title: "E-mail sending: Allow unencrypted connections to SMTP relay"
|
||||
|
||||
- title: "Improve startup times for large libraries by caching the has_cover check"
|
||||
|
||||
- title: "Update windows binary build to use python 2.7"
|
||||
|
||||
- title: "Metadata and cover download plugins from Nicebooks (disabled by default)"
|
||||
|
||||
|
||||
bug fixes:
|
||||
- title: "MOBI Input: Fix bug in cleanup regex that broke parsing of escaped XML declarations."
|
||||
tickets: [7585]
|
||||
|
||||
- title: "Content server: Fix bug when user has custom categories/columns with non ascii names"
|
||||
tickets: [7590]
|
||||
|
||||
- title: "RTF Output: Handle non breaking spaces correctly"
|
||||
tickets: [7668]
|
||||
|
||||
- title: "Conversion pipeline: When rasterizing SVG images workaround incorrect handinlg of percentage height specifications in QSvgRenderer."
|
||||
tickets: [7598]
|
||||
|
||||
- title: "News download: Update version of feedparser used to parse RSS feeds."
|
||||
tickets: [7674]
|
||||
|
||||
- title: "Tag Browser: Allow user to restore hidden categories by a right click even is all categories have been hidden"
|
||||
|
||||
- title: "TXT/RTF Output: Handle XML processing instructions embedded in content correctly."
|
||||
tickets: [7644]
|
||||
|
||||
- title: "MOBI Input: Workarounds for lack of nesting rules between block and inline tags"
|
||||
tickets: [7618]
|
||||
|
||||
- title: "E-book viewer: Load all hyphenation patterns to support multi-lingual books"
|
||||
|
||||
- title: "E-book viewer: Fix incorrect lang names being used in hyphenation"
|
||||
|
||||
- title: "Check to see that the result file from a conversion is not empty before adding it, protects against the case where the conversion process crashes and the GUI adds a zero byte file to the book record"
|
||||
|
||||
- title: "E-book viewer: More sophisticated algorithm to resize images to fit viewer window. Should preserve aspect ratio in more cases"
|
||||
|
||||
- title: "Remove unneccessary calls to set_path when creating book records. Speeds up record creation by about 30% on my system"
|
||||
|
||||
- title: "Speedup for bibtex catalog generation."
|
||||
|
||||
- title: "Kobo driver: Fix missing table in deleting books process for Kobo WiFi and Kobo-O 1.8 Beta"
|
||||
|
||||
- title: "RTF Input: Preserve scene breaks in the form of empty paragraphs. Preprocessing: Improvements to chapter detection"
|
||||
|
||||
- title: "Fix custom recipe not sorted by title"
|
||||
tickets: [7486]
|
||||
|
||||
- title: "Kobo driver: Fix bug in managing the Im_Reading category on windows"
|
||||
|
||||
improved recipes:
|
||||
- "El Pais - Uruguay"
|
||||
- Argentinian La Nacion
|
||||
- comics.com
|
||||
- Mingpao
|
||||
- Revista Muy Intersante
|
||||
- Telepolis
|
||||
- New York Times
|
||||
|
||||
new recipes:
|
||||
- title: "Bangkok Biz News and Matichon"
|
||||
author: "Anat Ruangrassamee"
|
||||
|
||||
- title: "The Workingham Times and Deutsche Welle"
|
||||
author: "Darko Miletic"
|
||||
|
||||
- title: "Biz Portal"
|
||||
author: "marbs"
|
||||
|
||||
- title: "Various Japanese news sources"
|
||||
author: "Hiroshi Miura"
|
||||
|
||||
- title: "Arcamax"
|
||||
author: "Starson17"
|
||||
|
||||
- title: "Various Spanish news sources"
|
||||
author: "Gustavo Azambuja"
|
||||
|
||||
- title: "TSN"
|
||||
author: Nexus
|
||||
|
||||
- title: "Zeit Online Premium"
|
||||
author: Steffen Siebert
|
||||
|
||||
|
||||
- version: 0.7.29
|
||||
date: 2010-11-19
|
||||
|
||||
@ -79,14 +181,6 @@
|
||||
- title: "Press releases of the German government and EU Commission"
|
||||
author: "malfi"
|
||||
|
||||
- title: "Dnevnik, Siol.net, MMC-RTV and Avto-magazon"
|
||||
author: "BlonG"
|
||||
|
||||
- title: "SC Print Magazine"
|
||||
author: "Tony Maro"
|
||||
|
||||
- title: "Diario Sport"
|
||||
author: "Jefferson Frantz"
|
||||
|
||||
- version: 0.7.28
|
||||
date: 2010-11-12
|
||||
|
@ -38,6 +38,7 @@ Monocle.Browser.on = {
|
||||
iPad: navigator.userAgent.indexOf("iPad") != -1,
|
||||
BlackBerry: navigator.userAgent.indexOf("BlackBerry") != -1,
|
||||
Android: navigator.userAgent.indexOf('Android') != -1,
|
||||
MacOSX: navigator.userAgent.indexOf('Mac OS X') != -1,
|
||||
Kindle3: navigator.userAgent.match(/Kindle\/3/)
|
||||
}
|
||||
|
||||
@ -162,12 +163,23 @@ Monocle.Browser.has.transform3d = Monocle.Browser.CSSProps.isSupported([
|
||||
'OPerspective',
|
||||
'msPerspective'
|
||||
]) && Monocle.Browser.CSSProps.supportsMediaQueryProperty('transform-3d');
|
||||
Monocle.Browser.has.embedded = (top != self);
|
||||
|
||||
Monocle.Browser.has.iframeTouchBug = Monocle.Browser.iOSVersionBelow("4.2");
|
||||
|
||||
Monocle.Browser.has.selectThruBug = Monocle.Browser.iOSVersionBelow("4.2");
|
||||
|
||||
Monocle.Browser.has.mustScrollSheaf = Monocle.Browser.is.MobileSafari;
|
||||
Monocle.Browser.has.iframeDoubleWidthBug = Monocle.Browser.has.mustScrollSheaf;
|
||||
|
||||
Monocle.Browser.has.floatColumnBug = Monocle.Browser.is.WebKit;
|
||||
|
||||
Monocle.Browser.has.relativeIframeWidthBug = Monocle.Browser.on.Android;
|
||||
|
||||
|
||||
Monocle.Browser.has.jumpFlickerBug =
|
||||
Monocle.Browser.on.MacOSX && Monocle.Browser.is.WebKit;
|
||||
|
||||
|
||||
if (typeof window.console == "undefined") {
|
||||
window.console = {
|
||||
@ -1091,11 +1103,29 @@ Monocle.Reader = function (node, bookData, options, onLoadCallback) {
|
||||
cmpt.dom.setStyles(Monocle.Styles.component);
|
||||
Monocle.Styles.applyRules(cmpt.contentDocument.body, Monocle.Styles.body);
|
||||
}
|
||||
lockFrameWidths();
|
||||
dom.find('overlay').dom.setStyles(Monocle.Styles.overlay);
|
||||
dispatchEvent('monocle:styles');
|
||||
}
|
||||
|
||||
|
||||
function lockingFrameWidths() {
|
||||
if (!Monocle.Browser.has.relativeIframeWidthBug) { return; }
|
||||
for (var i = 0, cmpt; cmpt = dom.find('component', i); ++i) {
|
||||
cmpt.style.display = "none";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function lockFrameWidths() {
|
||||
if (!Monocle.Browser.has.relativeIframeWidthBug) { return; }
|
||||
for (var i = 0, cmpt; cmpt = dom.find('component', i); ++i) {
|
||||
cmpt.style.width = cmpt.parentNode.offsetWidth+"px";
|
||||
cmpt.style.display = "block";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function setBook(bk, place, callback) {
|
||||
p.book = bk;
|
||||
var pageCount = 0;
|
||||
@ -1121,12 +1151,14 @@ Monocle.Reader = function (node, bookData, options, onLoadCallback) {
|
||||
if (!p.initialized) {
|
||||
console.warn('Attempt to resize book before initialization.');
|
||||
}
|
||||
lockingFrameWidths();
|
||||
if (!dispatchEvent("monocle:resizing", {}, true)) {
|
||||
return;
|
||||
}
|
||||
clearTimeout(p.resizeTimer);
|
||||
p.resizeTimer = setTimeout(
|
||||
function () {
|
||||
lockFrameWidths();
|
||||
p.flipper.moveTo({ page: pageNumber() });
|
||||
dispatchEvent("monocle:resize");
|
||||
},
|
||||
@ -1765,12 +1797,7 @@ Monocle.Book = function (dataSource) {
|
||||
|
||||
|
||||
function componentIdMatching(str) {
|
||||
for (var i = 0; i < p.componentIds.length; ++i) {
|
||||
if (str.indexOf(p.componentIds[i]) > -1) {
|
||||
return p.componentIds[i];
|
||||
}
|
||||
}
|
||||
return null;
|
||||
return p.componentIds.indexOf(str) >= 0 ? str : null;
|
||||
}
|
||||
|
||||
|
||||
@ -2018,6 +2045,12 @@ Monocle.Component = function (book, id, index, chapters, source) {
|
||||
|
||||
|
||||
function loadFrameFromURL(url, frame, callback) {
|
||||
if (!url.match(/^\//)) {
|
||||
var link = document.createElement('a');
|
||||
link.setAttribute('href', url);
|
||||
url = link.href;
|
||||
delete(link);
|
||||
}
|
||||
frame.onload = function () {
|
||||
frame.onload = null;
|
||||
Monocle.defer(callback);
|
||||
@ -2460,7 +2493,7 @@ Monocle.Flippers.Legacy = function (reader) {
|
||||
function moveTo(locus, callback) {
|
||||
var fn = frameToLocus;
|
||||
if (typeof callback == "function") {
|
||||
fn = function () { frameToLocus(); callback(); }
|
||||
fn = function (locus) { frameToLocus(locus); callback(locus); }
|
||||
}
|
||||
p.reader.getBook().setOrLoadPageAt(page(), locus, fn);
|
||||
}
|
||||
@ -2794,7 +2827,9 @@ Monocle.Dimensions.Columns = function (pageDiv) {
|
||||
function scrollerWidth() {
|
||||
var bdy = p.page.m.activeFrame.contentDocument.body;
|
||||
if (Monocle.Browser.has.iframeDoubleWidthBug) {
|
||||
if (Monocle.Browser.iOSVersion < "4.1") {
|
||||
if (Monocle.Browser.on.Android) {
|
||||
return bdy.scrollWidth * 1.5; // I actually have no idea why 1.5.
|
||||
} else if (Monocle.Browser.iOSVersion < "4.1") {
|
||||
var hbw = bdy.scrollWidth / 2;
|
||||
var sew = scrollerElement().scrollWidth;
|
||||
return Math.max(sew, hbw);
|
||||
@ -2969,6 +3004,7 @@ Monocle.Flippers.Slider = function (reader) {
|
||||
|
||||
|
||||
function setPage(pageDiv, locus, callback) {
|
||||
ensureWaitControl();
|
||||
p.reader.getBook().setOrLoadPageAt(
|
||||
pageDiv,
|
||||
locus,
|
||||
@ -3048,6 +3084,7 @@ Monocle.Flippers.Slider = function (reader) {
|
||||
checkPoint(boxPointX);
|
||||
|
||||
p.turnData.releasing = true;
|
||||
showWaitControl(lowerPage());
|
||||
|
||||
if (dir == k.FORWARDS) {
|
||||
if (
|
||||
@ -3088,14 +3125,18 @@ Monocle.Flippers.Slider = function (reader) {
|
||||
|
||||
|
||||
function onGoingBackward(x) {
|
||||
var lp = lowerPage();
|
||||
var lp = lowerPage(), up = upperPage();
|
||||
showWaitControl(up);
|
||||
jumpOut(lp, // move lower page off-screen
|
||||
function () {
|
||||
flipPages(); // flip lower to upper
|
||||
setPage( // set upper page to previous
|
||||
lp,
|
||||
getPlace(lowerPage()).getLocus({ direction: k.BACKWARDS }),
|
||||
function () { lifted(x); }
|
||||
function () {
|
||||
lifted(x);
|
||||
hideWaitControl(up);
|
||||
}
|
||||
);
|
||||
}
|
||||
);
|
||||
@ -3103,8 +3144,10 @@ Monocle.Flippers.Slider = function (reader) {
|
||||
|
||||
|
||||
function afterGoingForward() {
|
||||
var up = upperPage();
|
||||
var up = upperPage(), lp = lowerPage();
|
||||
if (p.interactive) {
|
||||
showWaitControl(up);
|
||||
showWaitControl(lp);
|
||||
setPage( // set upper (off screen) to current
|
||||
up,
|
||||
getPlace().getLocus({ direction: k.FORWARDS }),
|
||||
@ -3113,6 +3156,7 @@ Monocle.Flippers.Slider = function (reader) {
|
||||
}
|
||||
);
|
||||
} else {
|
||||
showWaitControl(lp);
|
||||
flipPages();
|
||||
jumpIn(up, function () { prepareNextPage(announceTurn); });
|
||||
}
|
||||
@ -3171,6 +3215,8 @@ Monocle.Flippers.Slider = function (reader) {
|
||||
|
||||
|
||||
function announceTurn() {
|
||||
hideWaitControl(upperPage());
|
||||
hideWaitControl(lowerPage());
|
||||
p.reader.dispatchEvent('monocle:turn');
|
||||
resetTurnData();
|
||||
}
|
||||
@ -3319,12 +3365,14 @@ Monocle.Flippers.Slider = function (reader) {
|
||||
|
||||
|
||||
function jumpIn(pageDiv, callback) {
|
||||
setX(pageDiv, 0, { duration: 1 }, callback);
|
||||
var dur = Monocle.Browser.has.jumpFlickerBug ? 1 : 0;
|
||||
setX(pageDiv, 0, { duration: dur }, callback);
|
||||
}
|
||||
|
||||
|
||||
function jumpOut(pageDiv, callback) {
|
||||
setX(pageDiv, 0 - pageDiv.offsetWidth, { duration: 1 }, callback);
|
||||
var dur = Monocle.Browser.has.jumpFlickerBug ? 1 : 0;
|
||||
setX(pageDiv, 0 - pageDiv.offsetWidth, { duration: dur }, callback);
|
||||
}
|
||||
|
||||
|
||||
@ -3357,6 +3405,28 @@ Monocle.Flippers.Slider = function (reader) {
|
||||
}
|
||||
|
||||
|
||||
function ensureWaitControl() {
|
||||
if (p.waitControl) { return; }
|
||||
p.waitControl = {
|
||||
createControlElements: function (holder) {
|
||||
return holder.dom.make('div', 'flippers_slider_wait');
|
||||
}
|
||||
}
|
||||
p.reader.addControl(p.waitControl, 'page');
|
||||
}
|
||||
|
||||
|
||||
function showWaitControl(page) {
|
||||
var ctrl = p.reader.dom.find('flippers_slider_wait', page.m.pageIndex);
|
||||
ctrl.style.opacity = 0.5;
|
||||
}
|
||||
|
||||
|
||||
function hideWaitControl(page) {
|
||||
var ctrl = p.reader.dom.find('flippers_slider_wait', page.m.pageIndex);
|
||||
ctrl.style.opacity = 0;
|
||||
}
|
||||
|
||||
API.pageCount = p.pageCount;
|
||||
API.addPage = addPage;
|
||||
API.getPlace = getPlace;
|
||||
|
BIN
resources/images/news/cnetjapan.png
Normal file
After Width: | Height: | Size: 892 B |
BIN
resources/images/news/deutsche_welle_bs.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/deutsche_welle_en.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/deutsche_welle_es.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/deutsche_welle_hr.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/deutsche_welle_pt.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/deutsche_welle_sr.png
Normal file
After Width: | Height: | Size: 445 B |
BIN
resources/images/news/endgadget_ja.png
Normal file
After Width: | Height: | Size: 698 B |
BIN
resources/images/news/jijinews.png
Normal file
After Width: | Height: | Size: 919 B |
BIN
resources/images/news/msnsankei.png
Normal file
After Width: | Height: | Size: 543 B |
BIN
resources/images/news/nikkei_free.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/nikkei_sub_economy.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/nikkei_sub_industory.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/nikkei_sub_life.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/nikkei_sub_main.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/nikkei_sub_sports.png
Normal file
After Width: | Height: | Size: 948 B |
BIN
resources/images/news/reuters.png
Normal file
After Width: | Height: | Size: 693 B |
BIN
resources/images/news/reuters_ja.png
Normal file
After Width: | Height: | Size: 693 B |
BIN
resources/images/news/the_workingham_times.png
Normal file
After Width: | Height: | Size: 1011 B |
50
resources/recipes/180.recipe
Normal file
@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
'''
|
||||
180.com.uy
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Noticias(BasicNewsRecipe):
|
||||
title = '180.com.uy'
|
||||
__author__ = 'Gustavo Azambuja'
|
||||
description = 'Noticias de Uruguay'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
use_embedded_content = False
|
||||
recursion = 5
|
||||
encoding = 'utf-8'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'tef-md tef-md-seccion-sociedad'})]
|
||||
remove_tags = [
|
||||
dict(name=['object','link'])
|
||||
]
|
||||
|
||||
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||
p {font-family:Arial,Helvetica,sans-serif;}
|
||||
'''
|
||||
feeds = [
|
||||
(u'Titulares', u'http://www.180.com.uy/feed.php')
|
||||
]
|
||||
|
||||
def get_cover_url(self):
|
||||
return 'http://www.180.com.uy/tplef/img/logo.gif'
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
110
resources/recipes/arcamax.recipe
Normal file
@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = 'Copyright 2010 Starson17'
|
||||
'''
|
||||
www.arcamax.com
|
||||
'''
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Arcamax(BasicNewsRecipe):
|
||||
title = 'Arcamax'
|
||||
__author__ = 'Starson17'
|
||||
__version__ = '1.03'
|
||||
__date__ = '25 November 2010'
|
||||
description = u'Family Friendly Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.'
|
||||
category = 'news, comics'
|
||||
language = 'en'
|
||||
use_embedded_content= False
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
cover_url = 'http://www.arcamax.com/images/pub/amuse/leftcol/zits.jpg'
|
||||
|
||||
####### USER PREFERENCES - SET COMICS AND NUMBER OF COMICS TO RETRIEVE ########
|
||||
num_comics_to_get = 7
|
||||
# CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS
|
||||
|
||||
conversion_options = {'linearize_tables' : True
|
||||
, 'comment' : description
|
||||
, 'tags' : category
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['toon']}),
|
||||
]
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
for title, url in [
|
||||
######## COMICS - GENERAL ########
|
||||
#(u"9 Chickweed Lane", u"http://www.arcamax.com/ninechickweedlane"),
|
||||
#(u"Agnes", u"http://www.arcamax.com/agnes"),
|
||||
#(u"Andy Capp", u"http://www.arcamax.com/andycapp"),
|
||||
(u"BC", u"http://www.arcamax.com/bc"),
|
||||
#(u"Baby Blues", u"http://www.arcamax.com/babyblues"),
|
||||
#(u"Beetle Bailey", u"http://www.arcamax.com/beetlebailey"),
|
||||
(u"Blondie", u"http://www.arcamax.com/blondie"),
|
||||
#u"Boondocks", u"http://www.arcamax.com/boondocks"),
|
||||
#(u"Cathy", u"http://www.arcamax.com/cathy"),
|
||||
#(u"Daddys Home", u"http://www.arcamax.com/daddyshome"),
|
||||
(u"Dilbert", u"http://www.arcamax.com/dilbert"),
|
||||
#(u"Dinette Set", u"http://www.arcamax.com/thedinetteset"),
|
||||
(u"Dog Eat Doug", u"http://www.arcamax.com/dogeatdoug"),
|
||||
(u"Doonesbury", u"http://www.arcamax.com/doonesbury"),
|
||||
#(u"Dustin", u"http://www.arcamax.com/dustin"),
|
||||
(u"Family Circus", u"http://www.arcamax.com/familycircus"),
|
||||
(u"Garfield", u"http://www.arcamax.com/garfield"),
|
||||
#(u"Get Fuzzy", u"http://www.arcamax.com/getfuzzy"),
|
||||
#(u"Girls and Sports", u"http://www.arcamax.com/girlsandsports"),
|
||||
#(u"Hagar the Horrible", u"http://www.arcamax.com/hagarthehorrible"),
|
||||
#(u"Heathcliff", u"http://www.arcamax.com/heathcliff"),
|
||||
#(u"Jerry King Cartoons", u"http://www.arcamax.com/humorcartoon"),
|
||||
#(u"Luann", u"http://www.arcamax.com/luann"),
|
||||
#(u"Momma", u"http://www.arcamax.com/momma"),
|
||||
#(u"Mother Goose and Grimm", u"http://www.arcamax.com/mothergooseandgrimm"),
|
||||
(u"Mutts", u"http://www.arcamax.com/mutts"),
|
||||
#(u"Non Sequitur", u"http://www.arcamax.com/nonsequitur"),
|
||||
#(u"Pearls Before Swine", u"http://www.arcamax.com/pearlsbeforeswine"),
|
||||
#(u"Pickles", u"http://www.arcamax.com/pickles"),
|
||||
#(u"Red and Rover", u"http://www.arcamax.com/redandrover"),
|
||||
#(u"Rubes", u"http://www.arcamax.com/rubes"),
|
||||
#(u"Rugrats", u"http://www.arcamax.com/rugrats"),
|
||||
(u"Speed Bump", u"http://www.arcamax.com/speedbump"),
|
||||
(u"Wizard of Id", u"http://www.arcamax.com/wizardofid"),
|
||||
(u"Dilbert", u"http://www.arcamax.com/dilbert"),
|
||||
(u"Zits", u"http://www.arcamax.com/zits"),
|
||||
]:
|
||||
articles = self.make_links(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
return feeds
|
||||
|
||||
def make_links(self, url):
|
||||
title = 'Temp'
|
||||
current_articles = []
|
||||
pages = range(1, self.num_comics_to_get+1)
|
||||
for page in pages:
|
||||
page_soup = self.index_to_soup(url)
|
||||
if page_soup:
|
||||
title = page_soup.find(name='div', attrs={'class':'toon'}).p.img['alt']
|
||||
page_url = url
|
||||
prev_page_url = 'http://www.arcamax.com' + page_soup.find('a', attrs={'class':'next'}, text='Previous').parent['href']
|
||||
current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':''})
|
||||
url = prev_page_url
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
main_comic = soup.find('p',attrs={'class':'m0'})
|
||||
if main_comic.a['target'] == '_blank':
|
||||
main_comic.a.img['id'] = 'main_comic'
|
||||
return soup
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||
img#main_comic {max-width:100%; min-width:100%;}
|
||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||
'''
|
||||
|
@ -13,6 +13,7 @@ class Dnevnik(BasicNewsRecipe):
|
||||
labguage = 'sl'
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'sl'
|
||||
|
||||
conversion_options = {'linearize_tables' : True}
|
||||
|
||||
|
25
resources/recipes/bangkok_biz.recipe
Normal file
@ -0,0 +1,25 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1290689337(BasicNewsRecipe):
|
||||
__author__ = 'Anat R.'
|
||||
language = 'th'
|
||||
title = u'Bangkok Biz News'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
feeds = [(u'Headlines',
|
||||
u'http://www.bangkokbiznews.com/home/services/rss/home.xml'),
|
||||
(u'Politics', u'http://www.bangkokbiznews.com/home/services/rss/politics.xml'),
|
||||
(u'Business', u'http://www.bangkokbiznews.com/home/services/rss/business.xml'),
|
||||
(u'Finance', u' http://www.bangkokbiznews.com/home/services/rss/finance.xml'),
|
||||
(u'Technology', u' http://www.bangkokbiznews.com/home/services/rss/it.xml')]
|
||||
remove_tags_before = dict(name='div', attrs={'class':'box-Detailcontent'})
|
||||
remove_tags_after = dict(name='p', attrs={'class':'allTags'})
|
||||
remove_tags = []
|
||||
remove_tags.append(dict(name = 'div', attrs = {'id': 'content-tools'}))
|
||||
remove_tags.append(dict(name = 'p', attrs = {'class':'allTags'}))
|
||||
remove_tags.append(dict(name = 'div', attrs = {'id':'morePic'}))
|
||||
remove_tags.append(dict(name = 'ul', attrs = {'class':'tabs-nav'}))
|
||||
|
58
resources/recipes/bitacora.recipe
Normal file
@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
'''
|
||||
bitacora.com.uy
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class General(BasicNewsRecipe):
|
||||
title = 'bitacora.com.uy'
|
||||
__author__ = 'Gustavo Azambuja'
|
||||
description = 'Noticias de Uruguay'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
use_embedded_content = False
|
||||
recursion = 5
|
||||
encoding = 'iso-8859-1'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [dict(id=['txt'])]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':'tablafoot'}),
|
||||
dict(name=['object','h4']),
|
||||
dict(name=['object','link'])
|
||||
]
|
||||
|
||||
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||
p {font-family:Arial,Helvetica,sans-serif;}
|
||||
'''
|
||||
feeds = [
|
||||
(u'Titulares', u'http://www.bitacora.com.uy/anxml.cgi?15')
|
||||
]
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = None
|
||||
index = 'http://www.bitacora.com.uy'
|
||||
soup = self.index_to_soup(index)
|
||||
link_item = soup.find('img',attrs={'class':'imgtapa'})
|
||||
if link_item:
|
||||
cover_url = "http://www.bitacora.com.uy/"+link_item['src']
|
||||
return cover_url
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
40
resources/recipes/biz_portal.recipe
Normal file
@ -0,0 +1,40 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1283848012(BasicNewsRecipe):
|
||||
description = 'This is a recipe of BizPortal.co.il.'
|
||||
cover_url = 'http://www.bizportal.co.il/shukhahon/images/bizportal.jpg'
|
||||
title = u'BizPortal'
|
||||
language = 'he'
|
||||
__author__ = 'marbs'
|
||||
extra_css='img {max-width:100%;} body{direction: rtl;},title{direction: rtl; } ,article_description{direction: rtl; }, a.article{direction: rtl; } ,calibre_feed_description{direction: rtl; }'
|
||||
simultaneous_downloads = 5
|
||||
remove_javascript = True
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
remove_empty_feeds = True
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
remove_attributes = ['width']
|
||||
simultaneous_downloads = 5
|
||||
# keep_only_tags =dict(name='div', attrs={'id':'articleContainer'})
|
||||
remove_tags = [dict(name='img', attrs={'scr':['images/bizlogo_nl.gif']})]
|
||||
max_articles_per_feed = 100
|
||||
#preprocess_regexps = [
|
||||
# (re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: '')
|
||||
# ]
|
||||
|
||||
|
||||
feeds = [(u'חדשות שוק ההון', u'http://www.bizportal.co.il/shukhahon/messRssUTF2.xml'),
|
||||
(u'חדשות וול סטריט בעברית', u'http://www.bizportal.co.il/shukhahon/images/bizportal.jpg'),
|
||||
(u'שיווק ופרסום', u'http://www.bizportal.co.il/shukhahon/messRssUTF145.xml'),
|
||||
(u'משפט', u'http://www.bizportal.co.il/shukhahon/messRssUTF3.xml'),
|
||||
(u'ניתוח טכני', u'http://www.bizportal.co.il/shukhahon/messRssUTF5.xml'),
|
||||
(u'דיני עבודה ושכר', u'http://www.bizportal.co.il/shukhahon/messRssUTF6.xml'),
|
||||
(u'מיסוי', u'http://www.bizportal.co.il/shukhahon/messRssUTF7.xml'),
|
||||
(u'טאבו', u'http://www.bizportal.co.il/shukhahon/messRssUTF8.xml'),
|
||||
(u'נדל"ן', u'http://www.bizportal.co.il/shukhahon/messRssUTF160.xml'),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
split1 = url.split("=")
|
||||
print_url = 'http://www.bizportal.co.il/web/webnew/shukhahon/biznews02print.shtml?mid=' + split1[1]
|
||||
return print_url
|
@ -1,18 +1,22 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# -*- coding: utf-8 mode: python -*-
|
||||
|
||||
# Find the newest version of this recipe here:
|
||||
# https://github.com/consti/BrandEins-Recipe/raw/master/brandeins.recipe
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>'
|
||||
__version__ = '0.95'
|
||||
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>'
|
||||
__version__ = '0.96'
|
||||
|
||||
''' http://brandeins.de - Wirtschaftsmagazin '''
|
||||
import re
|
||||
import string
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
|
||||
class BrandEins(BasicNewsRecipe):
|
||||
|
||||
title = u'Brand Eins'
|
||||
title = u'brand eins'
|
||||
__author__ = 'Constantin Hofstetter'
|
||||
description = u'Wirtschaftsmagazin'
|
||||
publisher ='brandeins.de'
|
||||
@ -22,11 +26,14 @@ class BrandEins(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
language = 'de'
|
||||
publication_type = 'magazine'
|
||||
needs_subscription = 'optional'
|
||||
|
||||
# 2 is the last full magazine (default)
|
||||
# 1 is the newest (but not full)
|
||||
# 3 is one before 2 etc.
|
||||
which_ausgabe = 2
|
||||
# This value can be set via the username field.
|
||||
default_issue = 2
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]
|
||||
|
||||
@ -61,17 +68,31 @@ class BrandEins(BasicNewsRecipe):
|
||||
|
||||
return soup
|
||||
|
||||
def get_cover(self, soup):
|
||||
cover_url = None
|
||||
cover_item = soup.find('div', attrs = {'class': 'cover_image'})
|
||||
if cover_item:
|
||||
cover_url = 'http://www.brandeins.de/' + cover_item.img['src']
|
||||
return cover_url
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
|
||||
archive = "http://www.brandeins.de/archiv.html"
|
||||
|
||||
issue = self.default_issue
|
||||
if self.username:
|
||||
try:
|
||||
issue = int(self.username)
|
||||
except:
|
||||
pass
|
||||
|
||||
soup = self.index_to_soup(archive)
|
||||
latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0]
|
||||
pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-self.which_ausgabe]
|
||||
pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-issue]
|
||||
url = pre_latest_issue.get('href', False)
|
||||
# Get the title for the magazin - build it out of the title of the cover - take the issue and year;
|
||||
self.title = "Brand Eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d+)", pre_latest_issue.find('img').get('title', False)).group('date')
|
||||
# Get month and year of the magazine issue - build it out of the title of the cover
|
||||
self.timefmt = " " + re.search(r"(?P<date>\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date')
|
||||
url = 'http://brandeins.de/'+url
|
||||
|
||||
# url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
|
||||
@ -83,6 +104,7 @@ class BrandEins(BasicNewsRecipe):
|
||||
|
||||
def brand_eins_parse_latest_issue(self, url):
|
||||
soup = self.index_to_soup(url)
|
||||
self.cover_url = self.get_cover(soup)
|
||||
article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]
|
||||
|
||||
titles_and_articles = []
|
||||
@ -123,3 +145,4 @@ class BrandEins(BasicNewsRecipe):
|
||||
current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
|
||||
titles_and_articles.append([chapter_title, current_articles])
|
||||
return titles_and_articles
|
||||
|
||||
|
@ -11,7 +11,6 @@ class AdvancedUserRecipe1275798572(BasicNewsRecipe):
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'en'
|
||||
masthead_url = 'http://www.cbc.ca/includes/gfx/cbcnews_logo_09.gif'
|
||||
cover_url = 'http://img692.imageshack.us/img692/2814/cbc.png'
|
||||
keep_only_tags = [dict(name='div', attrs={'id':['storyhead','storybody']})]
|
||||
|
32
resources/recipes/cnetjapan.recipe
Normal file
@ -0,0 +1,32 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class CNetJapan(BasicNewsRecipe):
|
||||
title = u'CNET Japan'
|
||||
oldest_article = 3
|
||||
max_articles_per_feed = 30
|
||||
__author__ = 'Hiroshi Miura'
|
||||
|
||||
feeds = [(u'cnet rss', u'http://feeds.japan.cnet.com/cnet/rss')]
|
||||
language = 'ja'
|
||||
encoding = 'Shift_JIS'
|
||||
remove_javascript = True
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(ur'<!--\u25B2contents_left END\u25B2-->.*</body>', re.DOTALL|re.IGNORECASE|re.UNICODE),
|
||||
lambda match: '</body>'),
|
||||
(re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '</body>'),
|
||||
(re.compile(ur'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->', re.UNICODE),
|
||||
lambda match: '<!-- removed -->'),
|
||||
]
|
||||
|
||||
remove_tags_before = dict(name="h2")
|
||||
remove_tags = [
|
||||
{'class':"social_bkm_share"},
|
||||
{'class':"social_bkm_print"},
|
||||
{'class':"block20 clearfix"},
|
||||
dict(name="div",attrs={'id':'bookreview'}),
|
||||
]
|
||||
remove_tags_after = {'class':"block20"}
|
||||
|
@ -347,6 +347,7 @@ class Comics(BasicNewsRecipe):
|
||||
title = strip_tag['title']
|
||||
print 'title: ', title
|
||||
current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':''})
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
extra_css = '''
|
||||
|
69
resources/recipes/cosmopolitan.recipe
Normal file
@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
'''
|
||||
Muy Interesante
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class General(BasicNewsRecipe):
|
||||
title = 'Cosmopolitan'
|
||||
__author__ = 'Gustavo Azambuja'
|
||||
description = 'Revista Cosmopolitan, Edicion Espanola'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
use_embedded_content = False
|
||||
recursion = 1
|
||||
encoding = 'utf8'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
conversion_options = {'linearize_tables': True}
|
||||
|
||||
oldest_article = 180
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [
|
||||
dict(id=['contenido']),
|
||||
dict(name='td', attrs={'class':['contentheading', 'txt_articulo']})
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['breadcrumb', 'bloque1', 'article', 'bajo_title', 'tags_articles', 'otrosenlaces_title', 'otrosenlaces_parent', 'compartir']}),
|
||||
dict(name='div', attrs={'id':'comment'}),
|
||||
dict(name='table', attrs={'class':'pagenav'}),
|
||||
dict(name=['object','link'])
|
||||
]
|
||||
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||
img {float:left; clear:both; margin:10px}
|
||||
p {font-family:Arial,Helvetica,sans-serif;}
|
||||
'''
|
||||
feeds = [
|
||||
(u'Articulos', u'http://feeds.feedburner.com/cosmohispano/FSSt')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
attribs = [ 'style','font','valign'
|
||||
,'colspan','width','height'
|
||||
,'rowspan','summary','align'
|
||||
,'cellspacing','cellpadding'
|
||||
,'frames','rules','border'
|
||||
]
|
||||
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
|
||||
item.name = 'div'
|
||||
for attrib in attribs:
|
||||
if item.has_key(attrib):
|
||||
del item[attrib]
|
||||
return soup
|
||||
|
||||
def get_cover_url(self):
|
||||
index = 'http://www.cosmohispano.com/revista'
|
||||
soup = self.index_to_soup(index)
|
||||
link_item = soup.find('img',attrs={'class':'img_portada'})
|
||||
if link_item:
|
||||
cover_url = "http://www.cosmohispano.com"+link_item['src']
|
||||
return cover_url
|
76
resources/recipes/deutsche_welle_bs.recipe
Normal file
@ -0,0 +1,76 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
dw-world.de
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DeutscheWelle_bs(BasicNewsRecipe):
|
||||
title = 'Deutsche Welle'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Vijesti iz Njemacke i svijeta'
|
||||
publisher = 'Deutsche Welle'
|
||||
category = 'news, politics, Germany'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'bs'
|
||||
publication_type = 'newsportal'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||
extra_css = """
|
||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||
body{font-family: Arial,sans1,sans-serif}
|
||||
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||
"""
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher': publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||
,dict(attrs={'class':'actionFooter'})
|
||||
]
|
||||
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||
remove_attributes = ['height','width','onclick','border','lang']
|
||||
|
||||
feeds = [
|
||||
(u'Politika' , u'http://rss.dw-world.de/rdf/rss-bos-pol')
|
||||
,(u'Evropa' , u'http://rss.dw-world.de/rdf/rss-bos-eu' )
|
||||
,(u'Kiosk' , u'http://rss.dw-world.de/rdf/rss-bos-eu' )
|
||||
,(u'Ekonomija i Nuka' , u'http://rss.dw-world.de/rdf/rss-bos-eco')
|
||||
,(u'Kultura' , u'http://rss.dw-world.de/rdf/rss-bos-cul')
|
||||
,(u'Sport' , u'http://rss.dw-world.de/rdf/rss-bos-sp' )
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
artl = url.rpartition('/')[2]
|
||||
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
del item['href']
|
||||
if item.has_key('target'):
|
||||
del item['target']
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
return soup
|
||||
|
66
resources/recipes/deutsche_welle_en.recipe
Normal file
@ -0,0 +1,66 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
dw-world.de
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DeutscheWelle_en(BasicNewsRecipe):
|
||||
title = 'Deutsche Welle'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'News from Germany and World'
|
||||
publisher = 'Deutsche Welle'
|
||||
category = 'news, politics, Germany'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'en'
|
||||
publication_type = 'newsportal'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||
extra_css = """
|
||||
body{font-family: Arial,sans-serif}
|
||||
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||
"""
|
||||
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher': publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||
,dict(attrs={'class':'actionFooter'})
|
||||
]
|
||||
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||
remove_attributes = ['height','width','onclick','border','lang']
|
||||
|
||||
feeds = [(u'All news', u'http://rss.dw-world.de/rdf/rss-en-all')]
|
||||
|
||||
def print_version(self, url):
|
||||
artl = url.rpartition('/')[2]
|
||||
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
del item['href']
|
||||
if item.has_key('target'):
|
||||
del item['target']
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
return soup
|
||||
|
66
resources/recipes/deutsche_welle_es.recipe
Normal file
@ -0,0 +1,66 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
dw-world.de
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DeutscheWelle_es(BasicNewsRecipe):
|
||||
title = 'Deutsche Welle'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Noticias desde Alemania y mundo'
|
||||
publisher = 'Deutsche Welle'
|
||||
category = 'news, politics, Germany'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'es'
|
||||
publication_type = 'newsportal'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||
extra_css = """
|
||||
body{font-family: Arial,sans-serif}
|
||||
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||
"""
|
||||
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher': publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||
,dict(attrs={'class':'actionFooter'})
|
||||
]
|
||||
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||
remove_attributes = ['height','width','onclick','border','lang']
|
||||
|
||||
feeds = [(u'Noticias', u'http://rss.dw-world.de/rdf/rss-sp-all')]
|
||||
|
||||
def print_version(self, url):
|
||||
artl = url.rpartition('/')[2]
|
||||
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
del item['href']
|
||||
if item.has_key('target'):
|
||||
del item['target']
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
return soup
|
||||
|
74
resources/recipes/deutsche_welle_hr.recipe
Normal file
@ -0,0 +1,74 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
dw-world.de
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DeutscheWelle_hr(BasicNewsRecipe):
|
||||
title = 'Deutsche Welle'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Vesti iz Njemacke i svijeta'
|
||||
publisher = 'Deutsche Welle'
|
||||
category = 'news, politics, Germany'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'hr'
|
||||
publication_type = 'newsportal'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||
extra_css = """
|
||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||
body{font-family: Arial,sans1,sans-serif}
|
||||
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||
"""
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher': publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||
,dict(attrs={'class':'actionFooter'})
|
||||
]
|
||||
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||
remove_attributes = ['height','width','onclick','border','lang']
|
||||
|
||||
feeds = [
|
||||
(u'Svijet' , u'http://rss.dw-world.de/rdf/rss-cro-svijet')
|
||||
,(u'Europa' , u'http://rss.dw-world.de/rdf/rss-cro-eu' )
|
||||
,(u'Njemacka' , u'http://rss.dw-world.de/rdf/rss-cro-ger' )
|
||||
,(u'Vijesti' , u'http://rss.dw-world.de/rdf/rss-cro-all' )
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
artl = url.rpartition('/')[2]
|
||||
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
del item['href']
|
||||
if item.has_key('target'):
|
||||
del item['target']
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
return soup
|
||||
|
66
resources/recipes/deutsche_welle_pt.recipe
Normal file
@ -0,0 +1,66 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
dw-world.de
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DeutscheWelle_pt(BasicNewsRecipe):
|
||||
title = 'Deutsche Welle'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Noticias desde Alemania y mundo'
|
||||
publisher = 'Deutsche Welle'
|
||||
category = 'news, politics, Germany'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'pt'
|
||||
publication_type = 'newsportal'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||
extra_css = """
|
||||
body{font-family: Arial,sans-serif}
|
||||
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||
"""
|
||||
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher': publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||
,dict(attrs={'class':'actionFooter'})
|
||||
]
|
||||
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||
remove_attributes = ['height','width','onclick','border','lang']
|
||||
|
||||
feeds = [(u'Noticias', u'http://rss.dw-world.de/rdf/rss-br-all')]
|
||||
|
||||
def print_version(self, url):
|
||||
artl = url.rpartition('/')[2]
|
||||
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
del item['href']
|
||||
if item.has_key('target'):
|
||||
del item['target']
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
return soup
|
||||
|
79
resources/recipes/deutsche_welle_sr.recipe
Normal file
@ -0,0 +1,79 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
dw-world.de
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DeutscheWelle_sr(BasicNewsRecipe):
|
||||
title = 'Deutsche Welle'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Vesti iz Nemacke i sveta'
|
||||
publisher = 'Deutsche Welle'
|
||||
category = 'news, politics, Germany'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'sr'
|
||||
publication_type = 'newsportal'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.dw-world.de/skins/std/channel1/pics/dw_logo1024.gif'
|
||||
extra_css = """
|
||||
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
|
||||
body{font-family: Arial,sans1,sans-serif}
|
||||
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||
.caption{font-size: x-small; display: block; margin-bottom: 0.4em}
|
||||
"""
|
||||
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
|
||||
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher': publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['iframe','embed','object','form','base','meta','link'])
|
||||
,dict(attrs={'class':'actionFooter'})
|
||||
]
|
||||
keep_only_tags=[dict(attrs={'class':'ArticleDetail detail'})]
|
||||
remove_attributes = ['height','width','onclick','border','lang']
|
||||
|
||||
feeds = [
|
||||
(u'Politika' , u'http://rss.dw-world.de/rdf/rss-ser-pol' )
|
||||
,(u'Srbija' , u'http://rss.dw-world.de/rdf/rss-ser-pol-ser' )
|
||||
,(u'Region' , u'http://rss.dw-world.de/rdf/rss-ser-pol-region' )
|
||||
,(u'Evropa' , u'http://rss.dw-world.de/rdf/rss-ser-pol-eu' )
|
||||
,(u'Nemacka' , u'http://rss.dw-world.de/rdf/rss-ser-pol-ger' )
|
||||
,(u'Svet' , u'http://rss.dw-world.de/rdf/rss-ser-pol-ger' )
|
||||
,(u'Pregled stampe', u'http://rss.dw-world.de/rdf/rss-ser-pol-ger')
|
||||
,(u'Nauka Tehnika Medicina', u'http://rss.dw-world.de/rdf/rss-ser-science')
|
||||
,(u'Kultura' , u'feed:http://rss.dw-world.de/rdf/rss-ser-cul' )
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
artl = url.rpartition('/')[2]
|
||||
return 'http://www.dw-world.de/popups/popup_printcontent/' + artl
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll('a'):
|
||||
limg = item.find('img')
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
if limg:
|
||||
item.name = 'div'
|
||||
del item['href']
|
||||
if item.has_key('target'):
|
||||
del item['target']
|
||||
else:
|
||||
str = self.tag_to_string(item)
|
||||
item.replaceWith(str)
|
||||
return soup
|
||||
|
80
resources/recipes/el_pais_uy.recipe
Normal file
@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
'''
|
||||
http://www.elpais.com.uy/
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class General(BasicNewsRecipe):
|
||||
title = 'El Pais - Uruguay'
|
||||
__author__ = 'Gustavo Azambuja'
|
||||
description = 'Noticias de Uruguay y el resto del mundo'
|
||||
publisher = 'EL PAIS S.A.'
|
||||
category = 'news, politics, Uruguay'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
use_embedded_content = False
|
||||
recursion = 2
|
||||
encoding = 'iso-8859-1'
|
||||
masthead_url = 'http://www.elpais.com.uy/Images/09/cabezal/logo_PDEP.png'
|
||||
publication_type = 'newspaper'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
keep_only_tags = [
|
||||
dict(name='h1'),
|
||||
dict(name='div', attrs={'id':'Contenido'})
|
||||
]
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['date_text', 'comments', 'form_section', 'share_it']}),
|
||||
dict(name='div', attrs={'id':['relatedPosts', 'spacer', 'banner_izquierda', 'right_container']}),
|
||||
dict(name='p', attrs={'class':'FacebookLikeButton'}),
|
||||
dict(name=['object','form']),
|
||||
dict(name=['object','table']) ]
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||
p {font-family:Arial,Helvetica,sans-serif;}
|
||||
body{font-family: Verdana,Arial,Helvetica,sans-serif }
|
||||
img{margin-bottom: 0.4em; display:block;}
|
||||
'''
|
||||
feeds = [
|
||||
(u'Ultimo Momento', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=umomento'),
|
||||
(u'Editorial', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=editorial'),
|
||||
(u'Nacional', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=nacional'),
|
||||
(u'Internacional', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=internacional'),
|
||||
(u'Espectaculos', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=espectaculos'),
|
||||
(u'Deportes', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=deportes'),
|
||||
(u'Ciudades', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=ciudades'),
|
||||
(u'Economia', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=economia')
|
||||
]
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = None
|
||||
index = 'http://www.elpais.com.uy'
|
||||
soup = self.index_to_soup(index)
|
||||
link_item = soup.find('div',attrs={'class':'boxmedio box257'})
|
||||
print link_item
|
||||
if link_item:
|
||||
cover_url = 'http://www.elpais.com.uy'+link_item.img['src']
|
||||
return cover_url
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
22
resources/recipes/endgadget_ja.recipe
Normal file
@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
japan.engadget.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class EndgadgetJapan(BasicNewsRecipe):
|
||||
title = u'Endgadget\u65e5\u672c\u7248'
|
||||
language = 'ja'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
cover_url = 'http://skins18.wincustomize.com/1/49/149320/29/7578/preview-29-7578.jpg'
|
||||
masthead_url = 'http://www.blogsmithmedia.com/japanese.engadget.com/media/eng-jp-logo-t.png'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
language = 'ja'
|
||||
encoding = 'utf-8'
|
||||
feeds = [(u'engadget', u'http://japanese.engadget.com/rss.xml')]
|
100
resources/recipes/freeway.recipe
Normal file
@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
'''
|
||||
http://freeway.com.uy
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class General(BasicNewsRecipe):
|
||||
title = 'freeway.com.uy'
|
||||
__author__ = 'Gustavo Azambuja'
|
||||
description = 'Revista Freeway, Montevideo, Uruguay'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
use_embedded_content = False
|
||||
recursion = 1
|
||||
encoding = 'utf8'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
conversion_options = {'linearize_tables': True}
|
||||
|
||||
oldest_article = 180
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [
|
||||
dict(id=['contenido']),
|
||||
dict(name='a', attrs={'class':'titulo_art_ppal'}),
|
||||
dict(name='img', attrs={'class':'recuadro'}),
|
||||
dict(name='td', attrs={'class':'txt_art_ppal'})
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name=['object','link'])
|
||||
]
|
||||
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||
img {float:left; clear:both; margin:10px}
|
||||
p {font-family:Arial,Helvetica,sans-serif;}
|
||||
'''
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
for title, url in [('Articulos', 'http://freeway.com.uy/revista/')]:
|
||||
articles = self.art_parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
return feeds
|
||||
|
||||
def art_parse_section(self, url):
|
||||
soup = self.index_to_soup(url)
|
||||
div = soup.find(attrs={'id': 'tbl_1'})
|
||||
|
||||
current_articles = []
|
||||
for tag in div.findAllNext(attrs = {'class': 'ancho_articulos'}):
|
||||
if tag.get('class') == 'link-list-heading':
|
||||
break
|
||||
for td in tag.findAll('td'):
|
||||
a = td.find('a', attrs= {'class': 'titulo_articulos'})
|
||||
if a is None:
|
||||
continue
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
if not url or not title:
|
||||
continue
|
||||
if url.startswith('/'):
|
||||
url = 'http://freeway.com.uy'+url
|
||||
p = td.find('p', attrs= {'class': 'txt_articulos'})
|
||||
description = self.tag_to_string(p)
|
||||
self.log('\t\tFound article:', title)
|
||||
self.log('\t\t\t', url)
|
||||
self.log('\t\t\t', description)
|
||||
current_articles.append({'title': title, 'url': url, 'description':description, 'date':''})
|
||||
|
||||
return current_articles
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
attribs = [ 'style','font','valign'
|
||||
,'colspan','width','height'
|
||||
,'rowspan','summary','align'
|
||||
,'cellspacing','cellpadding'
|
||||
,'frames','rules','border'
|
||||
]
|
||||
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
|
||||
item.name = 'div'
|
||||
for attrib in attribs:
|
||||
if item.has_key(attrib):
|
||||
del item[attrib]
|
||||
return soup
|
||||
|
||||
def get_cover_url(self):
|
||||
#index = 'http://www.cosmohispano.com/revista'
|
||||
#soup = self.index_to_soup(index)
|
||||
#link_item = soup.find('img',attrs={'class':'img_portada'})
|
||||
#if link_item:
|
||||
# cover_url = "http://www.cosmohispano.com"+link_item['src']
|
||||
return 'http://freeway.com.uy/_upload/_n_foto_grande/noticia_1792_tapanoviembre2010.jpg'
|
26
resources/recipes/jijinews.recipe
Normal file
@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.jiji.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class JijiDotCom(BasicNewsRecipe):
|
||||
title = u'\u6642\u4e8b\u901a\u4fe1'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'World News from Jiji Press'
|
||||
publisher = 'Jiji Press Ltd.'
|
||||
category = 'news'
|
||||
encoding = 'utf-8'
|
||||
oldest_article = 6
|
||||
max_articles_per_feed = 100
|
||||
language = 'ja'
|
||||
cover_url = 'http://www.jiji.com/img/top_header_logo2.gif'
|
||||
masthead_url = 'http://jen.jiji.com/images/logo_jijipress.gif'
|
||||
|
||||
feeds = [(u'\u30cb\u30e5\u30fc\u30b9', u'http://www.jiji.com/rss/ranking.rdf')]
|
||||
remove_tags_after = dict(id="ad_google")
|
||||
|
48
resources/recipes/la_diaria.recipe
Normal file
@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
'''
|
||||
ladiaria.com.uy
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class General(BasicNewsRecipe):
|
||||
title = 'La Diaria'
|
||||
__author__ = 'Gustavo Azambuja'
|
||||
description = 'Noticias de Uruguay'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
use_embedded_content = False
|
||||
recursion = 5
|
||||
encoding = 'utf8'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [dict(id=['article'])]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['byline', 'hr', 'titlebar', 'volver-arriba-right']}),
|
||||
dict(name='div', attrs={'id':'discussion'}),
|
||||
dict(name=['object','link'])
|
||||
]
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||
p {font-family:Arial,Helvetica,sans-serif;}
|
||||
'''
|
||||
feeds = [
|
||||
(u'Articulos', u'http://ladiaria.com/feeds/articulos')
|
||||
]
|
||||
|
||||
def get_cover_url(self):
|
||||
return 'http://ladiaria.com/edicion/imagenportada/'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
@ -8,7 +8,7 @@ from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class LaRazon_Bol(BasicNewsRecipe):
|
||||
title = 'La Razón - Bolivia'
|
||||
title = u'La Razón - Bolivia'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'El diario nacional de Bolivia'
|
||||
publisher = 'Praxsis S.R.L.'
|
||||
|
@ -20,11 +20,14 @@ class Lanacion(BasicNewsRecipe):
|
||||
publication_type = 'newspaper'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.lanacion.com.ar/imgs/layout/logos/ln341x47.gif'
|
||||
extra_css = """ h1{font-family: Georgia,serif}
|
||||
extra_css = """ h1{font-family: Georgia,serif}
|
||||
h2{color: #626262}
|
||||
body{font-family: Arial,sans-serif}
|
||||
img{margin-top: 0.5em; margin-bottom: 0.2em}
|
||||
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
|
||||
.notaFecha{color: #808080}
|
||||
.notaEpigrafe{font-size: x-small}
|
||||
.topNota h1{font-family: Arial,sans-serif} """
|
||||
.topNota h1{font-family: Arial,sans-serif}
|
||||
"""
|
||||
|
||||
|
||||
conversion_options = {
|
||||
@ -38,12 +41,12 @@ class Lanacion(BasicNewsRecipe):
|
||||
remove_tags = [
|
||||
dict(name='div' , attrs={'class':'notaComentario floatFix noprint' })
|
||||
,dict(name='ul' , attrs={'class':['cajaHerramientas cajaTop noprint','herramientas noprint']})
|
||||
,dict(name='div' , attrs={'class':'cajaHerramientas noprint' })
|
||||
,dict(attrs={'class':['titulosMultimedia','derecha','techo color','encuesta','izquierda compartir','floatFix']})
|
||||
,dict(name=['iframe','embed','object','form','base','hr'])
|
||||
,dict(name='div' , attrs={'class':['cajaHerramientas noprint','cajaHerramientas floatFix'] })
|
||||
,dict(attrs={'class':['titulosMultimedia','derecha','techo color','encuesta','izquierda compartir','floatFix','videoCentro']})
|
||||
,dict(name=['iframe','embed','object','form','base','hr','meta','link','input'])
|
||||
]
|
||||
remove_tags_after = dict(attrs={'class':['tags','nota-destacado']})
|
||||
remove_attributes = ['height','width','visible']
|
||||
remove_attributes = ['height','width','visible','onclick','data-count','name']
|
||||
|
||||
feeds = [
|
||||
(u'Ultimas noticias' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?origen=2' )
|
||||
|
26
resources/recipes/mainichi.recipe
Normal file
@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.mainichi.jp
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class MainichiDailyNews(BasicNewsRecipe):
|
||||
title = u'\u6bce\u65e5\u65b0\u805e'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
description = 'Japanese traditional newspaper Mainichi Daily News'
|
||||
publisher = 'Mainichi Daily News'
|
||||
category = 'news, japan'
|
||||
language = 'ja'
|
||||
|
||||
feeds = [(u'daily news', u'http://mainichi.jp/rss/etc/flash.rss')]
|
||||
|
||||
remove_tags_before = {'class':"NewsTitle"}
|
||||
remove_tags = [{'class':"RelatedArticle"}]
|
||||
remove_tags_after = {'class':"Credit"}
|
||||
|
18
resources/recipes/mainichi_it_news.recipe
Normal file
@ -0,0 +1,18 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class MainichiDailyITNews(BasicNewsRecipe):
|
||||
title = u'\u6bce\u65e5\u65b0\u805e(IT&\u5bb6\u96fb)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
description = 'Japanese traditional newspaper Mainichi Daily News - IT and electronics'
|
||||
publisher = 'Mainichi Daily News'
|
||||
category = 'news, Japan, IT, Electronics'
|
||||
language = 'ja'
|
||||
|
||||
feeds = [(u'IT News', u'http://mainichi.pheedo.jp/f/mainichijp_electronics')]
|
||||
|
||||
remove_tags_before = {'class':"NewsTitle"}
|
||||
remove_tags = [{'class':"RelatedArticle"}]
|
||||
remove_tags_after = {'class':"Credit"}
|
||||
|
22
resources/recipes/matichon.recipe
Normal file
@ -0,0 +1,22 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1290412756(BasicNewsRecipe):
|
||||
__author__ = 'Anat R.'
|
||||
title = u'Matichon'
|
||||
oldest_article = 7
|
||||
language = 'th'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
feeds = [(u'News', u'http://www.matichon.co.th/rss/news_article.xml'),
|
||||
(u'Columns', u'http://www.matichon.co.th/rss/news_columns.xml'),
|
||||
(u'Politics', u'http://www.matichon.co.th/rss/news_politic.xml'),
|
||||
(u'Business', u'http://www.matichon.co.th/rss/news_business.xml'),
|
||||
(u'World', u'http://www.matichon.co.th/rss/news_world.xml'),
|
||||
(u'Sports', u'http://www.matichon.co.th/rss/news_sport.xml'),
|
||||
(u'Entertainment', u'http://www.matichon.co.th/rss/news_entertainment.xml')]
|
||||
keep_only_tags = []
|
||||
keep_only_tags.append(dict(name = 'h3', attrs = {'class' : 'read-h'}))
|
||||
keep_only_tags.append(dict(name = 'p', attrs = {'class' : 'read-time'}))
|
||||
keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'news-content'}))
|
@ -3,13 +3,28 @@ __copyright__ = '2010, Eddie Lau'
|
||||
'''
|
||||
modified from Singtao Toronto calibre recipe by rty
|
||||
Change Log:
|
||||
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
|
||||
ordering of articles
|
||||
2010/11/12: add news image and eco-news section
|
||||
2010/11/08: add parsing of finance section
|
||||
2010/11/06: temporary work-around for Kindle device having no capability to display unicode
|
||||
in section/article list.
|
||||
2010/10/31: skip repeated articles in section pages
|
||||
'''
|
||||
|
||||
import datetime
|
||||
import os, datetime, re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
|
||||
class AdvancedUserRecipe1278063072(BasicNewsRecipe):
|
||||
|
||||
from calibre import __appname__, strftime
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.utils.date import now as nowf
|
||||
|
||||
class MPHKRecipe(BasicNewsRecipe):
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
@ -24,27 +39,131 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe):
|
||||
encoding = 'Big5-HKSCS'
|
||||
recursions = 0
|
||||
conversion_options = {'linearize_tables':True}
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;}'
|
||||
#extra_css = 'img {float:right; margin:4px;}'
|
||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||
keep_only_tags = [dict(name='h1'),
|
||||
#dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page
|
||||
dict(attrs={'class':['photo']}),
|
||||
dict(attrs={'id':['newscontent']}),
|
||||
dict(attrs={'id':['newscontent01','newscontent02']})]
|
||||
remove_tags = [dict(name='style'),
|
||||
dict(attrs={'id':['newscontent135']})] # for the finance page
|
||||
remove_attributes = ['width']
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '<h1>'),
|
||||
(re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '</h1>'),
|
||||
]
|
||||
|
||||
def image_url_processor(cls, baseurl, url):
|
||||
# trick: break the url at the first occurance of digit, add an additional
|
||||
# '_' at the front
|
||||
# not working, may need to move this to preprocess_html() method
|
||||
#minIdx = 10000
|
||||
#i0 = url.find('0')
|
||||
#if i0 >= 0 and i0 < minIdx:
|
||||
# minIdx = i0
|
||||
#i1 = url.find('1')
|
||||
#if i1 >= 0 and i1 < minIdx:
|
||||
# minIdx = i1
|
||||
#i2 = url.find('2')
|
||||
#if i2 >= 0 and i2 < minIdx:
|
||||
# minIdx = i2
|
||||
#i3 = url.find('3')
|
||||
#if i3 >= 0 and i0 < minIdx:
|
||||
# minIdx = i3
|
||||
#i4 = url.find('4')
|
||||
#if i4 >= 0 and i4 < minIdx:
|
||||
# minIdx = i4
|
||||
#i5 = url.find('5')
|
||||
#if i5 >= 0 and i5 < minIdx:
|
||||
# minIdx = i5
|
||||
#i6 = url.find('6')
|
||||
#if i6 >= 0 and i6 < minIdx:
|
||||
# minIdx = i6
|
||||
#i7 = url.find('7')
|
||||
#if i7 >= 0 and i7 < minIdx:
|
||||
# minIdx = i7
|
||||
#i8 = url.find('8')
|
||||
#if i8 >= 0 and i8 < minIdx:
|
||||
# minIdx = i8
|
||||
#i9 = url.find('9')
|
||||
#if i9 >= 0 and i9 < minIdx:
|
||||
# minIdx = i9
|
||||
#return url[0:minIdx] + '_' + url[minIdx+1:]
|
||||
return url
|
||||
|
||||
def get_fetchdate(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
# convert UTC to local hk time - at around HKT 5.30am, all news are available
|
||||
dt_local = dt_utc - datetime.timedelta(-2.5/24)
|
||||
# convert UTC to local hk time - at around HKT 6.00am, all news are available
|
||||
dt_local = dt_utc - datetime.timedelta(-2.0/24)
|
||||
return dt_local.strftime("%Y%m%d")
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
dateStr = self.get_fetchdate()
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
feeds = []
|
||||
dateStr = self.get_fetchdate()
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
|
||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||
(u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'),
|
||||
(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||
(u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'),
|
||||
('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'),
|
||||
(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
# special - finance
|
||||
fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||
if fin_articles:
|
||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
# special - eco-friendly
|
||||
# eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm')
|
||||
# if eco_articles:
|
||||
# feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
|
||||
# special - entertainment
|
||||
#ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
#if ent_articles:
|
||||
# feeds.append(('Entertainment', ent_articles))
|
||||
return feeds
|
||||
|
||||
def parse_section(self, url):
|
||||
dateStr = self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
divs.reverse()
|
||||
for i in divs:
|
||||
a = i.find('a', href = True)
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
def parse_fin_section(self, url):
|
||||
dateStr = self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href= True)
|
||||
current_articles = []
|
||||
for i in a:
|
||||
url = i.get('href', False)
|
||||
if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://www.mpfinance.com/cfm/' +url
|
||||
current_articles.append({'title': title, 'url': url, 'description':''})
|
||||
return current_articles
|
||||
|
||||
def parse_eco_section(self, url):
|
||||
soup = self.index_to_soup(url)
|
||||
divs = soup.findAll(attrs={'class': ['bullet']})
|
||||
current_articles = []
|
||||
@ -53,9 +172,162 @@ class AdvancedUserRecipe1278063072(BasicNewsRecipe):
|
||||
a = i.find('a', href = True)
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||
if url not in included_urls:
|
||||
url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
|
||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||
current_articles.append({'title': title, 'url': url, 'description':''})
|
||||
included_urls.append(url)
|
||||
return current_articles
|
||||
|
||||
#def parse_ent_section(self, url):
|
||||
# dateStr = self.get_fetchdate()
|
||||
# soup = self.index_to_soup(url)
|
||||
# a = soup.findAll('a', href=True)
|
||||
# current_articles = []
|
||||
# included_urls = []
|
||||
# for i in a:
|
||||
# title = self.tag_to_string(i)
|
||||
# url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
|
||||
# if url not in included_urls and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1 and not title == '':
|
||||
# current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
# return current_articles
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll(style=True):
|
||||
del item['width']
|
||||
for item in soup.findAll(stype=True):
|
||||
del item['absmiddle']
|
||||
return soup
|
||||
|
||||
def create_opf(self, feeds, dir=None):
|
||||
#super(MPHKRecipe,self).create_opf(feeds, dir)
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
title = self.short_title()
|
||||
if self.output_profile.periodical_date_in_title:
|
||||
title += strftime(self.timefmt)
|
||||
mi = MetaInformation(title, [__appname__])
|
||||
mi.publisher = __appname__
|
||||
mi.author_sort = __appname__
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
mi.timestamp = nowf()
|
||||
mi.comments = self.description
|
||||
if not isinstance(mi.comments, unicode):
|
||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
mi.pubdate = nowf()
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
if mp is not None and os.access(mp, os.R_OK):
|
||||
from calibre.ebooks.metadata.opf2 import Guide
|
||||
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
|
||||
ref.type = 'masthead'
|
||||
ref.title = 'Masthead Image'
|
||||
opf.guide.append(ref)
|
||||
|
||||
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
|
||||
manifest.append(os.path.join(dir, 'index.html'))
|
||||
manifest.append(os.path.join(dir, 'index.ncx'))
|
||||
|
||||
# Get cover
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is None:
|
||||
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
|
||||
if self.default_cover(pf):
|
||||
cpath = pf.name
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
manifest.append(cpath)
|
||||
|
||||
# Get masthead
|
||||
mpath = getattr(self, 'masthead_path', None)
|
||||
if mpath is not None and os.access(mpath, os.R_OK):
|
||||
manifest.append(mpath)
|
||||
|
||||
opf.create_manifest_from_files_in(manifest)
|
||||
for mani in opf.manifest:
|
||||
if mani.path.endswith('.ncx'):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
for j, a in enumerate(f):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/'%(num, j)
|
||||
auth = a.author
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = a.text_summary
|
||||
if not desc:
|
||||
desc = None
|
||||
else:
|
||||
desc = self.description_limiter(desc)
|
||||
entries.append('%sindex.html'%adir)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth, description=desc)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
relp = sp[len(prefix):]
|
||||
entries.append(relp.replace(os.sep, '/'))
|
||||
last = sp
|
||||
|
||||
if os.path.exists(last):
|
||||
with open(last, 'rb') as fi:
|
||||
src = fi.read().decode('utf-8')
|
||||
soup = BeautifulSoup(src)
|
||||
body = soup.find('body')
|
||||
if body is not None:
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, __appname__, prefix=prefix,
|
||||
center=self.center_navbar)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
with open(last, 'wb') as fi:
|
||||
fi.write(unicode(soup).encode('utf-8'))
|
||||
if len(feeds) == 0:
|
||||
raise Exception('All feeds are empty, aborting.')
|
||||
|
||||
if len(feeds) > 1:
|
||||
for i, f in enumerate(feeds):
|
||||
entries.append('feed_%d/index.html'%i)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
auth = getattr(f, 'author', None)
|
||||
if not auth:
|
||||
auth = None
|
||||
desc = getattr(f, 'description', None)
|
||||
if not desc:
|
||||
desc = None
|
||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
|
||||
f.title, play_order=po, description=desc, author=auth))
|
||||
|
||||
else:
|
||||
entries.append('feed_%d/index.html'%0)
|
||||
feed_index(0, toc)
|
||||
|
||||
for i, p in enumerate(entries):
|
||||
entries[i] = os.path.join(dir, p.replace('/', os.sep))
|
||||
opf.create_spine(entries)
|
||||
opf.set_toc(toc)
|
||||
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
||||
|
56
resources/recipes/montevideo_com.recipe
Normal file
@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
'''
|
||||
http://www.montevideo.com.uy
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Noticias(BasicNewsRecipe):
|
||||
title = 'Montevideo COMM'
|
||||
__author__ = 'Gustavo Azambuja'
|
||||
description = 'Noticias de Uruguay'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
use_embedded_content = False
|
||||
recursion = 5
|
||||
encoding = 'utf-8'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [dict(id=['txt'])]
|
||||
remove_tags = [
|
||||
dict(name=['object','link'])
|
||||
]
|
||||
|
||||
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||
p {font-family:Arial,Helvetica,sans-serif;}
|
||||
'''
|
||||
feeds = [
|
||||
(u'Destacados', u'http://www.montevideo.com.uy/anxml.aspx?58'),
|
||||
(u'Noticias', u'http://www.montevideo.com.uy/anxml.aspx?59'),
|
||||
(u'Tecnologia', u'http://www.montevideo.com.uy/anxml.aspx?133'),
|
||||
(u'Tiempo Libre', u'http://www.montevideo.com.uy/anxml.aspx?60'),
|
||||
# (u'Deportes', u'http://www.montevideo.com.uy/anxml.aspx?968'),
|
||||
# (u'Pantallazo', u'http://www.montevideo.com.uy/anxml.aspx?1022'),
|
||||
(u'Gastronomia', u'http://www.montevideo.com.uy/anxml.aspx?1023')
|
||||
]
|
||||
|
||||
def get_cover_url(self):
|
||||
return 'http://sphotos.ak.fbcdn.net/hphotos-ak-snc1/hs276.snc1/10319_147339559330_147337559330_2625816_6636564_n.jpg'
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
24
resources/recipes/msnsankei.recipe
Normal file
@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
sankei.jp.msn.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class MSNSankeiNewsProduct(BasicNewsRecipe):
|
||||
title = u'MSN\u7523\u7d4c\u30cb\u30e5\u30fc\u30b9(\u65b0\u5546\u54c1)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'Products release from Japan'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
encoding = 'Shift_JIS'
|
||||
language = 'ja'
|
||||
|
||||
feeds = [(u'\u65b0\u5546\u54c1', u'http://sankei.jp.msn.com/rss/news/release.xml')]
|
||||
|
||||
remove_tags_before = dict(id="__r_article_title__")
|
||||
remove_tags_after = dict(id="ajax_release_news")
|
||||
remove_tags = [{'class':"parent chromeCustom6G"}]
|
60
resources/recipes/nikkei_free.recipe
Normal file
@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.nikkei.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class NikkeiNet(BasicNewsRecipe):
|
||||
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(Free)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'News and current market affairs from Japan'
|
||||
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
language = 'ja'
|
||||
|
||||
feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'),
|
||||
(u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'),
|
||||
(u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'),
|
||||
(u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'),
|
||||
(u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'),
|
||||
(u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'),
|
||||
(u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'),
|
||||
(u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'),
|
||||
(u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'),
|
||||
(u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'),
|
||||
(u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'),
|
||||
(u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'),
|
||||
(u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'),
|
||||
(u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'),
|
||||
(u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'),
|
||||
(u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'),
|
||||
(u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'),
|
||||
(u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'),
|
||||
(u'\u5730\u57df\u30cb\u30e5\u30fc\u30b9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=local'),
|
||||
(u'\u7d71\u8a08\u30fb\u767d\u66f8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=report'),
|
||||
(u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking'),
|
||||
(u'\u4f1a\u898b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=interview'),
|
||||
(u'\u793e\u8aac\u30fb\u6625\u79cb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shasetsu'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba'),
|
||||
(u'\u8abf\u67fb\u30fb\u30a2\u30f3\u30b1\u30fc\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=research')
|
||||
]
|
||||
|
||||
remove_tags_before = dict(id="CONTENTS")
|
||||
remove_tags = [
|
||||
dict(name="form"),
|
||||
{'class':"cmn-hide"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
125
resources/recipes/nikkei_sub.recipe
Normal file
@ -0,0 +1,125 @@
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import mechanize
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
|
||||
class NikkeiNet_subscription(BasicNewsRecipe):
|
||||
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'News and current market affairs from Japan'
|
||||
needs_subscription = True
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
language = 'ja'
|
||||
remove_javascript = False
|
||||
temp_files = []
|
||||
|
||||
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||
remove_tags = [
|
||||
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||
{'class':"cmn-print_headline cmn-clearfix"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
|
||||
cj = mechanize.LWPCookieJar()
|
||||
br.set_cookiejar(cj)
|
||||
|
||||
#br.set_debug_http(True)
|
||||
#br.set_debug_redirects(True)
|
||||
#br.set_debug_responses(True)
|
||||
|
||||
if self.username is not None and self.password is not None:
|
||||
#print "----------------------------get login form--------------------------------------------"
|
||||
# open login form
|
||||
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||
response = br.response()
|
||||
#print "----------------------------get login form---------------------------------------------"
|
||||
#print "----------------------------set login form---------------------------------------------"
|
||||
# remove disabled input which brings error on mechanize
|
||||
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||
br.set_response(response)
|
||||
br.select_form(name='LA0010Form01')
|
||||
br['LA0010Form01:LA0010Email'] = self.username
|
||||
br['LA0010Form01:LA0010Password'] = self.password
|
||||
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||
br.submit()
|
||||
br.response()
|
||||
#print "----------------------------send login form---------------------------------------------"
|
||||
#print "----------------------------open news main page-----------------------------------------"
|
||||
# open news site
|
||||
br.open('http://www.nikkei.com/')
|
||||
br.response()
|
||||
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||
#print response2.get_data()
|
||||
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||
# forced redirect in default
|
||||
br.select_form(nr=0)
|
||||
br.submit()
|
||||
response3 = br.response()
|
||||
# return some cookie which should be set by Javascript
|
||||
#print response3.geturl()
|
||||
raw = response3.get_data()
|
||||
#print "---------------------------response to form --------------------------------------------"
|
||||
# grab cookie from JS and set it
|
||||
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||
br.select_form(nr=0)
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||
|
||||
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].close()
|
||||
cj.load(self.temp_files[-1].name)
|
||||
|
||||
br.submit()
|
||||
|
||||
#br.set_debug_http(False)
|
||||
#br.set_debug_redirects(False)
|
||||
#br.set_debug_responses(False)
|
||||
return br
|
||||
|
||||
|
||||
|
||||
feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'),
|
||||
(u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'),
|
||||
(u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'),
|
||||
(u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'),
|
||||
(u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'),
|
||||
(u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'),
|
||||
(u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'),
|
||||
(u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'),
|
||||
(u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'),
|
||||
(u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'),
|
||||
(u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'),
|
||||
(u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'),
|
||||
(u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'),
|
||||
(u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'),
|
||||
(u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'),
|
||||
(u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'),
|
||||
(u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'),
|
||||
(u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'),
|
||||
(u'\u5730\u57df\u30cb\u30e5\u30fc\u30b9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=local'),
|
||||
(u'\u7d71\u8a08\u30fb\u767d\u66f8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=report'),
|
||||
(u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking'),
|
||||
(u'\u4f1a\u898b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=interview'),
|
||||
(u'\u793e\u8aac\u30fb\u6625\u79cb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shasetsu'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba'),
|
||||
(u'\u8abf\u67fb\u30fb\u30a2\u30f3\u30b1\u30fc\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=research')
|
||||
]
|
||||
|
||||
|
||||
|
109
resources/recipes/nikkei_sub_economy.recipe
Normal file
@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.nikkei.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import mechanize
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
class NikkeiNet_sub_economy(BasicNewsRecipe):
|
||||
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7d4c\u6e08)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'News and current market affairs from Japan'
|
||||
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
needs_subscription = True
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
language = 'ja'
|
||||
remove_javascript = False
|
||||
temp_files = []
|
||||
|
||||
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||
remove_tags = [
|
||||
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||
{'class':"cmn-print_headline cmn-clearfix"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
||||
feeds = [ (u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'),
|
||||
(u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'),
|
||||
(u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'),
|
||||
(u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'),
|
||||
(u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'),
|
||||
(u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'),
|
||||
(u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'),
|
||||
(u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'),
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
|
||||
cj = mechanize.LWPCookieJar()
|
||||
br.set_cookiejar(cj)
|
||||
|
||||
#br.set_debug_http(True)
|
||||
#br.set_debug_redirects(True)
|
||||
#br.set_debug_responses(True)
|
||||
|
||||
if self.username is not None and self.password is not None:
|
||||
#print "----------------------------get login form--------------------------------------------"
|
||||
# open login form
|
||||
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||
response = br.response()
|
||||
#print "----------------------------get login form---------------------------------------------"
|
||||
#print "----------------------------set login form---------------------------------------------"
|
||||
# remove disabled input which brings error on mechanize
|
||||
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||
br.set_response(response)
|
||||
br.select_form(name='LA0010Form01')
|
||||
br['LA0010Form01:LA0010Email'] = self.username
|
||||
br['LA0010Form01:LA0010Password'] = self.password
|
||||
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||
br.submit()
|
||||
br.response()
|
||||
#print "----------------------------send login form---------------------------------------------"
|
||||
#print "----------------------------open news main page-----------------------------------------"
|
||||
# open news site
|
||||
br.open('http://www.nikkei.com/')
|
||||
br.response()
|
||||
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||
#print response2.get_data()
|
||||
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||
# forced redirect in default
|
||||
br.select_form(nr=0)
|
||||
br.submit()
|
||||
response3 = br.response()
|
||||
# return some cookie which should be set by Javascript
|
||||
#print response3.geturl()
|
||||
raw = response3.get_data()
|
||||
#print "---------------------------response to form --------------------------------------------"
|
||||
# grab cookie from JS and set it
|
||||
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||
br.select_form(nr=0)
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||
|
||||
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].close()
|
||||
cj.load(self.temp_files[-1].name)
|
||||
|
||||
br.submit()
|
||||
|
||||
#br.set_debug_http(False)
|
||||
#br.set_debug_redirects(False)
|
||||
#br.set_debug_responses(False)
|
||||
return br
|
||||
|
||||
|
||||
|
||||
|
108
resources/recipes/nikkei_sub_industry.recipe
Normal file
@ -0,0 +1,108 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.nikkei.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import mechanize
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
|
||||
class NikkeiNet_sub_industory(BasicNewsRecipe):
|
||||
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7523\u696d)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'News and current market affairs from Japan'
|
||||
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
needs_subscription = True
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
language = 'ja'
|
||||
remove_javascript = False
|
||||
temp_files = []
|
||||
|
||||
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||
remove_tags = [
|
||||
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||
{'class':"cmn-print_headline cmn-clearfix"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
||||
feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'),
|
||||
(u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'),
|
||||
(u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'),
|
||||
(u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'),
|
||||
(u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'),
|
||||
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
|
||||
cj = mechanize.LWPCookieJar()
|
||||
br.set_cookiejar(cj)
|
||||
|
||||
#br.set_debug_http(True)
|
||||
#br.set_debug_redirects(True)
|
||||
#br.set_debug_responses(True)
|
||||
|
||||
if self.username is not None and self.password is not None:
|
||||
#print "----------------------------get login form--------------------------------------------"
|
||||
# open login form
|
||||
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||
response = br.response()
|
||||
#print "----------------------------get login form---------------------------------------------"
|
||||
#print "----------------------------set login form---------------------------------------------"
|
||||
# remove disabled input which brings error on mechanize
|
||||
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||
br.set_response(response)
|
||||
br.select_form(name='LA0010Form01')
|
||||
br['LA0010Form01:LA0010Email'] = self.username
|
||||
br['LA0010Form01:LA0010Password'] = self.password
|
||||
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||
br.submit()
|
||||
br.response()
|
||||
#print "----------------------------send login form---------------------------------------------"
|
||||
#print "----------------------------open news main page-----------------------------------------"
|
||||
# open news site
|
||||
br.open('http://www.nikkei.com/')
|
||||
br.response()
|
||||
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||
#print response2.get_data()
|
||||
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||
# forced redirect in default
|
||||
br.select_form(nr=0)
|
||||
br.submit()
|
||||
response3 = br.response()
|
||||
# return some cookie which should be set by Javascript
|
||||
#print response3.geturl()
|
||||
raw = response3.get_data()
|
||||
#print "---------------------------response to form --------------------------------------------"
|
||||
# grab cookie from JS and set it
|
||||
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||
br.select_form(nr=0)
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||
|
||||
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].close()
|
||||
cj.load(self.temp_files[-1].name)
|
||||
|
||||
br.submit()
|
||||
|
||||
#br.set_debug_http(False)
|
||||
#br.set_debug_redirects(False)
|
||||
#br.set_debug_responses(False)
|
||||
return br
|
||||
|
||||
|
||||
|
||||
|
109
resources/recipes/nikkei_sub_life.recipe
Normal file
@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.nikkei.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import mechanize
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
|
||||
class NikkeiNet_sub_life(BasicNewsRecipe):
|
||||
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u751f\u6d3b)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'News and current market affairs from Japan'
|
||||
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
needs_subscription = True
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
language = 'ja'
|
||||
remove_javascript = False
|
||||
temp_files = []
|
||||
|
||||
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||
remove_tags = [
|
||||
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||
{'class':"cmn-print_headline cmn-clearfix"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
||||
feeds = [ (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'),
|
||||
(u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'),
|
||||
(u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'),
|
||||
(u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'),
|
||||
(u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'),
|
||||
(u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking')
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
|
||||
cj = mechanize.LWPCookieJar()
|
||||
br.set_cookiejar(cj)
|
||||
|
||||
#br.set_debug_http(True)
|
||||
#br.set_debug_redirects(True)
|
||||
#br.set_debug_responses(True)
|
||||
|
||||
if self.username is not None and self.password is not None:
|
||||
#print "----------------------------get login form--------------------------------------------"
|
||||
# open login form
|
||||
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||
response = br.response()
|
||||
#print "----------------------------get login form---------------------------------------------"
|
||||
#print "----------------------------set login form---------------------------------------------"
|
||||
# remove disabled input which brings error on mechanize
|
||||
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||
br.set_response(response)
|
||||
br.select_form(name='LA0010Form01')
|
||||
br['LA0010Form01:LA0010Email'] = self.username
|
||||
br['LA0010Form01:LA0010Password'] = self.password
|
||||
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||
br.submit()
|
||||
br.response()
|
||||
#print "----------------------------send login form---------------------------------------------"
|
||||
#print "----------------------------open news main page-----------------------------------------"
|
||||
# open news site
|
||||
br.open('http://www.nikkei.com/')
|
||||
br.response()
|
||||
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||
#print response2.get_data()
|
||||
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||
# forced redirect in default
|
||||
br.select_form(nr=0)
|
||||
br.submit()
|
||||
response3 = br.response()
|
||||
# return some cookie which should be set by Javascript
|
||||
#print response3.geturl()
|
||||
raw = response3.get_data()
|
||||
#print "---------------------------response to form --------------------------------------------"
|
||||
# grab cookie from JS and set it
|
||||
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||
br.select_form(nr=0)
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||
|
||||
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].close()
|
||||
cj.load(self.temp_files[-1].name)
|
||||
|
||||
br.submit()
|
||||
|
||||
#br.set_debug_http(False)
|
||||
#br.set_debug_redirects(False)
|
||||
#br.set_debug_responses(False)
|
||||
return br
|
||||
|
||||
|
||||
|
||||
|
102
resources/recipes/nikkei_sub_main.recipe
Normal file
@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.nikkei.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import mechanize
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
|
||||
class NikkeiNet_sub_main(BasicNewsRecipe):
|
||||
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u7dcf\u5408)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'News and current market affairs from Japan'
|
||||
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
needs_subscription = True
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
language = 'ja'
|
||||
remove_javascript = False
|
||||
temp_files = []
|
||||
|
||||
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||
remove_tags = [
|
||||
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||
{'class':"cmn-print_headline cmn-clearfix"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
||||
feeds = [ (u'NIKKEI', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=main')]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
|
||||
cj = mechanize.LWPCookieJar()
|
||||
br.set_cookiejar(cj)
|
||||
|
||||
#br.set_debug_http(True)
|
||||
#br.set_debug_redirects(True)
|
||||
#br.set_debug_responses(True)
|
||||
|
||||
if self.username is not None and self.password is not None:
|
||||
#print "----------------------------get login form--------------------------------------------"
|
||||
# open login form
|
||||
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||
response = br.response()
|
||||
#print "----------------------------get login form---------------------------------------------"
|
||||
#print "----------------------------set login form---------------------------------------------"
|
||||
# remove disabled input which brings error on mechanize
|
||||
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||
br.set_response(response)
|
||||
br.select_form(name='LA0010Form01')
|
||||
br['LA0010Form01:LA0010Email'] = self.username
|
||||
br['LA0010Form01:LA0010Password'] = self.password
|
||||
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||
br.submit()
|
||||
br.response()
|
||||
#print "----------------------------send login form---------------------------------------------"
|
||||
#print "----------------------------open news main page-----------------------------------------"
|
||||
# open news site
|
||||
br.open('http://www.nikkei.com/')
|
||||
br.response()
|
||||
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||
#print response2.get_data()
|
||||
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||
# forced redirect in default
|
||||
br.select_form(nr=0)
|
||||
br.submit()
|
||||
response3 = br.response()
|
||||
# return some cookie which should be set by Javascript
|
||||
#print response3.geturl()
|
||||
raw = response3.get_data()
|
||||
#print "---------------------------response to form --------------------------------------------"
|
||||
# grab cookie from JS and set it
|
||||
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||
br.select_form(nr=0)
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||
|
||||
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].close()
|
||||
cj.load(self.temp_files[-1].name)
|
||||
|
||||
br.submit()
|
||||
|
||||
#br.set_debug_http(False)
|
||||
#br.set_debug_redirects(False)
|
||||
#br.set_debug_responses(False)
|
||||
return br
|
||||
|
||||
|
||||
|
||||
|
109
resources/recipes/nikkei_sub_sports.recipe
Normal file
@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.nikkei.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import mechanize
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
|
||||
class NikkeiNet_sub_sports(BasicNewsRecipe):
|
||||
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248(\u30b9\u30dd\u30fc\u30c4)'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
description = 'News and current market affairs from Japan'
|
||||
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
|
||||
needs_subscription = True
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 20
|
||||
language = 'ja'
|
||||
remove_javascript = False
|
||||
temp_files = []
|
||||
|
||||
remove_tags_before = {'class':"cmn-section cmn-indent"}
|
||||
remove_tags = [
|
||||
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
|
||||
{'class':"cmn-article_keyword cmn-clearfix"},
|
||||
{'class':"cmn-print_headline cmn-clearfix"},
|
||||
]
|
||||
remove_tags_after = {'class':"cmn-pr_list"}
|
||||
|
||||
feeds = [
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'),
|
||||
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba')
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
|
||||
cj = mechanize.LWPCookieJar()
|
||||
br.set_cookiejar(cj)
|
||||
|
||||
#br.set_debug_http(True)
|
||||
#br.set_debug_redirects(True)
|
||||
#br.set_debug_responses(True)
|
||||
|
||||
if self.username is not None and self.password is not None:
|
||||
#print "----------------------------get login form--------------------------------------------"
|
||||
# open login form
|
||||
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
|
||||
response = br.response()
|
||||
#print "----------------------------get login form---------------------------------------------"
|
||||
#print "----------------------------set login form---------------------------------------------"
|
||||
# remove disabled input which brings error on mechanize
|
||||
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
|
||||
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
|
||||
br.set_response(response)
|
||||
br.select_form(name='LA0010Form01')
|
||||
br['LA0010Form01:LA0010Email'] = self.username
|
||||
br['LA0010Form01:LA0010Password'] = self.password
|
||||
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
|
||||
br.submit()
|
||||
br.response()
|
||||
#print "----------------------------send login form---------------------------------------------"
|
||||
#print "----------------------------open news main page-----------------------------------------"
|
||||
# open news site
|
||||
br.open('http://www.nikkei.com/')
|
||||
br.response()
|
||||
#print "----------------------------www.nikkei.com BODY --------------------------------------"
|
||||
#print response2.get_data()
|
||||
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
|
||||
# forced redirect in default
|
||||
br.select_form(nr=0)
|
||||
br.submit()
|
||||
response3 = br.response()
|
||||
# return some cookie which should be set by Javascript
|
||||
#print response3.geturl()
|
||||
raw = response3.get_data()
|
||||
#print "---------------------------response to form --------------------------------------------"
|
||||
# grab cookie from JS and set it
|
||||
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
|
||||
br.select_form(nr=0)
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
|
||||
|
||||
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
|
||||
self.temp_files[-1].close()
|
||||
cj.load(self.temp_files[-1].name)
|
||||
|
||||
br.submit()
|
||||
|
||||
#br.set_debug_http(False)
|
||||
#br.set_debug_redirects(False)
|
||||
#br.set_debug_responses(False)
|
||||
return br
|
||||
|
||||
|
||||
|
||||
|
@ -13,6 +13,7 @@ class NowToronto(BasicNewsRecipe):
|
||||
title = u'Now Toronto'
|
||||
description = u'Now Toronto'
|
||||
__author__ = 'Starson17'
|
||||
language = 'en_CA'
|
||||
conversion_options = {
|
||||
'no_default_epub_cover' : True
|
||||
}
|
||||
|
@ -7,14 +7,22 @@ nytimes.com
|
||||
'''
|
||||
import re, string, time
|
||||
from calibre import entity_to_unicode, strftime
|
||||
from datetime import timedelta, date
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
||||
|
||||
|
||||
class NYTimes(BasicNewsRecipe):
|
||||
|
||||
# set headlinesOnly to True for the headlines-only version
|
||||
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
||||
headlinesOnly = True
|
||||
|
||||
# set webEdition to True for the Web edition of the newspaper. Set oldest_article to the
|
||||
# number of days old an article can be for inclusion. If oldest_article = 0 all articles
|
||||
# will be included. Note: oldest_article is ignored if webEdition = False
|
||||
webEdition = False
|
||||
oldest_article = 7
|
||||
|
||||
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||||
# Otherwise, only the sections named will be included. For example,
|
||||
#
|
||||
@ -39,20 +47,76 @@ class NYTimes(BasicNewsRecipe):
|
||||
# from an article (if one exists). If one_picture_per_article = True, the image
|
||||
# will be moved to a location between the headline and the byline.
|
||||
# If one_picture_per_article = False, all images from the article will be included
|
||||
|
||||
# and shown in their original location.
|
||||
one_picture_per_article = True
|
||||
one_picture_per_article = False
|
||||
|
||||
# The maximum number of articles that will be downloaded
|
||||
max_articles_per_feed = 100
|
||||
|
||||
# Whether to omit duplicates of articles (typically arsing when articles are indexed in
|
||||
# more than one section). If True, only the first occurance will be downloaded.
|
||||
filterDuplicates = True
|
||||
|
||||
# Sections to collect for the Web edition.
|
||||
# Delete any you don't want, or use includeSections or excludeSections
|
||||
web_sections = [(u'World',u'world'),
|
||||
(u'U.S.',u'national'),
|
||||
(u'Politics',u'politics'),
|
||||
(u'New York',u'nyregion'),
|
||||
(u'Business','business'),
|
||||
(u'Technology',u'technology'),
|
||||
(u'Sports',u'sports'),
|
||||
(u'Science',u'science'),
|
||||
(u'Health',u'health'),
|
||||
(u'Opinion',u'opinion'),
|
||||
(u'Arts',u'arts'),
|
||||
(u'Books',u'books'),
|
||||
(u'Movies',u'movies'),
|
||||
(u'Music',u'arts/music'),
|
||||
(u'Television',u'arts/television'),
|
||||
(u'Style',u'style'),
|
||||
(u'Dining & Wine',u'dining'),
|
||||
(u'Fashion & Style',u'fashion'),
|
||||
(u'Home & Garden',u'garden'),
|
||||
(u'Travel',u'travel'),
|
||||
('Education',u'education'),
|
||||
('Multimedia',u'multimedia'),
|
||||
(u'Obituaries',u'obituaries'),
|
||||
(u'Sunday Magazine',u'magazine'),
|
||||
(u'Week in Review',u'weekinreview')]
|
||||
|
||||
|
||||
if headlinesOnly:
|
||||
title='New York Times Headlines'
|
||||
description = 'Headlines from the New York Times'
|
||||
needs_subscription = False
|
||||
elif webEdition:
|
||||
title='New York Times (Web)'
|
||||
description = 'New York Times on the Web'
|
||||
needs_subscription = True
|
||||
else:
|
||||
title='New York Times'
|
||||
description = 'Today\'s New York Times'
|
||||
needs_subscription = True
|
||||
|
||||
|
||||
month_list = ['january','february','march','april','may','june','july','august','september','october','november','december']
|
||||
|
||||
def decode_us_date(self,datestr):
|
||||
udate = datestr.strip().lower().split()
|
||||
try:
|
||||
m = self.month_list.index(udate[0])+1
|
||||
except:
|
||||
return date.today()
|
||||
d = int(udate[1])
|
||||
y = int(udate[2])
|
||||
try:
|
||||
d = date(y,m,d)
|
||||
except:
|
||||
d = date.today
|
||||
return d
|
||||
|
||||
earliest_date = date.today() - timedelta(days=oldest_article)
|
||||
|
||||
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||||
language = 'en'
|
||||
@ -136,6 +200,12 @@ class NYTimes(BasicNewsRecipe):
|
||||
.image {text-align: center;}
|
||||
.source {text-align: left; }'''
|
||||
|
||||
|
||||
articles = {}
|
||||
key = None
|
||||
ans = []
|
||||
url_list = []
|
||||
|
||||
def filter_ans(self, ans) :
|
||||
total_article_count = 0
|
||||
idx = 0
|
||||
@ -164,6 +234,29 @@ class NYTimes(BasicNewsRecipe):
|
||||
self.log( "Queued %d articles" % total_article_count )
|
||||
return ans
|
||||
|
||||
def exclude_url(self,url):
|
||||
if not url.startswith("http"):
|
||||
return True
|
||||
if not url.endswith(".html"):
|
||||
return True
|
||||
if 'nytimes.com' not in url:
|
||||
return True
|
||||
if 'podcast' in url:
|
||||
return True
|
||||
if '/video/' in url:
|
||||
return True
|
||||
if '/slideshow/' in url:
|
||||
return True
|
||||
if '/magazine/index' in url:
|
||||
return True
|
||||
if '/interactive/' in url:
|
||||
return True
|
||||
if '/reference/' in url:
|
||||
return True
|
||||
if '/premium/' in url:
|
||||
return True
|
||||
return False
|
||||
|
||||
def fixChars(self,string):
|
||||
# Replace lsquo (\x91)
|
||||
fixed = re.sub("\x91","‘",string)
|
||||
@ -249,7 +342,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
return BeautifulSoup(_raw, markupMassage=massage)
|
||||
|
||||
# Entry point
|
||||
print "index_to_soup()"
|
||||
soup = get_the_soup( self.encoding, url_or_raw )
|
||||
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
||||
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
||||
@ -273,83 +365,110 @@ class NYTimes(BasicNewsRecipe):
|
||||
else:
|
||||
return description
|
||||
|
||||
def parse_todays_index(self):
|
||||
def feed_title(self,div):
|
||||
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
||||
|
||||
def feed_title(div):
|
||||
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
||||
|
||||
articles = {}
|
||||
key = None
|
||||
ans = []
|
||||
url_list = []
|
||||
|
||||
def handle_article(div):
|
||||
a = div.find('a', href=True)
|
||||
if not a:
|
||||
def handle_article(self,div):
|
||||
thumbnail = div.find('div','thumbnail')
|
||||
if thumbnail:
|
||||
thumbnail.extract()
|
||||
a = div.find('a', href=True)
|
||||
if not a:
|
||||
return
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if self.exclude_url(url):
|
||||
return
|
||||
url += '?pagewanted=all'
|
||||
if self.filterDuplicates:
|
||||
if url in self.url_list:
|
||||
return
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if not url.startswith("http"):
|
||||
return
|
||||
if not url.endswith(".html"):
|
||||
return
|
||||
if 'podcast' in url:
|
||||
return
|
||||
if '/video/' in url:
|
||||
return
|
||||
url += '?pagewanted=all'
|
||||
if url in url_list:
|
||||
return
|
||||
url_list.append(url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description = ''
|
||||
pubdate = strftime('%a, %d %b')
|
||||
summary = div.find(True, attrs={'class':'summary'})
|
||||
if summary:
|
||||
description = self.tag_to_string(summary, use_alt=False)
|
||||
author = ''
|
||||
self.url_list.append(url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description = ''
|
||||
pubdate = strftime('%a, %d %b')
|
||||
summary = div.find(True, attrs={'class':'summary'})
|
||||
if summary:
|
||||
description = self.tag_to_string(summary, use_alt=False)
|
||||
author = ''
|
||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||
if authorAttribution:
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
else:
|
||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||
if authorAttribution:
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
else:
|
||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||
if authorAttribution:
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
feed = key if key is not None else 'Uncategorized'
|
||||
if not articles.has_key(feed):
|
||||
ans.append(feed)
|
||||
articles[feed] = []
|
||||
articles[feed].append(
|
||||
dict(title=title, url=url, date=pubdate,
|
||||
description=description, author=author,
|
||||
content=''))
|
||||
feed = self.key if self.key is not None else 'Uncategorized'
|
||||
if not self.articles.has_key(feed):
|
||||
self.ans.append(feed)
|
||||
self.articles[feed] = []
|
||||
self.articles[feed].append(
|
||||
dict(title=title, url=url, date=pubdate,
|
||||
description=description, author=author,
|
||||
content=''))
|
||||
|
||||
|
||||
def parse_web_edition(self):
|
||||
|
||||
for (sec_title,index_url) in self.web_sections:
|
||||
if self.includeSections != []:
|
||||
if sec_title not in self.includeSections:
|
||||
print "SECTION NOT INCLUDED: ",sec_title
|
||||
continue
|
||||
if sec_title in self.excludeSections:
|
||||
print "SECTION EXCLUDED: ",sec_title
|
||||
continue
|
||||
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
|
||||
self.key = sec_title
|
||||
# Find each article
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||
if div['class'] in ['story', 'story headline'] :
|
||||
self.handle_article(div)
|
||||
elif div['class'] == 'headlinesOnly multiline flush':
|
||||
for lidiv in div.findAll('li'):
|
||||
self.handle_article(lidiv)
|
||||
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
|
||||
|
||||
def parse_todays_index(self):
|
||||
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||
|
||||
|
||||
skipping = False
|
||||
# Find each article
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||
|
||||
if div['class'] in ['section-headline','sectionHeader']:
|
||||
key = string.capwords(feed_title(div))
|
||||
key = key.replace('Op-ed','Op-Ed')
|
||||
key = key.replace('U.s.','U.S.')
|
||||
self.key = string.capwords(self.feed_title(div))
|
||||
self.key = self.key.replace('Op-ed','Op-Ed')
|
||||
self.key = self.key.replace('U.s.','U.S.')
|
||||
self.key = self.key.replace('N.y.','N.Y.')
|
||||
skipping = False
|
||||
if self.includeSections != []:
|
||||
if self.key not in self.includeSections:
|
||||
print "SECTION NOT INCLUDED: ",self.key
|
||||
skipping = True
|
||||
if self.key in self.excludeSections:
|
||||
print "SECTION EXCLUDED: ",self.key
|
||||
skipping = True
|
||||
|
||||
elif div['class'] in ['story', 'story headline'] :
|
||||
handle_article(div)
|
||||
if not skipping:
|
||||
self.handle_article(div)
|
||||
elif div['class'] == 'headlinesOnly multiline flush':
|
||||
for lidiv in div.findAll('li'):
|
||||
handle_article(lidiv)
|
||||
if not skipping:
|
||||
self.handle_article(lidiv)
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return self.filter_ans(ans)
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
|
||||
def parse_headline_index(self):
|
||||
|
||||
articles = {}
|
||||
ans = []
|
||||
url_list = []
|
||||
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
||||
|
||||
# Fetch the content table
|
||||
@ -363,15 +482,24 @@ class NYTimes(BasicNewsRecipe):
|
||||
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
|
||||
for div_sec in td_col.findAll('div',recursive=False):
|
||||
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
|
||||
|
||||
section_name = self.tag_to_string(h6_sec_name,use_alt=False)
|
||||
section_name = re.sub(r'^ *$','',section_name)
|
||||
|
||||
if section_name == '':
|
||||
continue
|
||||
if self.includeSections != []:
|
||||
if section_name not in self.includeSections:
|
||||
print "SECTION NOT INCLUDED: ",section_name
|
||||
continue
|
||||
if section_name in self.excludeSections:
|
||||
print "SECTION EXCLUDED: ",section_name
|
||||
continue
|
||||
|
||||
section_name=string.capwords(section_name)
|
||||
if section_name == 'U.s.':
|
||||
section_name = 'U.S.'
|
||||
elif section_name == 'Op-ed':
|
||||
section_name = 'Op-Ed'
|
||||
section_name = section_name.replace('Op-ed','Op-Ed')
|
||||
section_name = section_name.replace('U.s.','U.S.')
|
||||
section_name = section_name.replace('N.y.','N.Y.')
|
||||
pubdate = strftime('%a, %d %b')
|
||||
|
||||
search_div = div_sec
|
||||
@ -392,37 +520,32 @@ class NYTimes(BasicNewsRecipe):
|
||||
if not a:
|
||||
continue
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if not url.startswith("http"):
|
||||
continue
|
||||
if not url.endswith(".html"):
|
||||
continue
|
||||
if 'podcast' in url:
|
||||
continue
|
||||
if 'video' in url:
|
||||
if self.exclude_url(url):
|
||||
continue
|
||||
url += '?pagewanted=all'
|
||||
if url in url_list:
|
||||
continue
|
||||
url_list.append(url)
|
||||
self.log("URL %s" % url)
|
||||
if self.filterDuplicates:
|
||||
if url in self.url_list:
|
||||
continue
|
||||
self.url_list.append(url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
desc = h3_item.find('p')
|
||||
if desc is not None:
|
||||
description = self.tag_to_string(desc,use_alt=False)
|
||||
else:
|
||||
description = ''
|
||||
if not articles.has_key(section_name):
|
||||
ans.append(section_name)
|
||||
articles[section_name] = []
|
||||
articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||
if not self.articles.has_key(section_name):
|
||||
self.ans.append(section_name)
|
||||
self.articles[section_name] = []
|
||||
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return self.filter_ans(ans)
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
|
||||
def parse_index(self):
|
||||
if self.headlinesOnly:
|
||||
return self.parse_headline_index()
|
||||
elif self.webEdition:
|
||||
return self.parse_web_edition()
|
||||
else:
|
||||
return self.parse_todays_index()
|
||||
|
||||
@ -438,6 +561,21 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
if self.webEdition & (self.oldest_article>0):
|
||||
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
||||
if date_tag:
|
||||
date_str = self.tag_to_string(date_tag,use_alt=False)
|
||||
date_str = date_str.replace('Published:','')
|
||||
date_items = date_str.split(',')
|
||||
try:
|
||||
datestring = date_items[0]+' '+date_items[1]
|
||||
article_date = self.decode_us_date(datestring)
|
||||
except:
|
||||
article_date = date.today()
|
||||
if article_date < self.earliest_date:
|
||||
self.log("Skipping article dated %s" % date_str)
|
||||
return None
|
||||
|
||||
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||||
if kicker_tag: # remove Op_Ed author head shots
|
||||
tagline = self.tag_to_string(kicker_tag)
|
||||
@ -462,7 +600,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
for inlineImg in inlineImgs[1:]:
|
||||
inlineImg.extract()
|
||||
# Move firstImg before article body
|
||||
#article_body = soup.find(True, {'id':'articleBody'})
|
||||
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
||||
if cgFirst:
|
||||
# Strip all sibling NavigableStrings: noise
|
||||
@ -548,4 +685,3 @@ class NYTimes(BasicNewsRecipe):
|
||||
divTag.replaceWith(tag)
|
||||
|
||||
return soup
|
||||
|
||||
|
@ -7,14 +7,22 @@ nytimes.com
|
||||
'''
|
||||
import re, string, time
|
||||
from calibre import entity_to_unicode, strftime
|
||||
from datetime import timedelta, date
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
|
||||
|
||||
|
||||
class NYTimes(BasicNewsRecipe):
|
||||
|
||||
# set headlinesOnly to True for the headlines-only version
|
||||
# set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
|
||||
headlinesOnly = False
|
||||
|
||||
# set webEdition to True for the Web edition of the newspaper. Set oldest_article to the
|
||||
# number of days old an article can be for inclusion. If oldest_article = 0 all articles
|
||||
# will be included. Note: oldest_article is ignored if webEdition = False
|
||||
webEdition = False
|
||||
oldest_article = 7
|
||||
|
||||
# includeSections: List of sections to include. If empty, all sections found will be included.
|
||||
# Otherwise, only the sections named will be included. For example,
|
||||
#
|
||||
@ -39,20 +47,76 @@ class NYTimes(BasicNewsRecipe):
|
||||
# from an article (if one exists). If one_picture_per_article = True, the image
|
||||
# will be moved to a location between the headline and the byline.
|
||||
# If one_picture_per_article = False, all images from the article will be included
|
||||
|
||||
# and shown in their original location.
|
||||
one_picture_per_article = True
|
||||
one_picture_per_article = False
|
||||
|
||||
# The maximum number of articles that will be downloaded
|
||||
max_articles_per_feed = 100
|
||||
|
||||
# Whether to omit duplicates of articles (typically arsing when articles are indexed in
|
||||
# more than one section). If True, only the first occurance will be downloaded.
|
||||
filterDuplicates = True
|
||||
|
||||
# Sections to collect for the Web edition.
|
||||
# Delete any you don't want, or use includeSections or excludeSections
|
||||
web_sections = [(u'World',u'world'),
|
||||
(u'U.S.',u'national'),
|
||||
(u'Politics',u'politics'),
|
||||
(u'New York',u'nyregion'),
|
||||
(u'Business','business'),
|
||||
(u'Technology',u'technology'),
|
||||
(u'Sports',u'sports'),
|
||||
(u'Science',u'science'),
|
||||
(u'Health',u'health'),
|
||||
(u'Opinion',u'opinion'),
|
||||
(u'Arts',u'arts'),
|
||||
(u'Books',u'books'),
|
||||
(u'Movies',u'movies'),
|
||||
(u'Music',u'arts/music'),
|
||||
(u'Television',u'arts/television'),
|
||||
(u'Style',u'style'),
|
||||
(u'Dining & Wine',u'dining'),
|
||||
(u'Fashion & Style',u'fashion'),
|
||||
(u'Home & Garden',u'garden'),
|
||||
(u'Travel',u'travel'),
|
||||
('Education',u'education'),
|
||||
('Multimedia',u'multimedia'),
|
||||
(u'Obituaries',u'obituaries'),
|
||||
(u'Sunday Magazine',u'magazine'),
|
||||
(u'Week in Review',u'weekinreview')]
|
||||
|
||||
|
||||
if headlinesOnly:
|
||||
title='New York Times Headlines'
|
||||
description = 'Headlines from the New York Times'
|
||||
needs_subscription = False
|
||||
elif webEdition:
|
||||
title='New York Times (Web)'
|
||||
description = 'New York Times on the Web'
|
||||
needs_subscription = True
|
||||
else:
|
||||
title='New York Times'
|
||||
description = 'Today\'s New York Times'
|
||||
needs_subscription = True
|
||||
|
||||
|
||||
month_list = ['january','february','march','april','may','june','july','august','september','october','november','december']
|
||||
|
||||
def decode_us_date(self,datestr):
|
||||
udate = datestr.strip().lower().split()
|
||||
try:
|
||||
m = self.month_list.index(udate[0])+1
|
||||
except:
|
||||
return date.today()
|
||||
d = int(udate[1])
|
||||
y = int(udate[2])
|
||||
try:
|
||||
d = date(y,m,d)
|
||||
except:
|
||||
d = date.today
|
||||
return d
|
||||
|
||||
earliest_date = date.today() - timedelta(days=oldest_article)
|
||||
|
||||
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
|
||||
language = 'en'
|
||||
@ -60,7 +124,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
|
||||
timefmt = ''
|
||||
needs_subscription = True
|
||||
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
|
||||
cover_margins = (18,18,'grey99')
|
||||
|
||||
@ -137,6 +200,12 @@ class NYTimes(BasicNewsRecipe):
|
||||
.image {text-align: center;}
|
||||
.source {text-align: left; }'''
|
||||
|
||||
|
||||
articles = {}
|
||||
key = None
|
||||
ans = []
|
||||
url_list = []
|
||||
|
||||
def filter_ans(self, ans) :
|
||||
total_article_count = 0
|
||||
idx = 0
|
||||
@ -165,6 +234,29 @@ class NYTimes(BasicNewsRecipe):
|
||||
self.log( "Queued %d articles" % total_article_count )
|
||||
return ans
|
||||
|
||||
def exclude_url(self,url):
|
||||
if not url.startswith("http"):
|
||||
return True
|
||||
if not url.endswith(".html"):
|
||||
return True
|
||||
if 'nytimes.com' not in url:
|
||||
return True
|
||||
if 'podcast' in url:
|
||||
return True
|
||||
if '/video/' in url:
|
||||
return True
|
||||
if '/slideshow/' in url:
|
||||
return True
|
||||
if '/magazine/index' in url:
|
||||
return True
|
||||
if '/interactive/' in url:
|
||||
return True
|
||||
if '/reference/' in url:
|
||||
return True
|
||||
if '/premium/' in url:
|
||||
return True
|
||||
return False
|
||||
|
||||
def fixChars(self,string):
|
||||
# Replace lsquo (\x91)
|
||||
fixed = re.sub("\x91","‘",string)
|
||||
@ -250,7 +342,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
return BeautifulSoup(_raw, markupMassage=massage)
|
||||
|
||||
# Entry point
|
||||
print "index_to_soup()"
|
||||
soup = get_the_soup( self.encoding, url_or_raw )
|
||||
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
||||
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
|
||||
@ -274,83 +365,110 @@ class NYTimes(BasicNewsRecipe):
|
||||
else:
|
||||
return description
|
||||
|
||||
def parse_todays_index(self):
|
||||
def feed_title(self,div):
|
||||
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
||||
|
||||
def feed_title(div):
|
||||
return ''.join(div.findAll(text=True, recursive=True)).strip()
|
||||
|
||||
articles = {}
|
||||
key = None
|
||||
ans = []
|
||||
url_list = []
|
||||
|
||||
def handle_article(div):
|
||||
a = div.find('a', href=True)
|
||||
if not a:
|
||||
def handle_article(self,div):
|
||||
thumbnail = div.find('div','thumbnail')
|
||||
if thumbnail:
|
||||
thumbnail.extract()
|
||||
a = div.find('a', href=True)
|
||||
if not a:
|
||||
return
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if self.exclude_url(url):
|
||||
return
|
||||
url += '?pagewanted=all'
|
||||
if self.filterDuplicates:
|
||||
if url in self.url_list:
|
||||
return
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if not url.startswith("http"):
|
||||
return
|
||||
if not url.endswith(".html"):
|
||||
return
|
||||
if 'podcast' in url:
|
||||
return
|
||||
if '/video/' in url:
|
||||
return
|
||||
url += '?pagewanted=all'
|
||||
if url in url_list:
|
||||
return
|
||||
url_list.append(url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description = ''
|
||||
pubdate = strftime('%a, %d %b')
|
||||
summary = div.find(True, attrs={'class':'summary'})
|
||||
if summary:
|
||||
description = self.tag_to_string(summary, use_alt=False)
|
||||
author = ''
|
||||
self.url_list.append(url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description = ''
|
||||
pubdate = strftime('%a, %d %b')
|
||||
summary = div.find(True, attrs={'class':'summary'})
|
||||
if summary:
|
||||
description = self.tag_to_string(summary, use_alt=False)
|
||||
author = ''
|
||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||
if authorAttribution:
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
else:
|
||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||
if authorAttribution:
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
else:
|
||||
authorAttribution = div.find(True, attrs={'class':'byline'})
|
||||
if authorAttribution:
|
||||
author = self.tag_to_string(authorAttribution, use_alt=False)
|
||||
feed = key if key is not None else 'Uncategorized'
|
||||
if not articles.has_key(feed):
|
||||
ans.append(feed)
|
||||
articles[feed] = []
|
||||
articles[feed].append(
|
||||
dict(title=title, url=url, date=pubdate,
|
||||
description=description, author=author,
|
||||
content=''))
|
||||
feed = self.key if self.key is not None else 'Uncategorized'
|
||||
if not self.articles.has_key(feed):
|
||||
self.ans.append(feed)
|
||||
self.articles[feed] = []
|
||||
self.articles[feed].append(
|
||||
dict(title=title, url=url, date=pubdate,
|
||||
description=description, author=author,
|
||||
content=''))
|
||||
|
||||
|
||||
def parse_web_edition(self):
|
||||
|
||||
for (sec_title,index_url) in self.web_sections:
|
||||
if self.includeSections != []:
|
||||
if sec_title not in self.includeSections:
|
||||
print "SECTION NOT INCLUDED: ",sec_title
|
||||
continue
|
||||
if sec_title in self.excludeSections:
|
||||
print "SECTION EXCLUDED: ",sec_title
|
||||
continue
|
||||
print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
|
||||
self.key = sec_title
|
||||
# Find each article
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||
if div['class'] in ['story', 'story headline'] :
|
||||
self.handle_article(div)
|
||||
elif div['class'] == 'headlinesOnly multiline flush':
|
||||
for lidiv in div.findAll('li'):
|
||||
self.handle_article(lidiv)
|
||||
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
|
||||
|
||||
def parse_todays_index(self):
|
||||
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||
|
||||
|
||||
skipping = False
|
||||
# Find each article
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
|
||||
|
||||
if div['class'] in ['section-headline','sectionHeader']:
|
||||
key = string.capwords(feed_title(div))
|
||||
key = key.replace('Op-ed','Op-Ed')
|
||||
key = key.replace('U.s.','U.S.')
|
||||
self.key = string.capwords(self.feed_title(div))
|
||||
self.key = self.key.replace('Op-ed','Op-Ed')
|
||||
self.key = self.key.replace('U.s.','U.S.')
|
||||
self.key = self.key.replace('N.y.','N.Y.')
|
||||
skipping = False
|
||||
if self.includeSections != []:
|
||||
if self.key not in self.includeSections:
|
||||
print "SECTION NOT INCLUDED: ",self.key
|
||||
skipping = True
|
||||
if self.key in self.excludeSections:
|
||||
print "SECTION EXCLUDED: ",self.key
|
||||
skipping = True
|
||||
|
||||
elif div['class'] in ['story', 'story headline'] :
|
||||
handle_article(div)
|
||||
if not skipping:
|
||||
self.handle_article(div)
|
||||
elif div['class'] == 'headlinesOnly multiline flush':
|
||||
for lidiv in div.findAll('li'):
|
||||
handle_article(lidiv)
|
||||
if not skipping:
|
||||
self.handle_article(lidiv)
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return self.filter_ans(ans)
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
|
||||
def parse_headline_index(self):
|
||||
|
||||
articles = {}
|
||||
ans = []
|
||||
url_list = []
|
||||
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
||||
|
||||
# Fetch the content table
|
||||
@ -364,15 +482,24 @@ class NYTimes(BasicNewsRecipe):
|
||||
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
|
||||
for div_sec in td_col.findAll('div',recursive=False):
|
||||
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
|
||||
|
||||
section_name = self.tag_to_string(h6_sec_name,use_alt=False)
|
||||
section_name = re.sub(r'^ *$','',section_name)
|
||||
|
||||
if section_name == '':
|
||||
continue
|
||||
if self.includeSections != []:
|
||||
if section_name not in self.includeSections:
|
||||
print "SECTION NOT INCLUDED: ",section_name
|
||||
continue
|
||||
if section_name in self.excludeSections:
|
||||
print "SECTION EXCLUDED: ",section_name
|
||||
continue
|
||||
|
||||
section_name=string.capwords(section_name)
|
||||
if section_name == 'U.s.':
|
||||
section_name = 'U.S.'
|
||||
elif section_name == 'Op-ed':
|
||||
section_name = 'Op-Ed'
|
||||
section_name = section_name.replace('Op-ed','Op-Ed')
|
||||
section_name = section_name.replace('U.s.','U.S.')
|
||||
section_name = section_name.replace('N.y.','N.Y.')
|
||||
pubdate = strftime('%a, %d %b')
|
||||
|
||||
search_div = div_sec
|
||||
@ -393,37 +520,32 @@ class NYTimes(BasicNewsRecipe):
|
||||
if not a:
|
||||
continue
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
if not url.startswith("http"):
|
||||
continue
|
||||
if not url.endswith(".html"):
|
||||
continue
|
||||
if 'podcast' in url:
|
||||
continue
|
||||
if 'video' in url:
|
||||
if self.exclude_url(url):
|
||||
continue
|
||||
url += '?pagewanted=all'
|
||||
if url in url_list:
|
||||
continue
|
||||
url_list.append(url)
|
||||
self.log("URL %s" % url)
|
||||
if self.filterDuplicates:
|
||||
if url in self.url_list:
|
||||
continue
|
||||
self.url_list.append(url)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
desc = h3_item.find('p')
|
||||
if desc is not None:
|
||||
description = self.tag_to_string(desc,use_alt=False)
|
||||
else:
|
||||
description = ''
|
||||
if not articles.has_key(section_name):
|
||||
ans.append(section_name)
|
||||
articles[section_name] = []
|
||||
articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||
if not self.articles.has_key(section_name):
|
||||
self.ans.append(section_name)
|
||||
self.articles[section_name] = []
|
||||
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
||||
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return self.filter_ans(ans)
|
||||
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
|
||||
return self.filter_ans(self.ans)
|
||||
|
||||
def parse_index(self):
|
||||
if self.headlinesOnly:
|
||||
return self.parse_headline_index()
|
||||
elif self.webEdition:
|
||||
return self.parse_web_edition()
|
||||
else:
|
||||
return self.parse_todays_index()
|
||||
|
||||
@ -439,6 +561,21 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
if self.webEdition & (self.oldest_article>0):
|
||||
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
|
||||
if date_tag:
|
||||
date_str = self.tag_to_string(date_tag,use_alt=False)
|
||||
date_str = date_str.replace('Published:','')
|
||||
date_items = date_str.split(',')
|
||||
try:
|
||||
datestring = date_items[0]+' '+date_items[1]
|
||||
article_date = self.decode_us_date(datestring)
|
||||
except:
|
||||
article_date = date.today()
|
||||
if article_date < self.earliest_date:
|
||||
self.log("Skipping article dated %s" % date_str)
|
||||
return None
|
||||
|
||||
kicker_tag = soup.find(attrs={'class':'kicker'})
|
||||
if kicker_tag: # remove Op_Ed author head shots
|
||||
tagline = self.tag_to_string(kicker_tag)
|
||||
@ -463,7 +600,6 @@ class NYTimes(BasicNewsRecipe):
|
||||
for inlineImg in inlineImgs[1:]:
|
||||
inlineImg.extract()
|
||||
# Move firstImg before article body
|
||||
#article_body = soup.find(True, {'id':'articleBody'})
|
||||
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
|
||||
if cgFirst:
|
||||
# Strip all sibling NavigableStrings: noise
|
||||
|
63
resources/recipes/observa_digital.recipe
Normal file
@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
'''
|
||||
observa.com.uy
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Noticias(BasicNewsRecipe):
|
||||
title = 'Observa Digital'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
description = 'Noticias desde Uruguay'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
use_embedded_content = False
|
||||
recursion = 5
|
||||
encoding = 'utf8'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [dict(id=['contenido'])]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':'contenedorVinculadas'}),
|
||||
dict(name='p', attrs={'id':'nota_firma'}),
|
||||
dict(name=['object','link'])
|
||||
]
|
||||
|
||||
remove_attributes = ['width','height', 'style', 'font', 'color']
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||
p {font-family:Arial,Helvetica,sans-serif;}
|
||||
'''
|
||||
feeds = [
|
||||
(u'Actualidad', u'http://www.observa.com.uy/RSS/actualidad.xml'),
|
||||
(u'Deportes', u'http://www.observa.com.uy/RSS/deportes.xml'),
|
||||
(u'Vida', u'http://www.observa.com.uy/RSS/vida.xml'),
|
||||
(u'Ciencia y Tecnologia', u'http://www.observa.com.uy/RSS/ciencia.xml')
|
||||
]
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = None
|
||||
index = 'http://www.elobservador.com.uy/elobservador/nav_portada.asp?suplemento=dia'
|
||||
soup = self.index_to_soup(index)
|
||||
link_item = soup.find('img',attrs={'usemap':'#mapeo_imagenes'})
|
||||
if link_item:
|
||||
cover_url = 'http://www.elobservador.com.uy'+link_item['src'].strip()
|
||||
|
||||
print cover_url
|
||||
|
||||
return cover_url
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
37
resources/recipes/reuters_ja.recipe
Normal file
@ -0,0 +1,37 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
class ReutersJa(BasicNewsRecipe):
|
||||
|
||||
title = 'Reuters(Japan)'
|
||||
description = 'Global news in Japanese'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
use_embedded_content = False
|
||||
language = 'ja'
|
||||
max_articles_per_feed = 10
|
||||
remove_javascript = True
|
||||
|
||||
feeds = [ ('Top Stories', 'http://feeds.reuters.com/reuters/JPTopNews?format=xml'),
|
||||
('World News', 'http://feeds.reuters.com/reuters/JPWorldNews?format=xml'),
|
||||
('Business News', 'http://feeds.reuters.com/reuters/JPBusinessNews?format=xml'),
|
||||
('Technology News', 'http://feeds.reuters.com/reuters/JPTechnologyNews?format=xml'),
|
||||
('Oddly Enough News', 'http://feeds.reuters.com/reuters/JPOddlyEnoughNews?format=xml')
|
||||
]
|
||||
|
||||
remove_tags_before = {'class':"article primaryContent"}
|
||||
remove_tags = [ dict(id="banner"),
|
||||
dict(id="autilities"),
|
||||
dict(id="textSizer"),
|
||||
dict(id="shareFooter"),
|
||||
dict(id="relatedNews"),
|
||||
dict(id="editorsChoice"),
|
||||
dict(id="ecArticles"),
|
||||
{'class':"secondaryContent"},
|
||||
{'class':"module"},
|
||||
]
|
||||
remove_tags_after = {'class':"assetBuddy"}
|
||||
|
||||
def print_version(self, url):
|
||||
m = re.search('(.*idJPJAPAN-[0-9]+)', url)
|
||||
return m.group(0)+'?sp=true'
|
||||
|
54
resources/recipes/revista_bla.recipe
Normal file
@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
|
||||
'''
|
||||
http://www.revistabla.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Noticias(BasicNewsRecipe):
|
||||
title = 'Revista Bla'
|
||||
__author__ = 'Gustavo Azambuja'
|
||||
description = 'Moda | Uruguay'
|
||||
language = 'es'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
use_embedded_content = False
|
||||
recursion = 5
|
||||
encoding = 'utf8'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
oldest_article = 20
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags = [dict(id=['body_container'])]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['date_text', 'comments', 'form_section', 'share_it']}),
|
||||
dict(name='div', attrs={'id':['relatedPosts', 'spacer', 'banner_izquierda', 'right_container']}),
|
||||
dict(name='p', attrs={'class':'FacebookLikeButton'}),
|
||||
dict(name=['object','link']) ]
|
||||
|
||||
extra_css = '''
|
||||
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
|
||||
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
|
||||
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
|
||||
p {font-family:Arial,Helvetica,sans-serif;}
|
||||
'''
|
||||
feeds = [
|
||||
(u'Articulos', u'http://www.revistabla.com/feed/')
|
||||
]
|
||||
|
||||
def get_cover_url(self):
|
||||
cover_url = None
|
||||
index = 'http://www.revistabla.com'
|
||||
soup = self.index_to_soup(index)
|
||||
link_item = soup.find('div',attrs={'class':'header_right'})
|
||||
if link_item:
|
||||
cover_url = link_item.img['src']
|
||||
return cover_url
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
@ -108,3 +108,10 @@ class RevistaMuyInteresante(BasicNewsRecipe):
|
||||
feeds.append((title, articles))
|
||||
return feeds
|
||||
|
||||
def get_cover_url(self):
|
||||
index = 'http://www.muyinteresante.es/revista'
|
||||
soup = self.index_to_soup(index)
|
||||
link_item = soup.find('img',attrs={'class':'img_portada'})
|
||||
if link_item:
|
||||
cover_url = "http://www.muyinteresante.es"+link_item['src']
|
||||
return cover_url
|
||||
|
@ -6,6 +6,7 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
spiegel.de
|
||||
'''
|
||||
|
||||
from time import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Spiegel_ger(BasicNewsRecipe):
|
||||
@ -44,3 +45,6 @@ class Spiegel_ger(BasicNewsRecipe):
|
||||
rmain, rsep, rrest = main.rpartition(',')
|
||||
purl = rmain + ',druck-' + rrest + ',' + rest
|
||||
return purl
|
||||
|
||||
def get_cover_url(self):
|
||||
return 'http://wissen.spiegel.de/wissen/titel/SP/' + strftime("%Y/%W/%j/titel.jpg")
|
||||
|
@ -3,12 +3,12 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
|
||||
|
||||
''' http://www.derstandard.at - Austrian Newspaper '''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TelepolisNews(BasicNewsRecipe):
|
||||
title = u'Telepolis (News)'
|
||||
title = u'Telepolis (News+Artikel)'
|
||||
__author__ = 'Gerhard Aigner'
|
||||
publisher = 'Heise Zeitschriften Verlag GmbH & Co KG'
|
||||
description = 'News from telepolis'
|
||||
@ -20,16 +20,16 @@ class TelepolisNews(BasicNewsRecipe):
|
||||
encoding = "utf-8"
|
||||
language = 'de_AT'
|
||||
|
||||
use_embedded_content = False
|
||||
use_embedded_content =False
|
||||
remove_empty_feeds = True
|
||||
|
||||
preprocess_regexps = [(re.compile(r'<a[^>]*>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(r'</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),]
|
||||
|
||||
keep_only_tags = [dict(name = 'table',attrs={'class':'blogtable'})]
|
||||
remove_tags = [dict(name='img'), dict(name='td',attrs={'class':'blogbottom'})]
|
||||
keep_only_tags = [dict(name = 'td',attrs={'class':'bloghead'}),dict(name = 'td',attrs={'class':'blogfliess'})]
|
||||
remove_tags = [dict(name='img'), dict(name='td',attrs={'class':'blogbottom'}), dict(name='td',attrs={'class':'forum'})]
|
||||
|
||||
feeds = [(u'News', u'http://www.heise.de/tp/news.rdf')]
|
||||
feeds = [(u'News', u'http://www.heise.de/tp/news-atom.xml')]
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment' , description
|
||||
@ -41,7 +41,7 @@ class TelepolisNews(BasicNewsRecipe):
|
||||
|
||||
def get_article_url(self, article):
|
||||
'''if the linked article is of kind artikel don't take it'''
|
||||
if (article.link.count('artikel') > 0) :
|
||||
if (article.link.count('artikel') > 1) :
|
||||
return None
|
||||
return article.link
|
||||
|
||||
@ -49,3 +49,5 @@ class TelepolisNews(BasicNewsRecipe):
|
||||
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
|
||||
soup.head.insert(0,mtag)
|
||||
return soup
|
||||
|
||||
|
||||
|
33
resources/recipes/the_h.recipe
Normal file
@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
'''
|
||||
www.h-online.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TheHeiseOnline(BasicNewsRecipe):
|
||||
title = u'The H'
|
||||
__author__ = 'Hiroshi Miura'
|
||||
oldest_article = 3
|
||||
description = 'In association with Heise Online'
|
||||
publisher = 'Heise Media UK Ltd.'
|
||||
category = 'news, technology, security'
|
||||
max_articles_per_feed = 100
|
||||
language = 'en'
|
||||
encoding = 'utf-8'
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
,'tags' : category
|
||||
,'publisher': publisher
|
||||
,'language' : language
|
||||
}
|
||||
feeds = [
|
||||
(u'The H News Feed', u'http://www.h-online.com/news/atom.xml')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?view=print'
|
||||
|
59
resources/recipes/the_workingham_times.recipe
Normal file
@ -0,0 +1,59 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.getwokingham.co.uk
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class TheWorkinghamTimes(BasicNewsRecipe):
|
||||
title = 'The Workingham Times'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'News from UK'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'utf8'
|
||||
publisher = 'The Wokingham Times - S&B media'
|
||||
category = 'news, UK, world'
|
||||
language = 'en_GB'
|
||||
publication_type = 'newsportal'
|
||||
extra_css = """
|
||||
body{ font-family: Arial,sans-serif }
|
||||
img{display: block; margin-bottom: 0.4em}
|
||||
"""
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'article-body'})]
|
||||
remove_tags = [
|
||||
dict(name='div' , attrs={'class':['ad']})
|
||||
,dict(name=['meta','base','iframe','embed','object'])
|
||||
,dict(name='span' , attrs={'class':'caption small'})
|
||||
]
|
||||
remove_attributes = ['width','height','lang']
|
||||
|
||||
feeds = [
|
||||
('Home' , 'http://www.getwokingham.co.uk/rss.xml' )
|
||||
,('News' , 'http://www.getwokingham.co.uk/news/rss.xml' )
|
||||
,('Entertainment', 'http://www.getwokingham.co.uk/entertainment/rss.xml')
|
||||
,('Lifestyle' , 'http://www.getwokingham.co.uk/lifestyle/rss.xml' )
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.findAll('a'):
|
||||
if item.string is not None:
|
||||
str = item.string
|
||||
item.replaceWith(str)
|
||||
else:
|
||||
item.name = 'span'
|
||||
del item['href']
|
||||
return soup
|
34
resources/recipes/tsn.recipe
Normal file
@ -0,0 +1,34 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1289990851(BasicNewsRecipe):
|
||||
title = u'TSN'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 50
|
||||
language = 'en_CA'
|
||||
__author__ = 'Nexus'
|
||||
no_stylesheets = True
|
||||
INDEX = 'http://tsn.ca/nhl/story/?id=nhl'
|
||||
keep_only_tags = [dict(name='div', attrs={'id':['tsnColWrap']}),
|
||||
dict(name='div', attrs={'id':['tsnStory']})]
|
||||
remove_tags = [dict(name='div', attrs={'id':'tsnRelated'}),
|
||||
dict(name='div', attrs={'class':'textSize'})]
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
feed_parts = soup.findAll('div', attrs={'class': 'feature'})
|
||||
for feed_part in feed_parts:
|
||||
articles = []
|
||||
if not feed_part.h2:
|
||||
continue
|
||||
feed_title = feed_part.h2.string
|
||||
article_parts = feed_part.findAll('a')
|
||||
for article_part in article_parts:
|
||||
article_title = article_part.string
|
||||
article_date = ''
|
||||
article_url = 'http://tsn.ca/' + article_part['href']
|
||||
articles.append({'title': article_title, 'url': article_url, 'description':'', 'date':article_date})
|
||||
if articles:
|
||||
feeds.append((feed_title, articles))
|
||||
return feeds
|
||||
|
@ -12,6 +12,7 @@ class ZeitDe(BasicNewsRecipe):
|
||||
title = 'Zeit Online'
|
||||
description = 'Zeit Online'
|
||||
language = 'de'
|
||||
encoding = 'UTF-8'
|
||||
|
||||
__author__ = 'Martin Pitt, Sujata Raman, Ingo Paschke and Marc Toensing'
|
||||
|
||||
@ -58,7 +59,7 @@ class ZeitDe(BasicNewsRecipe):
|
||||
def preprocess_html(self, soup):
|
||||
for tag in soup.findAll(name=['ul','li']):
|
||||
tag.name = 'div'
|
||||
|
||||
|
||||
soup.html['xml:lang'] = self.lang
|
||||
soup.html['lang'] = self.lang
|
||||
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
|
||||
|
63
resources/recipes/zeitde_sub.recipe
Normal file
@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 mode: python -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Steffen Siebert <calibre at steffensiebert.de>'
|
||||
__docformat__ = 'restructuredtext de'
|
||||
__version__ = '1.1'
|
||||
|
||||
"""
|
||||
Die Zeit EPUB
|
||||
"""
|
||||
|
||||
import os, urllib2, zipfile, re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
class ZeitEPUBAbo(BasicNewsRecipe):
|
||||
|
||||
title = u'Zeit Online Premium'
|
||||
description = u'Das EPUB Abo der Zeit (needs subscription)'
|
||||
language = 'de'
|
||||
lang = 'de-DE'
|
||||
|
||||
__author__ = 'Steffen Siebert'
|
||||
needs_subscription = True
|
||||
|
||||
conversion_options = {
|
||||
'no_default_epub_cover' : True
|
||||
}
|
||||
|
||||
def build_index(self):
|
||||
domain = "http://premium.zeit.de"
|
||||
url = domain + "/abovorteile/cgi-bin/_er_member/p4z.fpl?ER_Do=getUserData&ER_NextTemplate=login_ok"
|
||||
|
||||
browser = self.get_browser()
|
||||
browser.add_password("http://premium.zeit.de", self.username, self.password)
|
||||
|
||||
try:
|
||||
browser.open(url)
|
||||
except urllib2.HTTPError:
|
||||
self.report_progress(0,_("Can't login to download issue"))
|
||||
raise ValueError('Failed to login, check your username and password')
|
||||
|
||||
response = browser.follow_link(text="DIE ZEIT als E-Paper")
|
||||
response = browser.follow_link(url_regex=re.compile('^http://contentserver.hgv-online.de/nodrm/fulfillment\\?distributor=zeit-online&orderid=zeit_online.*'))
|
||||
|
||||
tmp = PersistentTemporaryFile(suffix='.epub')
|
||||
self.report_progress(0,_('downloading epub'))
|
||||
tmp.write(response.read())
|
||||
tmp.close()
|
||||
|
||||
zfile = zipfile.ZipFile(tmp.name, 'r')
|
||||
self.report_progress(0,_('extracting epub'))
|
||||
|
||||
zfile.extractall(self.output_dir)
|
||||
|
||||
tmp.close()
|
||||
index = os.path.join(self.output_dir, 'content.opf')
|
||||
|
||||
self.report_progress(1,_('epub downloaded and extracted'))
|
||||
|
||||
return index
|
||||
|
@ -2022,7 +2022,8 @@ var Hyphenator = (function (window) {
|
||||
if (n.nodeType === 3 && n.data.length >= min) { //type 3 = #text -> hyphenate!
|
||||
n.data = n.data.replace(Hyphenator.languages[lang].genRegExp, hyphenate);
|
||||
} else if (n.nodeType === 1) {
|
||||
if (n.lang !== '') {
|
||||
// Modified by Kovid to use element lang only if it has been loaded
|
||||
if (n.lang !== '' && Hyphenator.languages.hasOwnProperty(n.lang)) {
|
||||
Hyphenator.hyphenate(n, n.lang);
|
||||
} else {
|
||||
Hyphenator.hyphenate(n, lang);
|
||||
@ -2139,4 +2140,4 @@ if (Hyphenator.isBookmarklet()) {
|
||||
Hyphenator.config({displaytogglebox: true, intermediatestate: 'visible', doframes: true});
|
||||
Hyphenator.config(Hyphenator.getConfigFromURI());
|
||||
Hyphenator.run();
|
||||
}
|
||||
}
|
||||
|
@ -6,14 +6,43 @@
|
||||
|
||||
function scale_images() {
|
||||
$("img:visible").each(function() {
|
||||
var offset = $(this).offset();
|
||||
var img = $(this);
|
||||
var offset = img.offset();
|
||||
var avail_width = window.innerWidth - offset.left - 5;
|
||||
var avail_height = window.innerHeight - 5;
|
||||
img.css('width', img.data('orig-width'));
|
||||
img.css('height', img.data('orig-height'));
|
||||
var width = img.width();
|
||||
var height = img.height();
|
||||
var ratio = 0;
|
||||
|
||||
if (width > avail_width) {
|
||||
ratio = avail_width / width;
|
||||
img.css('width', avail_width+'px');
|
||||
img.css('height', (ratio*height) + 'px');
|
||||
height = height * ratio;
|
||||
width = width * ratio;
|
||||
}
|
||||
|
||||
if (height > avail_height) {
|
||||
ratio = avail_height / height;
|
||||
img.css('height', avail_height);
|
||||
img.css('width', width * ratio);
|
||||
}
|
||||
//window.py_bridge.debug(window.getComputedStyle(this, '').getPropertyValue('max-width'));
|
||||
$(this).css("max-width", (window.innerWidth-offset.left-5)+"px");
|
||||
$(this).css("max-height", (window.innerHeight-5)+"px");
|
||||
});
|
||||
}
|
||||
|
||||
function store_original_size_attributes() {
|
||||
$("img").each(function() {
|
||||
var img = $(this);
|
||||
img.data('orig-width', img.css('width'));
|
||||
img.data('orig-height', img.css('height'));
|
||||
});
|
||||
}
|
||||
|
||||
function setup_image_scaling_handlers() {
|
||||
store_original_size_attributes();
|
||||
scale_images();
|
||||
$(window).resize(function(){
|
||||
scale_images();
|
||||
|
@ -90,11 +90,13 @@ fc_lib = '/usr/lib'
|
||||
podofo_inc = '/usr/include/podofo'
|
||||
podofo_lib = '/usr/lib'
|
||||
chmlib_inc_dirs = chmlib_lib_dirs = []
|
||||
sqlite_inc_dirs = []
|
||||
|
||||
if iswindows:
|
||||
prefix = r'C:\cygwin\home\kovid\sw'
|
||||
sw_inc_dir = os.path.join(prefix, 'include')
|
||||
sw_lib_dir = os.path.join(prefix, 'lib')
|
||||
sqlite_inc_dirs = [sw_inc_dir]
|
||||
fc_inc = os.path.join(sw_inc_dir, 'fontconfig')
|
||||
fc_lib = sw_lib_dir
|
||||
chmlib_inc_dirs = consolidate('CHMLIB_INC_DIR', os.path.join(prefix,
|
||||
|
@ -18,7 +18,7 @@ from setup.build_environment import fc_inc, fc_lib, chmlib_inc_dirs, \
|
||||
QMAKE, msvc, MT, win_inc, win_lib, png_inc_dirs, win_ddk, \
|
||||
magick_inc_dirs, magick_lib_dirs, png_lib_dirs, png_libs, \
|
||||
magick_error, magick_libs, ft_lib_dirs, ft_libs, jpg_libs, \
|
||||
jpg_lib_dirs, chmlib_lib_dirs
|
||||
jpg_lib_dirs, chmlib_lib_dirs, sqlite_inc_dirs
|
||||
MT
|
||||
isunix = islinux or isosx or isfreebsd
|
||||
|
||||
@ -58,6 +58,11 @@ if iswindows:
|
||||
|
||||
extensions = [
|
||||
|
||||
Extension('sqlite_custom',
|
||||
['calibre/library/sqlite_custom.c'],
|
||||
inc_dirs=sqlite_inc_dirs
|
||||
),
|
||||
|
||||
Extension('chmlib',
|
||||
['calibre/utils/chm/swig_chm.c'],
|
||||
libraries=['ChmLib' if iswindows else 'chm'],
|
||||
|
@ -132,7 +132,7 @@ class Win32Freeze(Command, WixMixIn):
|
||||
shutil.copytree(self.j(comext, 'shell'), self.j(sp_dir, 'win32com', 'shell'))
|
||||
shutil.rmtree(comext)
|
||||
|
||||
for pat in (r'numpy', r'PyQt4\uic\port_v3'):
|
||||
for pat in (r'PyQt4\uic\port_v3', ):
|
||||
x = glob.glob(self.j(self.lib_dir, 'site-packages', pat))[0]
|
||||
shutil.rmtree(x)
|
||||
|
||||
|
@ -19,7 +19,7 @@ Set CMAKE_PREFIX_PATH environment variable to C:\cygwin\home\kovid\sw
|
||||
|
||||
This is where all dependencies will be installed.
|
||||
|
||||
Add C:\Python26\Scripts and C:\Python26 to PATH
|
||||
Add C:\Python27\Scripts and C:\Python27 to PATH
|
||||
|
||||
Install setuptools from http://pypi.python.org/pypi/setuptools
|
||||
If there are no windows binaries already compiled for the version of python you are using then download the source and run the following command in the folder where the source has been unpacked::
|
||||
@ -28,10 +28,16 @@ If there are no windows binaries already compiled for the version of python you
|
||||
|
||||
Run the following command to install python dependencies::
|
||||
|
||||
easy_install --always-unzip -U ipython mechanize pyreadline python-dateutil dnspython cssutils clientform
|
||||
easy_install --always-unzip -U ipython mechanize pyreadline python-dateutil dnspython cssutils clientform pycrypto
|
||||
|
||||
Install BeautifulSoup 3.0.x manually into site-packages (3.1.x parses broken HTML very poorly)
|
||||
|
||||
|
||||
SQLite
|
||||
---------
|
||||
|
||||
Put sqlite3*.h from the sqlite windows amlgamation in ~/sw/include
|
||||
|
||||
Qt
|
||||
--------
|
||||
|
||||
|
@ -632,6 +632,10 @@ def main(outfile, args=sys.argv[1:]):
|
||||
except tokenize.TokenError, e:
|
||||
print >> sys.stderr, '%s: %s, line %d, column %d' % (
|
||||
e[0], filename, e[1][0], e[1][1])
|
||||
except IndentationError, e:
|
||||
print >> sys.stderr, '%s: %s, line %s, column %s' % (
|
||||
e[0], filename, e.lineno, e[1][1])
|
||||
|
||||
finally:
|
||||
if closep:
|
||||
fp.close()
|
||||
|
@ -2,7 +2,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
__appname__ = 'calibre'
|
||||
__version__ = '0.7.29'
|
||||
__version__ = '0.7.30'
|
||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||
|
||||
import re
|
||||
|
@ -457,7 +457,7 @@ from calibre.devices.blackberry.driver import BLACKBERRY
|
||||
from calibre.devices.cybook.driver import CYBOOK, ORIZON
|
||||
from calibre.devices.eb600.driver import EB600, COOL_ER, SHINEBOOK, \
|
||||
POCKETBOOK360, GER2, ITALICA, ECLICTO, DBOOK, INVESBOOK, \
|
||||
BOOQ, ELONEX, POCKETBOOK301, MENTOR
|
||||
BOOQ, ELONEX, POCKETBOOK301, MENTOR, POCKETBOOK602
|
||||
from calibre.devices.iliad.driver import ILIAD
|
||||
from calibre.devices.irexdr.driver import IREXDR1000, IREXDR800
|
||||
from calibre.devices.jetbook.driver import JETBOOK, MIBUK, JETBOOK_MINI
|
||||
@ -476,13 +476,14 @@ from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, \
|
||||
SOVOS, PICO
|
||||
from calibre.devices.sne.driver import SNE
|
||||
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, \
|
||||
GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, Q600
|
||||
GEMEI, VELOCITYMICRO, PDNOVEL_KOBO, Q600, LUMIREAD
|
||||
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
|
||||
from calibre.devices.kobo.driver import KOBO
|
||||
|
||||
from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
|
||||
LibraryThing
|
||||
from calibre.ebooks.metadata.douban import DoubanBooks
|
||||
from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers
|
||||
from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
|
||||
LibraryThingCovers, DoubanCovers
|
||||
from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX
|
||||
@ -490,8 +491,9 @@ from calibre.ebooks.epub.fix.unmanifested import Unmanifested
|
||||
from calibre.ebooks.epub.fix.epubcheck import Epubcheck
|
||||
|
||||
plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon,
|
||||
LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
|
||||
Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers]
|
||||
LibraryThing, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
|
||||
Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers,
|
||||
NiceBooksCovers]
|
||||
plugins += [
|
||||
ComicInput,
|
||||
EPUBInput,
|
||||
@ -545,6 +547,7 @@ plugins += [
|
||||
SHINEBOOK,
|
||||
POCKETBOOK360,
|
||||
POCKETBOOK301,
|
||||
POCKETBOOK602,
|
||||
KINDLE,
|
||||
KINDLE2,
|
||||
KINDLE_DX,
|
||||
@ -597,6 +600,7 @@ plugins += [
|
||||
GEMEI,
|
||||
VELOCITYMICRO,
|
||||
PDNOVEL_KOBO,
|
||||
LUMIREAD,
|
||||
ITUNES,
|
||||
]
|
||||
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
|
||||
|
@ -678,6 +678,15 @@ class NookOutput(OutputProfile):
|
||||
fbase = 16
|
||||
fsizes = [12, 12, 14, 16, 18, 20, 22, 24]
|
||||
|
||||
class NookColorOutput(NookOutput):
|
||||
name = 'Nook Color'
|
||||
short_name = 'nook_color'
|
||||
description = _('This profile is intended for the B&N Nook Color.')
|
||||
|
||||
screen_size = (600, 980)
|
||||
comic_screen_size = (584, 980)
|
||||
dpi = 169
|
||||
|
||||
class BambookOutput(OutputProfile):
|
||||
|
||||
author = 'Li Fanxi'
|
||||
@ -698,6 +707,6 @@ output_profiles = [OutputProfile, SonyReaderOutput, SonyReader300Output,
|
||||
iPadOutput, KoboReaderOutput, TabletOutput,
|
||||
SonyReaderLandscapeOutput, KindleDXOutput, IlliadOutput,
|
||||
IRexDR1000Output, IRexDR800Output, JetBook5Output, NookOutput,
|
||||
BambookOutput, ]
|
||||
BambookOutput, NookColorOutput]
|
||||
|
||||
output_profiles.sort(cmp=lambda x,y:cmp(x.name.lower(), y.name.lower()))
|
||||
|
@ -120,7 +120,7 @@ def enable_plugin(plugin_or_name):
|
||||
config['enabled_plugins'] = ep
|
||||
|
||||
default_disabled_plugins = set([
|
||||
'Douban Books', 'Douban.com covers',
|
||||
'Douban Books', 'Douban.com covers', 'Nicebooks', 'Nicebooks covers'
|
||||
])
|
||||
|
||||
def is_disabled(plugin):
|
||||
|
@ -227,4 +227,22 @@ class POCKETBOOK301(USBMS):
|
||||
PRODUCT_ID = [0x301]
|
||||
BCD = [0x132]
|
||||
|
||||
class POCKETBOOK602(USBMS):
|
||||
|
||||
name = 'PocketBook Pro 602 Device Interface'
|
||||
description = _('Communicate with the PocketBook 602 reader.')
|
||||
author = 'Kovid Goyal'
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
FORMATS = ['epub', 'fb2', 'prc', 'mobi', 'pdf', 'djvu', 'rtf', 'chm',
|
||||
'doc', 'tcr', 'txt']
|
||||
|
||||
EBOOK_DIR_MAIN = 'books'
|
||||
SUPPORTS_SUB_DIRS = True
|
||||
|
||||
VENDOR_ID = [0x0525]
|
||||
PRODUCT_ID = [0xa4a5]
|
||||
BCD = [0x0324]
|
||||
|
||||
VENDOR_NAME = ''
|
||||
WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'PB602'
|
||||
|
||||
|
@ -93,7 +93,7 @@ class KOBO(USBMS):
|
||||
lpath = path.partition(self.normalize_path(prefix))[2]
|
||||
if lpath.startswith(os.sep):
|
||||
lpath = lpath[len(os.sep):]
|
||||
lpath = lpath.replace('\\', '/')
|
||||
lpath = lpath.replace('\\', '/')
|
||||
# debug_print("LPATH: ", lpath, " - Title: " , title)
|
||||
|
||||
playlist_map = {}
|
||||
@ -229,6 +229,10 @@ class KOBO(USBMS):
|
||||
#Delete the volume_shortcovers second
|
||||
cursor.execute('delete from volume_shortcovers where volumeid = ?', t)
|
||||
|
||||
# Delete the rows from content_keys
|
||||
if self.dbversion >= 8:
|
||||
cursor.execute('delete from content_keys where volumeid = ?', t)
|
||||
|
||||
# Delete the chapters associated with the book next
|
||||
t = (ContentID,ContentID,)
|
||||
cursor.execute('delete from content where BookID = ? or ContentID = ?', t)
|
||||
@ -354,7 +358,7 @@ class KOBO(USBMS):
|
||||
ContentID = ContentID.replace(self._main_prefix, '')
|
||||
else:
|
||||
ContentID = path
|
||||
ContentID = ContentID.replace(self._main_prefix + '.kobo/kepub/', '')
|
||||
ContentID = ContentID.replace(self._main_prefix + self.normalize_path('.kobo/kepub/'), '')
|
||||
|
||||
if self._card_a_prefix is not None:
|
||||
ContentID = ContentID.replace(self._card_a_prefix, '')
|
||||
@ -507,7 +511,10 @@ class KOBO(USBMS):
|
||||
t = (ContentID,)
|
||||
cursor.execute('select DateLastRead from Content where BookID is Null and ContentID = ?', t)
|
||||
result = cursor.fetchone()
|
||||
datelastread = result[0] if result[0] is not None else '1970-01-01T00:00:00'
|
||||
if result is None:
|
||||
datelastread = '1970-01-01T00:00:00'
|
||||
else:
|
||||
datelastread = result[0] if result[0] is not None else '1970-01-01T00:00:00'
|
||||
|
||||
t = (datelastread,ContentID,)
|
||||
|
||||
|
@ -174,3 +174,33 @@ class GEMEI(USBMS):
|
||||
EBOOK_DIR_MAIN = 'eBooks'
|
||||
SUPPORTS_SUB_DIRS = True
|
||||
|
||||
class LUMIREAD(USBMS):
|
||||
name = 'Acer Lumiread Device Interface'
|
||||
gui_name = 'Lumiread'
|
||||
description = _('Communicate with the Acer Lumiread')
|
||||
author = 'Kovid Goyal'
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
|
||||
# Ordered list of supported formats
|
||||
FORMATS = ['epub', 'pdf', 'mobi', 'chm', 'txt', 'doc', 'docx', 'rtf']
|
||||
|
||||
VENDOR_ID = [0x1025]
|
||||
PRODUCT_ID = [0x048d]
|
||||
BCD = [0x323]
|
||||
|
||||
EBOOK_DIR_MAIN = EBOOK_DIR_CARD_A = 'books'
|
||||
SUPPORTS_SUB_DIRS = True
|
||||
|
||||
THUMBNAIL_HEIGHT = 200
|
||||
|
||||
def upload_cover(self, path, filename, metadata, filepath):
|
||||
if metadata.thumbnail and metadata.thumbnail[-1]:
|
||||
cfilepath = filepath.replace('/', os.sep)
|
||||
cfilepath = cfilepath.replace(os.sep+'books'+os.sep,
|
||||
os.sep+'covers'+os.sep, 1)
|
||||
pdir = os.path.dirname(cfilepath)
|
||||
if not os.exists(pdir):
|
||||
os.makedirs(pdir)
|
||||
with open(cfilepath+'.jpg', 'wb') as f:
|
||||
f.write(metadata.thumbnail[-1])
|
||||
|
||||
|
@ -220,13 +220,13 @@ class Dehyphenator(object):
|
||||
self.html = html
|
||||
self.format = format
|
||||
if format == 'html':
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
|
||||
elif format == 'pdf':
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
||||
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
|
||||
elif format == 'individual_words':
|
||||
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
|
||||
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
|
||||
elif format == 'html_cleanup':
|
||||
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||
|
||||
html = intextmatch.sub(self.dehyphenate, html)
|
||||
return html
|
||||
|
@ -22,12 +22,12 @@ class PreProcessor(object):
|
||||
title = match.group('title')
|
||||
if not title:
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log("found " + unicode(self.html_preprocess_sections) +
|
||||
self.log("marked " + unicode(self.html_preprocess_sections) +
|
||||
" chapters. - " + unicode(chap))
|
||||
return '<h2>'+chap+'</h2>\n'
|
||||
else:
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log("found " + unicode(self.html_preprocess_sections) +
|
||||
self.log("marked " + unicode(self.html_preprocess_sections) +
|
||||
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
|
||||
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
||||
|
||||
@ -83,12 +83,30 @@ class PreProcessor(object):
|
||||
if min_lns > tot_htm_ends:
|
||||
return True
|
||||
|
||||
def dump(self, raw, where):
|
||||
import os
|
||||
dp = getattr(self.extra_opts, 'debug_pipeline', None)
|
||||
if dp and os.path.exists(dp):
|
||||
odir = os.path.join(dp, 'preprocess')
|
||||
if not os.path.exists(odir):
|
||||
os.makedirs(odir)
|
||||
if os.path.exists(odir):
|
||||
odir = os.path.join(odir, where)
|
||||
if not os.path.exists(odir):
|
||||
os.makedirs(odir)
|
||||
name, i = None, 0
|
||||
while not name or os.path.exists(os.path.join(odir, name)):
|
||||
i += 1
|
||||
name = '%04d.html'%i
|
||||
with open(os.path.join(odir, name), 'wb') as f:
|
||||
f.write(raw.encode('utf-8'))
|
||||
|
||||
def __call__(self, html):
|
||||
self.log("********* Preprocessing HTML *********")
|
||||
|
||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||
html = re.sub(r"\s*</p>", "</p>\n", html)
|
||||
html = re.sub(r"\s*<p>\s*", "\n<p>", html)
|
||||
html = re.sub(r"\s*<p(?P<style>[^>]*)>\s*", "\n<p"+"\g<style>"+">", html)
|
||||
|
||||
###### Check Markup ######
|
||||
#
|
||||
@ -150,52 +168,61 @@ class PreProcessor(object):
|
||||
#print "blanks between paragraphs is marked True"
|
||||
else:
|
||||
blanks_between_paragraphs = False
|
||||
#self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
|
||||
#self.dump(html, 'before_chapter_markup')
|
||||
# detect chapters/sections to match xpath or splitting logic
|
||||
#
|
||||
# Build the Regular Expressions in pieces
|
||||
lookahead = "(?=<(p|div))"
|
||||
init_lookahead = "(?=<(p|div))"
|
||||
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
||||
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
|
||||
chapter_header_open = r"(?P<chap>"
|
||||
title_header_open = r"(?P<title>"
|
||||
chapter_header_close = ")\s*"
|
||||
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*"
|
||||
title_header_close = ")"
|
||||
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
|
||||
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
|
||||
|
||||
if blanks_between_paragraphs:
|
||||
blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
|
||||
else:
|
||||
blank_lines = ""
|
||||
opt_title_open = "("
|
||||
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
|
||||
title_header_open = "(?P<title>"
|
||||
title_header_close = ")\s*"
|
||||
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
|
||||
opt_title_close = ")?"
|
||||
n_lookahead_open = "\s+(?!"
|
||||
n_lookahead_close = ")"
|
||||
|
||||
default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
|
||||
typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
|
||||
numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
|
||||
uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
|
||||
default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
|
||||
|
||||
chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
||||
#print chapter_marker
|
||||
min_chapters = 10
|
||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||
self.html_preprocess_sections = len(heading.findall(html))
|
||||
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
||||
#
|
||||
# Start with most typical chapter headings, get more aggressive until one works
|
||||
if self.html_preprocess_sections < 10:
|
||||
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
||||
html = chapdetect.sub(self.chapter_head, html)
|
||||
if self.html_preprocess_sections < 10:
|
||||
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters")
|
||||
chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
||||
chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
||||
html = chapdetect2.sub(self.chapter_head, html)
|
||||
|
||||
if self.html_preprocess_sections < 10:
|
||||
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words")
|
||||
chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
||||
chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE)
|
||||
html = chapdetect2.sub(self.chapter_head, html)
|
||||
chapter_types = [
|
||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
|
||||
[r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
|
||||
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
|
||||
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
|
||||
]
|
||||
|
||||
# Start with most typical chapter headings, get more aggressive until one works
|
||||
for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
|
||||
if self.html_preprocess_sections >= min_chapters:
|
||||
break
|
||||
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
||||
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
||||
self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
||||
if lookahead_ignorecase:
|
||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
||||
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
||||
else:
|
||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
|
||||
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
|
||||
|
||||
html = chapdetect.sub(self.chapter_head, html)
|
||||
|
||||
|
||||
###### Unwrap lines ######
|
||||
#
|
||||
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
||||
@ -232,6 +259,7 @@ class PreProcessor(object):
|
||||
html = dehyphenator(html,'html', length)
|
||||
self.log("Done dehyphenating")
|
||||
# Unwrap lines using punctation and line length
|
||||
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
|
||||
unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||
html = unwrap.sub(' ', html)
|
||||
#check any remaining hyphens, but only unwrap if there is a match
|
||||
@ -248,10 +276,10 @@ class PreProcessor(object):
|
||||
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
||||
|
||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||
if self.html_preprocess_sections < 10:
|
||||
if self.html_preprocess_sections < 5:
|
||||
self.log("Looking for more split points based on punctuation,"
|
||||
" currently have " + unicode(self.html_preprocess_sections))
|
||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||
html = chapdetect3.sub(self.chapter_break, html)
|
||||
# search for places where a first or second level heading is immediately followed by another
|
||||
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||
@ -262,4 +290,7 @@ class PreProcessor(object):
|
||||
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
||||
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
||||
|
||||
# Center separator lines
|
||||
html = re.sub(u'<p>\s*(?P<break>([*#•]+\s*)+)\s*</p>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
|
||||
|
||||
return html
|
||||
|
9
src/calibre/ebooks/iterator/__init__.py
Normal file
@ -0,0 +1,9 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
|
@ -145,18 +145,21 @@ class MetadataSource(Plugin): # {{{
|
||||
setattr(w, '_'+x, cb)
|
||||
cb.setChecked(c.get(x, True))
|
||||
w._layout.addWidget(cb)
|
||||
|
||||
cb = QCheckBox(_('Convert comments downloaded from %s to plain text')%(self.name))
|
||||
setattr(w, '_textcomments', cb)
|
||||
cb.setChecked(c.get('textcomments', False))
|
||||
w._layout.addWidget(cb)
|
||||
|
||||
if self.has_html_comments:
|
||||
cb = QCheckBox(_('Convert comments downloaded from %s to plain text')%(self.name))
|
||||
setattr(w, '_textcomments', cb)
|
||||
cb.setChecked(c.get('textcomments', False))
|
||||
w._layout.addWidget(cb)
|
||||
|
||||
return w
|
||||
|
||||
def save_settings(self, w):
|
||||
dl_settings = {}
|
||||
for x in ('rating', 'tags', 'comments', 'textcomments'):
|
||||
for x in ('rating', 'tags', 'comments'):
|
||||
dl_settings[x] = getattr(w, '_'+x).isChecked()
|
||||
if self.has_html_comments:
|
||||
dl_settings['textcomments'] = getattr(w, '_textcomments').isChecked()
|
||||
c = self.config_store()
|
||||
c.set(self.name, dl_settings)
|
||||
if hasattr(w, '_sc'):
|
||||
|
@ -90,10 +90,8 @@ def build_isbn(base_url, opts):
|
||||
return base_url + 'index1=isbn&value1='+opts.isbn
|
||||
|
||||
def build_combined(base_url, opts):
|
||||
query = ''
|
||||
for e in (opts.title, opts.author, opts.publisher):
|
||||
if e is not None:
|
||||
query += ' ' + e
|
||||
query = ' '.join([e for e in (opts.title, opts.author, opts.publisher) \
|
||||
if e is not None ])
|
||||
query = query.strip()
|
||||
if len(query) == 0:
|
||||
raise ISBNDBError('You must specify at least one of --author, --title or --publisher')
|
||||
@ -141,15 +139,8 @@ def create_books(opts, args, timeout=5.):
|
||||
print ('ISBNDB query: '+url)
|
||||
|
||||
tans = [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)]
|
||||
ans = []
|
||||
for x in tans:
|
||||
add = True
|
||||
for y in ans:
|
||||
if y.isbn == x.isbn:
|
||||
add = False
|
||||
if add:
|
||||
ans.append(x)
|
||||
return ans
|
||||
#remove duplicates ISBN
|
||||
return list(dict((book.isbn, book) for book in tans).values())
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
|
424
src/calibre/ebooks/metadata/nicebooks.py
Normal file
@ -0,0 +1,424 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, sengian <sengian1@gmail.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys, textwrap, re, traceback, socket
|
||||
from urllib import urlencode
|
||||
from math import ceil
|
||||
from copy import deepcopy
|
||||
|
||||
from lxml.html import soupparser
|
||||
|
||||
from calibre.utils.date import parse_date, utcnow
|
||||
from calibre import browser, preferred_encoding
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.metadata import MetaInformation, check_isbn, \
|
||||
authors_to_sort_string
|
||||
from calibre.ebooks.metadata.fetch import MetadataSource
|
||||
from calibre.ebooks.metadata.covers import CoverDownload
|
||||
from calibre.utils.config import OptionParser
|
||||
|
||||
class NiceBooks(MetadataSource):
|
||||
|
||||
name = 'Nicebooks'
|
||||
description = _('Downloads metadata from french Nicebooks')
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
author = 'Sengian'
|
||||
version = (1, 0, 0)
|
||||
|
||||
def fetch(self):
|
||||
try:
|
||||
self.results = search(self.title, self.book_author, self.publisher,
|
||||
self.isbn, max_results=10, verbose=self.verbose)
|
||||
except Exception, e:
|
||||
self.exception = e
|
||||
self.tb = traceback.format_exc()
|
||||
|
||||
class NiceBooksCovers(CoverDownload):
|
||||
|
||||
name = 'Nicebooks covers'
|
||||
description = _('Downloads covers from french Nicebooks')
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
author = 'Sengian'
|
||||
type = _('Cover download')
|
||||
version = (1, 0, 0)
|
||||
|
||||
def has_cover(self, mi, ans, timeout=5.):
|
||||
if not mi.isbn:
|
||||
return False
|
||||
br = browser()
|
||||
try:
|
||||
entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0]
|
||||
if Covers(mi.isbn)(entry).check_cover():
|
||||
self.debug('cover for', mi.isbn, 'found')
|
||||
ans.set()
|
||||
except Exception, e:
|
||||
self.debug(e)
|
||||
|
||||
def get_covers(self, mi, result_queue, abort, timeout=5.):
|
||||
if not mi.isbn:
|
||||
return
|
||||
br = browser()
|
||||
try:
|
||||
entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0]
|
||||
cover_data, ext = Covers(mi.isbn)(entry).get_cover(br, timeout)
|
||||
if not ext:
|
||||
ext = 'jpg'
|
||||
result_queue.put((True, cover_data, ext, self.name))
|
||||
except Exception, e:
|
||||
result_queue.put((False, self.exception_to_string(e),
|
||||
traceback.format_exc(), self.name))
|
||||
|
||||
|
||||
def report(verbose):
|
||||
if verbose:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def replace_monthsfr(datefr):
|
||||
# Replace french months by english equivalent for parse_date
|
||||
frtoen = {
|
||||
u'[jJ]anvier': u'jan',
|
||||
u'[fF].vrier': u'feb',
|
||||
u'[mM]ars': u'mar',
|
||||
u'[aA]vril': u'apr',
|
||||
u'[mM]ai': u'may',
|
||||
u'[jJ]uin': u'jun',
|
||||
u'[jJ]uillet': u'jul',
|
||||
u'[aA]o.t': u'aug',
|
||||
u'[sS]eptembre': u'sep',
|
||||
u'[Oo]ctobre': u'oct',
|
||||
u'[nN]ovembre': u'nov',
|
||||
u'[dD].cembre': u'dec' }
|
||||
for k in frtoen.iterkeys():
|
||||
tmp = re.sub(k, frtoen[k], datefr)
|
||||
if tmp <> datefr: break
|
||||
return tmp
|
||||
|
||||
class Query(object):
|
||||
|
||||
BASE_URL = 'http://fr.nicebooks.com/'
|
||||
|
||||
def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, max_results=20):
|
||||
assert not(title is None and author is None and publisher is None \
|
||||
and isbn is None and keywords is None)
|
||||
assert (max_results < 21)
|
||||
|
||||
self.max_results = int(max_results)
|
||||
|
||||
if isbn is not None:
|
||||
q = isbn
|
||||
else:
|
||||
q = ' '.join([i for i in (title, author, publisher, keywords) \
|
||||
if i is not None])
|
||||
|
||||
if isinstance(q, unicode):
|
||||
q = q.encode('utf-8')
|
||||
self.urldata = 'search?' + urlencode({'q':q,'s':'Rechercher'})
|
||||
|
||||
def __call__(self, browser, verbose, timeout = 5.):
|
||||
if verbose:
|
||||
print 'Query:', self.BASE_URL+self.urldata
|
||||
|
||||
try:
|
||||
raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read()
|
||||
except Exception, e:
|
||||
report(verbose)
|
||||
if callable(getattr(e, 'getcode', None)) and \
|
||||
e.getcode() == 404:
|
||||
return
|
||||
raise
|
||||
if '<title>404 - ' in raw:
|
||||
return
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
try:
|
||||
feed = soupparser.fromstring(raw)
|
||||
except:
|
||||
return
|
||||
|
||||
#nb of page to call
|
||||
try:
|
||||
nbresults = int(feed.xpath("//div[@id='topbar']/b")[0].text)
|
||||
except:
|
||||
#direct hit
|
||||
return [feed]
|
||||
|
||||
nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/10))
|
||||
pages =[feed]
|
||||
if nbpagetoquery > 1:
|
||||
for i in xrange(2, nbpagetoquery + 1):
|
||||
try:
|
||||
urldata = self.urldata + '&p=' + str(i)
|
||||
raw = browser.open_novisit(self.BASE_URL+urldata, timeout=timeout).read()
|
||||
except Exception, e:
|
||||
continue
|
||||
if '<title>404 - ' in raw:
|
||||
continue
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
try:
|
||||
feed = soupparser.fromstring(raw)
|
||||
except:
|
||||
continue
|
||||
pages.append(feed)
|
||||
|
||||
results = []
|
||||
for x in pages:
|
||||
results.extend([i.find_class('title')[0].get('href') \
|
||||
for i in x.xpath("//ul[@id='results']/li")])
|
||||
return results[:self.max_results]
|
||||
|
||||
class ResultList(list):
|
||||
|
||||
BASE_URL = 'http://fr.nicebooks.com'
|
||||
|
||||
def __init__(self):
|
||||
self.repub = re.compile(u'\s*.diteur\s*', re.I)
|
||||
self.reauteur = re.compile(u'\s*auteur.*', re.I)
|
||||
self.reautclean = re.compile(u'\s*\(.*\)\s*')
|
||||
|
||||
def get_title(self, entry):
|
||||
# title = deepcopy(entry.find("div[@id='book-info']"))
|
||||
title = deepcopy(entry)
|
||||
title.remove(title.find("dl[@title='Informations sur le livre']"))
|
||||
title = ' '.join([i.text_content() for i in title.iterchildren()])
|
||||
return unicode(title.replace('\n', ''))
|
||||
|
||||
def get_authors(self, entry):
|
||||
# author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
|
||||
author = entry.find("dl[@title='Informations sur le livre']")
|
||||
authortext = []
|
||||
for x in author.getiterator('dt'):
|
||||
if self.reauteur.match(x.text):
|
||||
elt = x.getnext()
|
||||
while elt.tag == 'dd':
|
||||
authortext.append(unicode(elt.text_content()))
|
||||
elt = elt.getnext()
|
||||
break
|
||||
if len(authortext) == 1:
|
||||
authortext = [self.reautclean.sub('', authortext[0])]
|
||||
return authortext
|
||||
|
||||
def get_description(self, entry, verbose):
|
||||
try:
|
||||
return u'RESUME:\n' + unicode(entry.getparent().xpath("//p[@id='book-description']")[0].text)
|
||||
except:
|
||||
report(verbose)
|
||||
return None
|
||||
|
||||
def get_book_info(self, entry, mi, verbose):
|
||||
entry = entry.find("dl[@title='Informations sur le livre']")
|
||||
for x in entry.getiterator('dt'):
|
||||
if x.text == 'ISBN':
|
||||
isbntext = x.getnext().text_content().replace('-', '')
|
||||
if check_isbn(isbntext):
|
||||
mi.isbn = unicode(isbntext)
|
||||
elif self.repub.match(x.text):
|
||||
mi.publisher = unicode(x.getnext().text_content())
|
||||
elif x.text == 'Langue':
|
||||
mi.language = unicode(x.getnext().text_content())
|
||||
elif x.text == 'Date de parution':
|
||||
d = x.getnext().text_content()
|
||||
try:
|
||||
default = utcnow().replace(day=15)
|
||||
d = replace_monthsfr(d)
|
||||
d = parse_date(d, assume_utc=True, default=default)
|
||||
mi.pubdate = d
|
||||
except:
|
||||
report(verbose)
|
||||
return mi
|
||||
|
||||
def fill_MI(self, entry, title, authors, verbose):
|
||||
mi = MetaInformation(title, authors)
|
||||
mi.author_sort = authors_to_sort_string(authors)
|
||||
mi.comments = self.get_description(entry, verbose)
|
||||
# entry = entry.find("dl[@title='Informations sur le livre']")
|
||||
# mi.publisher = self.get_publisher(entry)
|
||||
# mi.pubdate = self.get_date(entry, verbose)
|
||||
# mi.isbn = self.get_ISBN(entry)
|
||||
# mi.language = self.get_language(entry)
|
||||
return self.get_book_info(entry, mi, verbose)
|
||||
|
||||
def get_individual_metadata(self, browser, linkdata, verbose):
|
||||
try:
|
||||
raw = browser.open_novisit(self.BASE_URL + linkdata).read()
|
||||
except Exception, e:
|
||||
report(verbose)
|
||||
if callable(getattr(e, 'getcode', None)) and \
|
||||
e.getcode() == 404:
|
||||
return
|
||||
raise
|
||||
if '<title>404 - ' in raw:
|
||||
report(verbose)
|
||||
return
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
try:
|
||||
feed = soupparser.fromstring(raw)
|
||||
except:
|
||||
return
|
||||
|
||||
# get results
|
||||
return feed.xpath("//div[@id='container']")[0]
|
||||
|
||||
def populate(self, entries, browser, verbose=False):
|
||||
#single entry
|
||||
if len(entries) == 1 and not isinstance(entries[0], str):
|
||||
try:
|
||||
entry = entries[0].xpath("//div[@id='container']")[0]
|
||||
entry = entry.find("div[@id='book-info']")
|
||||
title = self.get_title(entry)
|
||||
authors = self.get_authors(entry)
|
||||
except Exception, e:
|
||||
if verbose:
|
||||
print 'Failed to get all details for an entry'
|
||||
print e
|
||||
return
|
||||
self.append(self.fill_MI(entry, title, authors, verbose))
|
||||
else:
|
||||
#multiple entries
|
||||
for x in entries:
|
||||
try:
|
||||
entry = self.get_individual_metadata(browser, x, verbose)
|
||||
entry = entry.find("div[@id='book-info']")
|
||||
title = self.get_title(entry)
|
||||
authors = self.get_authors(entry)
|
||||
except Exception, e:
|
||||
if verbose:
|
||||
print 'Failed to get all details for an entry'
|
||||
print e
|
||||
continue
|
||||
self.append(self.fill_MI(entry, title, authors, verbose))
|
||||
|
||||
|
||||
class NiceBooksError(Exception):
|
||||
pass
|
||||
|
||||
class ISBNNotFound(NiceBooksError):
|
||||
pass
|
||||
|
||||
class Covers(object):
|
||||
|
||||
def __init__(self, isbn = None):
|
||||
assert isbn is not None
|
||||
self.urlimg = ''
|
||||
self.isbn = isbn
|
||||
self.isbnf = False
|
||||
|
||||
def __call__(self, entry = None):
|
||||
try:
|
||||
self.urlimg = entry.xpath("//div[@id='book-picture']/a")[0].get('href')
|
||||
except:
|
||||
return self
|
||||
isbno = entry.get_element_by_id('book-info').find("dl[@title='Informations sur le livre']")
|
||||
for x in isbno.getiterator('dt'):
|
||||
if x.text == 'ISBN' and check_isbn(x.getnext().text_content()):
|
||||
self.isbnf = True
|
||||
break
|
||||
return self
|
||||
|
||||
def check_cover(self):
|
||||
return True if self.urlimg else False
|
||||
|
||||
def get_cover(self, browser, timeout = 5.):
|
||||
try:
|
||||
cover, ext = browser.open_novisit(self.urlimg, timeout=timeout).read(), \
|
||||
self.urlimg.rpartition('.')[-1]
|
||||
return cover, ext if ext else 'jpg'
|
||||
except Exception, err:
|
||||
if isinstance(getattr(err, 'args', [None])[0], socket.timeout):
|
||||
err = NiceBooksError(_('Nicebooks timed out. Try again later.'))
|
||||
raise err
|
||||
if not len(self.urlimg):
|
||||
if not self.isbnf:
|
||||
raise ISBNNotFound('ISBN: '+self.isbn+_(' not found.'))
|
||||
raise NiceBooksError(_('An errror occured with Nicebooks cover fetcher'))
|
||||
|
||||
|
||||
def search(title=None, author=None, publisher=None, isbn=None,
|
||||
max_results=5, verbose=False, keywords=None):
|
||||
br = browser()
|
||||
entries = Query(title=title, author=author, isbn=isbn, publisher=publisher,
|
||||
keywords=keywords, max_results=max_results)(br, verbose)
|
||||
|
||||
if entries is None or len(entries) == 0:
|
||||
return
|
||||
|
||||
#List of entry
|
||||
ans = ResultList()
|
||||
ans.populate(entries, br, verbose)
|
||||
return ans
|
||||
|
||||
def check_for_cover(isbn):
|
||||
br = browser()
|
||||
entry = Query(isbn=isbn, max_results=1)(br, False)[0]
|
||||
return Covers(isbn)(entry).check_cover()
|
||||
|
||||
def cover_from_isbn(isbn, timeout = 5.):
|
||||
br = browser()
|
||||
entry = Query(isbn=isbn, max_results=1)(br, False, timeout)[0]
|
||||
return Covers(isbn)(entry).get_cover(br, timeout)
|
||||
|
||||
|
||||
def option_parser():
|
||||
parser = OptionParser(textwrap.dedent(\
|
||||
'''\
|
||||
%prog [options]
|
||||
|
||||
Fetch book metadata from Nicebooks. You must specify one of title, author,
|
||||
ISBN, publisher or keywords. Will fetch a maximum of 20 matches,
|
||||
so you should make your query as specific as possible.
|
||||
It can also get covers if the option is activated.
|
||||
'''
|
||||
))
|
||||
parser.add_option('-t', '--title', help='Book title')
|
||||
parser.add_option('-a', '--author', help='Book author(s)')
|
||||
parser.add_option('-p', '--publisher', help='Book publisher')
|
||||
parser.add_option('-i', '--isbn', help='Book ISBN')
|
||||
parser.add_option('-k', '--keywords', help='Keywords')
|
||||
parser.add_option('-c', '--covers', default=0,
|
||||
help='Covers: 1-Check/ 2-Download')
|
||||
parser.add_option('-p', '--coverspath', default='',
|
||||
help='Covers files path')
|
||||
parser.add_option('-m', '--max-results', default=20,
|
||||
help='Maximum number of results to fetch')
|
||||
parser.add_option('-v', '--verbose', default=0, action='count',
|
||||
help='Be more verbose about errors')
|
||||
return parser
|
||||
|
||||
def main(args=sys.argv):
|
||||
import os
|
||||
parser = option_parser()
|
||||
opts, args = parser.parse_args(args)
|
||||
try:
|
||||
results = search(opts.title, opts.author, isbn=opts.isbn, publisher=opts.publisher,
|
||||
keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results)
|
||||
except AssertionError:
|
||||
report(True)
|
||||
parser.print_help()
|
||||
return 1
|
||||
if results is None or len(results) == 0:
|
||||
print 'No result found for this search!'
|
||||
return 0
|
||||
for result in results:
|
||||
print unicode(result).encode(preferred_encoding, 'replace')
|
||||
covact = int(opts.covers)
|
||||
if covact == 1:
|
||||
textcover = 'No cover found!'
|
||||
if check_for_cover(result.isbn):
|
||||
textcover = 'A cover was found for this book'
|
||||
print textcover
|
||||
elif covact == 2:
|
||||
cover_data, ext = cover_from_isbn(result.isbn)
|
||||
cpath = result.isbn
|
||||
if len(opts.coverspath):
|
||||
cpath = os.path.normpath(opts.coverspath + '/' + result.isbn)
|
||||
oname = os.path.abspath(cpath+'.'+ext)
|
||||
open(oname, 'wb').write(cover_data)
|
||||
print 'Cover saved to file ', oname
|
||||
print
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -475,7 +475,14 @@ class MobiReader(object):
|
||||
self.processed_html = self.processed_html.replace('\r\n', '\n')
|
||||
self.processed_html = self.processed_html.replace('> <', '>\n<')
|
||||
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
|
||||
self.processed_html = re.sub(r'<?xml[^>]*>', '', self.processed_html)
|
||||
self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html)
|
||||
# Swap inline and block level elements, and order block level elements according to priority
|
||||
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
|
||||
self.processed_html = re.sub(r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', '\g<para>'+'\g<styletags>', self.processed_html)
|
||||
self.processed_html = re.sub(r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', '\g<styletags>'+'\g<para>', self.processed_html)
|
||||
self.processed_html = re.sub(r'(?i)(?P<blockquote>(</blockquote[^>]*>\s*){1,})(?P<para></p[^>]*>)', '\g<para>'+'\g<blockquote>', self.processed_html)
|
||||
self.processed_html = re.sub(r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<blockquote[^>]*>\s*){1,})', '\g<blockquote>'+'\g<para>', self.processed_html)
|
||||
|
||||
|
||||
def remove_random_bytes(self, html):
|
||||
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08',
|
||||
|
@ -55,18 +55,31 @@ class SVGRasterizer(object):
|
||||
self.rasterize_cover()
|
||||
|
||||
def rasterize_svg(self, elem, width=0, height=0, format='PNG'):
|
||||
view_box = elem.get('viewBox', elem.get('viewbox', None))
|
||||
sizes = None
|
||||
logger = self.oeb.logger
|
||||
|
||||
if view_box is not None:
|
||||
box = [float(x) for x in view_box.split()]
|
||||
sizes = [box[2]-box[0], box[3] - box[1]]
|
||||
for image in elem.xpath('descendant::*[local-name()="image" and '
|
||||
'@height and contains(@height, "%")]'):
|
||||
logger.info('Found SVG image height in %, trying to convert...')
|
||||
try:
|
||||
h = float(image.get('height').replace('%', ''))/100.
|
||||
image.set('height', str(h*sizes[1]))
|
||||
except:
|
||||
logger.exception('Failed to convert percentage height:',
|
||||
image.get('height'))
|
||||
|
||||
data = QByteArray(xml2str(elem, with_tail=False))
|
||||
svg = QSvgRenderer(data)
|
||||
size = svg.defaultSize()
|
||||
view_box = elem.get('viewBox', elem.get('viewbox', None))
|
||||
if size.width() == 100 and size.height() == 100 \
|
||||
and view_box is not None:
|
||||
box = [float(x) for x in view_box.split()]
|
||||
size.setWidth(box[2] - box[0])
|
||||
size.setHeight(box[3] - box[1])
|
||||
if size.width() == 100 and size.height() == 100 and sizes:
|
||||
size.setWidth(sizes[0])
|
||||
size.setHeight(sizes[1])
|
||||
if width or height:
|
||||
size.scale(width, height, Qt.KeepAspectRatio)
|
||||
logger = self.oeb.logger
|
||||
logger.info('Rasterizing %r to %dx%d'
|
||||
% (elem, size.width(), size.height()))
|
||||
image = QImage(size, QImage.Format_ARGB32_Premultiplied)
|
||||
|
@ -9,6 +9,7 @@ import os
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
|
||||
class PDBInput(InputFormatPlugin):
|
||||
|
||||
@ -44,3 +45,8 @@ class PDBInput(InputFormatPlugin):
|
||||
opf = reader.extract_content(os.getcwd())
|
||||
|
||||
return opf
|
||||
|
||||
def preprocess_html(self, options, html):
|
||||
self.options = options
|
||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
||||
return preprocessor(html)
|
@ -114,7 +114,7 @@ class RTFInput(InputFormatPlugin):
|
||||
group_borders = 1,
|
||||
|
||||
# Write or do not write paragraphs. Default is 0.
|
||||
empty_paragraphs = 0,
|
||||
empty_paragraphs = 1,
|
||||
)
|
||||
parser.parse_rtf()
|
||||
ans = open('out.xml').read()
|
||||
@ -289,6 +289,10 @@ class RTFInput(InputFormatPlugin):
|
||||
with open(html, 'wb') as f:
|
||||
res = transform.tostring(result)
|
||||
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
||||
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
|
||||
if not getattr(self.options, 'remove_paragraph_spacing', False):
|
||||
res = re.sub('\s*<body>', '<body>', res)
|
||||
res = re.sub('(?<=\n)\n{2}', u'<p>\u00a0</p>\n', res)
|
||||
if self.options.preprocess_html:
|
||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
||||
res = preprocessor(res)
|
||||
|
@ -81,7 +81,9 @@ def txt2rtf(text):
|
||||
buf = cStringIO.StringIO()
|
||||
for x in text:
|
||||
val = ord(x)
|
||||
if val <= 127:
|
||||
if val == 160:
|
||||
buf.write('\\~')
|
||||
elif val <= 127:
|
||||
buf.write(x)
|
||||
else:
|
||||
repl = ascii_text(x)
|
||||
@ -191,6 +193,10 @@ class RTFMLizer(object):
|
||||
def dump_text(self, elem, stylizer, tag_stack=[]):
|
||||
if not isinstance(elem.tag, basestring) \
|
||||
or namespace(elem.tag) != XHTML_NS:
|
||||
p = elem.getparent()
|
||||
if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
|
||||
and elem.tail:
|
||||
return elem.tail
|
||||
return u''
|
||||
|
||||
text = u''
|
||||
|
@ -155,6 +155,10 @@ class TXTMLizer(object):
|
||||
|
||||
if not isinstance(elem.tag, basestring) \
|
||||
or namespace(elem.tag) != XHTML_NS:
|
||||
p = elem.getparent()
|
||||
if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
|
||||
and elem.tail:
|
||||
return [elem.tail]
|
||||
return ['']
|
||||
|
||||
text = ['']
|
||||
|
@ -89,14 +89,18 @@ class AddAction(InterfaceAction):
|
||||
self.gui.library_view.model().db.import_book(MetaInformation(None), [])
|
||||
self.gui.library_view.model().books_added(num)
|
||||
|
||||
def add_isbns(self, isbns):
|
||||
def add_isbns(self, books):
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
ids = set([])
|
||||
for x in isbns:
|
||||
for x in books:
|
||||
mi = MetaInformation(None)
|
||||
mi.isbn = x
|
||||
ids.add(self.gui.library_view.model().db.import_book(mi, []))
|
||||
self.gui.library_view.model().books_added(len(isbns))
|
||||
mi.isbn = x['isbn']
|
||||
db = self.gui.library_view.model().db
|
||||
if x['path'] is not None:
|
||||
ids.add(db.import_book(mi, [x['path']]))
|
||||
else:
|
||||
ids.add(db.import_book(mi, []))
|
||||
self.gui.library_view.model().books_added(len(books))
|
||||
self.gui.iactions['Edit Metadata'].do_download_metadata(ids)
|
||||
|
||||
|
||||
@ -150,7 +154,7 @@ class AddAction(InterfaceAction):
|
||||
from calibre.gui2.dialogs.add_from_isbn import AddFromISBN
|
||||
d = AddFromISBN(self.gui)
|
||||
if d.exec_() == d.Accepted:
|
||||
self.add_isbns(d.isbns)
|
||||
self.add_isbns(d.books)
|
||||
|
||||
def add_books(self, *args):
|
||||
'''
|
||||
|
@ -132,9 +132,9 @@ class CheckIntegrity(QProgressDialog):
|
||||
titles = [self.db.title(x, index_is_id=True) for x in bad]
|
||||
det_msg = '\n'.join(titles)
|
||||
warning_dialog(self, _('Some inconsistencies found'),
|
||||
_('The following books had formats listed in the '
|
||||
_('The following books had formats or covers listed in the '
|
||||
'database that are not actually available. '
|
||||
'The entries for the formats have been removed. '
|
||||
'The entries for the formats/covers have been removed. '
|
||||
'You should check them manually. This can '
|
||||
'happen if you manipulate the files in the '
|
||||
'library folder directly.'), det_msg=det_msg, show=True)
|
||||
|