Implement a check setup command that uses PyFlakes to check for various errors

This commit is contained in:
Kovid Goyal 2009-09-07 19:03:52 -06:00
parent 792c6b0b22
commit f9ff180347
206 changed files with 12460 additions and 13498 deletions

View File

@ -1,5 +1,5 @@
*_ui.py *_ui.py
moc_*.cpp .check-cache.pickle
src/calibre/plugins src/calibre/plugins
resources/images.qrc resources/images.qrc
src/calibre/manual/.build/ src/calibre/manual/.build/

View File

@ -6,7 +6,6 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import sys, os, optparse import sys, os, optparse
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
@ -70,7 +69,7 @@ def main(args=sys.argv):
command.clean() command.clean()
return 0 return 0
if opts.clean_all(): if opts.clean_all:
for cmd in commands.__all__: for cmd in commands.__all__:
prints('Cleaning', cmd) prints('Cleaning', cmd)
getattr(commands, cmd).clean() getattr(commands, cmd).clean()

75
setup/check.py Normal file
View File

@ -0,0 +1,75 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os, cPickle, subprocess
from operator import attrgetter
from setup import Command
def check_for_python_errors(filename, builtins):
from pyflakes import checker, ast
contents = open(filename, 'rb').read()
try:
tree = ast.parse(contents, filename)
except:
import traceback
traceback.print_exc()
try:
value = sys.exc_info()[1]
lineno, offset, line = value[1][1:]
except IndexError:
lineno, offset, line = 1, 0, ''
if line.endswith("\n"):
line = line[:-1]
return [SyntaxError(filename, lineno, offset, str(value))]
else:
w = checker.Checker(tree, filename, builtins = builtins)
w.messages.sort(key = attrgetter('lineno'))
return w.messages
class Check(Command):
BUILTINS = ['_', '__', 'dynamic_property', 'I', 'P']
CACHE = '.check-cache.pickle'
def run(self, opts):
cache = {}
if os.path.exists(self.CACHE):
cache = cPickle.load(open(self.CACHE, 'rb'))
for x in os.walk(self.j(self.SRC, 'calibre')):
for f in x[-1]:
f = self.j(x[0], f)
mtime = os.stat(f).st_mtime
if f.endswith('.py') and cache.get(f, 0) != mtime and \
self.b(f) not in ('ptempfile.py', 'feedparser.py',
'pyparsing.py', 'markdown.py') and 'genshi' not in f and \
'prs500/driver.py' not in f:
self.info('\tChecking', f)
w = check_for_python_errors(f, self.BUILTINS)
if w:
self.report_errors(w)
cPickle.dump(cache, open(self.CACHE, 'wb'), -1)
subprocess.call(['gvim', '-f', f])
raise SystemExit(1)
cache[f] = mtime
cPickle.dump(cache, open(self.CACHE, 'wb'), -1)
def report_errors(self, errors):
for err in errors:
if isinstance(err, SyntaxError):
print '\t\tSyntax Error'
else:
col = getattr(err, 'col', 0) if getattr(err, 'col', 0) else 0
lineno = err.lineno if err.lineno else 0
self.info('\t\t%d:%d:'%(lineno, col),
err.message%err.message_args)

View File

@ -11,6 +11,7 @@ __all__ = [
'build', 'build',
'gui', 'gui',
'develop', 'develop',
'check',
] ]
@ -29,6 +30,8 @@ develop = Develop()
from setup.gui import GUI from setup.gui import GUI
gui = GUI() gui = GUI()
from setup.check import Check
check = Check()
commands = {} commands = {}
for x in __all__: for x in __all__:

View File

@ -78,9 +78,10 @@ class GUI(Command):
dat = pat.sub(sub, dat) dat = pat.sub(sub, dat)
if form.endswith('viewer%smain.ui'%os.sep): if form.endswith('viewer%smain.ui'%os.sep):
self.inf('\t\tPromoting WebView') self.info('\t\tPromoting WebView')
dat = dat.replace('self.view = QtWebKit.QWebView(', 'self.view = DocumentView(') dat = dat.replace('self.view = QtWebKit.QWebView(', 'self.view = DocumentView(')
dat += '\n\nfrom calibre.gui2.viewer.documentview import DocumentView' dat += '\n\nfrom calibre.gui2.viewer.documentview import DocumentView'
dat += '\nQtWebKit'
open(compiled_form, 'wb').write(dat) open(compiled_form, 'wb').write(dat)

View File

@ -21,6 +21,11 @@ from calibre.constants import iswindows, isosx, islinux, isfrozen, \
filesystem_encoding filesystem_encoding
import mechanize import mechanize
if False:
winutil, winutilerror, __appname__, islinux, __version__
fcntl, win32event, isfrozen, __author__, terminal_controller
winerror, win32api
mimetypes.add_type('application/epub+zip', '.epub') mimetypes.add_type('application/epub+zip', '.epub')
mimetypes.add_type('text/x-sony-bbeb+xml', '.lrs') mimetypes.add_type('text/x-sony-bbeb+xml', '.lrs')
mimetypes.add_type('application/xhtml+xml', '.xhtml') mimetypes.add_type('application/xhtml+xml', '.xhtml')

View File

@ -13,19 +13,19 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from constants import eStart, eError, eItsMe from constants import eStart
class CodingStateMachine: class CodingStateMachine:
def __init__(self, sm): def __init__(self, sm):

View File

@ -13,19 +13,19 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys import constants
from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
from charsetprober import CharSetProber from charsetprober import CharSetProber
from codingstatemachine import CodingStateMachine from codingstatemachine import CodingStateMachine
@ -75,5 +75,5 @@ class EscCharSetProber(CharSetProber):
self._mState = constants.eFoundIt self._mState = constants.eFoundIt
self._mDetectedCharset = codingSM.get_coding_state_machine() self._mDetectedCharset = codingSM.get_coding_state_machine()
return self.get_state() return self.get_state()
return self.get_state() return self.get_state()

View File

@ -14,19 +14,19 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys import constants
from charsetgroupprober import CharSetGroupProber from charsetgroupprober import CharSetGroupProber
from sbcharsetprober import SingleByteCharSetProber from sbcharsetprober import SingleByteCharSetProber
from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model

View File

@ -13,19 +13,19 @@
# modify it under the terms of the GNU Lesser General Public # modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either # License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version. # version 2.1 of the License, or (at your option) any later version.
# #
# This library is distributed in the hope that it will be useful, # This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details. # Lesser General Public License for more details.
# #
# You should have received a copy of the GNU Lesser General Public # You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software # License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
import constants, sys import constants
from constants import eStart, eError, eItsMe from constants import eStart, eError, eItsMe
from charsetprober import CharSetProber from charsetprober import CharSetProber
from codingstatemachine import CodingStateMachine from codingstatemachine import CodingStateMachine

View File

@ -8,11 +8,10 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, re import re
from itertools import count, chain from itertools import count
from calibre.ebooks.oeb.base import XHTML, XHTML_NS from calibre.ebooks.oeb.base import XHTML_NS
from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.base import OEBBook
from lxml import etree, html
from lxml.etree import XPath from lxml.etree import XPath
NSMAP = {'h': XHTML_NS, 'html': XHTML_NS, 'xhtml': XHTML_NS} NSMAP = {'h': XHTML_NS, 'html': XHTML_NS, 'xhtml': XHTML_NS}
@ -55,5 +54,5 @@ def add_page_map(opfpath, opts):
id = elem.attrib['id'] = idgen.next() id = elem.attrib['id'] = idgen.next()
href = '#'.join((item.href, id)) href = '#'.join((item.href, id))
oeb.pages.add(name, href) oeb.pages.add(name, href)
writer = DirWriter(version='2.0', page_map=True) writer = None#DirWriter(version='2.0', page_map=True)
writer.dump(oeb, opfpath) writer.dump(oeb, opfpath)

View File

@ -6,7 +6,6 @@ from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
from calibre import plugins from calibre import plugins
_lzx, _error = plugins['lzx'] _lzx, _error = plugins['lzx']

View File

@ -7,3 +7,5 @@ Microsoft LIT tag and attribute tables.
from calibre.ebooks.lit.maps.opf import MAP as OPF_MAP from calibre.ebooks.lit.maps.opf import MAP as OPF_MAP
from calibre.ebooks.lit.maps.html import MAP as HTML_MAP from calibre.ebooks.lit.maps.html import MAP as HTML_MAP
OPF_MAP, HTML_MAP

View File

@ -1,14 +1,14 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import sys, os import os
from calibre import iswindows
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
try: try:
from PIL import ImageFont from PIL import ImageFont
ImageFont
except ImportError: except ImportError:
import ImageFont import ImageFont
''' '''
Default fonts used in the PRS500 Default fonts used in the PRS500
''' '''
@ -48,11 +48,11 @@ def get_font_path(name):
# then, try calibre shipped ones # then, try calibre shipped ones
try: try:
try: try:
font_mod = __import__('calibre.ebooks.lrf.fonts.prs500', {}, {}, font_mod = __import__('calibre.ebooks.lrf.fonts.prs500', {}, {},
[fname], -1) [fname], -1)
getattr(font_mod, fname) getattr(font_mod, fname)
except (ImportError, AttributeError): except (ImportError, AttributeError):
font_mod = __import__('calibre.ebooks.lrf.fonts.liberation', {}, {}, font_mod = __import__('calibre.ebooks.lrf.fonts.liberation', {}, {},
[LIBERATION_FONT_MAP[name]], -1) [LIBERATION_FONT_MAP[name]], -1)
p = PersistentTemporaryFile('.ttf', 'font_') p = PersistentTemporaryFile('.ttf', 'font_')
p.write(getattr(font_mod, fname).font_data) p.write(getattr(font_mod, fname).font_data)
@ -61,7 +61,7 @@ def get_font_path(name):
return p.name return p.name
except ImportError: except ImportError:
pass pass
# finally, try system default ones # finally, try system default ones
if SYSTEM_FONT_MAP.has_key(name) and os.access(SYSTEM_FONT_MAP[name], os.R_OK): if SYSTEM_FONT_MAP.has_key(name) and os.access(SYSTEM_FONT_MAP[name], os.R_OK):
return SYSTEM_FONT_MAP[name] return SYSTEM_FONT_MAP[name]
@ -71,7 +71,7 @@ def get_font_path(name):
def get_font(name, size, encoding='unic'): def get_font(name, size, encoding='unic'):
''' '''
Get an ImageFont object by name. Get an ImageFont object by name.
@param size: Font height in pixels. To convert from pts: @param size: Font height in pixels. To convert from pts:
sz in pixels = (dpi/72) * size in pts sz in pixels = (dpi/72) * size in pts
@param encoding: Font encoding to use. E.g. 'unic', 'symbol', 'ADOB', 'ADBE', 'aprm' @param encoding: Font encoding to use. E.g. 'unic', 'symbol', 'ADOB', 'ADBE', 'aprm'

View File

@ -94,7 +94,7 @@ NAME_MAP = {
u'springgreen': u'#00FF7F', u'springgreen': u'#00FF7F',
u'violet': u'#EE82EE', u'violet': u'#EE82EE',
u'yellowgreen': u'#9ACD32' u'yellowgreen': u'#9ACD32'
} }
hex_pat = re.compile('#(\d{2})(\d{2})(\d{2})') hex_pat = re.compile('#(\d{2})(\d{2})(\d{2})')
rgb_pat = re.compile('rgb\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)', re.IGNORECASE) rgb_pat = re.compile('rgb\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)', re.IGNORECASE)
@ -109,5 +109,5 @@ def lrs_color(html_color):
if hcol in NAME_MAP: if hcol in NAME_MAP:
return NAME_MAP[hcol].replace('#', '0x00') return NAME_MAP[hcol].replace('#', '0x00')
return '0x00000000' return '0x00000000'

View File

@ -10,13 +10,13 @@ from calibre.ebooks.lrf.lrfparser import LRFDocument
from calibre.ebooks.metadata.opf import OPFCreator from calibre.ebooks.metadata.opf import OPFCreator
from calibre.ebooks.lrf.objects import PageAttr, BlockAttr, TextAttr from calibre.ebooks.lrf.objects import PageAttr, BlockAttr, TextAttr
from calibre.ebooks.lrf.pylrs.pylrs import TextStyle
class BlockStyle(object): class BlockStyle(object):
def __init__(self, ba): def __init__(self, ba):
self.ba = ba self.ba = ba
def __str__(self): def __str__(self):
ans = '.'+str(self.ba.id)+' {\n' ans = '.'+str(self.ba.id)+' {\n'
if hasattr(self.ba, 'sidemargin'): if hasattr(self.ba, 'sidemargin'):
@ -37,10 +37,10 @@ class BlockStyle(object):
ans += '\tbackground-color: %s;\n'%(self.ba.bgcolor.to_html()) ans += '\tbackground-color: %s;\n'%(self.ba.bgcolor.to_html())
#TODO: Fixed size blocks #TODO: Fixed size blocks
return ans + '}\n' return ans + '}\n'
class LRFConverter(object): class LRFConverter(object):
def __init__(self, document, opts, logger): def __init__(self, document, opts, logger):
self.lrf = document self.lrf = document
self.opts = opts self.opts = opts
@ -48,15 +48,15 @@ class LRFConverter(object):
self.logger = logger self.logger = logger
logger.info('Parsing LRF...') logger.info('Parsing LRF...')
self.lrf.parse() self.lrf.parse()
self.create_metadata() self.create_metadata()
self.create_styles() self.create_styles()
def create_metadata(self): def create_metadata(self):
self.logger.info('Reading metadata...') self.logger.info('Reading metadata...')
mi = get_metadata(self.lrf) mi = get_metadata(self.lrf)
self.opf = OPFCreator(self.output_dir, mi) self.opf = OPFCreator(self.output_dir, mi)
def create_page_styles(self): def create_page_styles(self):
self.page_css = '' self.page_css = ''
for obj in self.lrf.objects.values(): for obj in self.lrf.objects.values():
@ -65,21 +65,21 @@ class LRFConverter(object):
self.page_css = selector + ' {\n' self.page_css = selector + ' {\n'
# TODO: Headers and footers # TODO: Headers and footers
self.page_css += '}\n' self.page_css += '}\n'
def create_block_styles(self): def create_block_styles(self):
self.block_css = '' self.block_css = ''
for obj in self.lrf.objects.values(): for obj in self.lrf.objects.values():
if isinstance(obj, BlockAttr): if isinstance(obj, BlockAttr):
self.block_css += str(BlockStyle(obj)) self.block_css += str(BlockStyle(obj))
def create_text_styles(self): def create_text_styles(self):
self.text_css = '' self.text_css = ''
for obj in self.lrf.objects.values(): for obj in self.lrf.objects.values():
if isinstance(obj, TextAttr): if isinstance(obj, TextAttr):
self.text_css += str(TextStyle(obj)) self.text_css += str(TextStyle(obj))
print self.text_css print self.text_css
def create_styles(self): def create_styles(self):
self.logger.info('Creating CSS stylesheet...') self.logger.info('Creating CSS stylesheet...')
self.create_page_styles() self.create_page_styles()
@ -104,9 +104,9 @@ def process_file(lrfpath, opts, logger=None):
raise ConversionError(opts.out + ' is not a directory') raise ConversionError(opts.out + ' is not a directory')
if not os.path.exists(opts.out): if not os.path.exists(opts.out):
os.makedirs(opts.out) os.makedirs(opts.out)
document = LRFDocument(open(lrfpath, 'rb')) document = LRFDocument(open(lrfpath, 'rb'))
conv = LRFConverter(document, opts, logger) conv = LRFConverter(document, opts, logger)
def main(args=sys.argv): def main(args=sys.argv):
@ -116,7 +116,7 @@ def main(args=sys.argv):
parser.print_help() parser.print_help()
return 1 return 1
process_file(args[1], opts) process_file(args[1], opts)
return 0 return 0

View File

@ -11,23 +11,23 @@ def ceil(num):
return int(math.ceil(num)) return int(math.ceil(num))
def print_xml(elem): def print_xml(elem):
from calibre.ebooks.lrf.pylrs.pylrs import ElementWriter from calibre.ebooks.lrf.pylrs.pylrs import ElementWriter
elem = elem.toElement('utf8') elem = elem.toElement('utf8')
ew = ElementWriter(elem, sourceEncoding='utf8') ew = ElementWriter(elem, sourceEncoding='utf8')
ew.write(sys.stdout) ew.write(sys.stdout)
print print
def cattrs(base, extra): def cattrs(base, extra):
new = base.copy() new = base.copy()
new.update(extra) new.update(extra)
return new return new
def tokens(tb): def tokens(tb):
''' '''
Return the next token. A token is : Return the next token. A token is :
1. A string 1. A string
a block of text that has the same style a block of text that has the same style
''' '''
def process_element(x, attrs): def process_element(x, attrs):
if isinstance(x, CR): if isinstance(x, CR):
yield 2, None yield 2, None
@ -49,22 +49,22 @@ def tokens(tb):
for y in x.contents: for y in x.contents:
for z in process_element(y, attrs): for z in process_element(y, attrs):
yield z yield z
for i in tb.contents: for i in tb.contents:
if isinstance(i, CR): if isinstance(i, CR):
yield 1, None yield 1, None
elif isinstance(i, Paragraph): elif isinstance(i, Paragraph):
for j in i.contents: for j in i.contents:
attrs = {} attrs = {}
if hasattr(j, 'attrs'): if hasattr(j, 'attrs'):
attrs = j.attrs attrs = j.attrs
for k in process_element(j, attrs): for k in process_element(j, attrs):
yield k yield k
class Cell(object): class Cell(object):
def __init__(self, conv, tag, css): def __init__(self, conv, tag, css):
self.conv = conv self.conv = conv
self.tag = tag self.tag = tag
@ -89,7 +89,7 @@ class Cell(object):
self.rowspan = int(tag['rowspan']) if tag.has_key('rowspan') else 1 self.rowspan = int(tag['rowspan']) if tag.has_key('rowspan') else 1
except: except:
pass pass
pp = conv.current_page pp = conv.current_page
conv.book.allow_new_page = False conv.book.allow_new_page = False
conv.current_page = conv.book.create_page() conv.current_page = conv.book.create_page()
@ -99,7 +99,7 @@ class Cell(object):
if isinstance(item, TextBlock): if isinstance(item, TextBlock):
self.text_blocks.append(item) self.text_blocks.append(item)
conv.current_page = pp conv.current_page = pp
conv.book.allow_new_page = True conv.book.allow_new_page = True
if not self.text_blocks: if not self.text_blocks:
tb = conv.book.create_text_block() tb = conv.book.create_text_block()
tb.Paragraph(' ') tb.Paragraph(' ')
@ -107,7 +107,7 @@ class Cell(object):
for tb in self.text_blocks: for tb in self.text_blocks:
tb.parent = None tb.parent = None
tb.objId = 0 tb.objId = 0
# Needed as we have to eventually change this BlockStyle's width and # Needed as we have to eventually change this BlockStyle's width and
# height attributes. This blockstyle may be shared with other # height attributes. This blockstyle may be shared with other
# elements, so doing that causes havoc. # elements, so doing that causes havoc.
tb.blockStyle = conv.book.create_block_style() tb.blockStyle = conv.book.create_block_style()
@ -117,17 +117,17 @@ class Cell(object):
if ts.attrs['align'] == 'foot': if ts.attrs['align'] == 'foot':
if isinstance(tb.contents[-1], Paragraph): if isinstance(tb.contents[-1], Paragraph):
tb.contents[-1].append(' ') tb.contents[-1].append(' ')
def pts_to_pixels(self, pts): def pts_to_pixels(self, pts):
pts = int(pts) pts = int(pts)
return ceil((float(self.conv.profile.dpi)/72.)*(pts/10.)) return ceil((float(self.conv.profile.dpi)/72.)*(pts/10.))
def minimum_width(self): def minimum_width(self):
return max([self.minimum_tb_width(tb) for tb in self.text_blocks]) return max([self.minimum_tb_width(tb) for tb in self.text_blocks])
def minimum_tb_width(self, tb): def minimum_tb_width(self, tb):
ts = tb.textStyle.attrs ts = tb.textStyle.attrs
default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize'])) default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
@ -135,7 +135,7 @@ class Cell(object):
mwidth = 0 mwidth = 0
for token, attrs in tokens(tb): for token, attrs in tokens(tb):
font = default_font font = default_font
if isinstance(token, int): # Handle para and line breaks if isinstance(token, int): # Handle para and line breaks
continue continue
if isinstance(token, Plot): if isinstance(token, Plot):
return self.pts_to_pixels(token.xsize) return self.pts_to_pixels(token.xsize)
@ -151,24 +151,24 @@ class Cell(object):
if width > mwidth: if width > mwidth:
mwidth = width mwidth = width
return parindent + mwidth + 2 return parindent + mwidth + 2
def text_block_size(self, tb, maxwidth=sys.maxint, debug=False): def text_block_size(self, tb, maxwidth=sys.maxint, debug=False):
ts = tb.textStyle.attrs ts = tb.textStyle.attrs
default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize'])) default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
parindent = self.pts_to_pixels(ts['parindent']) parindent = self.pts_to_pixels(ts['parindent'])
top, bottom, left, right = 0, 0, parindent, parindent top, bottom, left, right = 0, 0, parindent, parindent
def add_word(width, height, left, right, top, bottom, ls, ws): def add_word(width, height, left, right, top, bottom, ls, ws):
if left + width > maxwidth: if left + width > maxwidth:
left = width + ws left = width + ws
top += ls top += ls
bottom = top+ls if top+ls > bottom else bottom bottom = top+ls if top+ls > bottom else bottom
else: else:
left += (width + ws) left += (width + ws)
right = left if left > right else right right = left if left > right else right
bottom = top+ls if top+ls > bottom else bottom bottom = top+ls if top+ls > bottom else bottom
return left, right, top, bottom return left, right, top, bottom
for token, attrs in tokens(tb): for token, attrs in tokens(tb):
if attrs == None: if attrs == None:
attrs = {} attrs = {}
@ -196,17 +196,17 @@ class Cell(object):
width, height = font.getsize(word) width, height = font.getsize(word)
left, right, top, bottom = add_word(width, height, left, right, top, bottom, ls, ws) left, right, top, bottom = add_word(width, height, left, right, top, bottom, ls, ws)
return right+3+max(parindent, 10), bottom return right+3+max(parindent, 10), bottom
def text_block_preferred_width(self, tb, debug=False): def text_block_preferred_width(self, tb, debug=False):
return self.text_block_size(tb, sys.maxint, debug=debug)[0] return self.text_block_size(tb, sys.maxint, debug=debug)[0]
def preferred_width(self, debug=False): def preferred_width(self, debug=False):
return ceil(max([self.text_block_preferred_width(i, debug=debug) for i in self.text_blocks])) return ceil(max([self.text_block_preferred_width(i, debug=debug) for i in self.text_blocks]))
def height(self, width): def height(self, width):
return sum([self.text_block_size(i, width)[1] for i in self.text_blocks]) return sum([self.text_block_size(i, width)[1] for i in self.text_blocks])
class Row(object): class Row(object):
def __init__(self, conv, row, css, colpad): def __init__(self, conv, row, css, colpad):
@ -221,15 +221,15 @@ class Row(object):
name = a['name'] if a.has_key('name') else a['id'] if a.has_key('id') else None name = a['name'] if a.has_key('name') else a['id'] if a.has_key('id') else None
if name is not None: if name is not None:
self.targets.append(name.replace('#', '')) self.targets.append(name.replace('#', ''))
def number_of_cells(self): def number_of_cells(self):
'''Number of cells in this row. Respects colspan''' '''Number of cells in this row. Respects colspan'''
ans = 0 ans = 0
for cell in self.cells: for cell in self.cells:
ans += cell.colspan ans += cell.colspan
return ans return ans
def height(self, widths): def height(self, widths):
i, heights = 0, [] i, heights = 0, []
for cell in self.cells: for cell in self.cells:
@ -239,11 +239,11 @@ class Row(object):
if not heights: if not heights:
return 0 return 0
return max(heights) return max(heights)
def cell_from_index(self, col): def cell_from_index(self, col):
i = -1 i = -1
cell = None cell = None
for cell in self.cells: for cell in self.cells:
for k in range(0, cell.colspan): for k in range(0, cell.colspan):
if i == col: if i == col:
break break
@ -251,30 +251,30 @@ class Row(object):
if i == col: if i == col:
break break
return cell return cell
def minimum_width(self, col): def minimum_width(self, col):
cell = self.cell_from_index(col) cell = self.cell_from_index(col)
if not cell: if not cell:
return 0 return 0
return cell.minimum_width() return cell.minimum_width()
def preferred_width(self, col): def preferred_width(self, col):
cell = self.cell_from_index(col) cell = self.cell_from_index(col)
if not cell: if not cell:
return 0 return 0
return 0 if cell.colspan > 1 else cell.preferred_width() return 0 if cell.colspan > 1 else cell.preferred_width()
def width_percent(self, col): def width_percent(self, col):
cell = self.cell_from_index(col) cell = self.cell_from_index(col)
if not cell: if not cell:
return -1 return -1
return -1 if cell.colspan > 1 else cell.pwidth return -1 if cell.colspan > 1 else cell.pwidth
def cell_iterator(self): def cell_iterator(self):
for c in self.cells: for c in self.cells:
yield c yield c
class Table(object): class Table(object):
def __init__(self, conv, table, css, rowpad=10, colpad=10): def __init__(self, conv, table, css, rowpad=10, colpad=10):
self.rows = [] self.rows = []
@ -283,31 +283,31 @@ class Table(object):
self.colpad = colpad self.colpad = colpad
rows = table.findAll('tr') rows = table.findAll('tr')
conv.in_table = True conv.in_table = True
for row in rows: for row in rows:
rcss = conv.tag_css(row, css)[0] rcss = conv.tag_css(row, css)[0]
self.rows.append(Row(conv, row, rcss, colpad)) self.rows.append(Row(conv, row, rcss, colpad))
conv.in_table = False conv.in_table = False
def number_of_columns(self): def number_of_columns(self):
max = 0 max = 0
for row in self.rows: for row in self.rows:
max = row.number_of_cells() if row.number_of_cells() > max else max max = row.number_of_cells() if row.number_of_cells() > max else max
return max return max
def number_or_rows(self): def number_or_rows(self):
return len(self.rows) return len(self.rows)
def height(self, maxwidth): def height(self, maxwidth):
''' Return row heights + self.rowpad''' ''' Return row heights + self.rowpad'''
widths = self.get_widths(maxwidth) widths = self.get_widths(maxwidth)
return sum([row.height(widths) + self.rowpad for row in self.rows]) - self.rowpad return sum([row.height(widths) + self.rowpad for row in self.rows]) - self.rowpad
def minimum_width(self, col): def minimum_width(self, col):
return max([row.minimum_width(col) for row in self.rows]) return max([row.minimum_width(col) for row in self.rows])
def width_percent(self, col): def width_percent(self, col):
return max([row.width_percent(col) for row in self.rows]) return max([row.width_percent(col) for row in self.rows])
def get_widths(self, maxwidth): def get_widths(self, maxwidth):
''' '''
Return widths of columns + self.colpad Return widths of columns + self.colpad
@ -320,29 +320,29 @@ class Table(object):
try: try:
cellwidths[r] = self.rows[r].preferred_width(c) cellwidths[r] = self.rows[r].preferred_width(c)
except IndexError: except IndexError:
continue continue
widths[c] = max(cellwidths) widths[c] = max(cellwidths)
min_widths = [self.minimum_width(i)+10 for i in xrange(cols)] min_widths = [self.minimum_width(i)+10 for i in xrange(cols)]
for i in xrange(len(widths)): for i in xrange(len(widths)):
wp = self.width_percent(i) wp = self.width_percent(i)
if wp >= 0.: if wp >= 0.:
widths[i] = max(min_widths[i], ceil((wp/100.) * (maxwidth - (cols-1)*self.colpad))) widths[i] = max(min_widths[i], ceil((wp/100.) * (maxwidth - (cols-1)*self.colpad)))
itercount = 0 itercount = 0
while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100: while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100:
for i in range(cols): for i in range(cols):
widths[i] = ceil((95./100.)*widths[i]) if \ widths[i] = ceil((95./100.)*widths[i]) if \
ceil((95./100.)*widths[i]) >= min_widths[i] else widths[i] ceil((95./100.)*widths[i]) >= min_widths[i] else widths[i]
itercount += 1 itercount += 1
return [i+self.colpad for i in widths] return [i+self.colpad for i in widths]
def blocks(self, maxwidth, maxheight): def blocks(self, maxwidth, maxheight):
rows, cols = self.number_or_rows(), self.number_of_columns() rows, cols = self.number_or_rows(), self.number_of_columns()
cellmatrix = [[None for c in range(cols)] for r in range(rows)] cellmatrix = [[None for c in range(cols)] for r in range(rows)]
rowpos = [0 for i in range(rows)] rowpos = [0 for i in range(rows)]
for r in range(rows): for r in range(rows):
nc = self.rows[r].cell_iterator() nc = self.rows[r].cell_iterator()
@ -358,14 +358,14 @@ class Table(object):
break break
except StopIteration: # No more cells in this row except StopIteration: # No more cells in this row
continue continue
widths = self.get_widths(maxwidth) widths = self.get_widths(maxwidth)
heights = [row.height(widths) for row in self.rows] heights = [row.height(widths) for row in self.rows]
xpos = [sum(widths[:i]) for i in range(cols)] xpos = [sum(widths[:i]) for i in range(cols)]
delta = maxwidth - sum(widths) delta = maxwidth - sum(widths)
if delta < 0: if delta < 0:
delta = 0 delta = 0
for r in range(len(cellmatrix)): for r in range(len(cellmatrix)):
yield None, 0, heights[r], 0, self.rows[r].targets yield None, 0, heights[r], 0, self.rows[r].targets
@ -377,13 +377,13 @@ class Table(object):
sypos = 0 sypos = 0
for tb in cell.text_blocks: for tb in cell.text_blocks:
tb.blockStyle = self.conv.book.create_block_style( tb.blockStyle = self.conv.book.create_block_style(
blockwidth=width, blockwidth=width,
blockheight=cell.text_block_size(tb, width)[1], blockheight=cell.text_block_size(tb, width)[1],
blockrule='horz-fixed') blockrule='horz-fixed')
yield tb, xpos[c], sypos, delta, None yield tb, xpos[c], sypos, delta, None
sypos += tb.blockStyle.attrs['blockheight'] sypos += tb.blockStyle.attrs['blockheight']

View File

@ -1,81 +1,81 @@
""" elements.py -- replacements and helpers for ElementTree """ """ elements.py -- replacements and helpers for ElementTree """
class ElementWriter(object): class ElementWriter(object):
def __init__(self, e, header=False, sourceEncoding="ascii", def __init__(self, e, header=False, sourceEncoding="ascii",
spaceBeforeClose=True, outputEncodingName="UTF-16"): spaceBeforeClose=True, outputEncodingName="UTF-16"):
self.header = header self.header = header
self.e = e self.e = e
self.sourceEncoding=sourceEncoding self.sourceEncoding=sourceEncoding
self.spaceBeforeClose = spaceBeforeClose self.spaceBeforeClose = spaceBeforeClose
self.outputEncodingName = outputEncodingName self.outputEncodingName = outputEncodingName
def _encodeCdata(self, rawText): def _encodeCdata(self, rawText):
if type(rawText) is str: if type(rawText) is str:
rawText = rawText.decode(self.sourceEncoding) rawText = rawText.decode(self.sourceEncoding)
text = rawText.replace("&", "&amp;") text = rawText.replace("&", "&amp;")
text = text.replace("<", "&lt;") text = text.replace("<", "&lt;")
text = text.replace(">", "&gt;") text = text.replace(">", "&gt;")
return text return text
def _writeAttribute(self, f, name, value): def _writeAttribute(self, f, name, value):
f.write(u' %s="' % unicode(name)) f.write(u' %s="' % unicode(name))
if not isinstance(value, basestring): if not isinstance(value, basestring):
value = unicode(value) value = unicode(value)
value = self._encodeCdata(value) value = self._encodeCdata(value)
value = value.replace('"', '&quot;') value = value.replace('"', '&quot;')
f.write(value) f.write(value)
f.write(u'"') f.write(u'"')
def _writeText(self, f, rawText): def _writeText(self, f, rawText):
text = self._encodeCdata(rawText) text = self._encodeCdata(rawText)
f.write(text) f.write(text)
def _write(self, f, e): def _write(self, f, e):
f.write(u'<' + unicode(e.tag)) f.write(u'<' + unicode(e.tag))
attributes = e.items() attributes = e.items()
attributes.sort() attributes.sort()
for name, value in attributes: for name, value in attributes:
self._writeAttribute(f, name, value) self._writeAttribute(f, name, value)
if e.text is not None or len(e) > 0: if e.text is not None or len(e) > 0:
f.write(u'>') f.write(u'>')
if e.text: if e.text:
self._writeText(f, e.text) self._writeText(f, e.text)
for e2 in e: for e2 in e:
self._write(f, e2) self._write(f, e2)
f.write(u'</%s>' % e.tag) f.write(u'</%s>' % e.tag)
else: else:
if self.spaceBeforeClose: if self.spaceBeforeClose:
f.write(' ') f.write(' ')
f.write(u'/>') f.write(u'/>')
if e.tail is not None: if e.tail is not None:
self._writeText(f, e.tail) self._writeText(f, e.tail)
def toString(self): def toString(self):
class x: class x:
pass pass
buffer = [] buffer = []
x.write = buffer.append x.write = buffer.append
self.write(x) self.write(x)
return u''.join(buffer) return u''.join(buffer)
def write(self, f): def write(self, f):
if self.header: if self.header:
f.write(u'<?xml version="1.0" encoding="%s"?>\n' % self.outputEncodingName) f.write(u'<?xml version="1.0" encoding="%s"?>\n' % self.outputEncodingName)
self._write(f, self.e) self._write(f, self.e)

File diff suppressed because it is too large Load Diff

View File

@ -1,43 +1,43 @@
def _optimize(tagList, tagName, conversion): def _optimize(tagList, tagName, conversion):
# copy the tag of interest plus any text # copy the tag of interest plus any text
newTagList = [] newTagList = []
for tag in tagList: for tag in tagList:
if tag.name == tagName or tag.name == "rawtext": if tag.name == tagName or tag.name == "rawtext":
newTagList.append(tag) newTagList.append(tag)
# now, eliminate any duplicates (leaving the last one) # now, eliminate any duplicates (leaving the last one)
for i, newTag in enumerate(newTagList[:-1]): for i, newTag in enumerate(newTagList[:-1]):
if newTag.name == tagName and newTagList[i+1].name == tagName: if newTag.name == tagName and newTagList[i+1].name == tagName:
tagList.remove(newTag) tagList.remove(newTag)
# eliminate redundant settings to same value across text strings # eliminate redundant settings to same value across text strings
newTagList = [] newTagList = []
for tag in tagList: for tag in tagList:
if tag.name == tagName: if tag.name == tagName:
newTagList.append(tag) newTagList.append(tag)
for i, newTag in enumerate(newTagList[:-1]): for i, newTag in enumerate(newTagList[:-1]):
value = conversion(newTag.parameter) value = conversion(newTag.parameter)
nextValue = conversion(newTagList[i+1].parameter) nextValue = conversion(newTagList[i+1].parameter)
if value == nextValue: if value == nextValue:
tagList.remove(newTagList[i+1]) tagList.remove(newTagList[i+1])
# eliminate any setting that don't have text after them # eliminate any setting that don't have text after them
while len(tagList) > 0 and tagList[-1].name == tagName: while len(tagList) > 0 and tagList[-1].name == tagName:
del tagList[-1] del tagList[-1]
def tagListOptimizer(tagList): def tagListOptimizer(tagList):
# this function eliminates redundant or unnecessary tags # this function eliminates redundant or unnecessary tags
# it scans a list of tags, looking for text settings that are # it scans a list of tags, looking for text settings that are
# changed before any text is output # changed before any text is output
# for example, # for example,
# fontsize=100, fontsize=200, text, fontsize=100, fontsize=200 # fontsize=100, fontsize=200, text, fontsize=100, fontsize=200
# should be: # should be:
# fontsize=200 text # fontsize=200 text
oldSize = len(tagList) oldSize = len(tagList)
_optimize(tagList, "fontsize", int) _optimize(tagList, "fontsize", int)
_optimize(tagList, "fontweight", int) _optimize(tagList, "fontweight", int)
return oldSize - len(tagList) return oldSize - len(tagList)

File diff suppressed because it is too large Load Diff

View File

@ -2,4 +2,6 @@
# Initialize extensions # Initialize extensions
from calibre.ebooks.markdown import mdx_footnotes from calibre.ebooks.markdown import mdx_footnotes
from calibre.ebooks.markdown import mdx_tables from calibre.ebooks.markdown import mdx_tables
from calibre.ebooks.markdown import mdx_toc from calibre.ebooks.markdown import mdx_toc
mdx_footnotes, mdx_tables, mdx_toc

View File

@ -8,8 +8,6 @@ My markdown extensions for adding:
Table of Contents (aka toc) Table of Contents (aka toc)
""" """
import os
import sys
import re import re
import markdown import markdown
@ -18,7 +16,7 @@ DEFAULT_TITLE = None
def extract_alphanumeric(in_str=None): def extract_alphanumeric(in_str=None):
"""take alpha-numeric (7bit ascii) and return as a string """take alpha-numeric (7bit ascii) and return as a string
""" """
# I'm sure this is really inefficient and # I'm sure this is really inefficient and
# could be done with a lambda/map() # could be done with a lambda/map()
#x.strip().title().replace(' ', "") #x.strip().title().replace(' ', "")
out_str=[] out_str=[]
@ -42,7 +40,7 @@ class TocExtension (markdown.Extension):
toc is returned in a div tag with class='toc' toc is returned in a div tag with class='toc'
toc is either: toc is either:
appended to end of document appended to end of document
OR OR
replaces first string occurence of "///Table of Contents Goes Here///" replaces first string occurence of "///Table of Contents Goes Here///"
""" """
@ -75,7 +73,7 @@ class TocExtension (markdown.Extension):
""" """
Creates Table Of Contents based on headers. Creates Table Of Contents based on headers.
@returns: toc as a single as a dom element @returns: toc as a single as a dom element
in a <div> tag with class='toc' in a <div> tag with class='toc'
""" """
@ -85,9 +83,9 @@ class TocExtension (markdown.Extension):
if element.type=='element': if element.type=='element':
if headers_compiled_re.match(element.nodeName): if headers_compiled_re.match(element.nodeName):
return True return True
headers_doc_list = doc.find(findHeadersFn) headers_doc_list = doc.find(findHeadersFn)
# Insert anchor tags into dom # Insert anchor tags into dom
generated_anchor_id=0 generated_anchor_id=0
headers_list=[] headers_list=[]
@ -99,19 +97,19 @@ class TocExtension (markdown.Extension):
if heading_type == self.auto_toc_heading_type: if heading_type == self.auto_toc_heading_type:
min_header_size_found=min(min_header_size_found, min_header_size_found=min(min_header_size_found,
heading_type) heading_type)
html_anchor_name= (extract_alphanumeric(heading_title) html_anchor_name= (extract_alphanumeric(heading_title)
+'__MD_autoTOC_%d' % (generated_anchor_id)) +'__MD_autoTOC_%d' % (generated_anchor_id))
# insert anchor tag inside header tags # insert anchor tag inside header tags
html_anchor = doc.createElement("a") html_anchor = doc.createElement("a")
html_anchor.setAttribute('name', html_anchor_name) html_anchor.setAttribute('name', html_anchor_name)
element.appendChild(html_anchor) element.appendChild(html_anchor)
headers_list.append( (heading_type, heading_title, headers_list.append( (heading_type, heading_title,
html_anchor_name) ) html_anchor_name) )
generated_anchor_id = generated_anchor_id + 1 generated_anchor_id = generated_anchor_id + 1
# create dom for toc # create dom for toc
if headers_list != []: if headers_list != []:
# Create list # Create list
@ -125,9 +123,9 @@ class TocExtension (markdown.Extension):
toc_doc_link.appendChild(toc_doc_text) toc_doc_link.appendChild(toc_doc_text)
toc_doc_entry.appendChild(toc_doc_link) toc_doc_entry.appendChild(toc_doc_link)
toc_doc_list.appendChild(toc_doc_entry) toc_doc_list.appendChild(toc_doc_entry)
# Put list into div # Put list into div
div = doc.createElement("div") div = doc.createElement("div")
div.setAttribute('class', 'toc') div.setAttribute('class', 'toc')
if self.TOC_TITLE: if self.TOC_TITLE:
@ -149,7 +147,7 @@ class TocPostprocessor (markdown.Postprocessor):
def run(self, doc): def run(self, doc):
tocPlaceholder = self.toc.findTocPlaceholder(doc) tocPlaceholder = self.toc.findTocPlaceholder(doc)
tocDiv = self.toc.createTocDiv(doc) tocDiv = self.toc.createTocDiv(doc)
if tocDiv: if tocDiv:
if tocPlaceholder : if tocPlaceholder :

View File

@ -2,7 +2,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Ashish Kulkarni <kulkarni.ashish@gmail.com>' __copyright__ = '2008, Ashish Kulkarni <kulkarni.ashish@gmail.com>'
'''Read meta information from IMP files''' '''Read meta information from IMP files'''
import sys, os import sys
from calibre.ebooks.metadata import MetaInformation, string_to_authors from calibre.ebooks.metadata import MetaInformation, string_to_authors
@ -17,7 +17,7 @@ def get_metadata(stream):
if stream.read(10) not in MAGIC: if stream.read(10) not in MAGIC:
print >>sys.stderr, u'Couldn\'t read IMP header from file' print >>sys.stderr, u'Couldn\'t read IMP header from file'
return mi return mi
def cString(skip=0): def cString(skip=0):
result = '' result = ''
while 1: while 1:
@ -30,7 +30,7 @@ def get_metadata(stream):
stream.read(38) # skip past some uninteresting headers stream.read(38) # skip past some uninteresting headers
_, category, title, author = cString(), cString(), cString(1), cString(2) _, category, title, author = cString(), cString(), cString(1), cString(2)
if title: if title:
mi.title = title mi.title = title
if author: if author:

View File

@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
Read metadata from LRX files Read metadata from LRX files
''' '''
import sys, struct import struct
from zlib import decompress from zlib import decompress
from lxml import etree from lxml import etree
@ -33,7 +33,7 @@ def short_be(buf):
def get_metadata(f): def get_metadata(f):
read = lambda at, amount: _read(f, at, amount) read = lambda at, amount: _read(f, at, amount)
f.seek(0) f.seek(0)
buf = f.read(12) buf = f.read(12)
if buf[4:] == 'ftypLRX2': if buf[4:] == 'ftypLRX2':
offset = 0 offset = 0
while True: while True:
@ -74,9 +74,9 @@ def get_metadata(f):
mi.tags = [x.text for x in bi.findall('Category')] mi.tags = [x.text for x in bi.findall('Category')]
mi.language = root.find('DocInfo').find('Language').text mi.language = root.find('DocInfo').find('Language').text
return mi return mi
elif buf[4:8] == 'LRX': elif buf[4:8] == 'LRX':
raise ValueError('Librie LRX format not supported') raise ValueError('Librie LRX format not supported')
else: else:
raise ValueError('Not a LRX file') raise ValueError('Not a LRX file')

View File

@ -17,7 +17,7 @@
# #
# Contributor(s): # Contributor(s):
# #
import zipfile, sys, re import zipfile, re
import xml.sax.saxutils import xml.sax.saxutils
from cStringIO import StringIO from cStringIO import StringIO
@ -46,7 +46,7 @@ fields = {
} }
def normalize(str): def normalize(str):
""" """
The normalize-space function returns the argument string with whitespace The normalize-space function returns the argument string with whitespace
normalized by stripping leading and trailing whitespace and replacing normalized by stripping leading and trailing whitespace and replacing
sequences of whitespace characters by a single space. sequences of whitespace characters by a single space.
@ -125,7 +125,7 @@ class odfmetaparser(xml.sax.saxutils.XMLGenerator):
else: else:
texttag = self._tag texttag = self._tag
self.seenfields[texttag] = self.data() self.seenfields[texttag] = self.data()
if field in self.deletefields: if field in self.deletefields:
self.output.dowrite = True self.output.dowrite = True
else: else:
@ -140,7 +140,7 @@ class odfmetaparser(xml.sax.saxutils.XMLGenerator):
def data(self): def data(self):
return normalize(''.join(self._data)) return normalize(''.join(self._data))
def get_metadata(stream): def get_metadata(stream):
zin = zipfile.ZipFile(stream, 'r') zin = zipfile.ZipFile(stream, 'r')
odfs = odfmetaparser() odfs = odfmetaparser()
@ -161,6 +161,6 @@ def get_metadata(stream):
mi.language = data['language'] mi.language = data['language']
if data.get('keywords', ''): if data.get('keywords', ''):
mi.tags = data['keywords'].split(',') mi.tags = data['keywords'].split(',')
return mi return mi

View File

@ -3,8 +3,8 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os import os
from zipfile import ZipFile from zipfile import ZipFile
from cStringIO import StringIO from cStringIO import StringIO
def get_metadata(stream): def get_metadata(stream):
@ -20,5 +20,5 @@ def get_metadata(stream):
stream = StringIO(zf.read(f)) stream = StringIO(zf.read(f))
return get_metadata(stream, stream_type) return get_metadata(stream, stream_type)
raise ValueError('No ebook found in ZIP archive') raise ValueError('No ebook found in ZIP archive')

View File

@ -3,7 +3,6 @@
''' '''
Writer content to palmdoc pdb file. Writer content to palmdoc pdb file.
''' '''
import os
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'

View File

@ -4,7 +4,6 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os
class zTXTError(Exception): class zTXTError(Exception):
pass pass

View File

@ -12,8 +12,6 @@ Decrypt content of PDF.
import os, sys import os, sys
from optparse import OptionGroup, Option from optparse import OptionGroup, Option
from calibre.ebooks.metadata.meta import metadata_from_formats
from calibre.ebooks.metadata import authors_to_string
from calibre.utils.config import OptionParser from calibre.utils.config import OptionParser
from calibre.utils.logging import Log from calibre.utils.logging import Log
from calibre.constants import preferred_encoding from calibre.constants import preferred_encoding
@ -36,8 +34,8 @@ OPTIONS = set([
class DecryptionError(Exception): class DecryptionError(Exception):
def __init__(self, pdf_path): def __init__(self, pdf_path):
self.value = 'Unable to decrypt file `%s`.' % value self.value = 'Unable to decrypt file `%s`.' % pdf_path
def __str__(self): def __str__(self):
return repr(self.value) return repr(self.value)
@ -62,20 +60,20 @@ def add_options(parser):
group = OptionGroup(parser, _('Decrypt Options:'), _('Options to control the transformation of pdf')) group = OptionGroup(parser, _('Decrypt Options:'), _('Options to control the transformation of pdf'))
parser.add_option_group(group) parser.add_option_group(group)
add_option = group.add_option add_option = group.add_option
for rec in OPTIONS: for rec in OPTIONS:
option_recommendation_to_cli_option(add_option, rec) option_recommendation_to_cli_option(add_option, rec)
def decrypt(pdf_path, out_path, password): def decrypt(pdf_path, out_path, password):
pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb'))
if pdf.decrypt(str(password)) == 0: if pdf.decrypt(str(password)) == 0:
raise DecryptionError(pdf_path) raise DecryptionError(pdf_path)
title = pdf.documentInfo.title if pdf.documentInfo.title else _('Unknown') title = pdf.documentInfo.title if pdf.documentInfo.title else _('Unknown')
author = pdf.documentInfo.author if pdf.documentInfo.author else _('Unknown') author = pdf.documentInfo.author if pdf.documentInfo.author else _('Unknown')
out_pdf = PdfFileWriter(title=title, author=author) out_pdf = PdfFileWriter(title=title, author=author)
for page in pdf.pages: for page in pdf.pages:
out_pdf.addPage(page) out_pdf.addPage(page)
@ -86,23 +84,23 @@ def main(args=sys.argv, name=''):
log = Log() log = Log()
parser = option_parser(name) parser = option_parser(name)
add_options(parser) add_options(parser)
opts, args = parser.parse_args(args) opts, args = parser.parse_args(args)
args = args[1:] args = args[1:]
if len(args) < 2: if len(args) < 2:
print 'Error: A PDF file and decryption password is required.\n' print 'Error: A PDF file and decryption password is required.\n'
print_help(parser, log) print_help(parser, log)
return 1 return 1
if not is_valid_pdf(args[0]): if not is_valid_pdf(args[0]):
print 'Error: Could not read file `%s`.' % args[0] print 'Error: Could not read file `%s`.' % args[0]
return 1 return 1
if not is_encrypted(args[0]): if not is_encrypted(args[0]):
print 'Error: file `%s` is not encrypted.' % args[0] print 'Error: file `%s` is not encrypted.' % args[0]
return 1 return 1
try: try:
decrypt(args[0], opts.output, args[1]) decrypt(args[0], opts.output, args[1])
except DecryptionError, e: except DecryptionError, e:

View File

@ -17,6 +17,8 @@ from calibre.utils.logging import Log
from calibre.constants import preferred_encoding from calibre.constants import preferred_encoding
from calibre.customize.conversion import OptionRecommendation from calibre.customize.conversion import OptionRecommendation
from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted
from calibre.ebooks.metadata import authors_to_string
from calibre.ebooks.metadata.meta import metadata_from_formats
from pyPdf import PdfFileWriter, PdfFileReader from pyPdf import PdfFileWriter, PdfFileReader
@ -52,7 +54,7 @@ def add_options(parser):
group = OptionGroup(parser, _('Encrypt Options:'), _('Options to control the transformation of pdf')) group = OptionGroup(parser, _('Encrypt Options:'), _('Options to control the transformation of pdf'))
parser.add_option_group(group) parser.add_option_group(group)
add_option = group.add_option add_option = group.add_option
for rec in OPTIONS: for rec in OPTIONS:
option_recommendation_to_cli_option(add_option, rec) option_recommendation_to_cli_option(add_option, rec)
@ -78,23 +80,23 @@ def main(args=sys.argv, name=''):
log = Log() log = Log()
parser = option_parser(name) parser = option_parser(name)
add_options(parser) add_options(parser)
opts, args = parser.parse_args(args) opts, args = parser.parse_args(args)
args = args[1:] args = args[1:]
if len(args) < 2: if len(args) < 2:
print 'Error: A PDF file and decryption password is required.\n' print 'Error: A PDF file and decryption password is required.\n'
print_help(parser, log) print_help(parser, log)
return 1 return 1
if not is_valid_pdf(args[0]): if not is_valid_pdf(args[0]):
print 'Error: Could not read file `%s`.' % args[0] print 'Error: Could not read file `%s`.' % args[0]
return 1 return 1
if is_encrypted(args[0]): if is_encrypted(args[0]):
print 'Error: file `%s` is already encrypted.' % args[0] print 'Error: file `%s` is already encrypted.' % args[0]
return 1 return 1
mi = metadata_from_formats([args[0]]) mi = metadata_from_formats([args[0]])
encrypt(args[0], opts.output, args[1], mi) encrypt(args[0], opts.output, args[1], mi)

View File

@ -11,25 +11,25 @@ Verify PDF files.
import os import os
from pyPdf import PdfFileWriter, PdfFileReader from pyPdf import PdfFileReader
def is_valid_pdf(pdf_path): def is_valid_pdf(pdf_path):
''' '''
Returns True if the pdf file is valid. Returns True if the pdf file is valid.
''' '''
try: try:
with open(os.path.abspath(pdf_path), 'rb') as pdf_file: with open(os.path.abspath(pdf_path), 'rb') as pdf_file:
pdf = PdfFileReader(pdf_file) pdf = PdfFileReader(pdf_file)
except: except:
return False return False
return True return True
def is_valid_pdfs(pdf_paths): def is_valid_pdfs(pdf_paths):
''' '''
Returns a list of invalid pdf files. Returns a list of invalid pdf files.
''' '''
invalid = [] invalid = []
for pdf_path in pdf_paths: for pdf_path in pdf_paths:
if not is_valid_pdf(pdf_path): if not is_valid_pdf(pdf_path):

View File

@ -4,7 +4,6 @@ __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os
import struct import struct
import zlib import zlib

View File

@ -15,7 +15,7 @@
# # # #
# # # #
######################################################################### #########################################################################
import sys, os, shutil import os, shutil
class Copy: class Copy:
"""Copy each changed file to a directory for debugging purposes""" """Copy each changed file to a directory for debugging purposes"""
@ -66,6 +66,6 @@ class Copy:
""" """
write_file = os.path.join(Copy.__dir,new_file) write_file = os.path.join(Copy.__dir,new_file)
shutil.copyfile(file, write_file) shutil.copyfile(file, write_file)
def rename(self, source, dest): def rename(self, source, dest):
shutil.copyfile(source, dest) shutil.copyfile(source, dest)

View File

@ -1,5 +1,4 @@
import sys import sys
from calibre.ebooks import rtf2xml
class ParseOptions: class ParseOptions:
""" """
Requires: Requires:

View File

@ -16,7 +16,6 @@
# # # #
######################################################################### #########################################################################
import sys, os, codecs import sys, os, codecs
from calibre.ebooks import rtf2xml
class Output: class Output:
""" """
Output file Output file

View File

@ -15,8 +15,6 @@
# # # #
# # # #
######################################################################### #########################################################################
import sys,os
from calibre.ebooks import rtf2xml
class OverrideTable: class OverrideTable:
""" """
Parse a line of text to make the override table. Return a string Parse a line of text to make the override table. Return a string

View File

@ -7,21 +7,19 @@ from calibre.gui2 import file_icon_provider
from calibre.gui2.dialogs.choose_format_ui import Ui_ChooseFormatDialog from calibre.gui2.dialogs.choose_format_ui import Ui_ChooseFormatDialog
class ChooseFormatDialog(QDialog, Ui_ChooseFormatDialog): class ChooseFormatDialog(QDialog, Ui_ChooseFormatDialog):
def __init__(self, window, msg, formats): def __init__(self, window, msg, formats):
QDialog.__init__(self, window) QDialog.__init__(self, window)
Ui_ChooseFormatDialog.__init__(self) Ui_ChooseFormatDialog.__init__(self)
self.setupUi(self) self.setupUi(self)
self.connect(self.formats, SIGNAL('activated(QModelIndex)'), lambda i: self.accept()) self.connect(self.formats, SIGNAL('activated(QModelIndex)'), lambda i: self.accept())
self.msg.setText(msg) self.msg.setText(msg)
for format in formats: for format in formats:
self.formats.addItem(QListWidgetItem(file_icon_provider().icon_from_ext(format.lower()), self.formats.addItem(QListWidgetItem(file_icon_provider().icon_from_ext(format.lower()),
format.upper())) format.upper()))
self._formats = formats self._formats = formats
self.formats.setCurrentRow(0) self.formats.setCurrentRow(0)
def format(self): def format(self):
return self._formats[self.formats.currentRow()] return self._formats[self.formats.currentRow()]

View File

@ -5,7 +5,7 @@ from PyQt4.QtGui import QDialog
from calibre.gui2.dialogs.conversion_error_ui import Ui_ConversionErrorDialog from calibre.gui2.dialogs.conversion_error_ui import Ui_ConversionErrorDialog
class ConversionErrorDialog(QDialog, Ui_ConversionErrorDialog): class ConversionErrorDialog(QDialog, Ui_ConversionErrorDialog):
def __init__(self, window, title, html, show=False): def __init__(self, window, title, html, show=False):
QDialog.__init__(self, window) QDialog.__init__(self, window)
Ui_ConversionErrorDialog.__init__(self) Ui_ConversionErrorDialog.__init__(self)
@ -14,7 +14,7 @@ class ConversionErrorDialog(QDialog, Ui_ConversionErrorDialog):
self.set_message(html) self.set_message(html)
if show: if show:
self.show() self.show()
def set_message(self, html): def set_message(self, html):
self.text.setHtml('<html><body>%s</body></html'%(html,)) self.text.setHtml('<html><body>%s</body></html'%(html,))

View File

@ -5,20 +5,20 @@ from PyQt4.QtGui import QGraphicsView
from PyQt4.QtCore import QSize from PyQt4.QtCore import QSize
class BookView(QGraphicsView): class BookView(QGraphicsView):
MINIMUM_SIZE = QSize(400, 500) MINIMUM_SIZE = QSize(400, 500)
def __init__(self, *args): def __init__(self, *args):
QGraphicsView.__init__(self, *args) QGraphicsView.__init__(self, *args)
self.preferred_size = self.MINIMUM_SIZE self.preferred_size = self.MINIMUM_SIZE
def minimumSizeHint(self): def minimumSizeHint(self):
return self.MINIMUM_SIZE return self.MINIMUM_SIZE
def sizeHint(self): def sizeHint(self):
return self.preferred_size return self.preferred_size
def resize_for(self, width, height): def resize_for(self, width, height):
self.preferred_size = QSize(width, height) self.preferred_size = QSize(width, height)

View File

@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
import os, math, re import os, math, re
from PyQt4.Qt import QWidget, QSize, QSizePolicy, QUrl, SIGNAL, Qt, QTimer, \ from PyQt4.Qt import QWidget, QSize, QSizePolicy, QUrl, SIGNAL, Qt, QTimer, \
QPainter, QPalette, QBrush, QFontDatabase, QDialog, \ QPainter, QPalette, QBrush, QFontDatabase, QDialog, \
QByteArray, QColor, QWheelEvent, QPoint, QImage, QRegion, \ QByteArray, QColor, QPoint, QImage, QRegion, \
QFont, QObject, QApplication, pyqtSignature QFont, QObject, QApplication, pyqtSignature
from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings

View File

@ -4,17 +4,14 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
import os, sys, traceback, urlparse import os, sys, urlparse
from BeautifulSoup import BeautifulSoup, Tag from BeautifulSoup import BeautifulSoup, Tag
from calibre.ebooks.oeb.iterator import EbookIterator
from calibre.ptempfile import TemporaryDirectory
from PyQt4 import QtCore from PyQt4 import QtCore
from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, QApplication, Qt, \ from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, QApplication, Qt, \
QPrinter, QPrintPreviewDialog, QPrintDialog, QDialog, QMetaObject, Q_ARG QPrinter, QPrintPreviewDialog, QPrintDialog, QDialog, QMetaObject, Q_ARG
from PyQt4 import QtCore
from PyQt4.QtWebKit import QWebView from PyQt4.QtWebKit import QWebView
PRINTCSS = 'body{width:100%;margin:0;padding:0;font-family:Arial;color:#000;background:none;font-size:12pt;text-align:left;}h1,h2,h3,h4,h5,h6{font-family:Helvetica;}h1{font-size:19pt;}h2{font-size:17pt;}h3{font-size:15pt;}h4,h5,h6{font-size:12pt;}pre,code,samp{font:10ptCourier,monospace;white-space:pre-wrap;page-break-inside:avoid;}blockquote{margin:1.3em;padding:1em;font-size:10pt;}hr{background-color:#ccc;}aimg{border:none;}a:link,a:visited{background:transparent;font-weight:700;text-decoration:underline;color:#333;}a:link:after,a{color:#000;}table{margin:1px;text-align:left;}th{border-bottom:1pxsolid#333;font-weight:bold;}td{border-bottom:1pxsolid#333;}th,td{padding:4px10px4px0;}tfoot{font-style:italic;}caption{background:#fff;margin-bottom:2em;text-align:left;}thead{display:table-header-group;}tr{page-break-inside:avoid;}#header,.header,#footer,.footer,#navbar,.navbar,#navigation,.navigation,#rightSideBar,.rightSideBar,#leftSideBar,.leftSideBar{display:none;}' PRINTCSS = 'body{width:100%;margin:0;padding:0;font-family:Arial;color:#000;background:none;font-size:12pt;text-align:left;}h1,h2,h3,h4,h5,h6{font-family:Helvetica;}h1{font-size:19pt;}h2{font-size:17pt;}h3{font-size:15pt;}h4,h5,h6{font-size:12pt;}pre,code,samp{font:10ptCourier,monospace;white-space:pre-wrap;page-break-inside:avoid;}blockquote{margin:1.3em;padding:1em;font-size:10pt;}hr{background-color:#ccc;}aimg{border:none;}a:link,a:visited{background:transparent;font-weight:700;text-decoration:underline;color:#333;}a:link:after,a{color:#000;}table{margin:1px;text-align:left;}th{border-bottom:1pxsolid#333;font-weight:bold;}td{border-bottom:1pxsolid#333;}th,td{padding:4px10px4px0;}tfoot{font-style:italic;}caption{background:#fff;margin-bottom:2em;text-align:left;}thead{display:table-header-group;}tr{page-break-inside:avoid;}#header,.header,#footer,.footer,#navbar,.navbar,#navigation,.navigation,#rightSideBar,.rightSideBar,#leftSideBar,.leftSideBar{display:none;}'
@ -31,18 +28,18 @@ class Printing(QObject):
self.connect(self.view, SIGNAL('loadFinished(bool)'), self.print_preview) self.connect(self.view, SIGNAL('loadFinished(bool)'), self.print_preview)
else: else:
self.connect(self.view, SIGNAL('loadFinished(bool)'), self.print_book) self.connect(self.view, SIGNAL('loadFinished(bool)'), self.print_book)
self.process_content(spine) self.process_content(spine)
def process_content(self, spine): def process_content(self, spine):
content = '' content = ''
for path in spine: for path in spine:
raw = self.raw_content(path) raw = self.raw_content(path)
content += self.parsed_content(raw, path) content += self.parsed_content(raw, path)
refined_content = self.refine_content(content) refined_content = self.refine_content(content)
base = os.path.splitdrive(spine[0])[0] base = os.path.splitdrive(spine[0])[0]
base = base if base != '' else '/' base = base if base != '' else '/'
@ -52,7 +49,7 @@ class Printing(QObject):
@QtCore.pyqtSignature('load_content(QString, QString)') @QtCore.pyqtSignature('load_content(QString, QString)')
def load_content(self, content, base): def load_content(self, content, base):
self.view.setHtml(content, QUrl(base)) self.view.setHtml(content, QUrl(base))
def raw_content(self, path): def raw_content(self, path):
return open(path, 'rb').read().decode(path.encoding) return open(path, 'rb').read().decode(path.encoding)
@ -64,11 +61,11 @@ class Printing(QObject):
styles = dom_tree.findAll('style') styles = dom_tree.findAll('style')
for s in styles: for s in styles:
s.extract() s.extract()
scripts = dom_tree.findAll('script') scripts = dom_tree.findAll('script')
for s in scripts: for s in scripts:
s.extract() s.extract()
# Convert all relative links to absolute paths. # Convert all relative links to absolute paths.
links = dom_tree.findAll(src=True) links = dom_tree.findAll(src=True)
for s in links: for s in links:
@ -85,40 +82,40 @@ class Printing(QObject):
# Adds the print css. # Adds the print css.
def refine_content(self, content): def refine_content(self, content):
dom_tree = BeautifulSoup('<html><head></head><body>%s</body></html>' % content) dom_tree = BeautifulSoup('<html><head></head><body>%s</body></html>' % content)
css = dom_tree.findAll('link') css = dom_tree.findAll('link')
for c in css: for c in css:
c.extract() c.extract()
print_css = Tag(BeautifulSoup(), 'style', [('type', 'text/css'), ('title', 'override_css')]) print_css = Tag(BeautifulSoup(), 'style', [('type', 'text/css'), ('title', 'override_css')])
print_css.insert(0, PRINTCSS) print_css.insert(0, PRINTCSS)
dom_tree.findAll('head')[0].insert(0, print_css) dom_tree.findAll('head')[0].insert(0, print_css)
return unicode(dom_tree) return unicode(dom_tree)
def print_preview(self, ok): def print_preview(self, ok):
printer = QPrinter(QPrinter.HighResolution) printer = QPrinter(QPrinter.HighResolution)
printer.setPageMargins(1, 1, 1, 1, QPrinter.Inch) printer.setPageMargins(1, 1, 1, 1, QPrinter.Inch)
previewDialog = QPrintPreviewDialog(printer) previewDialog = QPrintPreviewDialog(printer)
self.connect(previewDialog, SIGNAL('paintRequested(QPrinter *)'), self.view.print_) self.connect(previewDialog, SIGNAL('paintRequested(QPrinter *)'), self.view.print_)
previewDialog.exec_() previewDialog.exec_()
self.disconnect(previewDialog, SIGNAL('paintRequested(QPrinter *)'), self.view.print_) self.disconnect(previewDialog, SIGNAL('paintRequested(QPrinter *)'), self.view.print_)
self.loop.quit() self.loop.quit()
def print_book(self, ok): def print_book(self, ok):
printer = QPrinter(QPrinter.HighResolution) printer = QPrinter(QPrinter.HighResolution)
printer.setPageMargins(1, 1, 1, 1, QPrinter.Inch) printer.setPageMargins(1, 1, 1, 1, QPrinter.Inch)
printDialog = QPrintDialog(printer) printDialog = QPrintDialog(printer)
printDialog.setWindowTitle(_("Print eBook")) printDialog.setWindowTitle(_("Print eBook"))
printDialog.exec_() printDialog.exec_()
if printDialog.result() == QDialog.Accepted: if printDialog.result() == QDialog.Accepted:
self.view.print_(printer) self.view.print_(printer)
self.loop.quit() self.loop.quit()
def main(): def main():

View File

@ -18,7 +18,7 @@ sys.path.append(os.path.abspath('../../../'))
sys.path.append(os.path.abspath('.')) sys.path.append(os.path.abspath('.'))
from calibre import __appname__, __version__ from calibre import __appname__, __version__
import custom import custom
custom
# General configuration # General configuration
# --------------------- # ---------------------

View File

@ -1,970 +0,0 @@
""" path.py - An object representing a path to a file or directory.
Example:
from path import path
d = path('/home/guido/bin')
for f in d.files('*.py'):
f.chmod(0755)
This module requires Python 2.2 or later.
URL: http://www.jorendorff.com/articles/python/path
Author: Jason Orendorff <jason.orendorff\x40gmail\x2ecom> (and others - see the url!)
Date: 9 Mar 2007
"""
# TODO
# - Tree-walking functions don't avoid symlink loops. Matt Harrison
# sent me a patch for this.
# - Bug in write_text(). It doesn't support Universal newline mode.
# - Better error message in listdir() when self isn't a
# directory. (On Windows, the error message really sucks.)
# - Make sure everything has a good docstring.
# - Add methods for regex find and replace.
# - guess_content_type() method?
# - Perhaps support arguments to touch().
from __future__ import generators
import sys, warnings, os, fnmatch, glob, shutil, codecs, hashlib
__version__ = '2.2'
__all__ = ['path']
# Platform-specific support for path.owner
if os.name == 'nt':
try:
import win32security
except ImportError:
win32security = None
else:
try:
import pwd
except ImportError:
pwd = None
# Pre-2.3 support. Are unicode filenames supported?
_base = str
_getcwd = os.getcwd
try:
if os.path.supports_unicode_filenames:
_base = unicode
_getcwd = os.getcwdu
except AttributeError:
pass
# Pre-2.3 workaround for booleans
try:
True, False
except NameError:
True, False = 1, 0
# Pre-2.3 workaround for basestring.
try:
basestring
except NameError:
basestring = (str, unicode)
# Universal newline support
_textmode = 'r'
if hasattr(file, 'newlines'):
_textmode = 'U'
class TreeWalkWarning(Warning):
pass
class path(_base):
""" Represents a filesystem path.
For documentation on individual methods, consult their
counterparts in os.path.
"""
# --- Special Python methods.
def __repr__(self):
return 'path(%s)' % _base.__repr__(self)
# Adding a path and a string yields a path.
def __add__(self, more):
try:
resultStr = _base.__add__(self, more)
except TypeError: #Python bug
resultStr = NotImplemented
if resultStr is NotImplemented:
return resultStr
return self.__class__(resultStr)
def __radd__(self, other):
if isinstance(other, basestring):
return self.__class__(other.__add__(self))
else:
return NotImplemented
# The / operator joins paths.
def __div__(self, rel):
""" fp.__div__(rel) == fp / rel == fp.joinpath(rel)
Join two path components, adding a separator character if
needed.
"""
return self.__class__(os.path.join(self, rel))
# Make the / operator work even when true division is enabled.
__truediv__ = __div__
def getcwd(cls):
""" Return the current working directory as a path object. """
return cls(_getcwd())
getcwd = classmethod(getcwd)
# --- Operations on path strings.
isabs = os.path.isabs
def abspath(self): return self.__class__(os.path.abspath(self))
def normcase(self): return self.__class__(os.path.normcase(self))
def normpath(self): return self.__class__(os.path.normpath(self))
def realpath(self): return self.__class__(os.path.realpath(self))
def expanduser(self): return self.__class__(os.path.expanduser(self))
def expandvars(self): return self.__class__(os.path.expandvars(self))
def dirname(self): return self.__class__(os.path.dirname(self))
basename = os.path.basename
def expand(self):
""" Clean up a filename by calling expandvars(),
expanduser(), and normpath() on it.
This is commonly everything needed to clean up a filename
read from a configuration file, for example.
"""
return self.expandvars().expanduser().normpath()
def _get_namebase(self):
base, ext = os.path.splitext(self.name)
return base
def _get_ext(self):
f, ext = os.path.splitext(_base(self))
return ext
def _get_drive(self):
drive, r = os.path.splitdrive(self)
return self.__class__(drive)
parent = property(
dirname, None, None,
""" This path's parent directory, as a new path object.
For example, path('/usr/local/lib/libpython.so').parent == path('/usr/local/lib')
""")
name = property(
basename, None, None,
""" The name of this file or directory without the full path.
For example, path('/usr/local/lib/libpython.so').name == 'libpython.so'
""")
namebase = property(
_get_namebase, None, None,
""" The same as path.name, but with one file extension stripped off.
For example, path('/home/guido/python.tar.gz').name == 'python.tar.gz',
but path('/home/guido/python.tar.gz').namebase == 'python.tar'
""")
ext = property(
_get_ext, None, None,
""" The file extension, for example '.py'. """)
drive = property(
_get_drive, None, None,
""" The drive specifier, for example 'C:'.
This is always empty on systems that don't use drive specifiers.
""")
def splitpath(self):
""" p.splitpath() -> Return (p.parent, p.name). """
parent, child = os.path.split(self)
return self.__class__(parent), child
def splitdrive(self):
""" p.splitdrive() -> Return (p.drive, <the rest of p>).
Split the drive specifier from this path. If there is
no drive specifier, p.drive is empty, so the return value
is simply (path(''), p). This is always the case on Unix.
"""
drive, rel = os.path.splitdrive(self)
return self.__class__(drive), rel
def splitext(self):
""" p.splitext() -> Return (p.stripext(), p.ext).
Split the filename extension from this path and return
the two parts. Either part may be empty.
The extension is everything from '.' to the end of the
last path segment. This has the property that if
(a, b) == p.splitext(), then a + b == p.
"""
filename, ext = os.path.splitext(self)
return self.__class__(filename), ext
def stripext(self):
""" p.stripext() -> Remove one file extension from the path.
For example, path('/home/guido/python.tar.gz').stripext()
returns path('/home/guido/python.tar').
"""
return self.splitext()[0]
if hasattr(os.path, 'splitunc'):
def splitunc(self):
unc, rest = os.path.splitunc(self)
return self.__class__(unc), rest
def _get_uncshare(self):
unc, r = os.path.splitunc(self)
return self.__class__(unc)
uncshare = property(
_get_uncshare, None, None,
""" The UNC mount point for this path.
This is empty for paths on local drives. """)
def joinpath(self, *args):
""" Join two or more path components, adding a separator
character (os.sep) if needed. Returns a new path
object.
"""
return self.__class__(os.path.join(self, *args))
def splitall(self):
r""" Return a list of the path components in this path.
The first item in the list will be a path. Its value will be
either os.curdir, os.pardir, empty, or the root directory of
this path (for example, '/' or 'C:\\'). The other items in
the list will be strings.
path.path.joinpath(*result) will yield the original path.
"""
parts = []
loc = self
while loc != os.curdir and loc != os.pardir:
prev = loc
loc, child = prev.splitpath()
if loc == prev:
break
parts.append(child)
parts.append(loc)
parts.reverse()
return parts
def relpath(self):
""" Return this path as a relative path,
based from the current working directory.
"""
cwd = self.__class__(os.getcwd())
return cwd.relpathto(self)
def relpathto(self, dest):
""" Return a relative path from self to dest.
If there is no relative path from self to dest, for example if
they reside on different drives in Windows, then this returns
dest.abspath().
"""
origin = self.abspath()
dest = self.__class__(dest).abspath()
orig_list = origin.normcase().splitall()
# Don't normcase dest! We want to preserve the case.
dest_list = dest.splitall()
if orig_list[0] != os.path.normcase(dest_list[0]):
# Can't get here from there.
return dest
# Find the location where the two paths start to differ.
i = 0
for start_seg, dest_seg in zip(orig_list, dest_list):
if start_seg != os.path.normcase(dest_seg):
break
i += 1
# Now i is the point where the two paths diverge.
# Need a certain number of "os.pardir"s to work up
# from the origin to the point of divergence.
segments = [os.pardir] * (len(orig_list) - i)
# Need to add the diverging part of dest_list.
segments += dest_list[i:]
if len(segments) == 0:
# If they happen to be identical, use os.curdir.
relpath = os.curdir
else:
relpath = os.path.join(*segments)
return self.__class__(relpath)
# --- Listing, searching, walking, and matching
def listdir(self, pattern=None):
""" D.listdir() -> List of items in this directory.
Use D.files() or D.dirs() instead if you want a listing
of just files or just subdirectories.
The elements of the list are path objects.
With the optional 'pattern' argument, this only lists
items whose names match the given pattern.
"""
names = os.listdir(self)
if pattern is not None:
names = fnmatch.filter(names, pattern)
return [self / child for child in names]
def dirs(self, pattern=None):
""" D.dirs() -> List of this directory's subdirectories.
The elements of the list are path objects.
This does not walk recursively into subdirectories
(but see path.walkdirs).
With the optional 'pattern' argument, this only lists
directories whose names match the given pattern. For
example, d.dirs('build-*').
"""
return [p for p in self.listdir(pattern) if p.isdir()]
def files(self, pattern=None):
""" D.files() -> List of the files in this directory.
The elements of the list are path objects.
This does not walk into subdirectories (see path.walkfiles).
With the optional 'pattern' argument, this only lists files
whose names match the given pattern. For example,
d.files('*.pyc').
"""
return [p for p in self.listdir(pattern) if p.isfile()]
def walk(self, pattern=None, errors='strict'):
""" D.walk() -> iterator over files and subdirs, recursively.
The iterator yields path objects naming each child item of
this directory and its descendants. This requires that
D.isdir().
This performs a depth-first traversal of the directory tree.
Each directory is returned just before all its children.
The errors= keyword argument controls behavior when an
error occurs. The default is 'strict', which causes an
exception. The other allowed values are 'warn', which
reports the error via warnings.warn(), and 'ignore'.
"""
if errors not in ('strict', 'warn', 'ignore'):
raise ValueError("invalid errors parameter")
try:
childList = self.listdir()
except Exception:
if errors == 'ignore':
return
elif errors == 'warn':
warnings.warn(
"Unable to list directory '%s': %s"
% (self, sys.exc_info()[1]),
TreeWalkWarning)
return
else:
raise
for child in childList:
if pattern is None or child.fnmatch(pattern):
yield child
try:
isdir = child.isdir()
except Exception:
if errors == 'ignore':
isdir = False
elif errors == 'warn':
warnings.warn(
"Unable to access '%s': %s"
% (child, sys.exc_info()[1]),
TreeWalkWarning)
isdir = False
else:
raise
if isdir:
for item in child.walk(pattern, errors):
yield item
def walkdirs(self, pattern=None, errors='strict'):
""" D.walkdirs() -> iterator over subdirs, recursively.
With the optional 'pattern' argument, this yields only
directories whose names match the given pattern. For
example, mydir.walkdirs('*test') yields only directories
with names ending in 'test'.
The errors= keyword argument controls behavior when an
error occurs. The default is 'strict', which causes an
exception. The other allowed values are 'warn', which
reports the error via warnings.warn(), and 'ignore'.
"""
if errors not in ('strict', 'warn', 'ignore'):
raise ValueError("invalid errors parameter")
try:
dirs = self.dirs()
except Exception:
if errors == 'ignore':
return
elif errors == 'warn':
warnings.warn(
"Unable to list directory '%s': %s"
% (self, sys.exc_info()[1]),
TreeWalkWarning)
return
else:
raise
for child in dirs:
if pattern is None or child.fnmatch(pattern):
yield child
for subsubdir in child.walkdirs(pattern, errors):
yield subsubdir
def walkfiles(self, pattern=None, errors='strict'):
""" D.walkfiles() -> iterator over files in D, recursively.
The optional argument, pattern, limits the results to files
with names that match the pattern. For example,
mydir.walkfiles('*.tmp') yields only files with the .tmp
extension.
"""
if errors not in ('strict', 'warn', 'ignore'):
raise ValueError("invalid errors parameter")
try:
childList = self.listdir()
except Exception:
if errors == 'ignore':
return
elif errors == 'warn':
warnings.warn(
"Unable to list directory '%s': %s"
% (self, sys.exc_info()[1]),
TreeWalkWarning)
return
else:
raise
for child in childList:
try:
isfile = child.isfile()
isdir = not isfile and child.isdir()
except:
if errors == 'ignore':
continue
elif errors == 'warn':
warnings.warn(
"Unable to access '%s': %s"
% (self, sys.exc_info()[1]),
TreeWalkWarning)
continue
else:
raise
if isfile:
if pattern is None or child.fnmatch(pattern):
yield child
elif isdir:
for f in child.walkfiles(pattern, errors):
yield f
def fnmatch(self, pattern):
""" Return True if self.name matches the given pattern.
pattern - A filename pattern with wildcards,
for example '*.py'.
"""
return fnmatch.fnmatch(self.name, pattern)
def glob(self, pattern):
""" Return a list of path objects that match the pattern.
pattern - a path relative to this directory, with wildcards.
For example, path('/users').glob('*/bin/*') returns a list
of all the files users have in their bin directories.
"""
cls = self.__class__
return [cls(s) for s in glob.glob(_base(self / pattern))]
# --- Reading or writing an entire file at once.
def open(self, mode='r'):
""" Open this file. Return a file object. """
return file(self, mode)
def bytes(self):
""" Open this file, read all bytes, return them as a string. """
f = self.open('rb')
try:
return f.read()
finally:
f.close()
def write_bytes(self, bytes, append=False):
""" Open this file and write the given bytes to it.
Default behavior is to overwrite any existing file.
Call p.write_bytes(bytes, append=True) to append instead.
"""
if append:
mode = 'ab'
else:
mode = 'wb'
f = self.open(mode)
try:
f.write(bytes)
finally:
f.close()
def text(self, encoding=None, errors='strict'):
r""" Open this file, read it in, return the content as a string.
This uses 'U' mode in Python 2.3 and later, so '\r\n' and '\r'
are automatically translated to '\n'.
Optional arguments:
encoding - The Unicode encoding (or character set) of
the file. If present, the content of the file is
decoded and returned as a unicode object; otherwise
it is returned as an 8-bit str.
errors - How to handle Unicode errors; see help(str.decode)
for the options. Default is 'strict'.
"""
if encoding is None:
# 8-bit
f = self.open(_textmode)
try:
return f.read()
finally:
f.close()
else:
# Unicode
f = codecs.open(self, 'r', encoding, errors)
# (Note - Can't use 'U' mode here, since codecs.open
# doesn't support 'U' mode, even in Python 2.3.)
try:
t = f.read()
finally:
f.close()
return (t.replace(u'\r\n', u'\n')
.replace(u'\r\x85', u'\n')
.replace(u'\r', u'\n')
.replace(u'\x85', u'\n')
.replace(u'\u2028', u'\n'))
def write_text(self, text, encoding=None, errors='strict', linesep=os.linesep, append=False):
r""" Write the given text to this file.
The default behavior is to overwrite any existing file;
to append instead, use the 'append=True' keyword argument.
There are two differences between path.write_text() and
path.write_bytes(): newline handling and Unicode handling.
See below.
Parameters:
- text - str/unicode - The text to be written.
- encoding - str - The Unicode encoding that will be used.
This is ignored if 'text' isn't a Unicode string.
- errors - str - How to handle Unicode encoding errors.
Default is 'strict'. See help(unicode.encode) for the
options. This is ignored if 'text' isn't a Unicode
string.
- linesep - keyword argument - str/unicode - The sequence of
characters to be used to mark end-of-line. The default is
os.linesep. You can also specify None; this means to
leave all newlines as they are in 'text'.
- append - keyword argument - bool - Specifies what to do if
the file already exists (True: append to the end of it;
False: overwrite it.) The default is False.
--- Newline handling.
write_text() converts all standard end-of-line sequences
('\n', '\r', and '\r\n') to your platform's default end-of-line
sequence (see os.linesep; on Windows, for example, the
end-of-line marker is '\r\n').
If you don't like your platform's default, you can override it
using the 'linesep=' keyword argument. If you specifically want
write_text() to preserve the newlines as-is, use 'linesep=None'.
This applies to Unicode text the same as to 8-bit text, except
there are three additional standard Unicode end-of-line sequences:
u'\x85', u'\r\x85', and u'\u2028'.
(This is slightly different from when you open a file for
writing with fopen(filename, "w") in C or file(filename, 'w')
in Python.)
--- Unicode
If 'text' isn't Unicode, then apart from newline handling, the
bytes are written verbatim to the file. The 'encoding' and
'errors' arguments are not used and must be omitted.
If 'text' is Unicode, it is first converted to bytes using the
specified 'encoding' (or the default encoding if 'encoding'
isn't specified). The 'errors' argument applies only to this
conversion.
"""
if isinstance(text, unicode):
if linesep is not None:
# Convert all standard end-of-line sequences to
# ordinary newline characters.
text = (text.replace(u'\r\n', u'\n')
.replace(u'\r\x85', u'\n')
.replace(u'\r', u'\n')
.replace(u'\x85', u'\n')
.replace(u'\u2028', u'\n'))
text = text.replace(u'\n', linesep)
if encoding is None:
encoding = sys.getdefaultencoding()
bytes = text.encode(encoding, errors)
else:
# It is an error to specify an encoding if 'text' is
# an 8-bit string.
assert encoding is None
if linesep is not None:
text = (text.replace('\r\n', '\n')
.replace('\r', '\n'))
bytes = text.replace('\n', linesep)
self.write_bytes(bytes, append)
def lines(self, encoding=None, errors='strict', retain=True):
r""" Open this file, read all lines, return them in a list.
Optional arguments:
encoding - The Unicode encoding (or character set) of
the file. The default is None, meaning the content
of the file is read as 8-bit characters and returned
as a list of (non-Unicode) str objects.
errors - How to handle Unicode errors; see help(str.decode)
for the options. Default is 'strict'
retain - If true, retain newline characters; but all newline
character combinations ('\r', '\n', '\r\n') are
translated to '\n'. If false, newline characters are
stripped off. Default is True.
This uses 'U' mode in Python 2.3 and later.
"""
if encoding is None and retain:
f = self.open(_textmode)
try:
return f.readlines()
finally:
f.close()
else:
return self.text(encoding, errors).splitlines(retain)
def write_lines(self, lines, encoding=None, errors='strict',
linesep=os.linesep, append=False):
r""" Write the given lines of text to this file.
By default this overwrites any existing file at this path.
This puts a platform-specific newline sequence on every line.
See 'linesep' below.
lines - A list of strings.
encoding - A Unicode encoding to use. This applies only if
'lines' contains any Unicode strings.
errors - How to handle errors in Unicode encoding. This
also applies only to Unicode strings.
linesep - The desired line-ending. This line-ending is
applied to every line. If a line already has any
standard line ending ('\r', '\n', '\r\n', u'\x85',
u'\r\x85', u'\u2028'), that will be stripped off and
this will be used instead. The default is os.linesep,
which is platform-dependent ('\r\n' on Windows, '\n' on
Unix, etc.) Specify None to write the lines as-is,
like file.writelines().
Use the keyword argument append=True to append lines to the
file. The default is to overwrite the file. Warning:
When you use this with Unicode data, if the encoding of the
existing data in the file is different from the encoding
you specify with the encoding= parameter, the result is
mixed-encoding data, which can really confuse someone trying
to read the file later.
"""
if append:
mode = 'ab'
else:
mode = 'wb'
f = self.open(mode)
try:
for line in lines:
isUnicode = isinstance(line, unicode)
if linesep is not None:
# Strip off any existing line-end and add the
# specified linesep string.
if isUnicode:
if line[-2:] in (u'\r\n', u'\x0d\x85'):
line = line[:-2]
elif line[-1:] in (u'\r', u'\n',
u'\x85', u'\u2028'):
line = line[:-1]
else:
if line[-2:] == '\r\n':
line = line[:-2]
elif line[-1:] in ('\r', '\n'):
line = line[:-1]
line += linesep
if isUnicode:
if encoding is None:
encoding = sys.getdefaultencoding()
line = line.encode(encoding, errors)
f.write(line)
finally:
f.close()
def read_md5(self):
""" Calculate the md5 hash for this file.
This reads through the entire file.
"""
f = self.open('rb')
try:
m = hashlib.md5()
while True:
d = f.read(8192)
if not d:
break
m.update(d)
finally:
f.close()
return m.digest()
# --- Methods for querying the filesystem.
exists = os.path.exists
isdir = os.path.isdir
isfile = os.path.isfile
islink = os.path.islink
ismount = os.path.ismount
if hasattr(os.path, 'samefile'):
samefile = os.path.samefile
getatime = os.path.getatime
atime = property(
getatime, None, None,
""" Last access time of the file. """)
getmtime = os.path.getmtime
mtime = property(
getmtime, None, None,
""" Last-modified time of the file. """)
if hasattr(os.path, 'getctime'):
getctime = os.path.getctime
ctime = property(
getctime, None, None,
""" Creation time of the file. """)
getsize = os.path.getsize
size = property(
getsize, None, None,
""" Size of the file, in bytes. """)
if hasattr(os, 'access'):
def access(self, mode):
""" Return true if current user has access to this path.
mode - One of the constants os.F_OK, os.R_OK, os.W_OK, os.X_OK
"""
return os.access(self, mode)
def stat(self):
""" Perform a stat() system call on this path. """
return os.stat(self)
def lstat(self):
""" Like path.stat(), but do not follow symbolic links. """
return os.lstat(self)
def get_owner(self):
r""" Return the name of the owner of this file or directory.
This follows symbolic links.
On Windows, this returns a name of the form ur'DOMAIN\User Name'.
On Windows, a group can own a file or directory.
"""
if os.name == 'nt':
if win32security is None:
raise Exception("path.owner requires win32all to be installed")
desc = win32security.GetFileSecurity(
self, win32security.OWNER_SECURITY_INFORMATION)
sid = desc.GetSecurityDescriptorOwner()
account, domain, typecode = win32security.LookupAccountSid(None, sid)
return domain + u'\\' + account
else:
if pwd is None:
raise NotImplementedError("path.owner is not implemented on this platform.")
st = self.stat()
return pwd.getpwuid(st.st_uid).pw_name
owner = property(
get_owner, None, None,
""" Name of the owner of this file or directory. """)
if hasattr(os, 'statvfs'):
def statvfs(self):
""" Perform a statvfs() system call on this path. """
return os.statvfs(self)
if hasattr(os, 'pathconf'):
def pathconf(self, name):
return os.pathconf(self, name)
# --- Modifying operations on files and directories
def utime(self, times):
""" Set the access and modified times of this file. """
os.utime(self, times)
def chmod(self, mode):
os.chmod(self, mode)
if hasattr(os, 'chown'):
def chown(self, uid, gid):
os.chown(self, uid, gid)
def rename(self, new):
os.rename(self, new)
def renames(self, new):
os.renames(self, new)
# --- Create/delete operations on directories
def mkdir(self, mode=0777):
os.mkdir(self, mode)
def makedirs(self, mode=0777):
os.makedirs(self, mode)
def rmdir(self):
os.rmdir(self)
def removedirs(self):
os.removedirs(self)
# --- Modifying operations on files
def touch(self):
""" Set the access/modified times of this file to the current time.
Create the file if it does not exist.
"""
fd = os.open(self, os.O_WRONLY | os.O_CREAT, 0666)
os.close(fd)
os.utime(self, None)
def remove(self):
os.remove(self)
def unlink(self):
os.unlink(self)
# --- Links
if hasattr(os, 'link'):
def link(self, newpath):
""" Create a hard link at 'newpath', pointing to this file. """
os.link(self, newpath)
if hasattr(os, 'symlink'):
def symlink(self, newlink):
""" Create a symbolic link at 'newlink', pointing here. """
os.symlink(self, newlink)
if hasattr(os, 'readlink'):
def readlink(self):
""" Return the path to which this symbolic link points.
The result may be an absolute or a relative path.
"""
return self.__class__(os.readlink(self))
def readlinkabs(self):
""" Return the path to which this symbolic link points.
The result is always an absolute path.
"""
p = self.readlink()
if p.isabs():
return p
else:
return (self.parent / p).abspath()
# --- High-level functions from shutil
copyfile = shutil.copyfile
copymode = shutil.copymode
copystat = shutil.copystat
copy = shutil.copy
copy2 = shutil.copy2
copytree = shutil.copytree
if hasattr(shutil, 'move'):
move = shutil.move
rmtree = shutil.rmtree
# --- Special stuff from os
if hasattr(os, 'chroot'):
def chroot(self):
os.chroot(self)
if hasattr(os, 'startfile'):
def startfile(self):
os.startfile(self)

View File

@ -1,121 +0,0 @@
import sys, glob, re
import mechanize
URL = 'http://translate.google.com/translate_t?text=%(text)s&langpair=en|%(lang)s&oe=UTF8'
def browser():
opener = mechanize.Browser()
opener.set_handle_refresh(True)
opener.set_handle_robots(False)
opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
return opener
class PoFile(object):
SANITIZE = re.compile(r'&|<[^<>]+>|\%')
STRING = re.compile(r'"(.*)"')
def __init__(self, po_file):
self.po_file = open(po_file, 'r+b')
self.browser = browser()
self.entries = []
self.read()
def sanitize_line(self, line):
return self.SANITIZE.sub(line)
def read(self):
translated_lines = []
self.po_file.seek(0)
ID = 0
STR = 1
WHR = 2
mode = None
where, msgid, msgstr, fuzzy = [], [], [], False
for line in self.po_file.readlines():
prev_mode = mode
if line.startswith('#:'):
mode = WHR
elif line.startswith('msgid'):
mode = ID
elif line.startswith('msgstr'):
mode = STR
elif line.startswith('#,'):
fuzzy = True
continue
elif line.startswith('#') or not line.strip():
mode = None
if mode != prev_mode:
if prev_mode == STR:
self.add_entry(where, fuzzy, msgid, msgstr)
where, msgid, msgstr, fuzzy = [], [], [], False
if mode == WHR:
where.append(line[2:].strip())
elif mode == ID:
msgid.append(self.get_string(line))
elif mode == STR:
msgstr.append(self.get_string(line))
elif mode == None:
self.add_line(line)
def get_string(self, line):
return self.STRING.search(line).group(1)
def add_line(self, line):
self.entries.append(line.strip())
def add_entry(self, where, fuzzy, msgid, msgstr):
self.entries.append(Entry(where, fuzzy, msgid, msgstr))
def __str__(self):
return '\n'.join([str(i) for i in self.entries]) + '\n'
class Entry(object):
def __init__(self, where, fuzzy, msgid, msgstr, encoding='utf-8'):
self.fuzzy = fuzzy
self.where = [i.decode(encoding) for i in where]
self.msgid = [i.decode(encoding) for i in msgid]
self.msgstr = [i.decode(encoding) for i in msgstr]
self.encoding = encoding
def __str__(self):
ans = []
for line in self.where:
ans.append('#: ' + line.encode(self.encoding))
if self.fuzzy:
ans.append('#, fuzzy')
first = True
for line in self.msgid:
prefix = 'msgid ' if first else ''
ans.append(prefix + '"%s"'%line.encode(self.encoding))
first = False
first = True
for line in self.msgstr:
prefix = 'msgstr ' if first else ''
ans.append(prefix + '"%s"'%line.encode(self.encoding))
first = False
return '\n'.join(ans)
def main():
po_files = glob.glob('*.po')
for po_file in po_files:
PoFile(po_file)
pass
if __name__ == '__main__':
pof = PoFile('de.po')
open('/tmp/de.po', 'wb').write(str(pof))
#sys.exit(main())

View File

@ -15,7 +15,10 @@ def available_translations():
global _available_translations global _available_translations
if _available_translations is None: if _available_translations is None:
stats = P('localization/stats.pickle') stats = P('localization/stats.pickle')
stats = cPickle.load(open(stats, 'rb')) if os.path.exists(stats):
stats = cPickle.load(open(stats, 'rb'))
else:
stats = {}
_available_translations = [x for x in stats if stats[x] > 0.1] _available_translations = [x for x in stats if stats[x] > 0.1]
return _available_translations return _available_translations

View File

@ -85,7 +85,7 @@ __all__ = [
'htmlComment', 'javaStyleComment', 'keepOriginalText', 'line', 'lineEnd', 'lineStart', 'lineno', 'htmlComment', 'javaStyleComment', 'keepOriginalText', 'line', 'lineEnd', 'lineStart', 'lineno',
'makeHTMLTags', 'makeXMLTags', 'matchOnlyAtCol', 'matchPreviousExpr', 'matchPreviousLiteral', 'makeHTMLTags', 'makeXMLTags', 'matchOnlyAtCol', 'matchPreviousExpr', 'matchPreviousLiteral',
'nestedExpr', 'nullDebugAction', 'nums', 'oneOf', 'opAssoc', 'operatorPrecedence', 'printables', 'nestedExpr', 'nullDebugAction', 'nums', 'oneOf', 'opAssoc', 'operatorPrecedence', 'printables',
'punc8bit', 'pythonStyleComment', 'quotedString', 'removeQuotes', 'replaceHTMLEntity', 'punc8bit', 'pythonStyleComment', 'quotedString', 'removeQuotes', 'replaceHTMLEntity',
'replaceWith', 'restOfLine', 'sglQuotedString', 'srange', 'stringEnd', 'replaceWith', 'restOfLine', 'sglQuotedString', 'srange', 'stringEnd',
'stringStart', 'traceParseAction', 'unicodeString', 'upcaseTokens', 'withAttribute', 'stringStart', 'traceParseAction', 'unicodeString', 'upcaseTokens', 'withAttribute',
'indentedBlock', 'originalTextFor', 'indentedBlock', 'originalTextFor',
@ -425,7 +425,7 @@ class ParseResults(object):
self[k] = v self[k] = v
if isinstance(v[0],ParseResults): if isinstance(v[0],ParseResults):
v[0].__parent = wkref(self) v[0].__parent = wkref(self)
self.__toklist += other.__toklist self.__toklist += other.__toklist
self.__accumNames.update( other.__accumNames ) self.__accumNames.update( other.__accumNames )
del other del other
@ -3231,12 +3231,12 @@ def originalTextFor(expr, asString=True):
restore the parsed fields of an HTML start tag into the raw tag text itself, or to restore the parsed fields of an HTML start tag into the raw tag text itself, or to
revert separate tokens with intervening whitespace back to the original matching revert separate tokens with intervening whitespace back to the original matching
input text. Simpler to use than the parse action keepOriginalText, and does not input text. Simpler to use than the parse action keepOriginalText, and does not
require the inspect module to chase up the call stack. By default, returns a require the inspect module to chase up the call stack. By default, returns a
string containing the original parsed text. string containing the original parsed text.
If the optional asString argument is passed as False, then the return value is a If the optional asString argument is passed as False, then the return value is a
ParseResults containing any results names that were originally matched, and a ParseResults containing any results names that were originally matched, and a
single token containing the original matched text from the input string. So if single token containing the original matched text from the input string. So if
the expression passed to originalTextFor contains expressions with defined the expression passed to originalTextFor contains expressions with defined
results names, you must set asString to False if you want to preserve those results names, you must set asString to False if you want to preserve those
results name values.""" results name values."""
@ -3252,7 +3252,7 @@ def originalTextFor(expr, asString=True):
del t["_original_end"] del t["_original_end"]
matchExpr.setParseAction(extractText) matchExpr.setParseAction(extractText)
return matchExpr return matchExpr
# convenience constants for positional expressions # convenience constants for positional expressions
empty = Empty().setName("empty") empty = Empty().setName("empty")
lineStart = LineStart().setName("lineStart") lineStart = LineStart().setName("lineStart")
@ -3532,7 +3532,7 @@ def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString):
).setParseAction(lambda t:t[0].strip())) ).setParseAction(lambda t:t[0].strip()))
else: else:
if ignoreExpr is not None: if ignoreExpr is not None:
content = (Combine(OneOrMore(~ignoreExpr + content = (Combine(OneOrMore(~ignoreExpr +
~Literal(opener) + ~Literal(closer) + ~Literal(opener) + ~Literal(closer) +
CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1)) CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1))
).setParseAction(lambda t:t[0].strip())) ).setParseAction(lambda t:t[0].strip()))

View File

@ -20,6 +20,7 @@ class WriteXmlMixin:
def to_xml(self, encoding = "iso-8859-1"): def to_xml(self, encoding = "iso-8859-1"):
try: try:
import cStringIO as StringIO import cStringIO as StringIO
StringIO
except ImportError: except ImportError:
import StringIO import StringIO
f = StringIO.StringIO() f = StringIO.StringIO()
@ -64,7 +65,7 @@ def _format_date(dt):
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1], "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1],
dt.year, dt.hour, dt.minute, dt.second) dt.year, dt.hour, dt.minute, dt.second)
## ##
# A couple simple wrapper objects for the fields which # A couple simple wrapper objects for the fields which
# take a simple value other than a string. # take a simple value other than a string.
@ -72,7 +73,7 @@ class IntElement:
"""implements the 'publish' API for integers """implements the 'publish' API for integers
Takes the tag name and the integer value to publish. Takes the tag name and the integer value to publish.
(Could be used for anything which uses str() to be published (Could be used for anything which uses str() to be published
to text for XML.) to text for XML.)
""" """
@ -138,7 +139,7 @@ class Image:
self.width = width self.width = width
self.height = height self.height = height
self.description = description self.description = description
def publish(self, handler): def publish(self, handler):
handler.startElement("image", self.element_attrs) handler.startElement("image", self.element_attrs)
@ -150,7 +151,7 @@ class Image:
if isinstance(width, int): if isinstance(width, int):
width = IntElement("width", width) width = IntElement("width", width)
_opt_element(handler, "width", width) _opt_element(handler, "width", width)
height = self.height height = self.height
if isinstance(height, int): if isinstance(height, int):
height = IntElement("height", height) height = IntElement("height", height)
@ -196,7 +197,7 @@ class TextInput:
_element(handler, "name", self.name) _element(handler, "name", self.name)
_element(handler, "link", self.link) _element(handler, "link", self.link)
handler.endElement("textInput") handler.endElement("textInput")
class Enclosure: class Enclosure:
"""Publish an enclosure""" """Publish an enclosure"""
@ -255,7 +256,7 @@ class RSS2(WriteXmlMixin):
Stores the channel attributes, with the "category" elements under Stores the channel attributes, with the "category" elements under
".categories" and the RSS items under ".items". ".categories" and the RSS items under ".items".
""" """
rss_attrs = {"version": "2.0"} rss_attrs = {"version": "2.0"}
element_attrs = {} element_attrs = {}
def __init__(self, def __init__(self,
@ -269,7 +270,7 @@ class RSS2(WriteXmlMixin):
webMaster = None, webMaster = None,
pubDate = None, # a datetime, *in* *GMT* pubDate = None, # a datetime, *in* *GMT*
lastBuildDate = None, # a datetime lastBuildDate = None, # a datetime
categories = None, # list of strings or Category categories = None, # list of strings or Category
generator = _generator_name, generator = _generator_name,
docs = "http://blogs.law.harvard.edu/tech/rss", docs = "http://blogs.law.harvard.edu/tech/rss",
@ -294,7 +295,7 @@ class RSS2(WriteXmlMixin):
self.webMaster = webMaster self.webMaster = webMaster
self.pubDate = pubDate self.pubDate = pubDate
self.lastBuildDate = lastBuildDate self.lastBuildDate = lastBuildDate
if categories is None: if categories is None:
categories = [] categories = []
self.categories = categories self.categories = categories
@ -320,7 +321,7 @@ class RSS2(WriteXmlMixin):
_element(handler, "description", self.description) _element(handler, "description", self.description)
self.publish_extensions(handler) self.publish_extensions(handler)
_opt_element(handler, "language", self.language) _opt_element(handler, "language", self.language)
_opt_element(handler, "copyright", self.copyright) _opt_element(handler, "copyright", self.copyright)
_opt_element(handler, "managingEditor", self.managingEditor) _opt_element(handler, "managingEditor", self.managingEditor)
@ -374,8 +375,8 @@ class RSS2(WriteXmlMixin):
# output after the three required fields. # output after the three required fields.
pass pass
class RSSItem(WriteXmlMixin): class RSSItem(WriteXmlMixin):
"""Publish an RSS Item""" """Publish an RSS Item"""
element_attrs = {} element_attrs = {}
@ -391,7 +392,7 @@ class RSSItem(WriteXmlMixin):
pubDate = None, # a datetime pubDate = None, # a datetime
source = None, # a Source source = None, # a Source
): ):
if title is None and description is None: if title is None and description is None:
raise TypeError( raise TypeError(
"must define at least one of 'title' or 'description'") "must define at least one of 'title' or 'description'")
@ -421,7 +422,7 @@ class RSSItem(WriteXmlMixin):
if isinstance(category, basestring): if isinstance(category, basestring):
category = Category(category) category = Category(category)
category.publish(handler) category.publish(handler)
_opt_element(handler, "comments", self.comments) _opt_element(handler, "comments", self.comments)
if self.enclosure is not None: if self.enclosure is not None:
self.enclosure.publish(handler) self.enclosure.publish(handler)
@ -434,7 +435,7 @@ class RSSItem(WriteXmlMixin):
if self.source is not None: if self.source is not None:
self.source.publish(handler) self.source.publish(handler)
handler.endElement("item") handler.endElement("item")
def publish_extensions(self, handler): def publish_extensions(self, handler):

View File

@ -57,13 +57,13 @@ recipe_modules = ['recipe_' + r for r in (
'monitor', 'republika', 'beta', 'beta_en', 'glasjavnosti', 'monitor', 'republika', 'beta', 'beta_en', 'glasjavnosti',
'esquire', 'livemint', 'thedgesingapore', 'darknet', 'rga', 'esquire', 'livemint', 'thedgesingapore', 'darknet', 'rga',
'intelligencer', 'theoldfoodie', 'hln_be', 'honvedelem', 'intelligencer', 'theoldfoodie', 'hln_be', 'honvedelem',
'the_new_republic',
)] )]
import re, imp, inspect, time, os import re, imp, inspect, time, os
from calibre.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe, AutomaticNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe, AutomaticNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.path import path
from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ptempfile import PersistentTemporaryDirectory
from calibre import __appname__, english_sort from calibre import __appname__, english_sort
@ -102,8 +102,8 @@ def compile_recipe(src):
''' '''
global _tdir, _crep global _tdir, _crep
if _tdir is None or not os.path.exists(_tdir): if _tdir is None or not os.path.exists(_tdir):
_tdir = path(PersistentTemporaryDirectory('_recipes')) _tdir = PersistentTemporaryDirectory('_recipes')
temp = _tdir/('recipe%d.py'%_crep) temp = os.path.join(_tdir, 'recipe%d.py'%_crep)
_crep += 1 _crep += 1
if not isinstance(src, unicode): if not isinstance(src, unicode):
match = re.search(r'coding[:=]\s*([-\w.]+)', src[:200]) match = re.search(r'coding[:=]\s*([-\w.]+)', src[:200])
@ -118,8 +118,9 @@ def compile_recipe(src):
src = src.replace('from libprs500', 'from calibre').encode('utf-8') src = src.replace('from libprs500', 'from calibre').encode('utf-8')
f.write(src) f.write(src)
f.close() f.close()
module = imp.find_module(temp.namebase, [temp.dirname()]) module = imp.find_module(os.path.splitext(os.path.basename(temp))[0],
module = imp.load_module(temp.namebase, *module) [os.path.dirname(temp)])
module = imp.load_module(os.path.splitext(os.path.basename(temp))[0], *module)
classes = inspect.getmembers(module, classes = inspect.getmembers(module,
lambda x : inspect.isclass(x) and \ lambda x : inspect.isclass(x) and \
issubclass(x, (BasicNewsRecipe,)) and \ issubclass(x, (BasicNewsRecipe,)) and \
@ -148,6 +149,7 @@ _titles.sort(cmp=english_sort)
titles = _titles titles = _titles
def migrate_automatic_profile_to_automatic_recipe(profile): def migrate_automatic_profile_to_automatic_recipe(profile):
BeautifulSoup
oprofile = profile oprofile = profile
profile = compile_recipe(profile) profile = compile_recipe(profile)
if 'BasicUserProfile' not in profile.__name__: if 'BasicUserProfile' not in profile.__name__:
@ -165,3 +167,4 @@ class BasicUserRecipe%d(AutomaticNewsRecipe):
'''%(int(time.time()), repr(profile.title), profile.oldest_article, '''%(int(time.time()), repr(profile.title), profile.oldest_article,
profile.max_articles_per_feed, profile.summary_length, repr(profile.feeds)) profile.max_articles_per_feed, profile.summary_length, repr(profile.feeds))

View File

@ -1,61 +1,61 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
24sata.hr 24sata.hr
''' '''
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.BeautifulSoup import Tag
class Cro24Sata(BasicNewsRecipe): class Cro24Sata(BasicNewsRecipe):
title = '24 Sata - Hr' title = '24 Sata - Hr'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = "News Portal from Croatia" description = "News Portal from Croatia"
publisher = '24sata.hr' publisher = '24sata.hr'
category = 'news, politics, Croatia' category = 'news, politics, Croatia'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
delay = 4 delay = 4
no_stylesheets = True no_stylesheets = True
encoding = 'utf-8' encoding = 'utf-8'
use_embedded_content = False use_embedded_content = False
language = 'hr' language = 'hr'
lang = 'hr-HR' lang = 'hr-HR'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
, 'tags' : category , 'tags' : category
, 'publisher' : publisher , 'publisher' : publisher
, 'language' : lang , 'language' : lang
, 'pretty_print' : True , 'pretty_print' : True
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
remove_tags = [ remove_tags = [
dict(name=['object','link','embed']) dict(name=['object','link','embed'])
,dict(name='table', attrs={'class':'enumbox'}) ,dict(name='table', attrs={'class':'enumbox'})
] ]
feeds = [(u'Najnovije Vijesti', u'http://www.24sata.hr/index.php?cmd=show_rss&action=novo')] feeds = [(u'Najnovije Vijesti', u'http://www.24sata.hr/index.php?cmd=show_rss&action=novo')]
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
soup.head.insert(1,mcharset) soup.head.insert(1,mcharset)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
return soup return soup
def print_version(self, url): def print_version(self, url):
return url + '&action=ispis' return url + '&action=ispis'

View File

@ -1,68 +1,68 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
24sata.rs 24sata.rs
''' '''
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.BeautifulSoup import Tag
class Ser24Sata(BasicNewsRecipe): class Ser24Sata(BasicNewsRecipe):
title = '24 Sata - Sr' title = '24 Sata - Sr'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = '24 sata portal vesti iz Srbije' description = '24 sata portal vesti iz Srbije'
publisher = 'Ringier d.o.o.' publisher = 'Ringier d.o.o.'
category = 'news, politics, entertainment, Serbia' category = 'news, politics, entertainment, Serbia'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
encoding = 'utf-8' encoding = 'utf-8'
use_embedded_content = False use_embedded_content = False
language = 'sr' language = 'sr'
lang = 'sr-Latn-RS' lang = 'sr-Latn-RS'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
, 'tags' : category , 'tags' : category
, 'publisher' : publisher , 'publisher' : publisher
, 'language' : lang , 'language' : lang
, 'pretty_print' : True , 'pretty_print' : True
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
feeds = [(u'Vesti Dana', u'http://www.24sata.rs/rss.php')] feeds = [(u'Vesti Dana', u'http://www.24sata.rs/rss.php')]
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
attribs = [ 'style','font','valign' attribs = [ 'style','font','valign'
,'colspan','width','height' ,'colspan','width','height'
,'rowspan','summary','align' ,'rowspan','summary','align'
,'cellspacing','cellpadding' ,'cellspacing','cellpadding'
,'frames','rules','border' ,'frames','rules','border'
] ]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div' item.name = 'div'
for attrib in attribs: for attrib in attribs:
if item.has_key(attrib): if item.has_key(attrib):
del item[attrib] del item[attrib]
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
soup.head.insert(1,mcharset) soup.head.insert(1,mcharset)
return self.adeify_images(soup) return self.adeify_images(soup)
def print_version(self, url): def print_version(self, url):
article = url.partition('#')[0] article = url.partition('#')[0]
article_id = article.partition('id=')[2] article_id = article.partition('id=')[2]
return 'http://www.24sata.rs/_print.php?id=' + article_id return 'http://www.24sata.rs/_print.php?id=' + article_id

View File

@ -1,72 +1,72 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
elargentino.com elargentino.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
class SieteDias(BasicNewsRecipe): class SieteDias(BasicNewsRecipe):
title = '7 dias' title = '7 dias'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Revista Argentina' description = 'Revista Argentina'
publisher = 'ElArgentino.com' publisher = 'ElArgentino.com'
category = 'news, politics, show, Argentina' category = 'news, politics, show, Argentina'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'utf-8' encoding = 'utf-8'
language = 'es' language = 'es'
lang = 'es-AR' lang = 'es-AR'
direction = 'ltr' direction = 'ltr'
INDEX = 'http://www.elargentino.com/medios/125/7-Dias.html' INDEX = 'http://www.elargentino.com/medios/125/7-Dias.html'
extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} ' extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} '
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment' , description
, '--category' , category , '--category' , category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})] keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})]
remove_tags = [dict(name='link')] remove_tags = [dict(name='link')]
feeds = [(u'Articulos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=125&Content-Type=text/xml&ChannelDesc=7%20D%C3%ADas')] feeds = [(u'Articulos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=125&Content-Type=text/xml&ChannelDesc=7%20D%C3%ADas')]
def print_version(self, url): def print_version(self, url):
main, sep, article_part = url.partition('/nota-') main, sep, article_part = url.partition('/nota-')
article_id, rsep, rrest = article_part.partition('-') article_id, rsep, rrest = article_part.partition('-')
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
soup.head.insert(1,mcharset) soup.head.insert(1,mcharset)
return soup return soup
def get_cover_url(self): def get_cover_url(self):
cover_url = None cover_url = None
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('div',attrs={'class':'colder'}) cover_item = soup.find('div',attrs={'class':'colder'})
if cover_item: if cover_item:
clean_url = self.image_url_processor(None,cover_item.div.img['src']) clean_url = self.image_url_processor(None,cover_item.div.img['src'])
cover_url = 'http://www.elargentino.com' + clean_url + '&height=600' cover_url = 'http://www.elargentino.com' + clean_url + '&height=600'
return cover_url return cover_url
def image_url_processor(self, baseurl, url): def image_url_processor(self, baseurl, url):
base, sep, rest = url.rpartition('?Id=') base, sep, rest = url.rpartition('?Id=')
img, sep2, rrest = rest.partition('&') img, sep2, rrest = rest.partition('&')
return base + sep + img return base + sep + img

View File

@ -1,59 +1,59 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.accountancyage.com www.accountancyage.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
class AccountancyAge(BasicNewsRecipe): class AccountancyAge(BasicNewsRecipe):
title = 'Accountancy Age' title = 'Accountancy Age'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'business news' description = 'business news'
publisher = 'accountancyage.com' publisher = 'accountancyage.com'
category = 'news, politics, finances' category = 'news, politics, finances'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
simultaneous_downloads = 1 simultaneous_downloads = 1
encoding = 'utf-8' encoding = 'utf-8'
lang = 'en' lang = 'en'
language = 'en' language = 'en'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'bodycol'})] keep_only_tags = [dict(name='div', attrs={'class':'bodycol'})]
remove_tags = [dict(name=['embed','object'])] remove_tags = [dict(name=['embed','object'])]
remove_tags_after = dict(name='div', attrs={'id':'permalink'}) remove_tags_after = dict(name='div', attrs={'id':'permalink'})
remove_tags_before = dict(name='div', attrs={'class':'gap6'}) remove_tags_before = dict(name='div', attrs={'class':'gap6'})
feeds = [(u'All News', u'http://feeds.accountancyage.com/rss/latest/accountancyage/all')] feeds = [(u'All News', u'http://feeds.accountancyage.com/rss/latest/accountancyage/all')]
def print_version(self, url): def print_version(self, url):
rest, sep, miss = url.rpartition('/') rest, sep, miss = url.rpartition('/')
rr, ssep, artid = rest.rpartition('/') rr, ssep, artid = rest.rpartition('/')
return u'http://www.accountancyage.com/articles/print/' + artid return u'http://www.accountancyage.com/articles/print/' + artid
def get_article_url(self, article): def get_article_url(self, article):
return article.get('guid', None) return article.get('guid', None)
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
soup.head.insert(1,mcharset) soup.head.insert(1,mcharset)
return self.adeify_images(soup) return self.adeify_images(soup)

View File

@ -1,77 +1,77 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.adventuregamers.com www.adventuregamers.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class AdventureGamers(BasicNewsRecipe): class AdventureGamers(BasicNewsRecipe):
title = u'Adventure Gamers' title = u'Adventure Gamers'
language = 'en' language = 'en'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Adventure games portal' description = 'Adventure games portal'
publisher = 'Adventure Gamers' publisher = 'Adventure Gamers'
category = 'news, games, adventure, technology' category = 'news, games, adventure, technology'
language = 'en' language = 'en'
oldest_article = 10 oldest_article = 10
delay = 10 delay = 10
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
encoding = 'cp1252' encoding = 'cp1252'
remove_javascript = True remove_javascript = True
use_embedded_content = False use_embedded_content = False
INDEX = u'http://www.adventuregamers.com' INDEX = u'http://www.adventuregamers.com'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'class':'content_middle'}) dict(name='div', attrs={'class':'content_middle'})
] ]
remove_tags = [ remove_tags = [
dict(name=['object','link','embed','form']) dict(name=['object','link','embed','form'])
,dict(name='div', attrs={'class':['related-stories','article_leadout','prev','next','both']}) ,dict(name='div', attrs={'class':['related-stories','article_leadout','prev','next','both']})
] ]
remove_tags_after = [dict(name='div', attrs={'class':'toolbar_fat'})] remove_tags_after = [dict(name='div', attrs={'class':'toolbar_fat'})]
feeds = [(u'Articles', u'http://feeds2.feedburner.com/AdventureGamers')] feeds = [(u'Articles', u'http://feeds2.feedburner.com/AdventureGamers')]
def get_article_url(self, article): def get_article_url(self, article):
return article.get('guid', None) return article.get('guid', None)
def append_page(self, soup, appendtag, position): def append_page(self, soup, appendtag, position):
pager = soup.find('div',attrs={'class':'toolbar_fat_next'}) pager = soup.find('div',attrs={'class':'toolbar_fat_next'})
if pager: if pager:
nexturl = self.INDEX + pager.a['href'] nexturl = self.INDEX + pager.a['href']
soup2 = self.index_to_soup(nexturl) soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'bodytext'}) texttag = soup2.find('div', attrs={'class':'bodytext'})
for it in texttag.findAll(style=True): for it in texttag.findAll(style=True):
del it['style'] del it['style']
newpos = len(texttag.contents) newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos) self.append_page(soup2,texttag,newpos)
texttag.extract() texttag.extract()
appendtag.insert(position,texttag) appendtag.insert(position,texttag)
def preprocess_html(self, soup): def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>' mtag = '<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
self.append_page(soup, soup.body, 3) self.append_page(soup, soup.body, 3)
pager = soup.find('div',attrs={'class':'toolbar_fat'}) pager = soup.find('div',attrs={'class':'toolbar_fat'})
if pager: if pager:
pager.extract() pager.extract()
return soup return soup

View File

@ -1,62 +1,61 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
ambito.com ambito.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Ambito(BasicNewsRecipe): class Ambito(BasicNewsRecipe):
title = 'Ambito.com' title = 'Ambito.com'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas' description = 'Informacion Libre las 24 horas'
publisher = 'Ambito.com' publisher = 'Ambito.com'
category = 'news, politics, Argentina' category = 'news, politics, Argentina'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
encoding = 'iso-8859-1' encoding = 'iso-8859-1'
cover_url = 'http://www.ambito.com/img/logo_.jpg' cover_url = 'http://www.ambito.com/img/logo_.jpg'
remove_javascript = True remove_javascript = True
use_embedded_content = False use_embedded_content = False
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'align':'justify'})] keep_only_tags = [dict(name='div', attrs={'align':'justify'})]
remove_tags = [dict(name=['object','link'])] remove_tags = [dict(name=['object','link'])]
feeds = [ feeds = [
(u'Principales Noticias', u'http://www.ambito.com/rss/noticiasp.asp' ) (u'Principales Noticias', u'http://www.ambito.com/rss/noticiasp.asp' )
,(u'Economia' , u'http://www.ambito.com/rss/noticias.asp?S=Econom%EDa' ) ,(u'Economia' , u'http://www.ambito.com/rss/noticias.asp?S=Econom%EDa' )
,(u'Politica' , u'http://www.ambito.com/rss/noticias.asp?S=Pol%EDtica' ) ,(u'Politica' , u'http://www.ambito.com/rss/noticias.asp?S=Pol%EDtica' )
,(u'Informacion General' , u'http://www.ambito.com/rss/noticias.asp?S=Informaci%F3n%20General') ,(u'Informacion General' , u'http://www.ambito.com/rss/noticias.asp?S=Informaci%F3n%20General')
,(u'Agro' , u'http://www.ambito.com/rss/noticias.asp?S=Agro' ) ,(u'Agro' , u'http://www.ambito.com/rss/noticias.asp?S=Agro' )
,(u'Internacionales' , u'http://www.ambito.com/rss/noticias.asp?S=Internacionales' ) ,(u'Internacionales' , u'http://www.ambito.com/rss/noticias.asp?S=Internacionales' )
,(u'Deportes' , u'http://www.ambito.com/rss/noticias.asp?S=Deportes' ) ,(u'Deportes' , u'http://www.ambito.com/rss/noticias.asp?S=Deportes' )
,(u'Espectaculos' , u'http://www.ambito.com/rss/noticias.asp?S=Espect%E1culos' ) ,(u'Espectaculos' , u'http://www.ambito.com/rss/noticias.asp?S=Espect%E1culos' )
,(u'Tecnologia' , u'http://www.ambito.com/rss/noticias.asp?S=Tecnologia' ) ,(u'Tecnologia' , u'http://www.ambito.com/rss/noticias.asp?S=Tecnologia' )
,(u'Salud' , u'http://www.ambito.com/rss/noticias.asp?S=Salud' ) ,(u'Salud' , u'http://www.ambito.com/rss/noticias.asp?S=Salud' )
,(u'Ambito Nacional' , u'http://www.ambito.com/rss/noticias.asp?S=Ambito%20Nacional' ) ,(u'Ambito Nacional' , u'http://www.ambito.com/rss/noticias.asp?S=Ambito%20Nacional' )
] ]
def print_version(self, url): def print_version(self, url):
return url.replace('http://www.ambito.com/noticia.asp?','http://www.ambito.com/noticias/imprimir.asp?') return url.replace('http://www.ambito.com/noticia.asp?','http://www.ambito.com/noticias/imprimir.asp?')
def preprocess_html(self, soup): def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-AR"/>' mtag = '<meta http-equiv="Content-Language" content="es-AR"/>'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
return soup return soup
language = 'es' language = 'es'

View File

@ -1,55 +1,55 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
spectator.org spectator.org
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class TheAmericanSpectator(BasicNewsRecipe): class TheAmericanSpectator(BasicNewsRecipe):
title = 'The American Spectator' title = 'The American Spectator'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
language = 'en' language = 'en'
description = 'News from USA' description = 'News from USA'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
INDEX = 'http://spectator.org' INDEX = 'http://spectator.org'
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment' , description
, '--category' , 'news, politics, USA' , '--category' , 'news, politics, USA'
, '--publisher' , title , '--publisher' , title
] ]
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'class':'post inner'}) dict(name='div', attrs={'class':'post inner'})
,dict(name='div', attrs={'class':'author-bio'}) ,dict(name='div', attrs={'class':'author-bio'})
] ]
remove_tags = [ remove_tags = [
dict(name='object') dict(name='object')
,dict(name='div', attrs={'class':'col3' }) ,dict(name='div', attrs={'class':'col3' })
,dict(name='div', attrs={'class':'post-options' }) ,dict(name='div', attrs={'class':'post-options' })
,dict(name='p' , attrs={'class':'letter-editor'}) ,dict(name='p' , attrs={'class':'letter-editor'})
,dict(name='div', attrs={'class':'social' }) ,dict(name='div', attrs={'class':'social' })
] ]
feeds = [ (u'Articles', u'http://feedproxy.google.com/amspecarticles')] feeds = [ (u'Articles', u'http://feedproxy.google.com/amspecarticles')]
def get_cover_url(self): def get_cover_url(self):
cover_url = None cover_url = None
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
link_item = soup.find('a',attrs={'class':'cover'}) link_item = soup.find('a',attrs={'class':'cover'})
if link_item: if link_item:
soup2 = self.index_to_soup(link_item['href']) soup2 = self.index_to_soup(link_item['href'])
link_item2 = soup2.find('div',attrs={'class':'post inner issues'}) link_item2 = soup2.find('div',attrs={'class':'post inner issues'})
cover_url = self.INDEX + link_item2.img['src'] cover_url = self.INDEX + link_item2.img['src']
return cover_url return cover_url
def print_version(self, url): def print_version(self, url):
return url + '/print' return url + '/print'

View File

@ -1,62 +1,62 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
axxon.com.ar axxon.com.ar
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
class Axxon_news(BasicNewsRecipe): class Axxon_news(BasicNewsRecipe):
title = 'Axxon noticias' title = 'Axxon noticias'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Axxon, Ciencia Ficcion en Bits' description = 'Axxon, Ciencia Ficcion en Bits'
publisher = 'Axxon' publisher = 'Axxon'
category = 'news, SF, Argentina, science, movies' category = 'news, SF, Argentina, science, movies'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = False no_stylesheets = False
use_embedded_content = False use_embedded_content = False
language = 'es' language = 'es'
lang = 'es-AR' lang = 'es-AR'
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
, 'tags' : category , 'tags' : category
, 'publisher' : publisher , 'publisher' : publisher
, 'language' : lang , 'language' : lang
, 'pretty_print' : True , 'pretty_print' : True
} }
keep_only_tags = [dict(name='div', attrs={'class':'post'})] keep_only_tags = [dict(name='div', attrs={'class':'post'})]
remove_tags = [dict(name=['object','link','iframe','embed'])] remove_tags = [dict(name=['object','link','iframe','embed'])]
feeds = [(u'Noticias', u'http://axxon.com.ar/noticias/feed/')] feeds = [(u'Noticias', u'http://axxon.com.ar/noticias/feed/')]
remove_attributes = ['style','width','height','font','border','align'] remove_attributes = ['style','width','height','font','border','align']
def adeify_images2(cls, soup): def adeify_images2(cls, soup):
for item in soup.findAll('img'): for item in soup.findAll('img'):
for attrib in ['height','width','border','align','style']: for attrib in ['height','width','border','align','style']:
if item.has_key(attrib): if item.has_key(attrib):
del item[attrib] del item[attrib]
oldParent = item.parent oldParent = item.parent
if oldParent.name == 'a': if oldParent.name == 'a':
oldParent.name == 'p' oldParent.name == 'p'
myIndex = oldParent.contents.index(item) myIndex = oldParent.contents.index(item)
brtag = Tag(soup,'br') brtag = Tag(soup,'br')
oldParent.insert(myIndex+1,brtag) oldParent.insert(myIndex+1,brtag)
return soup return soup
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.html.insert(0,mlang) soup.html.insert(0,mlang)
return self.adeify_images2(soup) return self.adeify_images2(soup)

View File

@ -1,65 +1,65 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.azstarnet.com www.azstarnet.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Azstarnet(BasicNewsRecipe): class Azstarnet(BasicNewsRecipe):
title = 'Arizona Daily Star' title = 'Arizona Daily Star'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'news from Arizona' description = 'news from Arizona'
language = 'en' language = 'en'
publisher = 'azstarnet.com' publisher = 'azstarnet.com'
category = 'news, politics, Arizona, USA' category = 'news, politics, Arizona, USA'
delay = 1 delay = 1
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'utf-8' encoding = 'utf-8'
needs_subscription = True needs_subscription = True
remove_javascript = True remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
br.open('http://azstarnet.com/registration/retro.php') br.open('http://azstarnet.com/registration/retro.php')
br.select_form(nr=1) br.select_form(nr=1)
br['email'] = self.username br['email'] = self.username
br['pass' ] = self.password br['pass' ] = self.password
br.submit() br.submit()
return br return br
keep_only_tags = [dict(name='div', attrs={'id':'storycontent'})] keep_only_tags = [dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [ remove_tags = [
dict(name=['object','link','iframe','base','img']) dict(name=['object','link','iframe','base','img'])
,dict(name='div',attrs={'class':'bannerinstory'}) ,dict(name='div',attrs={'class':'bannerinstory'})
] ]
feeds = [(u'Tucson Region', u'http://rss.azstarnet.com/index.php?site=metro')] feeds = [(u'Tucson Region', u'http://rss.azstarnet.com/index.php?site=metro')]
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['dir' ] = 'ltr' soup.html['dir' ] = 'ltr'
soup.html['lang'] = 'en-US' soup.html['lang'] = 'en-US'
mtag = '\n<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n' mtag = '\n<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
return soup return soup

View File

@ -1,69 +1,69 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
b92.net b92.net
''' '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class B92(BasicNewsRecipe): class B92(BasicNewsRecipe):
title = 'B92' title = 'B92'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Dnevne vesti iz Srbije i sveta' description = 'Dnevne vesti iz Srbije i sveta'
publisher = 'B92' publisher = 'B92'
category = 'news, politics, Serbia' category = 'news, politics, Serbia'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1250' encoding = 'cp1250'
language = 'sr' language = 'sr'
lang = 'sr-Latn-RS' lang = 'sr-Latn-RS'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
, 'tags' : category , 'tags' : category
, 'publisher' : publisher , 'publisher' : publisher
, 'language' : lang , 'language' : lang
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='table', attrs={'class':'maindocument'})] keep_only_tags = [dict(name='table', attrs={'class':'maindocument'})]
remove_tags = [ remove_tags = [
dict(name='ul', attrs={'class':'comment-nav'}) dict(name='ul', attrs={'class':'comment-nav'})
,dict(name=['embed','link','base'] ) ,dict(name=['embed','link','base'] )
,dict(name='div', attrs={'class':'udokum'} ) ,dict(name='div', attrs={'class':'udokum'} )
] ]
feeds = [ feeds = [
(u'Vesti', u'http://www.b92.net/info/rss/vesti.xml') (u'Vesti', u'http://www.b92.net/info/rss/vesti.xml')
,(u'Biz' , u'http://www.b92.net/info/rss/biz.xml' ) ,(u'Biz' , u'http://www.b92.net/info/rss/biz.xml' )
] ]
def print_version(self, url): def print_version(self, url):
return url + '&version=print' return url + '&version=print'
def preprocess_html(self, soup): def preprocess_html(self, soup):
del soup.body['onload'] del soup.body['onload']
for item in soup.findAll('font'): for item in soup.findAll('font'):
item.name='div' item.name='div'
if item.has_key('size'): if item.has_key('size'):
del item['size'] del item['size']
attribs = [ 'style','font','valign' attribs = [ 'style','font','valign'
,'colspan','width','height' ,'colspan','width','height'
,'rowspan','summary','align' ,'rowspan','summary','align'
,'cellspacing','cellpadding' ,'cellspacing','cellpadding'
,'frames','rules','border' ,'frames','rules','border'
] ]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div' item.name = 'div'
for attrib in attribs: for attrib in attribs:
if item.has_key(attrib): if item.has_key(attrib):
del item[attrib] del item[attrib]
return soup return soup

View File

@ -1,93 +1,93 @@
## ##
## web2lrf profile to download articles from Barrons.com ## web2lrf profile to download articles from Barrons.com
## can download subscriber-only content if username and ## can download subscriber-only content if username and
## password are supplied. ## password are supplied.
## ##
''' '''
''' '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Barrons(BasicNewsRecipe): class Barrons(BasicNewsRecipe):
title = 'Barron\'s' title = 'Barron\'s'
max_articles_per_feed = 50 max_articles_per_feed = 50
needs_subscription = True needs_subscription = True
language = 'en' language = 'en'
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal'
description = 'Weekly publication for investors from the publisher of the Wall Street Journal' description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
timefmt = ' [%a, %b %d, %Y]' timefmt = ' [%a, %b %d, %Y]'
use_embedded_content = False use_embedded_content = False
no_stylesheets = False no_stylesheets = False
match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*'] match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
conversion_options = {'linearize_tables': True} conversion_options = {'linearize_tables': True}
##delay = 1 ##delay = 1
## Don't grab articles more than 7 days old ## Don't grab articles more than 7 days old
oldest_article = 7 oldest_article = 7
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[ [
## Remove anything before the body of the article. ## Remove anything before the body of the article.
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'), (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
## Remove any insets from the body of the article. ## Remove any insets from the body of the article.
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'), (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
## Remove any reprint info from the body of the article. ## Remove any reprint info from the body of the article.
(r'<hr size.*?<p', lambda match : '<p'), (r'<hr size.*?<p', lambda match : '<p'),
## Remove anything after the end of the article. ## Remove anything after the end of the article.
(r'<!-- article end.*?</body>', lambda match : '</body>'), (r'<!-- article end.*?</body>', lambda match : '</body>'),
] ]
] ]
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
br.open('http://commerce.barrons.com/auth/login') br.open('http://commerce.barrons.com/auth/login')
br.select_form(name='login_form') br.select_form(name='login_form')
br['user'] = self.username br['user'] = self.username
br['password'] = self.password br['password'] = self.password
br.submit() br.submit()
return br return br
## Use the print version of a page when available. ## Use the print version of a page when available.
def print_version(self, url): def print_version(self, url):
return url.replace('/article/', '/article_print/') return url.replace('/article/', '/article_print/')
## Comment out the feeds you don't want retrieved. ## Comment out the feeds you don't want retrieved.
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire ## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
def get_feeds(self): def get_feeds(self):
return [ return [
('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'), ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'), ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'), ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'), ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'), ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'), ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
] ]
## Logout of website ## Logout of website
## NOT CURRENTLY WORKING ## NOT CURRENTLY WORKING
# def cleanup(self): # def cleanup(self):
# try: # try:
# self.browser.set_debug_responses(True) # self.browser.set_debug_responses(True)
# import sys, logging # import sys, logging
# logger = logging.getLogger("mechanize") # logger = logging.getLogger("mechanize")
# logger.addHandler(logging.StreamHandler(sys.stdout)) # logger.addHandler(logging.StreamHandler(sys.stdout))
# logger.setLevel(logging.INFO) # logger.setLevel(logging.INFO)
# res = self.browser.open('http://online.barrons.com/logout') # res = self.browser.open('http://online.barrons.com/logout')
# except: # except:
# import traceback # import traceback
# traceback.print_exc() # traceback.print_exc()

View File

@ -1,35 +1,35 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Huan Komrade T <huantnh at gmail.com>' __copyright__ = '2009, Huan Komrade T <huantnh at gmail.com>'
''' '''
bbc.co.uk bbc.co.uk
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class BBCVietnamese(BasicNewsRecipe): class BBCVietnamese(BasicNewsRecipe):
title = u'BBC Vietnamese' title = u'BBC Vietnamese'
__author__ = 'Huan Komrade T' __author__ = 'Huan Komrade T'
description = 'Vietnam news and current affairs from the British Broadcasting Corporation' description = 'Vietnam news and current affairs from the British Broadcasting Corporation'
no_stylesheets = True no_stylesheets = True
language = 'vi' language = 'vi'
encoding = 'utf-8' encoding = 'utf-8'
recursions = 0 recursions = 0
remove_tags = [dict(name='div', attrs={'class':'footer'})] remove_tags = [dict(name='div', attrs={'class':'footer'})]
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }' extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
feeds = [ feeds = [
('Index', 'http://www.bbc.co.uk/vietnamese/index.xml'), ('Index', 'http://www.bbc.co.uk/vietnamese/index.xml'),
('Vietnam', 'http://www.bbc.co.uk/vietnamese/vietnam/index.xml'), ('Vietnam', 'http://www.bbc.co.uk/vietnamese/vietnam/index.xml'),
('Business', 'http://www.bbc.co.uk/vietnamese/business/index.xml'), ('Business', 'http://www.bbc.co.uk/vietnamese/business/index.xml'),
('Culture', 'http://www.bbc.co.uk/vietnamese/culture/index.xml'), ('Culture', 'http://www.bbc.co.uk/vietnamese/culture/index.xml'),
('Football', 'http://www.bbc.co.uk/vietnamese/football/index.xml'), ('Football', 'http://www.bbc.co.uk/vietnamese/football/index.xml'),
('Forum', 'http://www.bbc.co.uk/vietnamese/forum/index.xml'), ('Forum', 'http://www.bbc.co.uk/vietnamese/forum/index.xml'),
('In Depth', 'http://www.bbc.co.uk/vietnamese/indepth/index.xml'), ('In Depth', 'http://www.bbc.co.uk/vietnamese/indepth/index.xml'),
] ]
def print_version(self, url): def print_version(self, url):
return url.replace('http://www.bbc.co.uk/vietnamese/', 'http://www.bbc.co.uk/vietnamese/lg/') return url.replace('http://www.bbc.co.uk/vietnamese/', 'http://www.bbc.co.uk/vietnamese/lg/')

View File

@ -1,51 +1,51 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
beta.rs beta.rs
''' '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.BeautifulSoup import Tag
class Danas(BasicNewsRecipe): class Danas(BasicNewsRecipe):
title = 'BETA' title = 'BETA'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Novinska Agencija' description = 'Novinska Agencija'
publisher = 'Beta' publisher = 'Beta'
category = 'news, politics, Serbia' category = 'news, politics, Serbia'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = False no_stylesheets = False
use_embedded_content = True use_embedded_content = True
language = 'sr' language = 'sr'
lang = 'sr-Latn-RS' lang = 'sr-Latn-RS'
direction = 'ltr' direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
, 'tags' : category , 'tags' : category
, 'publisher' : publisher , 'publisher' : publisher
, 'language' : lang , 'language' : lang
, 'pretty_print' : True , 'pretty_print' : True
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
feeds = [ feeds = [
(u'Vesti dana', u'http://www.beta.rs/rssvd.asp') (u'Vesti dana', u'http://www.beta.rs/rssvd.asp')
,(u'Ekonomija' , u'http://www.beta.rs/rssek.asp') ,(u'Ekonomija' , u'http://www.beta.rs/rssek.asp')
,(u'Sport' , u'http://www.beta.rs/rsssp.asp') ,(u'Sport' , u'http://www.beta.rs/rsssp.asp')
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
soup.head.insert(1,mcharset) soup.head.insert(1,mcharset)
return self.adeify_images(soup) return self.adeify_images(soup)

View File

@ -1,38 +1,37 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
beta.rs beta.rs
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Danas(BasicNewsRecipe):
class Danas(BasicNewsRecipe): title = 'BETA - English'
title = 'BETA - English' __author__ = 'Darko Miletic'
__author__ = 'Darko Miletic' description = 'Serbian news agency'
description = 'Serbian news agency' publisher = 'Beta'
publisher = 'Beta' category = 'news, politics, Serbia'
category = 'news, politics, Serbia' oldest_article = 2
oldest_article = 2 max_articles_per_feed = 100
max_articles_per_feed = 100 no_stylesheets = False
no_stylesheets = False use_embedded_content = True
use_embedded_content = True
language = 'en' language = 'en'
lang = 'en' lang = 'en'
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
, 'tags' : category , 'tags' : category
, 'publisher' : publisher , 'publisher' : publisher
, 'language' : lang , 'language' : lang
, 'pretty_print' : True , 'pretty_print' : True
} }
feeds = [(u'News', u'http://www.beta.rs/rssen.asp')] feeds = [(u'News', u'http://www.beta.rs/rssen.asp')]
def preprocess_html(self, soup): def preprocess_html(self, soup):
return self.adeify_images(soup) return self.adeify_images(soup)

View File

@ -1,66 +1,65 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
blic.rs blic.rs
''' '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Blic(BasicNewsRecipe):
class Blic(BasicNewsRecipe): title = 'Blic'
title = 'Blic' __author__ = 'Darko Miletic'
__author__ = 'Darko Miletic' description = 'Blic.co.yu online verzija najtiraznije novine u Srbiji donosi najnovije vesti iz Srbije i sveta, komentare, politicke analize, poslovne i ekonomske vesti, vesti iz regiona, intervjue, informacije iz kulture, reportaze, pokriva sve sportske dogadjaje, detaljan tv program, nagradne igre, zabavu, fenomenalni Blic strip, dnevni horoskop, arhivu svih dogadjaja'
description = 'Blic.co.yu online verzija najtiraznije novine u Srbiji donosi najnovije vesti iz Srbije i sveta, komentare, politicke analize, poslovne i ekonomske vesti, vesti iz regiona, intervjue, informacije iz kulture, reportaze, pokriva sve sportske dogadjaje, detaljan tv program, nagradne igre, zabavu, fenomenalni Blic strip, dnevni horoskop, arhivu svih dogadjaja' publisher = 'RINGIER d.o.o.'
publisher = 'RINGIER d.o.o.' category = 'news, politics, Serbia'
category = 'news, politics, Serbia' delay = 1
delay = 1 oldest_article = 2
oldest_article = 2 max_articles_per_feed = 100
max_articles_per_feed = 100 remove_javascript = True
remove_javascript = True no_stylesheets = True
no_stylesheets = True use_embedded_content = False
use_embedded_content = False
language = 'sr' language = 'sr'
lang = 'sr-Latn-RS' lang = 'sr-Latn-RS'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} ' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} '
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
, 'tags' : category , 'tags' : category
, 'publisher' : publisher , 'publisher' : publisher
, 'language' : lang , 'language' : lang
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'class':'single_news'})] keep_only_tags = [dict(name='div', attrs={'class':'single_news'})]
feeds = [(u'Vesti', u'http://www.blic.rs/rssall.php')] feeds = [(u'Vesti', u'http://www.blic.rs/rssall.php')]
remove_tags = [dict(name=['object','link'])] remove_tags = [dict(name=['object','link'])]
def print_version(self, url): def print_version(self, url):
rest_url = url.partition('?')[2] rest_url = url.partition('?')[2]
return u'http://www.blic.rs/_print.php?' + rest_url return u'http://www.blic.rs/_print.php?' + rest_url
def preprocess_html(self, soup): def preprocess_html(self, soup):
attribs = [ 'style','font','valign' attribs = [ 'style','font','valign'
,'colspan','width','height' ,'colspan','width','height'
,'rowspan','summary','align' ,'rowspan','summary','align'
,'cellspacing','cellpadding' ,'cellspacing','cellpadding'
,'frames','rules','border' ,'frames','rules','border'
] ]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div' item.name = 'div'
for attrib in attribs: for attrib in attribs:
if item.has_key(attrib): if item.has_key(attrib):
del item[attrib] del item[attrib]
return self.adeify_images(soup) return self.adeify_images(soup)
def get_article_url(self, article): def get_article_url(self, article):
raw = article.get('link', None) raw = article.get('link', None)
return raw.replace('.co.yu','.rs') return raw.replace('.co.yu','.rs')

View File

@ -1,95 +1,95 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
borba.rs borba.rs
''' '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Borba(BasicNewsRecipe): class Borba(BasicNewsRecipe):
title = 'Borba Online' title = 'Borba Online'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Dnevne novine Borba Online' description = 'Dnevne novine Borba Online'
publisher = 'IP Novine Borba' publisher = 'IP Novine Borba'
category = 'news, politics, Serbia' category = 'news, politics, Serbia'
language = 'sr' language = 'sr'
lang = _('sr-Latn-RS') lang = _('sr-Latn-RS')
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
encoding = 'utf-8' encoding = 'utf-8'
use_embedded_content = False use_embedded_content = False
cover_url = 'http://www.borba.rs/images/stories/novine/naslovna_v.jpg' cover_url = 'http://www.borba.rs/images/stories/novine/naslovna_v.jpg'
INDEX = u'http://www.borba.rs/' INDEX = u'http://www.borba.rs/'
extra_css = ' @font-face {font-family: "serif1"; src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} .contentheading{font-size: x-large; font-weight: bold} .createdate{font-size: small; font-weight: bold} ' extra_css = ' @font-face {font-family: "serif1"; src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} .contentheading{font-size: x-large; font-weight: bold} .createdate{font-size: small; font-weight: bold} '
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
, 'tags' : category , 'tags' : category
, 'publisher' : publisher , 'publisher' : publisher
, 'language' : lang , 'language' : lang
, 'pretty_print' : True , 'pretty_print' : True
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'class':'main'})] keep_only_tags = [dict(name='div', attrs={'class':'main'})]
remove_tags_after = dict(name='div',attrs={'id':'written_comments_title'}) remove_tags_after = dict(name='div',attrs={'id':'written_comments_title'})
remove_tags = [ remove_tags = [
dict(name=['object','link','iframe','base','img']) dict(name=['object','link','iframe','base','img'])
,dict(name='div',attrs={'id':'written_comments_title'}) ,dict(name='div',attrs={'id':'written_comments_title'})
] ]
feeds = [ feeds = [
(u'Najnovije vesti', u'http://www.borba.rs/content/blogsection/28/105/') (u'Najnovije vesti', u'http://www.borba.rs/content/blogsection/28/105/')
,(u'Prvi plan' , u'http://www.borba.rs/content/blogsection/4/92/' ) ,(u'Prvi plan' , u'http://www.borba.rs/content/blogsection/4/92/' )
,(u'Dogadjaji' , u'http://www.borba.rs/content/blogsection/21/83/' ) ,(u'Dogadjaji' , u'http://www.borba.rs/content/blogsection/21/83/' )
,(u'Ekonomija' , u'http://www.borba.rs/content/blogsection/5/35/' ) ,(u'Ekonomija' , u'http://www.borba.rs/content/blogsection/5/35/' )
,(u'Komentari' , u'http://www.borba.rs/content/blogsection/23/94/' ) ,(u'Komentari' , u'http://www.borba.rs/content/blogsection/23/94/' )
,(u'Svet' , u'http://www.borba.rs/content/blogsection/7/36/' ) ,(u'Svet' , u'http://www.borba.rs/content/blogsection/7/36/' )
,(u'Sport' , u'http://www.borba.rs/content/blogsection/6/37/' ) ,(u'Sport' , u'http://www.borba.rs/content/blogsection/6/37/' )
,(u'Fama' , u'http://www.borba.rs/content/blogsection/25/89/' ) ,(u'Fama' , u'http://www.borba.rs/content/blogsection/25/89/' )
,(u'B2 Dodatak' , u'http://www.borba.rs/content/blogsection/30/116/') ,(u'B2 Dodatak' , u'http://www.borba.rs/content/blogsection/30/116/')
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):
attribs = [ 'style','font','valign' attribs = [ 'style','font','valign'
,'colspan','width','height' ,'colspan','width','height'
,'rowspan','summary','align' ,'rowspan','summary','align'
,'cellspacing','cellpadding' ,'cellspacing','cellpadding'
,'frames','rules','border' ,'frames','rules','border'
] ]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div' item.name = 'div'
for attrib in attribs: for attrib in attribs:
if item.has_key(attrib): if item.has_key(attrib):
del item[attrib] del item[attrib]
return soup return soup
def parse_index(self): def parse_index(self):
totalfeeds = [] totalfeeds = []
lfeeds = self.get_feeds() lfeeds = self.get_feeds()
for feedobj in lfeeds: for feedobj in lfeeds:
feedtitle, feedurl = feedobj feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = [] articles = []
soup = self.index_to_soup(feedurl) soup = self.index_to_soup(feedurl)
for item in soup.findAll('a', attrs={'class':'contentpagetitle'}): for item in soup.findAll('a', attrs={'class':'contentpagetitle'}):
url = item['href'] url = item['href']
title = self.tag_to_string(item) title = self.tag_to_string(item)
articles.append({ articles.append({
'title' :title 'title' :title
,'date' :'' ,'date' :''
,'url' :url ,'url' :url
,'description':'' ,'description':''
}) })
totalfeeds.append((feedtitle, articles)) totalfeeds.append((feedtitle, articles))
return totalfeeds return totalfeeds

View File

@ -1,72 +1,72 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
elargentino.com elargentino.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
class BsAsEconomico(BasicNewsRecipe): class BsAsEconomico(BasicNewsRecipe):
title = 'Buenos Aires Economico' title = 'Buenos Aires Economico'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Revista Argentina' description = 'Revista Argentina'
publisher = 'ElArgentino.com' publisher = 'ElArgentino.com'
category = 'news, politics, economy, Argentina' category = 'news, politics, economy, Argentina'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'utf-8' encoding = 'utf-8'
language = 'es' language = 'es'
lang = 'es-AR' lang = 'es-AR'
direction = 'ltr' direction = 'ltr'
INDEX = 'http://www.elargentino.com/medios/121/Buenos-Aires-Economico.html' INDEX = 'http://www.elargentino.com/medios/121/Buenos-Aires-Economico.html'
extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} ' extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} '
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment' , description
, '--category' , category , '--category' , category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})] keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})]
remove_tags = [dict(name='link')] remove_tags = [dict(name='link')]
feeds = [(u'Articulos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=121&Content-Type=text/xml&ChannelDesc=Buenos%20Aires%20Econ%C3%B3mico')] feeds = [(u'Articulos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=121&Content-Type=text/xml&ChannelDesc=Buenos%20Aires%20Econ%C3%B3mico')]
def print_version(self, url): def print_version(self, url):
main, sep, article_part = url.partition('/nota-') main, sep, article_part = url.partition('/nota-')
article_id, rsep, rrest = article_part.partition('-') article_id, rsep, rrest = article_part.partition('-')
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
soup.head.insert(1,mcharset) soup.head.insert(1,mcharset)
return soup return soup
def get_cover_url(self): def get_cover_url(self):
cover_url = None cover_url = None
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('div',attrs={'class':'colder'}) cover_item = soup.find('div',attrs={'class':'colder'})
if cover_item: if cover_item:
clean_url = self.image_url_processor(None,cover_item.div.img['src']) clean_url = self.image_url_processor(None,cover_item.div.img['src'])
cover_url = 'http://www.elargentino.com' + clean_url + '&height=600' cover_url = 'http://www.elargentino.com' + clean_url + '&height=600'
return cover_url return cover_url
def image_url_processor(self, baseurl, url): def image_url_processor(self, baseurl, url):
base, sep, rest = url.rpartition('?Id=') base, sep, rest = url.rpartition('?Id=')
img, sep2, rrest = rest.partition('&') img, sep2, rrest = rest.partition('&')
return base + sep + img return base + sep + img

View File

@ -1,46 +1,46 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
chicagobreakingnews.com chicagobreakingnews.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ChicagoBreakingNews(BasicNewsRecipe): class ChicagoBreakingNews(BasicNewsRecipe):
title = 'Chicago Breaking News' title = 'Chicago Breaking News'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Breaking News from Chicago' description = 'Breaking News from Chicago'
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = True use_embedded_content = True
publisher = 'Chicago Breaking News' publisher = 'Chicago Breaking News'
category = 'news, politics, USA, Chicago' category = 'news, politics, USA, Chicago'
encoding = 'utf8' encoding = 'utf8'
language = 'en' language = 'en'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'Breaking news', u'http://feeds2.feedburner.com/ChicagoBreakingNews/')] feeds = [(u'Breaking news', u'http://feeds2.feedburner.com/ChicagoBreakingNews/')]
def preprocess_html(self, soup): def preprocess_html(self, soup):
links = soup.findAll('a') links = soup.findAll('a')
for item in soup.findAll('a'): for item in soup.findAll('a'):
if item['href'].find('http://feedads.googleadservices.com') > -1: if item['href'].find('http://feedads.googleadservices.com') > -1:
item.extract() item.extract()
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
for item in soup.findAll(color=True): for item in soup.findAll(color=True):
del item['color'] del item['color']
for item in soup.findAll(size=True): for item in soup.findAll(size=True):
del item['size'] del item['size']
return soup return soup

View File

@ -3,11 +3,7 @@ __license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re
from urlparse import urlparse, urlunparse
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from threading import RLock
class ChicagoTribune(BasicNewsRecipe): class ChicagoTribune(BasicNewsRecipe):

View File

@ -1,73 +1,73 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
clarin.com clarin.com
''' '''
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
class Clarin(BasicNewsRecipe): class Clarin(BasicNewsRecipe):
title = 'Clarin' title = 'Clarin'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Noticias de Argentina y mundo' description = 'Noticias de Argentina y mundo'
publisher = 'Grupo Clarin' publisher = 'Grupo Clarin'
category = 'news, politics, Argentina' category = 'news, politics, Argentina'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
use_embedded_content = False use_embedded_content = False
no_stylesheets = True no_stylesheets = True
cover_url = strftime('http://www.clarin.com/diario/%Y/%m/%d/portada.jpg') cover_url = strftime('http://www.clarin.com/diario/%Y/%m/%d/portada.jpg')
remove_javascript = True remove_javascript = True
encoding = 'cp1252' encoding = 'cp1252'
language = 'es' language = 'es'
lang = 'es-AR' lang = 'es-AR'
direction = 'ltr' direction = 'ltr'
extra_css = ' .Txt{ font-family: sans-serif } .Volan{ font-family: sans-serif; font-size: x-small} .Pie{ font-family: sans-serif; font-size: x-small} .Copete{font-family: sans-serif; font-size: large} .Hora{font-family: sans-serif; font-size: large} .Autor{font-family: sans-serif; font-size: small} ' extra_css = ' .Txt{ font-family: sans-serif } .Volan{ font-family: sans-serif; font-size: x-small} .Pie{ font-family: sans-serif; font-size: x-small} .Copete{font-family: sans-serif; font-size: large} .Hora{font-family: sans-serif; font-size: large} .Autor{font-family: sans-serif; font-size: small} '
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\npretty_print=True\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\npretty_print=True\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
remove_tags = [ remove_tags = [
dict(name='a' , attrs={'class':'Imp' }) dict(name='a' , attrs={'class':'Imp' })
,dict(name='div' , attrs={'class':'Perma' }) ,dict(name='div' , attrs={'class':'Perma' })
,dict(name='h1' , text='Imprimir' ) ,dict(name='h1' , text='Imprimir' )
] ]
feeds = [ feeds = [
(u'Ultimo Momento', u'http://www.clarin.com/diario/hoy/um/sumariorss.xml') (u'Ultimo Momento', u'http://www.clarin.com/diario/hoy/um/sumariorss.xml')
,(u'El Pais' , u'http://www.clarin.com/diario/hoy/elpais.xml' ) ,(u'El Pais' , u'http://www.clarin.com/diario/hoy/elpais.xml' )
,(u'Opinion' , u'http://www.clarin.com/diario/hoy/opinion.xml' ) ,(u'Opinion' , u'http://www.clarin.com/diario/hoy/opinion.xml' )
,(u'El Mundo' , u'http://www.clarin.com/diario/hoy/elmundo.xml' ) ,(u'El Mundo' , u'http://www.clarin.com/diario/hoy/elmundo.xml' )
,(u'Sociedad' , u'http://www.clarin.com/diario/hoy/sociedad.xml' ) ,(u'Sociedad' , u'http://www.clarin.com/diario/hoy/sociedad.xml' )
,(u'La Ciudad' , u'http://www.clarin.com/diario/hoy/laciudad.xml' ) ,(u'La Ciudad' , u'http://www.clarin.com/diario/hoy/laciudad.xml' )
,(u'Policiales' , u'http://www.clarin.com/diario/hoy/policiales.xml' ) ,(u'Policiales' , u'http://www.clarin.com/diario/hoy/policiales.xml' )
,(u'Deportes' , u'http://www.clarin.com/diario/hoy/deportes.xml' ) ,(u'Deportes' , u'http://www.clarin.com/diario/hoy/deportes.xml' )
] ]
def print_version(self, url): def print_version(self, url):
rest = url.partition('-0')[-1] rest = url.partition('-0')[-1]
lmain = rest.partition('.')[0] lmain = rest.partition('.')[0]
lurl = u'http://www.servicios.clarin.com/notas/jsp/clarin/v9/notas/imprimir.jsp?pagid=' + lmain lurl = u'http://www.servicios.clarin.com/notas/jsp/clarin/v9/notas/imprimir.jsp?pagid=' + lmain
return lurl return lurl
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
soup.head.insert(1,mcharset) soup.head.insert(1,mcharset)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
return soup return soup

View File

@ -1,46 +1,46 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
climateprogress.org climateprogress.org
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.BeautifulSoup import Tag
class ClimateProgress(BasicNewsRecipe): class ClimateProgress(BasicNewsRecipe):
title = 'Climate Progress' title = 'Climate Progress'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = "An insider's view of climate science, politics and solutions" description = "An insider's view of climate science, politics and solutions"
publisher = 'Climate Progress' publisher = 'Climate Progress'
category = 'news, ecology, climate, blog' category = 'news, ecology, climate, blog'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = True use_embedded_content = True
encoding = 'utf-8' encoding = 'utf-8'
language = 'en' language = 'en'
lang = 'en-US' lang = 'en-US'
direction = 'ltr' direction = 'ltr'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'Posts', u'http://feeds.feedburner.com/climateprogress/lCrX')] feeds = [(u'Posts', u'http://feeds.feedburner.com/climateprogress/lCrX')]
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
soup.head.insert(1,mcharset) soup.head.insert(1,mcharset)
return self.adeify_images(soup) return self.adeify_images(soup)

View File

@ -1,41 +1,41 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.codinghorror.com/blog/ www.codinghorror.com/blog/
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class CodingHorror(BasicNewsRecipe): class CodingHorror(BasicNewsRecipe):
title = 'Coding Horror' title = 'Coding Horror'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'programming and human factors - Jeff Atwood' description = 'programming and human factors - Jeff Atwood'
category = 'blog, programming' category = 'blog, programming'
publisher = 'Jeff Atwood' publisher = 'Jeff Atwood'
language = 'en' language = 'en'
author = 'Jeff Atwood' author = 'Jeff Atwood'
oldest_article = 30 oldest_article = 30
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = True use_embedded_content = True
encoding = 'cp1252' encoding = 'cp1252'
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment' , description
, '--category' , category , '--category' , category
, '--publisher', publisher , '--publisher', publisher
, '--author' , author , '--author' , author
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nauthors="' + author + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nauthors="' + author + '"'
remove_tags = [ remove_tags = [
dict(name=['object','link']) dict(name=['object','link'])
,dict(name='div',attrs={'class':'feedflare'}) ,dict(name='div',attrs={'class':'feedflare'})
] ]
feeds = [(u'Articles', u'http://feeds2.feedburner.com/codinghorror' )] feeds = [(u'Articles', u'http://feeds2.feedburner.com/codinghorror' )]

View File

@ -1,46 +1,46 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.corriere.it/english www.corriere.it/english
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Corriere_en(BasicNewsRecipe): class Corriere_en(BasicNewsRecipe):
title = 'Corriere della Sera in English' title = 'Corriere della Sera in English'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Milan and Italy' description = 'News from Milan and Italy'
oldest_article = 15 oldest_article = 15
publisher = 'Corriere della Sera' publisher = 'Corriere della Sera'
category = 'news, politics, Italy' category = 'news, politics, Italy'
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
remove_javascript = True remove_javascript = True
language = 'en' language = 'en'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
, '--ignore-tables' , '--ignore-tables'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})] keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})]
remove_tags = [ remove_tags = [
dict(name=['base','object','link','embed','img']) dict(name=['base','object','link','embed','img'])
,dict(name='div', attrs={'class':'news-goback'}) ,dict(name='div', attrs={'class':'news-goback'})
,dict(name='ul', attrs={'class':'toolbar'}) ,dict(name='ul', attrs={'class':'toolbar'})
] ]
remove_tags_after = dict(name='p', attrs={'class':'footnotes'}) remove_tags_after = dict(name='p', attrs={'class':'footnotes'})
feeds = [(u'Italian Life', u'http://www.corriere.it/rss/english.xml')] feeds = [(u'Italian Life', u'http://www.corriere.it/rss/english.xml')]

View File

@ -1,56 +1,56 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.corriere.it www.corriere.it
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Corriere_it(BasicNewsRecipe): class Corriere_it(BasicNewsRecipe):
title = 'Corriere della Sera' title = 'Corriere della Sera'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Milan and Italy' description = 'News from Milan and Italy'
oldest_article = 7 oldest_article = 7
publisher = 'Corriere della Sera' publisher = 'Corriere della Sera'
category = 'news, politics, Italy' category = 'news, politics, Italy'
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
remove_javascript = True remove_javascript = True
language = 'it' language = 'it'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
, '--ignore-tables' , '--ignore-tables'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})] keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})]
remove_tags = [ remove_tags = [
dict(name=['base','object','link','embed','img']) dict(name=['base','object','link','embed','img'])
,dict(name='div', attrs={'class':'news-goback'}) ,dict(name='div', attrs={'class':'news-goback'})
,dict(name='ul', attrs={'class':'toolbar'}) ,dict(name='ul', attrs={'class':'toolbar'})
] ]
remove_tags_after = dict(name='p', attrs={'class':'footnotes'}) remove_tags_after = dict(name='p', attrs={'class':'footnotes'})
feeds = [ feeds = [
(u'Ultimora' , u'http://www.corriere.it/rss/ultimora.xml' ) (u'Ultimora' , u'http://www.corriere.it/rss/ultimora.xml' )
,(u'Cronache' , u'http://www.corriere.it/rss/cronache.xml' ) ,(u'Cronache' , u'http://www.corriere.it/rss/cronache.xml' )
,(u'Economia' , u'http://www.corriere.it/rss/economia.xml' ) ,(u'Economia' , u'http://www.corriere.it/rss/economia.xml' )
,(u'Editoriali', u'http://www.corriere.it/rss/editoriali.xml') ,(u'Editoriali', u'http://www.corriere.it/rss/editoriali.xml')
,(u'Esteri' , u'http://www.corriere.it/rss/esteri.xml' ) ,(u'Esteri' , u'http://www.corriere.it/rss/esteri.xml' )
,(u'Politica' , u'http://www.corriere.it/rss/politica.xml' ) ,(u'Politica' , u'http://www.corriere.it/rss/politica.xml' )
,(u'Salute' , u'http://www.corriere.it/rss/salute.xml' ) ,(u'Salute' , u'http://www.corriere.it/rss/salute.xml' )
,(u'Scienze' , u'http://www.corriere.it/rss/scienze.xml' ) ,(u'Scienze' , u'http://www.corriere.it/rss/scienze.xml' )
,(u'Spettacolo', u'http://www.corriere.it/rss/spettacoli.xml') ,(u'Spettacolo', u'http://www.corriere.it/rss/spettacoli.xml')
,(u'Sport' , u'http://www.corriere.it/rss/sport.xml' ) ,(u'Sport' , u'http://www.corriere.it/rss/sport.xml' )
] ]

View File

@ -7,7 +7,6 @@ Courrier International
''' '''
import re import re
from datetime import date
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class CourrierInternational(BasicNewsRecipe): class CourrierInternational(BasicNewsRecipe):
@ -21,12 +20,12 @@ class CourrierInternational(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
html2lrf_options = ['--base-font-size', '10'] html2lrf_options = ['--base-font-size', '10']
feeds = [ feeds = [
# Some articles requiring subscription fails on download. # Some articles requiring subscription fails on download.
('A la Une', 'http://www.courrierinternational.com/rss/rss_a_la_une.xml'), ('A la Une', 'http://www.courrierinternational.com/rss/rss_a_la_une.xml'),
] ]
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in
[ [
#Handle Depeches #Handle Depeches
@ -35,8 +34,8 @@ class CourrierInternational(BasicNewsRecipe):
(r'.*<td [^>]*>(Courrier international.*?) <td width="10"><img src="/img/espaceur.gif"></td>.*', lambda match : '<html><body><table><tr><td>'+match.group(1)+'</body></html>'), (r'.*<td [^>]*>(Courrier international.*?) <td width="10"><img src="/img/espaceur.gif"></td>.*', lambda match : '<html><body><table><tr><td>'+match.group(1)+'</body></html>'),
] ]
] ]
def print_version(self, url): def print_version(self, url):
return re.sub('/[a-zA-Z]+\.asp','/imprimer.asp' ,url) return re.sub('/[a-zA-Z]+\.asp','/imprimer.asp' ,url)

View File

@ -1,62 +1,62 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
''' '''
criticadigital.com criticadigital.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class CriticaDigital(BasicNewsRecipe): class CriticaDigital(BasicNewsRecipe):
title = 'Critica de la Argentina' title = 'Critica de la Argentina'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Noticias de Argentina' description = 'Noticias de Argentina'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
language = 'es' language = 'es'
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment' , description
, '--category' , 'news, Argentina' , '--category' , 'news, Argentina'
, '--publisher' , title , '--publisher' , title
] ]
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'class':'bloqueTitulosNoticia'}) dict(name='div', attrs={'class':'bloqueTitulosNoticia'})
,dict(name='div', attrs={'id':'c453-1' }) ,dict(name='div', attrs={'id':'c453-1' })
] ]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':'box300' }) dict(name='div', attrs={'class':'box300' })
,dict(name='div', style=True ) ,dict(name='div', style=True )
,dict(name='div', attrs={'class':'titcomentario'}) ,dict(name='div', attrs={'class':'titcomentario'})
,dict(name='div', attrs={'class':'comentario' }) ,dict(name='div', attrs={'class':'comentario' })
,dict(name='div', attrs={'class':'paginador' }) ,dict(name='div', attrs={'class':'paginador' })
] ]
feeds = [ feeds = [
(u'Politica', u'http://www.criticadigital.com/herramientas/rss.php?ch=politica' ) (u'Politica', u'http://www.criticadigital.com/herramientas/rss.php?ch=politica' )
,(u'Economia', u'http://www.criticadigital.com/herramientas/rss.php?ch=economia' ) ,(u'Economia', u'http://www.criticadigital.com/herramientas/rss.php?ch=economia' )
,(u'Deportes', u'http://www.criticadigital.com/herramientas/rss.php?ch=deportes' ) ,(u'Deportes', u'http://www.criticadigital.com/herramientas/rss.php?ch=deportes' )
,(u'Espectaculos', u'http://www.criticadigital.com/herramientas/rss.php?ch=espectaculos') ,(u'Espectaculos', u'http://www.criticadigital.com/herramientas/rss.php?ch=espectaculos')
,(u'Mundo', u'http://www.criticadigital.com/herramientas/rss.php?ch=mundo' ) ,(u'Mundo', u'http://www.criticadigital.com/herramientas/rss.php?ch=mundo' )
,(u'Policiales', u'http://www.criticadigital.com/herramientas/rss.php?ch=policiales' ) ,(u'Policiales', u'http://www.criticadigital.com/herramientas/rss.php?ch=policiales' )
,(u'Sociedad', u'http://www.criticadigital.com/herramientas/rss.php?ch=sociedad' ) ,(u'Sociedad', u'http://www.criticadigital.com/herramientas/rss.php?ch=sociedad' )
,(u'Salud', u'http://www.criticadigital.com/herramientas/rss.php?ch=salud' ) ,(u'Salud', u'http://www.criticadigital.com/herramientas/rss.php?ch=salud' )
,(u'Tecnologia', u'http://www.criticadigital.com/herramientas/rss.php?ch=tecnologia' ) ,(u'Tecnologia', u'http://www.criticadigital.com/herramientas/rss.php?ch=tecnologia' )
,(u'Santa Fe', u'http://www.criticadigital.com/herramientas/rss.php?ch=santa_fe' ) ,(u'Santa Fe', u'http://www.criticadigital.com/herramientas/rss.php?ch=santa_fe' )
] ]
def get_cover_url(self): def get_cover_url(self):
cover_url = None cover_url = None
index = 'http://www.criticadigital.com/impresa/' index = 'http://www.criticadigital.com/impresa/'
soup = self.index_to_soup(index) soup = self.index_to_soup(index)
link_item = soup.find('div',attrs={'class':'tapa'}) link_item = soup.find('div',attrs={'class':'tapa'})
if link_item: if link_item:
cover_url = index + link_item.img['src'] cover_url = index + link_item.img['src']
return cover_url return cover_url

View File

@ -1,45 +1,44 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
newyorker.com newyorker.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class CubaDebate(BasicNewsRecipe):
class CubaDebate(BasicNewsRecipe): title = 'CubaDebate'
title = 'CubaDebate' __author__ = 'Darko Miletic'
__author__ = 'Darko Miletic' description = 'Contra el Terorismo Mediatico'
description = 'Contra el Terorismo Mediatico' oldest_article = 15
oldest_article = 15
language = 'es' language = 'es'
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
publisher = 'Cubadebate' publisher = 'Cubadebate'
category = 'news, politics, Cuba' category = 'news, politics, Cuba'
encoding = 'utf-8' encoding = 'utf-8'
extra_css = ' #BlogTitle{font-size: x-large; font-weight: bold} ' extra_css = ' #BlogTitle{font-size: x-large; font-weight: bold} '
conversion_options = { conversion_options = {
'comments' : description 'comments' : description
,'tags' : category ,'tags' : category
,'language' : 'es' ,'language' : 'es'
,'publisher' : publisher ,'publisher' : publisher
,'pretty_print': True ,'pretty_print': True
} }
keep_only_tags = [dict(name='div', attrs={'id':'Outline'})] keep_only_tags = [dict(name='div', attrs={'id':'Outline'})]
remove_tags_after = dict(name='div',attrs={'id':'BlogContent'}) remove_tags_after = dict(name='div',attrs={'id':'BlogContent'})
remove_tags = [dict(name='link')] remove_tags = [dict(name='link')]
feeds = [(u'Articulos', u'http://www.cubadebate.cu/feed/')] feeds = [(u'Articulos', u'http://www.cubadebate.cu/feed/')]
def print_version(self, url): def print_version(self, url):
return url + 'print/' return url + 'print/'
def preprocess_html(self, soup): def preprocess_html(self, soup):
return self.adeify_images(soup) return self.adeify_images(soup)

View File

@ -1,34 +1,34 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class TheDailyMail(BasicNewsRecipe): class TheDailyMail(BasicNewsRecipe):
title = u'The Daily Mail' title = u'The Daily Mail'
oldest_article = 2 oldest_article = 2
language = 'en' language = 'en'
author = 'RufusA' author = 'RufusA'
simultaneous_downloads= 1 simultaneous_downloads= 1
max_articles_per_feed = 50 max_articles_per_feed = 50
extra_css = 'h1 {text-align: left;}' extra_css = 'h1 {text-align: left;}'
remove_tags = [ dict(name='ul', attrs={'class':'article-icons-links'}) ] remove_tags = [ dict(name='ul', attrs={'class':'article-icons-links'}) ]
remove_tags_after = dict(name='h3', attrs={'class':'social-links-title'}) remove_tags_after = dict(name='h3', attrs={'class':'social-links-title'})
remove_tags_before = dict(name='div', attrs={'id':'content'}) remove_tags_before = dict(name='div', attrs={'id':'content'})
no_stylesheets = True no_stylesheets = True
feeds = [ feeds = [
(u'Home', u'http://www.dailymail.co.uk/home/index.rss'), (u'Home', u'http://www.dailymail.co.uk/home/index.rss'),
(u'News', u'http://www.dailymail.co.uk/news/index.rss'), (u'News', u'http://www.dailymail.co.uk/news/index.rss'),
(u'Sport', u'http://www.dailymail.co.uk/sport/index.rss'), (u'Sport', u'http://www.dailymail.co.uk/sport/index.rss'),
(u'TV and Showbiz', u'http://www.dailymail.co.uk/tvshowbiz/index.rss'), (u'TV and Showbiz', u'http://www.dailymail.co.uk/tvshowbiz/index.rss'),
(u'Femail', u'http://www.dailymail.co.uk/femail/index.rss'), (u'Femail', u'http://www.dailymail.co.uk/femail/index.rss'),
(u'Health', u'http://www.dailymail.co.uk/health/index.rss'), (u'Health', u'http://www.dailymail.co.uk/health/index.rss'),
(u'Science and Technology', u'http://www.dailymail.co.uk/sciencetech/index.rss'), (u'Science and Technology', u'http://www.dailymail.co.uk/sciencetech/index.rss'),
(u'Money', u'http://www.dailymail.co.uk/money/index.rss'), (u'Money', u'http://www.dailymail.co.uk/money/index.rss'),
(u'Property', u'http://www.dailymail.co.uk/property/index.rss'), (u'Property', u'http://www.dailymail.co.uk/property/index.rss'),
(u'Motoring', u'http://www.dailymail.co.uk/motoring/index.rss'), (u'Motoring', u'http://www.dailymail.co.uk/motoring/index.rss'),
(u'Travel', u'http://www.dailymail.co.uk/travel/index.rss')] (u'Travel', u'http://www.dailymail.co.uk/travel/index.rss')]
def print_version(self, url): def print_version(self, url):
main = url.partition('?')[0] main = url.partition('?')[0]
return main + '?printingPage=true' return main + '?printingPage=true'

View File

@ -1,62 +1,62 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
danas.rs danas.rs
''' '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.BeautifulSoup import Tag
class Danas(BasicNewsRecipe): class Danas(BasicNewsRecipe):
title = 'Danas' title = 'Danas'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Vesti' description = 'Vesti'
publisher = 'Danas d.o.o.' publisher = 'Danas d.o.o.'
category = 'news, politics, Serbia' category = 'news, politics, Serbia'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = False no_stylesheets = False
use_embedded_content = False use_embedded_content = False
language = 'sr' language = 'sr'
lang = 'sr-Latn-RS' lang = 'sr-Latn-RS'
direction = 'ltr' direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
, 'tags' : category , 'tags' : category
, 'publisher' : publisher , 'publisher' : publisher
, 'language' : lang , 'language' : lang
, 'pretty_print' : True , 'pretty_print' : True
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':'left'})] keep_only_tags = [dict(name='div', attrs={'id':'left'})]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':['width_1_4','metaClanka','baner']}) dict(name='div', attrs={'class':['width_1_4','metaClanka','baner']})
,dict(name='div', attrs={'id':'comments'}) ,dict(name='div', attrs={'id':'comments'})
,dict(name=['object','link']) ,dict(name=['object','link'])
] ]
feeds = [ (u'Vesti', u'http://www.danas.rs/rss/rss.asp')] feeds = [ (u'Vesti', u'http://www.danas.rs/rss/rss.asp')]
def preprocess_html(self, soup): def preprocess_html(self, soup):
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
attribs = [ 'style','font','valign' attribs = [ 'style','font','valign'
,'colspan','width','height' ,'colspan','width','height'
,'rowspan','summary','align' ,'rowspan','summary','align'
,'cellspacing','cellpadding' ,'cellspacing','cellpadding'
,'frames','rules','border' ,'frames','rules','border'
] ]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div' item.name = 'div'
for attrib in attribs: for attrib in attribs:
if item.has_key(attrib): if item.has_key(attrib):
del item[attrib] del item[attrib]
return soup return soup

View File

@ -1,76 +1,76 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.nieuwsblad.be www.nieuwsblad.be
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
class DeGentenaarOnline(BasicNewsRecipe): class DeGentenaarOnline(BasicNewsRecipe):
title = 'De Gentenaar Online' title = 'De Gentenaar Online'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Belgium in Dutch' description = 'News from Belgium in Dutch'
publisher = 'De Gentenaar' publisher = 'De Gentenaar'
category = 'news, politics, Belgium' category = 'news, politics, Belgium'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'utf-8' encoding = 'utf-8'
language = 'nl' language = 'nl'
lang = 'nl-BE' lang = 'nl-BE'
direction = 'ltr' direction = 'ltr'
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment' , description
, '--category' , category , '--category' , category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='span', attrs={'id':['lblArticleTitle','lblArticleIntroduction','lblArticleMainText']})] keep_only_tags = [dict(name='span', attrs={'id':['lblArticleTitle','lblArticleIntroduction','lblArticleMainText']})]
remove_tags = [dict(name=['embed','object'])] remove_tags = [dict(name=['embed','object'])]
feeds = [ feeds = [
(u'Snelnieuws' , u'http://feeds.nieuwsblad.be/nieuws/snelnieuws' ) (u'Snelnieuws' , u'http://feeds.nieuwsblad.be/nieuws/snelnieuws' )
,(u'Binnenland' , u'http://feeds.nieuwsblad.be/nieuws/binnenland' ) ,(u'Binnenland' , u'http://feeds.nieuwsblad.be/nieuws/binnenland' )
,(u'Buitenland' , u'http://feeds.nieuwsblad.be/nieuwsblad/buitenland' ) ,(u'Buitenland' , u'http://feeds.nieuwsblad.be/nieuwsblad/buitenland' )
,(u'Economie' , u'http://feeds.nieuwsblad.be/economie/home' ) ,(u'Economie' , u'http://feeds.nieuwsblad.be/economie/home' )
,(u'Economie' , u'http://feeds.nieuwsblad.be/economie/home' ) ,(u'Economie' , u'http://feeds.nieuwsblad.be/economie/home' )
,(u'Algemeen' , u'http://feeds.nieuwsblad.be/life/algemeen' ) ,(u'Algemeen' , u'http://feeds.nieuwsblad.be/life/algemeen' )
,(u'Film' , u'http://feeds.nieuwsblad.be/life/film' ) ,(u'Film' , u'http://feeds.nieuwsblad.be/life/film' )
,(u'Boek' , u'http://feeds.nieuwsblad.be/life/boeken' ) ,(u'Boek' , u'http://feeds.nieuwsblad.be/life/boeken' )
,(u'Muziek' , u'http://feeds.nieuwsblad.be/life/muziek' ) ,(u'Muziek' , u'http://feeds.nieuwsblad.be/life/muziek' )
,(u'Podium' , u'http://feeds.nieuwsblad.be/life/podium' ) ,(u'Podium' , u'http://feeds.nieuwsblad.be/life/podium' )
,(u'TV & radio' , u'http://feeds.nieuwsblad.be/life/tv' ) ,(u'TV & radio' , u'http://feeds.nieuwsblad.be/life/tv' )
] ]
def print_version(self, url): def print_version(self, url):
return url.replace('/Detail.aspx?articleid','/PrintArticle.aspx?ArticleID') return url.replace('/Detail.aspx?articleid','/PrintArticle.aspx?ArticleID')
def get_article_url(self, article): def get_article_url(self, article):
return article.get('guid', None) return article.get('guid', None)
def preprocess_html(self, soup): def preprocess_html(self, soup):
del soup.body['onload'] del soup.body['onload']
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
for item in soup.findAll('span'): for item in soup.findAll('span'):
item.name='div' item.name='div'
if item.has_key('id') and item['id'] == 'lblArticleTitle': if item.has_key('id') and item['id'] == 'lblArticleTitle':
item.name='h3' item.name='h3'
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
soup.head.insert(1,mcharset) soup.head.insert(1,mcharset)
return soup return soup

View File

@ -1,69 +1,69 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>' __copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
''' http://www.derstandard.at - Austrian Newspaper ''' ''' http://www.derstandard.at - Austrian Newspaper '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class DerStandardRecipe(BasicNewsRecipe): class DerStandardRecipe(BasicNewsRecipe):
title = u'derStandard' title = u'derStandard'
__author__ = 'Gerhard Aigner' __author__ = 'Gerhard Aigner'
description = u'Nachrichten aus Österreich' description = u'Nachrichten aus Österreich'
publisher ='derStandard.at' publisher ='derStandard.at'
category = 'news, politics, nachrichten, Austria' category = 'news, politics, nachrichten, Austria'
use_embedded_content = False use_embedded_content = False
remove_empty_feeds = True remove_empty_feeds = True
lang = 'de-AT' lang = 'de-AT'
no_stylesheets = True no_stylesheets = True
encoding = 'utf-8' encoding = 'utf-8'
language = 'de' language = 'de'
recursions = 0 recursions = 0
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment' , description
, '--category' , category , '--category' , category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'), feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'),
(u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'), (u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'),
(u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'), (u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'),
(u'Web', u'http://derstandard.at/?page=rss&ressort=webstandard'), (u'Web', u'http://derstandard.at/?page=rss&ressort=webstandard'),
(u'Sport', u'http://derstandard.at/?page=rss&ressort=sport'), (u'Sport', u'http://derstandard.at/?page=rss&ressort=sport'),
(u'Panorama', u'http://derstandard.at/?page=rss&ressort=panorama'), (u'Panorama', u'http://derstandard.at/?page=rss&ressort=panorama'),
(u'Etat', u'http://derstandard.at/?page=rss&ressort=etat'), (u'Etat', u'http://derstandard.at/?page=rss&ressort=etat'),
(u'Kultur', u'http://derstandard.at/?page=rss&ressort=kultur'), (u'Kultur', u'http://derstandard.at/?page=rss&ressort=kultur'),
(u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'), (u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'),
(u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'), (u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'),
(u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')] (u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')]
remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'), remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'),
dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')] dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')]
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '') (re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '')
] ]
def print_version(self, url): def print_version(self, url):
return url.replace('?id=', 'txt/?id=') return url.replace('?id=', 'txt/?id=')
def get_article_url(self, article): def get_article_url(self, article):
'''if the article links to a index page (ressort) or a picture gallery '''if the article links to a index page (ressort) or a picture gallery
(ansichtssache), don't add it''' (ansichtssache), don't add it'''
if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0): if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0):
return None return None
return article.link return article.link
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">' mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
return soup return soup

View File

@ -1,72 +1,72 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
elargentino.com elargentino.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
class Diagonales(BasicNewsRecipe): class Diagonales(BasicNewsRecipe):
title = 'Diagonales' title = 'Diagonales'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'El nuevo diario de La Plata' description = 'El nuevo diario de La Plata'
publisher = 'ElArgentino.com' publisher = 'ElArgentino.com'
category = 'news, politics, Argentina, La Plata' category = 'news, politics, Argentina, La Plata'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'utf-8' encoding = 'utf-8'
language = 'es' language = 'es'
lang = 'es-AR' lang = 'es-AR'
direction = 'ltr' direction = 'ltr'
INDEX = 'http://www.elargentino.com/medios/122/Diagonales.html' INDEX = 'http://www.elargentino.com/medios/122/Diagonales.html'
extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} ' extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} '
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment' , description
, '--category' , category , '--category' , category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})] keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})]
remove_tags = [dict(name='link')] remove_tags = [dict(name='link')]
feeds = [(u'Articulos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=122&Content-Type=text/xml&ChannelDesc=Diagonales')] feeds = [(u'Articulos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=122&Content-Type=text/xml&ChannelDesc=Diagonales')]
def print_version(self, url): def print_version(self, url):
main, sep, article_part = url.partition('/nota-') main, sep, article_part = url.partition('/nota-')
article_id, rsep, rrest = article_part.partition('-') article_id, rsep, rrest = article_part.partition('-')
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
soup.head.insert(1,mcharset) soup.head.insert(1,mcharset)
return soup return soup
def get_cover_url(self): def get_cover_url(self):
cover_url = None cover_url = None
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('div',attrs={'class':'colder'}) cover_item = soup.find('div',attrs={'class':'colder'})
if cover_item: if cover_item:
clean_url = self.image_url_processor(None,cover_item.div.img['src']) clean_url = self.image_url_processor(None,cover_item.div.img['src'])
cover_url = 'http://www.elargentino.com' + clean_url + '&height=600' cover_url = 'http://www.elargentino.com' + clean_url + '&height=600'
return cover_url return cover_url
def image_url_processor(self, baseurl, url): def image_url_processor(self, baseurl, url):
base, sep, rest = url.rpartition('?Id=') base, sep, rest = url.rpartition('?Id=')
img, sep2, rrest = rest.partition('&') img, sep2, rrest = rest.partition('&')
return base + sep + img return base + sep + img

View File

@ -1,73 +1,73 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>' __copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
''' http://www.diepresse.at - Austrian Newspaper ''' ''' http://www.diepresse.at - Austrian Newspaper '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class DiePresseRecipe(BasicNewsRecipe): class DiePresseRecipe(BasicNewsRecipe):
title = u'diePresse' title = u'diePresse'
__author__ = 'Gerhard Aigner' __author__ = 'Gerhard Aigner'
description = u'DiePresse.com - Die Online-Ausgabe der Österreichischen Tageszeitung Die Presse.' description = u'DiePresse.com - Die Online-Ausgabe der Österreichischen Tageszeitung Die Presse.'
publisher ='DiePresse.com' publisher ='DiePresse.com'
category = 'news, politics, nachrichten, Austria' category = 'news, politics, nachrichten, Austria'
use_embedded_content = False use_embedded_content = False
remove_empty_feeds = True remove_empty_feeds = True
lang = 'de-AT' lang = 'de-AT'
no_stylesheets = True no_stylesheets = True
encoding = 'ISO-8859-1' encoding = 'ISO-8859-1'
language = 'de' language = 'de'
recursions = 0 recursions = 0
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment' , description
, '--category' , category , '--category' , category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'Textversion', re.DOTALL), lambda match: ''), (re.compile(r'Textversion', re.DOTALL), lambda match: ''),
] ]
remove_tags = [dict(name='hr'), remove_tags = [dict(name='hr'),
dict(name='br'), dict(name='br'),
dict(name='small'), dict(name='small'),
dict(name='img'), dict(name='img'),
dict(name='div', attrs={'class':'textnavi'}), dict(name='div', attrs={'class':'textnavi'}),
dict(name='h1', attrs={'class':'titel'}), dict(name='h1', attrs={'class':'titel'}),
dict(name='a', attrs={'class':'print'}), dict(name='a', attrs={'class':'print'}),
dict(name='div', attrs={'class':'hline'})] dict(name='div', attrs={'class':'hline'})]
feeds = [(u'Politik', u'http://diepresse.com/rss/Politik'), feeds = [(u'Politik', u'http://diepresse.com/rss/Politik'),
(u'Wirtschaft', u'http://diepresse.com/rss/Wirtschaft'), (u'Wirtschaft', u'http://diepresse.com/rss/Wirtschaft'),
(u'Europa', u'http://diepresse.com/rss/EU'), (u'Europa', u'http://diepresse.com/rss/EU'),
(u'Panorama', u'http://diepresse.com/rss/Panorama'), (u'Panorama', u'http://diepresse.com/rss/Panorama'),
(u'Sport', u'http://diepresse.com/rss/Sport'), (u'Sport', u'http://diepresse.com/rss/Sport'),
(u'Kultur', u'http://diepresse.com/rss/Kultur'), (u'Kultur', u'http://diepresse.com/rss/Kultur'),
(u'Leben', u'http://diepresse.com/rss/Leben'), (u'Leben', u'http://diepresse.com/rss/Leben'),
(u'Tech', u'http://diepresse.com/rss/Tech'), (u'Tech', u'http://diepresse.com/rss/Tech'),
(u'Wissenschaft', u'http://diepresse.com/rss/Science'), (u'Wissenschaft', u'http://diepresse.com/rss/Science'),
(u'Bildung', u'http://diepresse.com/rss/Bildung'), (u'Bildung', u'http://diepresse.com/rss/Bildung'),
(u'Gesundheit', u'http://diepresse.com/rss/Gesundheit'), (u'Gesundheit', u'http://diepresse.com/rss/Gesundheit'),
(u'Recht', u'http://diepresse.com/rss/Recht'), (u'Recht', u'http://diepresse.com/rss/Recht'),
(u'Spectrum', u'http://diepresse.com/rss/Spectrum'), (u'Spectrum', u'http://diepresse.com/rss/Spectrum'),
(u'Meinung', u'http://diepresse.com/rss/Meinung')] (u'Meinung', u'http://diepresse.com/rss/Meinung')]
def print_version(self, url): def print_version(self, url):
return url.replace('home','text/home') return url.replace('home','text/home')
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">' mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
return soup return soup

View File

@ -1,69 +1,69 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
dnevniavaz.ba dnevniavaz.ba
''' '''
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.BeautifulSoup import Tag
class DnevniAvaz(BasicNewsRecipe): class DnevniAvaz(BasicNewsRecipe):
title = 'Dnevni Avaz' title = 'Dnevni Avaz'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Latest news from Bosnia' description = 'Latest news from Bosnia'
publisher = 'Dnevni Avaz' publisher = 'Dnevni Avaz'
category = 'news, politics, Bosnia and Herzegovina' category = 'news, politics, Bosnia and Herzegovina'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
encoding = 'utf-8' encoding = 'utf-8'
use_embedded_content = False use_embedded_content = False
remove_javascript = True remove_javascript = True
cover_url = 'http://www.dnevniavaz.ba/img/logo.gif' cover_url = 'http://www.dnevniavaz.ba/img/logo.gif'
lang = 'bs-BA' lang = 'bs-BA'
language = 'bs' language = 'bs'
direction = 'ltr' direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
, 'tags' : category , 'tags' : category
, 'publisher' : publisher , 'publisher' : publisher
, 'language' : lang , 'language' : lang
, 'pretty_print' : True , 'pretty_print' : True
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':['fullarticle-title','fullarticle-leading','fullarticle-date','fullarticle-text','articleauthor']})] keep_only_tags = [dict(name='div', attrs={'id':['fullarticle-title','fullarticle-leading','fullarticle-date','fullarticle-text','articleauthor']})]
remove_tags = [dict(name=['object','link','base'])] remove_tags = [dict(name=['object','link','base'])]
feeds = [ feeds = [
(u'Najnovije' , u'http://www.dnevniavaz.ba/rss/novo' ) (u'Najnovije' , u'http://www.dnevniavaz.ba/rss/novo' )
,(u'Najpopularnije', u'http://www.dnevniavaz.ba/rss/popularno') ,(u'Najpopularnije', u'http://www.dnevniavaz.ba/rss/popularno')
] ]
def replace_tagname(self,soup,tagname,tagid,newtagname): def replace_tagname(self,soup,tagname,tagid,newtagname):
headtag = soup.find(tagname,attrs={'id':tagid}) headtag = soup.find(tagname,attrs={'id':tagid})
if headtag: if headtag:
headtag.name = newtagname headtag.name = newtagname
return return
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
soup.head.insert(1,mcharset) soup.head.insert(1,mcharset)
self.replace_tagname(soup,'div','fullarticle-title' ,'h1') self.replace_tagname(soup,'div','fullarticle-title' ,'h1')
self.replace_tagname(soup,'div','fullarticle-leading','h3') self.replace_tagname(soup,'div','fullarticle-leading','h3')
self.replace_tagname(soup,'div','fullarticle-date' ,'h5') self.replace_tagname(soup,'div','fullarticle-date' ,'h5')
return self.adeify_images(soup) return self.adeify_images(soup)

View File

@ -1,75 +1,75 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
dnevnik.hr dnevnik.hr
''' '''
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.BeautifulSoup import Tag
class DnevnikCro(BasicNewsRecipe): class DnevnikCro(BasicNewsRecipe):
title = 'Dnevnik - Hr' title = 'Dnevnik - Hr'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = "Vijesti iz Hrvatske" description = "Vijesti iz Hrvatske"
publisher = 'Dnevnik.hr' publisher = 'Dnevnik.hr'
category = 'news, politics, Croatia' category = 'news, politics, Croatia'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
delay = 4 delay = 4
no_stylesheets = True no_stylesheets = True
encoding = 'utf-8' encoding = 'utf-8'
use_embedded_content = False use_embedded_content = False
language = 'hr' language = 'hr'
lang = 'hr-HR' lang = 'hr-HR'
direction = 'ltr' direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
, 'tags' : category , 'tags' : category
, 'publisher' : publisher , 'publisher' : publisher
, 'language' : lang , 'language' : lang
, 'pretty_print' : True , 'pretty_print' : True
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':'article'})] keep_only_tags = [dict(name='div', attrs={'id':'article'})]
remove_tags = [ remove_tags = [
dict(name=['object','link','embed']) dict(name=['object','link','embed'])
,dict(name='div', attrs={'class':'menu'}) ,dict(name='div', attrs={'class':'menu'})
,dict(name='div', attrs={'id':'video'}) ,dict(name='div', attrs={'id':'video'})
] ]
remove_tags_after = dict(name='div', attrs={'id':'content'}) remove_tags_after = dict(name='div', attrs={'id':'content'})
feeds = [(u'Vijesti', u'http://rss.dnevnik.hr/index.rss')] feeds = [(u'Vijesti', u'http://rss.dnevnik.hr/index.rss')]
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction soup.html['dir' ] = self.direction
attribs = [ 'style','font','valign' attribs = [ 'style','font','valign'
,'colspan','width','height' ,'colspan','width','height'
,'rowspan','summary','align' ,'rowspan','summary','align'
,'cellspacing','cellpadding' ,'cellspacing','cellpadding'
,'frames','rules','border' ,'frames','rules','border'
] ]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div' item.name = 'div'
for attrib in attribs: for attrib in attribs:
if item.has_key(attrib): if item.has_key(attrib):
del item[attrib] del item[attrib]
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
soup.head.insert(1,mcharset) soup.head.insert(1,mcharset)
return self.adeify_images(soup) return self.adeify_images(soup)

View File

@ -1,59 +1,59 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
e-novine.com e-novine.com
''' '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.BeautifulSoup import Tag
class E_novine(BasicNewsRecipe): class E_novine(BasicNewsRecipe):
title = 'E-Novine' title = 'E-Novine'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Serbia' description = 'News from Serbia'
publisher = 'E-novine' publisher = 'E-novine'
category = 'news, politics, Balcans' category = 'news, politics, Balcans'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
encoding = 'cp1250' encoding = 'cp1250'
use_embedded_content = False use_embedded_content = False
language = 'sr' language = 'sr'
lang = 'sr' lang = 'sr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
, 'tags' : category , 'tags' : category
, 'publisher' : publisher , 'publisher' : publisher
, 'language' : lang , 'language' : lang
, 'pretty_print' : True , 'pretty_print' : True
} }
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':['css_47_0_2844H']})] keep_only_tags = [dict(name='div', attrs={'id':['css_47_0_2844H']})]
remove_tags = [dict(name=['object','link','embed','iframe'])] remove_tags = [dict(name=['object','link','embed','iframe'])]
feeds = [(u'Sve vesti', u'http://www.e-novine.com/rss/e-novine.xml' )] feeds = [(u'Sve vesti', u'http://www.e-novine.com/rss/e-novine.xml' )]
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
ftag = soup.find('div', attrs={'id':'css_47_0_2844H'}) ftag = soup.find('div', attrs={'id':'css_47_0_2844H'})
if ftag: if ftag:
it = ftag.div it = ftag.div
it.extract() it.extract()
ftag.div.extract() ftag.div.extract()
ftag.insert(0,it) ftag.insert(0,it)
return soup return soup

View File

@ -1,32 +1,32 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
EcoGeek.org EcoGeek.org
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class EcoGeek(BasicNewsRecipe): class EcoGeek(BasicNewsRecipe):
title = 'EcoGeek' title = 'EcoGeek'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'EcoGeek - Technology for the Environment Blog Feed' description = 'EcoGeek - Technology for the Environment Blog Feed'
publisher = 'EcoGeek' publisher = 'EcoGeek'
language = 'en' language = 'en'
category = 'news, ecology, blog' category = 'news, ecology, blog'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = True use_embedded_content = True
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'Posts', u'http://feeds2.feedburner.com/EcoGeek')] feeds = [(u'Posts', u'http://feeds2.feedburner.com/EcoGeek')]

View File

@ -1,62 +1,61 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
emol.com emol.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ElMercurio(BasicNewsRecipe): class ElMercurio(BasicNewsRecipe):
title = 'El Mercurio online' title = 'El Mercurio online'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile' description = 'El sitio de noticias online de Chile'
publisher = 'El Mercurio' publisher = 'El Mercurio'
category = 'news, politics, Chile' category = 'news, politics, Chile'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = 'http://www.emol.com/especiales/logo_emol/logo_emol.gif' cover_url = 'http://www.emol.com/especiales/logo_emol/logo_emol.gif'
remove_javascript = True remove_javascript = True
use_embedded_content = False use_embedded_content = False
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'class':'despliegue-txt_750px'}) dict(name='div', attrs={'class':'despliegue-txt_750px'})
,dict(name='div', attrs={'id':'div_cuerpo_participa'}) ,dict(name='div', attrs={'id':'div_cuerpo_participa'})
] ]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':'contenedor_despliegue-col-left300'}) dict(name='div', attrs={'class':'contenedor_despliegue-col-left300'})
,dict(name='div', attrs={'id':['div_centro_dn_opc','div_cabezera','div_secciones','div_contenidos','div_pie','nav']}) ,dict(name='div', attrs={'id':['div_centro_dn_opc','div_cabezera','div_secciones','div_contenidos','div_pie','nav']})
] ]
feeds = [ feeds = [
(u'Noticias de ultima hora', u'http://www.emol.com/rss20/rss.asp?canal=0') (u'Noticias de ultima hora', u'http://www.emol.com/rss20/rss.asp?canal=0')
,(u'Nacional', u'http://www.emol.com/rss20/rss.asp?canal=1') ,(u'Nacional', u'http://www.emol.com/rss20/rss.asp?canal=1')
,(u'Mundo', u'http://www.emol.com/rss20/rss.asp?canal=2') ,(u'Mundo', u'http://www.emol.com/rss20/rss.asp?canal=2')
,(u'Deportes', u'http://www.emol.com/rss20/rss.asp?canal=4') ,(u'Deportes', u'http://www.emol.com/rss20/rss.asp?canal=4')
,(u'Magazine', u'http://www.emol.com/rss20/rss.asp?canal=6') ,(u'Magazine', u'http://www.emol.com/rss20/rss.asp?canal=6')
,(u'Tecnologia', u'http://www.emol.com/rss20/rss.asp?canal=5') ,(u'Tecnologia', u'http://www.emol.com/rss20/rss.asp?canal=5')
,(u'La Musica', u'http://www.emol.com/rss20/rss.asp?canal=7') ,(u'La Musica', u'http://www.emol.com/rss20/rss.asp?canal=7')
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CL"/>' mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
return soup return soup
language = 'es' language = 'es'

View File

@ -1,66 +1,66 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
eluniversal.com.mx eluniversal.com.mx
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ElUniversal(BasicNewsRecipe): class ElUniversal(BasicNewsRecipe):
title = 'El Universal' title = 'El Universal'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Mexico' description = 'News from Mexico'
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
publisher = 'El Universal' publisher = 'El Universal'
category = 'news, politics, Mexico' category = 'news, politics, Mexico'
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
remove_javascript = True remove_javascript = True
language = 'es' language = 'es'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
, '--ignore-tables' , '--ignore-tables'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
remove_tags = [dict(name='link')] remove_tags = [dict(name='link')]
feeds = [ feeds = [
(u'Minuto por Minuto', u'http://www.eluniversal.com.mx/rss/universalmxm.xml' ) (u'Minuto por Minuto', u'http://www.eluniversal.com.mx/rss/universalmxm.xml' )
,(u'Mundo' , u'http://www.eluniversal.com.mx/rss/mundo.xml' ) ,(u'Mundo' , u'http://www.eluniversal.com.mx/rss/mundo.xml' )
,(u'Mexico' , u'http://www.eluniversal.com.mx/rss/mexico.xml' ) ,(u'Mexico' , u'http://www.eluniversal.com.mx/rss/mexico.xml' )
,(u'Estados' , u'http://www.eluniversal.com.mx/rss/estados.xml' ) ,(u'Estados' , u'http://www.eluniversal.com.mx/rss/estados.xml' )
,(u'Finanzas' , u'http://www.eluniversal.com.mx/rss/finanzas.xml' ) ,(u'Finanzas' , u'http://www.eluniversal.com.mx/rss/finanzas.xml' )
,(u'Deportes' , u'http://www.eluniversal.com.mx/rss/deportes.xml' ) ,(u'Deportes' , u'http://www.eluniversal.com.mx/rss/deportes.xml' )
,(u'Espectaculos' , u'http://www.eluniversal.com.mx/rss/espectaculos.xml' ) ,(u'Espectaculos' , u'http://www.eluniversal.com.mx/rss/espectaculos.xml' )
,(u'Cultura' , u'http://www.eluniversal.com.mx/rss/cultura.xml' ) ,(u'Cultura' , u'http://www.eluniversal.com.mx/rss/cultura.xml' )
,(u'Ciencia' , u'http://www.eluniversal.com.mx/rss/ciencia.xml' ) ,(u'Ciencia' , u'http://www.eluniversal.com.mx/rss/ciencia.xml' )
,(u'Computacion' , u'http://www.eluniversal.com.mx/rss/computo.xml' ) ,(u'Computacion' , u'http://www.eluniversal.com.mx/rss/computo.xml' )
,(u'Sociedad' , u'http://www.eluniversal.com.mx/rss/sociedad.xml' ) ,(u'Sociedad' , u'http://www.eluniversal.com.mx/rss/sociedad.xml' )
] ]
def print_version(self, url): def print_version(self, url):
return url.replace('/notas/','/notas/vi_') return url.replace('/notas/','/notas/vi_')
def preprocess_html(self, soup): def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-MX"/><meta http-equiv="Content-Type" content="text/html; charset=utf-8">' mtag = '<meta http-equiv="Content-Language" content="es-MX"/><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
for item in soup.findAll(font=True): for item in soup.findAll(font=True):
del item['font'] del item['font']
for item in soup.findAll(face=True): for item in soup.findAll(face=True):
del item['face'] del item['face']
for item in soup.findAll(helvetica=True): for item in soup.findAll(helvetica=True):
del item['helvetica'] del item['helvetica']
return soup return soup

View File

@ -1,62 +1,62 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
elargentino.com elargentino.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ElArgentino(BasicNewsRecipe): class ElArgentino(BasicNewsRecipe):
title = 'ElArgentino.com' title = 'ElArgentino.com'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas' description = 'Informacion Libre las 24 horas'
publisher = 'ElArgentino.com' publisher = 'ElArgentino.com'
category = 'news, politics, Argentina' category = 'news, politics, Argentina'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'utf8' encoding = 'utf8'
cover_url = 'http://www.elargentino.com/TemplateWeb/MediosFooter/tapa_elargentino.png' cover_url = 'http://www.elargentino.com/TemplateWeb/MediosFooter/tapa_elargentino.png'
language = 'es' language = 'es'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
remove_tags = [ remove_tags = [
dict(name='div', attrs={'id':'noprint' }) dict(name='div', attrs={'id':'noprint' })
,dict(name='div', attrs={'class':'encabezadoImprimir'}) ,dict(name='div', attrs={'class':'encabezadoImprimir'})
,dict(name='a' , attrs={'target':'_blank' }) ,dict(name='a' , attrs={'target':'_blank' })
] ]
feeds = [ feeds = [
(u'Portada' , u'http://www.elargentino.com/Highlights.aspx?Content-Type=text/xml&ChannelDesc=Home' ) (u'Portada' , u'http://www.elargentino.com/Highlights.aspx?Content-Type=text/xml&ChannelDesc=Home' )
,(u'Pais' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=112&Content-Type=text/xml&ChannelDesc=Pa%C3%ADs' ) ,(u'Pais' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=112&Content-Type=text/xml&ChannelDesc=Pa%C3%ADs' )
,(u'Economia' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=107&Content-Type=text/xml&ChannelDesc=Econom%C3%ADa' ) ,(u'Economia' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=107&Content-Type=text/xml&ChannelDesc=Econom%C3%ADa' )
,(u'Mundo' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=113&Content-Type=text/xml&ChannelDesc=Mundo' ) ,(u'Mundo' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=113&Content-Type=text/xml&ChannelDesc=Mundo' )
,(u'Tecnologia' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=118&Content-Type=text/xml&ChannelDesc=Tecnolog%C3%ADa' ) ,(u'Tecnologia' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=118&Content-Type=text/xml&ChannelDesc=Tecnolog%C3%ADa' )
,(u'Espectaculos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=114&Content-Type=text/xml&ChannelDesc=Espect%C3%A1culos') ,(u'Espectaculos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=114&Content-Type=text/xml&ChannelDesc=Espect%C3%A1culos')
,(u'Deportes' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=106&Content-Type=text/xml&ChannelDesc=Deportes' ) ,(u'Deportes' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=106&Content-Type=text/xml&ChannelDesc=Deportes' )
,(u'Sociedad' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=109&Content-Type=text/xml&ChannelDesc=Sociedad' ) ,(u'Sociedad' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=109&Content-Type=text/xml&ChannelDesc=Sociedad' )
,(u'Entrevistas' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=115&Content-Type=text/xml&ChannelDesc=Entrevistas' ) ,(u'Entrevistas' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=115&Content-Type=text/xml&ChannelDesc=Entrevistas' )
] ]
def print_version(self, url): def print_version(self, url):
main, sep, article_part = url.partition('/nota-') main, sep, article_part = url.partition('/nota-')
article_id, rsep, rrest = article_part.partition('-') article_id, rsep, rrest = article_part.partition('-')
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
def preprocess_html(self, soup): def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<meta http-equiv="Content-Language" content="es-AR"/>\n' mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<meta http-equiv="Content-Language" content="es-AR"/>\n'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
return soup return soup

View File

@ -1,72 +1,72 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
''' '''
cronista.com cronista.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ElCronista(BasicNewsRecipe): class ElCronista(BasicNewsRecipe):
title = 'El Cronista' title = 'El Cronista'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Noticias de Argentina' description = 'Noticias de Argentina'
oldest_article = 2 oldest_article = 2
language = 'es' language = 'es'
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment' , description
, '--category' , 'news, Argentina' , '--category' , 'news, Argentina'
, '--publisher' , title , '--publisher' , title
] ]
keep_only_tags = [ keep_only_tags = [
dict(name='table', attrs={'width':'100%' }) dict(name='table', attrs={'width':'100%' })
,dict(name='h1' , attrs={'class':'Arialgris16normal'}) ,dict(name='h1' , attrs={'class':'Arialgris16normal'})
] ]
remove_tags = [dict(name='a', attrs={'class':'Arialazul12'})] remove_tags = [dict(name='a', attrs={'class':'Arialazul12'})]
feeds = [ feeds = [
(u'Economia' , u'http://www.cronista.com/adjuntos/8/rss/Economia_EI.xml' ) (u'Economia' , u'http://www.cronista.com/adjuntos/8/rss/Economia_EI.xml' )
,(u'Negocios' , u'http://www.cronista.com/adjuntos/8/rss/negocios_EI.xml' ) ,(u'Negocios' , u'http://www.cronista.com/adjuntos/8/rss/negocios_EI.xml' )
,(u'Ultimo momento' , u'http://www.cronista.com/adjuntos/8/rss/ultimo_momento.xml' ) ,(u'Ultimo momento' , u'http://www.cronista.com/adjuntos/8/rss/ultimo_momento.xml' )
,(u'Finanzas y Mercados' , u'http://www.cronista.com/adjuntos/8/rss/Finanzas_Mercados_EI.xml' ) ,(u'Finanzas y Mercados' , u'http://www.cronista.com/adjuntos/8/rss/Finanzas_Mercados_EI.xml' )
,(u'Financial Times' , u'http://www.cronista.com/adjuntos/8/rss/FT_EI.xml' ) ,(u'Financial Times' , u'http://www.cronista.com/adjuntos/8/rss/FT_EI.xml' )
,(u'Opinion edicion impresa' , u'http://www.cronista.com/adjuntos/8/rss/opinion_edicion_impresa.xml' ) ,(u'Opinion edicion impresa' , u'http://www.cronista.com/adjuntos/8/rss/opinion_edicion_impresa.xml' )
,(u'Socialmente Responsables', u'http://www.cronista.com/adjuntos/8/rss/Socialmente_Responsables.xml') ,(u'Socialmente Responsables', u'http://www.cronista.com/adjuntos/8/rss/Socialmente_Responsables.xml')
,(u'Asuntos Legales' , u'http://www.cronista.com/adjuntos/8/rss/asuntoslegales.xml' ) ,(u'Asuntos Legales' , u'http://www.cronista.com/adjuntos/8/rss/asuntoslegales.xml' )
,(u'IT Business' , u'http://www.cronista.com/adjuntos/8/rss/itbusiness.xml' ) ,(u'IT Business' , u'http://www.cronista.com/adjuntos/8/rss/itbusiness.xml' )
,(u'Management y RR.HH.' , u'http://www.cronista.com/adjuntos/8/rss/management.xml' ) ,(u'Management y RR.HH.' , u'http://www.cronista.com/adjuntos/8/rss/management.xml' )
,(u'Inversiones Personales' , u'http://www.cronista.com/adjuntos/8/rss/inversionespersonales.xml' ) ,(u'Inversiones Personales' , u'http://www.cronista.com/adjuntos/8/rss/inversionespersonales.xml' )
] ]
def print_version(self, url): def print_version(self, url):
main, sep, rest = url.partition('.com/notas/') main, sep, rest = url.partition('.com/notas/')
article_id, lsep, rrest = rest.partition('-') article_id, lsep, rrest = rest.partition('-')
return 'http://www.cronista.com/interior/index.php?p=imprimir_nota&idNota=' + article_id return 'http://www.cronista.com/interior/index.php?p=imprimir_nota&idNota=' + article_id
def preprocess_html(self, soup): def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">' mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
soup.head.base.extract() soup.head.base.extract()
htext = soup.find('h1',attrs={'class':'Arialgris16normal'}) htext = soup.find('h1',attrs={'class':'Arialgris16normal'})
htext.name = 'p' htext.name = 'p'
soup.prettify() soup.prettify()
return soup return soup
def get_cover_url(self): def get_cover_url(self):
cover_url = None cover_url = None
index = 'http://www.cronista.com/contenidos/' index = 'http://www.cronista.com/contenidos/'
soup = self.index_to_soup(index + 'ee.html') soup = self.index_to_soup(index + 'ee.html')
link_item = soup.find('a',attrs={'href':"javascript:Close()"}) link_item = soup.find('a',attrs={'href':"javascript:Close()"})
if link_item: if link_item:
cover_url = index + link_item.img['src'] cover_url = index + link_item.img['src']
return cover_url return cover_url

View File

@ -1,61 +1,60 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
elmundo.es elmundo.es
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ElMundo(BasicNewsRecipe): class ElMundo(BasicNewsRecipe):
title = 'El Mundo' title = 'El Mundo'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Spain' description = 'News from Spain'
publisher = 'El Mundo' publisher = 'El Mundo'
category = 'news, politics, Spain' category = 'news, politics, Spain'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'iso8859_15' encoding = 'iso8859_15'
cover_url = 'http://estaticos02.cache.el-mundo.net/papel/imagenes/v2.0/logoverde.gif' cover_url = 'http://estaticos02.cache.el-mundo.net/papel/imagenes/v2.0/logoverde.gif'
remove_javascript = True remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'id':['bloqueprincipal','noticia']}) dict(name='div', attrs={'id':['bloqueprincipal','noticia']})
,dict(name='div', attrs={'class':['contenido_noticia_01']}) ,dict(name='div', attrs={'class':['contenido_noticia_01']})
] ]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':['herramientas','publicidad_google']}) dict(name='div', attrs={'class':['herramientas','publicidad_google']})
,dict(name='div', attrs={'id':'modulo_multimedia' }) ,dict(name='div', attrs={'id':'modulo_multimedia' })
,dict(name='ul', attrs={'class':'herramientas' }) ,dict(name='ul', attrs={'class':'herramientas' })
,dict(name=['object','link']) ,dict(name=['object','link'])
] ]
feeds = [ feeds = [
(u'Portada' , u'http://rss.elmundo.es/rss/descarga.htm?data2=4' ) (u'Portada' , u'http://rss.elmundo.es/rss/descarga.htm?data2=4' )
,(u'Espana' , u'http://rss.elmundo.es/rss/descarga.htm?data2=8' ) ,(u'Espana' , u'http://rss.elmundo.es/rss/descarga.htm?data2=8' )
,(u'Internacional' , u'http://rss.elmundo.es/rss/descarga.htm?data2=9' ) ,(u'Internacional' , u'http://rss.elmundo.es/rss/descarga.htm?data2=9' )
,(u'Cultura' , u'http://rss.elmundo.es/rss/descarga.htm?data2=6' ) ,(u'Cultura' , u'http://rss.elmundo.es/rss/descarga.htm?data2=6' )
,(u'Ciencia/Ecologia', u'http://rss.elmundo.es/rss/descarga.htm?data2=5' ) ,(u'Ciencia/Ecologia', u'http://rss.elmundo.es/rss/descarga.htm?data2=5' )
,(u'Comunicacion' , u'http://rss.elmundo.es/rss/descarga.htm?data2=26') ,(u'Comunicacion' , u'http://rss.elmundo.es/rss/descarga.htm?data2=26')
,(u'Television' , u'http://rss.elmundo.es/rss/descarga.htm?data2=76') ,(u'Television' , u'http://rss.elmundo.es/rss/descarga.htm?data2=76')
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
return soup return soup
language = 'es' language = 'es'

View File

@ -1,56 +1,56 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
elperiodico.cat elperiodico.cat
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
class ElPeriodico_cat(BasicNewsRecipe): class ElPeriodico_cat(BasicNewsRecipe):
title = 'El Periodico de Catalunya' title = 'El Periodico de Catalunya'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Noticias desde Catalunya' description = 'Noticias desde Catalunya'
publisher = 'elperiodico.cat' publisher = 'elperiodico.cat'
category = 'news, politics, Spain, Catalunya' category = 'news, politics, Spain, Catalunya'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
delay = 1 delay = 1
encoding = 'cp1252' encoding = 'cp1252'
language = 'ca' language = 'ca'
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment' , description
, '--category' , category , '--category' , category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u"Tota l'edició", u'http://www.elperiodico.cat/rss.asp?id=46')] feeds = [(u"Tota l'edició", u'http://www.elperiodico.cat/rss.asp?id=46')]
keep_only_tags = [dict(name='div', attrs={'id':'noticia'})] keep_only_tags = [dict(name='div', attrs={'id':'noticia'})]
remove_tags = [ remove_tags = [
dict(name=['object','link','script']) dict(name=['object','link','script'])
,dict(name='ul',attrs={'class':'herramientasDeNoticia'}) ,dict(name='ul',attrs={'class':'herramientasDeNoticia'})
,dict(name='div', attrs={'id':'inferiores'}) ,dict(name='div', attrs={'id':'inferiores'})
] ]
def print_version(self, url): def print_version(self, url):
return url.replace('/default.asp?','/print.asp?') return url.replace('/default.asp?','/print.asp?')
def preprocess_html(self, soup): def preprocess_html(self, soup):
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mcharset) soup.head.insert(0,mcharset)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
return soup return soup

View File

@ -1,56 +1,56 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
elperiodico.com elperiodico.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
class ElPeriodico_esp(BasicNewsRecipe): class ElPeriodico_esp(BasicNewsRecipe):
title = 'El Periodico de Catalunya' title = 'El Periodico de Catalunya'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Noticias desde Catalunya' description = 'Noticias desde Catalunya'
publisher = 'elperiodico.com' publisher = 'elperiodico.com'
category = 'news, politics, Spain, Catalunya' category = 'news, politics, Spain, Catalunya'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
delay = 1 delay = 1
encoding = 'cp1252' encoding = 'cp1252'
language = 'es' language = 'es'
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment' , description
, '--category' , category , '--category' , category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u"Toda la edición", u'http://www.elperiodico.com/rss.asp?id=46')] feeds = [(u"Toda la edición", u'http://www.elperiodico.com/rss.asp?id=46')]
keep_only_tags = [dict(name='div', attrs={'id':'noticia'})] keep_only_tags = [dict(name='div', attrs={'id':'noticia'})]
remove_tags = [ remove_tags = [
dict(name=['object','link','script']) dict(name=['object','link','script'])
,dict(name='ul',attrs={'class':'herramientasDeNoticia'}) ,dict(name='ul',attrs={'class':'herramientasDeNoticia'})
,dict(name='div', attrs={'id':'inferiores'}) ,dict(name='div', attrs={'id':'inferiores'})
] ]
def print_version(self, url): def print_version(self, url):
return url.replace('/default.asp?','/print.asp?') return url.replace('/default.asp?','/print.asp?')
def preprocess_html(self, soup): def preprocess_html(self, soup):
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mcharset) soup.head.insert(0,mcharset)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
return soup return soup

View File

@ -1,53 +1,53 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.tiempo.hn www.tiempo.hn
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.BeautifulSoup import Tag
class ElTiempoHn(BasicNewsRecipe): class ElTiempoHn(BasicNewsRecipe):
title = 'El Tiempo - Honduras' title = 'El Tiempo - Honduras'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Noticias de Honduras y mundo' description = 'Noticias de Honduras y mundo'
publisher = 'El Tiempo' publisher = 'El Tiempo'
category = 'news, politics, Honduras' category = 'news, politics, Honduras'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
use_embedded_content = False use_embedded_content = False
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
encoding = 'utf-8' encoding = 'utf-8'
language = 'es' language = 'es'
lang = 'es-HN' lang = 'es-HN'
direction = 'ltr' direction = 'ltr'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
, '--ignore-tables' , '--ignore-tables'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True\npretty_print=True\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} img {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em}"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True\npretty_print=True\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} img {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em}"'
remove_tags = [dict(name=['form','object','embed','base'])] remove_tags = [dict(name=['form','object','embed','base'])]
keep_only_tags = [dict(name='td' , attrs={'id':'mainbodycont'})] keep_only_tags = [dict(name='td' , attrs={'id':'mainbodycont'})]
feeds = [(u'Noticias', u'http://www.tiempo.hn/index.php?format=feed&type=rss')] feeds = [(u'Noticias', u'http://www.tiempo.hn/index.php?format=feed&type=rss')]
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
soup.head.insert(1,mcharset) soup.head.insert(1,mcharset)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
return self.adeify_images(soup) return self.adeify_images(soup)

View File

@ -1,32 +1,31 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
''' '''
engadget.com engadget.com
''' '''
import string,re from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class Engadget(BasicNewsRecipe):
class Engadget(BasicNewsRecipe): title = u'Engadget'
title = u'Engadget' __author__ = 'Darko Miletic'
__author__ = 'Darko Miletic' description = 'Tech news'
description = 'Tech news'
language = 'en' language = 'en'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
keep_only_tags = [ dict(name='div', attrs={'class':'post'}) ] keep_only_tags = [ dict(name='div', attrs={'class':'post'}) ]
remove_tags = [ remove_tags = [
dict(name='object') dict(name='object')
,dict(name='div', attrs={'class':'postmeta'}) ,dict(name='div', attrs={'class':'postmeta'})
,dict(name='div', attrs={'class':'quigoads'}) ,dict(name='div', attrs={'class':'quigoads'})
] ]
feeds = [ (u'Posts', u'http://www.engadget.com/rss.xml')] feeds = [ (u'Posts', u'http://www.engadget.com/rss.xml')]

View File

@ -1,63 +1,63 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.esquire.com www.esquire.com
''' '''
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
class Esquire(BasicNewsRecipe): class Esquire(BasicNewsRecipe):
title = 'Esquire' title = 'Esquire'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Esquire: Man at His Best' description = 'Esquire: Man at His Best'
publisher = 'Hearst Communications, Inc.' publisher = 'Hearst Communications, Inc.'
category = 'magazine, men, women we love, style, the guide, sex, screen' category = 'magazine, men, women we love, style, the guide, sex, screen'
oldest_article = 30 oldest_article = 30
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
encoding = 'cp1250' encoding = 'cp1250'
use_embedded_content = False use_embedded_content = False
language = 'en' language = 'en'
lang = 'en-US' lang = 'en-US'
cover_url = strftime('http://www.esquire.com/cm/esquire/cover-images/%Y_') + strftime('%m').strip('0') + '.jpg' cover_url = strftime('http://www.esquire.com/cm/esquire/cover-images/%Y_') + strftime('%m').strip('0') + '.jpg'
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
, 'tags' : category , 'tags' : category
, 'publisher' : publisher , 'publisher' : publisher
, 'language' : lang , 'language' : lang
, 'pretty_print' : True , 'pretty_print' : True
} }
keep_only_tags = [dict(name='div', attrs={'id':'content'})] keep_only_tags = [dict(name='div', attrs={'id':'content'})]
remove_tags = [dict(name=['object','link','embed','iframe'])] remove_tags = [dict(name=['object','link','embed','iframe'])]
feeds = [ feeds = [
(u'Style' , u'http://www.esquire.com/style/rss/' ) (u'Style' , u'http://www.esquire.com/style/rss/' )
,(u'Women' , u'http://www.esquire.com/women/rss/' ) ,(u'Women' , u'http://www.esquire.com/women/rss/' )
,(u'Features' , u'http://www.esquire.com/features/rss/' ) ,(u'Features' , u'http://www.esquire.com/features/rss/' )
,(u'Fiction' , u'http://www.esquire.com/fiction/rss/' ) ,(u'Fiction' , u'http://www.esquire.com/fiction/rss/' )
,(u'Frontpage', u'http://www.esquire.com/rss/' ) ,(u'Frontpage', u'http://www.esquire.com/rss/' )
] ]
def print_version(self, url): def print_version(self, url):
rest = url.rpartition('?')[0] rest = url.rpartition('?')[0]
article = rest.rpartition('/')[2] article = rest.rpartition('/')[2]
return 'http://www.esquire.com/print-this/' + article return 'http://www.esquire.com/print-this/' + article
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
return soup return soup

View File

@ -1,58 +1,58 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
exiledonline.com exiledonline.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Exiled(BasicNewsRecipe): class Exiled(BasicNewsRecipe):
title = 'Exiled Online' title = 'Exiled Online'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = "Mankind's only alternative since 1997 - Formerly known as The eXile" description = "Mankind's only alternative since 1997 - Formerly known as The eXile"
publisher = 'Exiled Online' publisher = 'Exiled Online'
category = 'news, politics, international' category = 'news, politics, international'
oldest_article = 15 oldest_article = 15
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'utf8' encoding = 'utf8'
remove_javascript = True remove_javascript = True
language = 'en' language = 'en'
cover_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif' cover_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment' , description
, '--base-font-size', '10' , '--base-font-size', '10'
, '--category' , category , '--category' , category
, '--publisher' , publisher , '--publisher' , publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'main'})] keep_only_tags = [dict(name='div', attrs={'id':'main'})]
remove_tags = [ remove_tags = [
dict(name=['object','link']) dict(name=['object','link'])
,dict(name='div', attrs={'class':'info'}) ,dict(name='div', attrs={'class':'info'})
,dict(name='div', attrs={'id':['comments','navig']}) ,dict(name='div', attrs={'id':['comments','navig']})
] ]
feeds = [(u'Articles', u'http://exiledonline.com/feed/')] feeds = [(u'Articles', u'http://exiledonline.com/feed/')]
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
mtag = '\n<meta http-equiv="Content-Language" content="en"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n' mtag = '\n<meta http-equiv="Content-Language" content="en"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
return soup return soup
def get_article_url(self, article): def get_article_url(self, article):
raw = article.get('link', None) raw = article.get('link', None)
final = raw + 'all/1/' final = raw + 'all/1/'
return final return final

View File

@ -1,59 +1,59 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.expansion.com www.expansion.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
class Expansion(BasicNewsRecipe): class Expansion(BasicNewsRecipe):
title = 'Diario Expansion' title = 'Diario Expansion'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Lider de informacion de mercados, economica y politica' description = 'Lider de informacion de mercados, economica y politica'
publisher = 'expansion.com' publisher = 'expansion.com'
category = 'news, politics, Spain' category = 'news, politics, Spain'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
delay = 1 delay = 1
encoding = 'iso-8859-15' encoding = 'iso-8859-15'
language = 'es' language = 'es'
direction = 'ltr' direction = 'ltr'
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment' , description
, '--category' , category , '--category' , category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [ feeds = [
(u'Ultimas noticias', u'http://rss.expansion.com/rss/descarga.htm?data2=178') (u'Ultimas noticias', u'http://rss.expansion.com/rss/descarga.htm?data2=178')
,(u'Temas del dia' , u'http://rss.expansion.com/rss/descarga.htm?data2=178') ,(u'Temas del dia' , u'http://rss.expansion.com/rss/descarga.htm?data2=178')
] ]
keep_only_tags = [dict(name='div', attrs={'id':'principal'})] keep_only_tags = [dict(name='div', attrs={'id':'principal'})]
remove_tags = [ remove_tags = [
dict(name=['object','link','script']) dict(name=['object','link','script'])
,dict(name='div', attrs={'class':['utilidades','tit_relacionadas']}) ,dict(name='div', attrs={'class':['utilidades','tit_relacionadas']})
] ]
remove_tags_after = [dict(name='div', attrs={'class':'tit_relacionadas'})] remove_tags_after = [dict(name='div', attrs={'class':'tit_relacionadas'})]
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['dir' ] = self.direction soup.html['dir' ] = self.direction
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mcharset) soup.head.insert(0,mcharset)
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
return soup return soup

View File

@ -1,55 +1,55 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.fastcompany.com www.fastcompany.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
class FastCompany(BasicNewsRecipe): class FastCompany(BasicNewsRecipe):
title = 'Fast Company' title = 'Fast Company'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Where ideas and people meet' description = 'Where ideas and people meet'
publisher = 'fastcompany.com' publisher = 'fastcompany.com'
category = 'news, technology, gadgets, games' category = 'news, technology, gadgets, games'
oldest_article = 15 oldest_article = 15
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = True use_embedded_content = True
simultaneous_downloads = 1 simultaneous_downloads = 1
encoding = 'utf-8' encoding = 'utf-8'
lang = 'en' lang = 'en'
language = 'en' language = 'en'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
remove_tags = [dict(name=['embed','object']), dict(name='div',attrs={'class':'feedflare'})] remove_tags = [dict(name=['embed','object']), dict(name='div',attrs={'class':'feedflare'})]
feeds = [(u'All News', u'http://feeds.feedburner.com/fastcompany/headlines')] feeds = [(u'All News', u'http://feeds.feedburner.com/fastcompany/headlines')]
def get_article_url(self, article): def get_article_url(self, article):
return article.get('guid', None) return article.get('guid', None)
def preprocess_html(self, soup): def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang) soup.head.insert(0,mlang)
soup.head.insert(1,mcharset) soup.head.insert(1,mcharset)
for item in soup.findAll('a'): for item in soup.findAll('a'):
sp = item['href'].find('http://feedads.g.doubleclick.net/') sp = item['href'].find('http://feedads.g.doubleclick.net/')
if sp != -1: if sp != -1:
item.extract() item.extract()
return self.adeify_images(soup) return self.adeify_images(soup)

View File

@ -1,51 +1,51 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2009, Kovid Goyal <kovid at kovidgoyal.net>, Darko Miletic <darko at gmail.com>' __copyright__ = '2008-2009, Kovid Goyal <kovid at kovidgoyal.net>, Darko Miletic <darko at gmail.com>'
''' '''
Profile to download FAZ.net Profile to download FAZ.net
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class FazNet(BasicNewsRecipe): class FazNet(BasicNewsRecipe):
title = 'FAZ NET' title = 'FAZ NET'
__author__ = 'Kovid Goyal, Darko Miletic' __author__ = 'Kovid Goyal, Darko Miletic'
description = 'Frankfurter Allgemeine Zeitung' description = 'Frankfurter Allgemeine Zeitung'
publisher = 'FAZ Electronic Media GmbH' publisher = 'FAZ Electronic Media GmbH'
category = 'news, politics, Germany' category = 'news, politics, Germany'
use_embedded_content = False use_embedded_content = False
language = 'de' language = 'de'
max_articles_per_feed = 30 max_articles_per_feed = 30
no_stylesheets = True no_stylesheets = True
encoding = 'utf-8' encoding = 'utf-8'
remove_javascript = True remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'Article'})] keep_only_tags = [dict(name='div', attrs={'class':'Article'})]
remove_tags = [ remove_tags = [
dict(name=['object','link','embed','base']) dict(name=['object','link','embed','base'])
,dict(name='div', attrs={'class':['LinkBoxModulSmall','ModulVerlagsInfo']}) ,dict(name='div', attrs={'class':['LinkBoxModulSmall','ModulVerlagsInfo']})
] ]
feeds = [ ('FAZ.NET', 'http://www.faz.net/s/Rub/Tpl~Epartner~SRss_.xml') ] feeds = [ ('FAZ.NET', 'http://www.faz.net/s/Rub/Tpl~Epartner~SRss_.xml') ]
def print_version(self, url): def print_version(self, url):
article, sep, rest = url.partition('?') article, sep, rest = url.partition('?')
return article.replace('.html', '~Afor~Eprint.html') return article.replace('.html', '~Afor~Eprint.html')
def preprocess_html(self, soup): def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>' mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
del soup.body['onload'] del soup.body['onload']
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
return soup return soup

Some files were not shown because too many files have changed in this diff Show More