Implement a check setup command that uses PyFlakes to check for various errors

This commit is contained in:
Kovid Goyal 2009-09-07 19:03:52 -06:00
parent 792c6b0b22
commit f9ff180347
206 changed files with 12460 additions and 13498 deletions

View File

@ -1,5 +1,5 @@
*_ui.py
moc_*.cpp
.check-cache.pickle
src/calibre/plugins
resources/images.qrc
src/calibre/manual/.build/

View File

@ -6,7 +6,6 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os, optparse
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
@ -70,7 +69,7 @@ def main(args=sys.argv):
command.clean()
return 0
if opts.clean_all():
if opts.clean_all:
for cmd in commands.__all__:
prints('Cleaning', cmd)
getattr(commands, cmd).clean()

75
setup/check.py Normal file
View File

@ -0,0 +1,75 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os, cPickle, subprocess
from operator import attrgetter
from setup import Command
def check_for_python_errors(filename, builtins):
from pyflakes import checker, ast
contents = open(filename, 'rb').read()
try:
tree = ast.parse(contents, filename)
except:
import traceback
traceback.print_exc()
try:
value = sys.exc_info()[1]
lineno, offset, line = value[1][1:]
except IndexError:
lineno, offset, line = 1, 0, ''
if line.endswith("\n"):
line = line[:-1]
return [SyntaxError(filename, lineno, offset, str(value))]
else:
w = checker.Checker(tree, filename, builtins = builtins)
w.messages.sort(key = attrgetter('lineno'))
return w.messages
class Check(Command):
BUILTINS = ['_', '__', 'dynamic_property', 'I', 'P']
CACHE = '.check-cache.pickle'
def run(self, opts):
cache = {}
if os.path.exists(self.CACHE):
cache = cPickle.load(open(self.CACHE, 'rb'))
for x in os.walk(self.j(self.SRC, 'calibre')):
for f in x[-1]:
f = self.j(x[0], f)
mtime = os.stat(f).st_mtime
if f.endswith('.py') and cache.get(f, 0) != mtime and \
self.b(f) not in ('ptempfile.py', 'feedparser.py',
'pyparsing.py', 'markdown.py') and 'genshi' not in f and \
'prs500/driver.py' not in f:
self.info('\tChecking', f)
w = check_for_python_errors(f, self.BUILTINS)
if w:
self.report_errors(w)
cPickle.dump(cache, open(self.CACHE, 'wb'), -1)
subprocess.call(['gvim', '-f', f])
raise SystemExit(1)
cache[f] = mtime
cPickle.dump(cache, open(self.CACHE, 'wb'), -1)
def report_errors(self, errors):
for err in errors:
if isinstance(err, SyntaxError):
print '\t\tSyntax Error'
else:
col = getattr(err, 'col', 0) if getattr(err, 'col', 0) else 0
lineno = err.lineno if err.lineno else 0
self.info('\t\t%d:%d:'%(lineno, col),
err.message%err.message_args)

View File

@ -11,6 +11,7 @@ __all__ = [
'build',
'gui',
'develop',
'check',
]
@ -29,6 +30,8 @@ develop = Develop()
from setup.gui import GUI
gui = GUI()
from setup.check import Check
check = Check()
commands = {}
for x in __all__:

View File

@ -78,9 +78,10 @@ class GUI(Command):
dat = pat.sub(sub, dat)
if form.endswith('viewer%smain.ui'%os.sep):
self.inf('\t\tPromoting WebView')
self.info('\t\tPromoting WebView')
dat = dat.replace('self.view = QtWebKit.QWebView(', 'self.view = DocumentView(')
dat += '\n\nfrom calibre.gui2.viewer.documentview import DocumentView'
dat += '\nQtWebKit'
open(compiled_form, 'wb').write(dat)

View File

@ -21,6 +21,11 @@ from calibre.constants import iswindows, isosx, islinux, isfrozen, \
filesystem_encoding
import mechanize
if False:
winutil, winutilerror, __appname__, islinux, __version__
fcntl, win32event, isfrozen, __author__, terminal_controller
winerror, win32api
mimetypes.add_type('application/epub+zip', '.epub')
mimetypes.add_type('text/x-sony-bbeb+xml', '.lrs')
mimetypes.add_type('application/xhtml+xml', '.xhtml')

View File

@ -13,19 +13,19 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from constants import eStart, eError, eItsMe
from constants import eStart
class CodingStateMachine:
def __init__(self, sm):

View File

@ -13,19 +13,19 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
import constants
from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
from charsetprober import CharSetProber
from codingstatemachine import CodingStateMachine
@ -75,5 +75,5 @@ class EscCharSetProber(CharSetProber):
self._mState = constants.eFoundIt
self._mDetectedCharset = codingSM.get_coding_state_machine()
return self.get_state()
return self.get_state()

View File

@ -14,19 +14,19 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
import constants
from charsetgroupprober import CharSetGroupProber
from sbcharsetprober import SingleByteCharSetProber
from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model

View File

@ -13,19 +13,19 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
import constants, sys
import constants
from constants import eStart, eError, eItsMe
from charsetprober import CharSetProber
from codingstatemachine import CodingStateMachine

View File

@ -8,11 +8,10 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
__docformat__ = 'restructuredtext en'
import os, re
from itertools import count, chain
from calibre.ebooks.oeb.base import XHTML, XHTML_NS
import re
from itertools import count
from calibre.ebooks.oeb.base import XHTML_NS
from calibre.ebooks.oeb.base import OEBBook
from lxml import etree, html
from lxml.etree import XPath
NSMAP = {'h': XHTML_NS, 'html': XHTML_NS, 'xhtml': XHTML_NS}
@ -55,5 +54,5 @@ def add_page_map(opfpath, opts):
id = elem.attrib['id'] = idgen.next()
href = '#'.join((item.href, id))
oeb.pages.add(name, href)
writer = DirWriter(version='2.0', page_map=True)
writer = None#DirWriter(version='2.0', page_map=True)
writer.dump(oeb, opfpath)

View File

@ -6,7 +6,6 @@ from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys
from calibre import plugins
_lzx, _error = plugins['lzx']

View File

@ -7,3 +7,5 @@ Microsoft LIT tag and attribute tables.
from calibre.ebooks.lit.maps.opf import MAP as OPF_MAP
from calibre.ebooks.lit.maps.html import MAP as HTML_MAP
OPF_MAP, HTML_MAP

View File

@ -1,14 +1,14 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import sys, os
from calibre import iswindows
import os
from calibre.ptempfile import PersistentTemporaryFile
try:
from PIL import ImageFont
ImageFont
except ImportError:
import ImageFont
'''
Default fonts used in the PRS500
'''
@ -48,11 +48,11 @@ def get_font_path(name):
# then, try calibre shipped ones
try:
try:
font_mod = __import__('calibre.ebooks.lrf.fonts.prs500', {}, {},
font_mod = __import__('calibre.ebooks.lrf.fonts.prs500', {}, {},
[fname], -1)
getattr(font_mod, fname)
except (ImportError, AttributeError):
font_mod = __import__('calibre.ebooks.lrf.fonts.liberation', {}, {},
font_mod = __import__('calibre.ebooks.lrf.fonts.liberation', {}, {},
[LIBERATION_FONT_MAP[name]], -1)
p = PersistentTemporaryFile('.ttf', 'font_')
p.write(getattr(font_mod, fname).font_data)
@ -61,7 +61,7 @@ def get_font_path(name):
return p.name
except ImportError:
pass
# finally, try system default ones
if SYSTEM_FONT_MAP.has_key(name) and os.access(SYSTEM_FONT_MAP[name], os.R_OK):
return SYSTEM_FONT_MAP[name]
@ -71,7 +71,7 @@ def get_font_path(name):
def get_font(name, size, encoding='unic'):
'''
Get an ImageFont object by name.
Get an ImageFont object by name.
@param size: Font height in pixels. To convert from pts:
sz in pixels = (dpi/72) * size in pts
@param encoding: Font encoding to use. E.g. 'unic', 'symbol', 'ADOB', 'ADBE', 'aprm'

View File

@ -94,7 +94,7 @@ NAME_MAP = {
u'springgreen': u'#00FF7F',
u'violet': u'#EE82EE',
u'yellowgreen': u'#9ACD32'
}
}
hex_pat = re.compile('#(\d{2})(\d{2})(\d{2})')
rgb_pat = re.compile('rgb\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)', re.IGNORECASE)
@ -109,5 +109,5 @@ def lrs_color(html_color):
if hcol in NAME_MAP:
return NAME_MAP[hcol].replace('#', '0x00')
return '0x00000000'

View File

@ -10,13 +10,13 @@ from calibre.ebooks.lrf.lrfparser import LRFDocument
from calibre.ebooks.metadata.opf import OPFCreator
from calibre.ebooks.lrf.objects import PageAttr, BlockAttr, TextAttr
from calibre.ebooks.lrf.pylrs.pylrs import TextStyle
class BlockStyle(object):
def __init__(self, ba):
self.ba = ba
def __str__(self):
ans = '.'+str(self.ba.id)+' {\n'
if hasattr(self.ba, 'sidemargin'):
@ -37,10 +37,10 @@ class BlockStyle(object):
ans += '\tbackground-color: %s;\n'%(self.ba.bgcolor.to_html())
#TODO: Fixed size blocks
return ans + '}\n'
class LRFConverter(object):
def __init__(self, document, opts, logger):
self.lrf = document
self.opts = opts
@ -48,15 +48,15 @@ class LRFConverter(object):
self.logger = logger
logger.info('Parsing LRF...')
self.lrf.parse()
self.create_metadata()
self.create_styles()
def create_metadata(self):
self.logger.info('Reading metadata...')
mi = get_metadata(self.lrf)
self.opf = OPFCreator(self.output_dir, mi)
def create_page_styles(self):
self.page_css = ''
for obj in self.lrf.objects.values():
@ -65,21 +65,21 @@ class LRFConverter(object):
self.page_css = selector + ' {\n'
# TODO: Headers and footers
self.page_css += '}\n'
def create_block_styles(self):
self.block_css = ''
for obj in self.lrf.objects.values():
if isinstance(obj, BlockAttr):
self.block_css += str(BlockStyle(obj))
def create_text_styles(self):
self.text_css = ''
for obj in self.lrf.objects.values():
if isinstance(obj, TextAttr):
self.text_css += str(TextStyle(obj))
print self.text_css
def create_styles(self):
self.logger.info('Creating CSS stylesheet...')
self.create_page_styles()
@ -104,9 +104,9 @@ def process_file(lrfpath, opts, logger=None):
raise ConversionError(opts.out + ' is not a directory')
if not os.path.exists(opts.out):
os.makedirs(opts.out)
document = LRFDocument(open(lrfpath, 'rb'))
conv = LRFConverter(document, opts, logger)
conv = LRFConverter(document, opts, logger)
def main(args=sys.argv):
@ -116,7 +116,7 @@ def main(args=sys.argv):
parser.print_help()
return 1
process_file(args[1], opts)
return 0

View File

@ -11,23 +11,23 @@ def ceil(num):
return int(math.ceil(num))
def print_xml(elem):
from calibre.ebooks.lrf.pylrs.pylrs import ElementWriter
from calibre.ebooks.lrf.pylrs.pylrs import ElementWriter
elem = elem.toElement('utf8')
ew = ElementWriter(elem, sourceEncoding='utf8')
ew.write(sys.stdout)
print
def cattrs(base, extra):
new = base.copy()
new.update(extra)
return new
def tokens(tb):
'''
Return the next token. A token is :
1. A string
1. A string
a block of text that has the same style
'''
'''
def process_element(x, attrs):
if isinstance(x, CR):
yield 2, None
@ -49,22 +49,22 @@ def tokens(tb):
for y in x.contents:
for z in process_element(y, attrs):
yield z
for i in tb.contents:
if isinstance(i, CR):
yield 1, None
elif isinstance(i, Paragraph):
for j in i.contents:
for j in i.contents:
attrs = {}
if hasattr(j, 'attrs'):
attrs = j.attrs
for k in process_element(j, attrs):
for k in process_element(j, attrs):
yield k
class Cell(object):
def __init__(self, conv, tag, css):
self.conv = conv
self.tag = tag
@ -89,7 +89,7 @@ class Cell(object):
self.rowspan = int(tag['rowspan']) if tag.has_key('rowspan') else 1
except:
pass
pp = conv.current_page
conv.book.allow_new_page = False
conv.current_page = conv.book.create_page()
@ -99,7 +99,7 @@ class Cell(object):
if isinstance(item, TextBlock):
self.text_blocks.append(item)
conv.current_page = pp
conv.book.allow_new_page = True
conv.book.allow_new_page = True
if not self.text_blocks:
tb = conv.book.create_text_block()
tb.Paragraph(' ')
@ -107,7 +107,7 @@ class Cell(object):
for tb in self.text_blocks:
tb.parent = None
tb.objId = 0
# Needed as we have to eventually change this BlockStyle's width and
# Needed as we have to eventually change this BlockStyle's width and
# height attributes. This blockstyle may be shared with other
# elements, so doing that causes havoc.
tb.blockStyle = conv.book.create_block_style()
@ -117,17 +117,17 @@ class Cell(object):
if ts.attrs['align'] == 'foot':
if isinstance(tb.contents[-1], Paragraph):
tb.contents[-1].append(' ')
def pts_to_pixels(self, pts):
pts = int(pts)
return ceil((float(self.conv.profile.dpi)/72.)*(pts/10.))
def minimum_width(self):
return max([self.minimum_tb_width(tb) for tb in self.text_blocks])
def minimum_tb_width(self, tb):
ts = tb.textStyle.attrs
default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
@ -135,7 +135,7 @@ class Cell(object):
mwidth = 0
for token, attrs in tokens(tb):
font = default_font
if isinstance(token, int): # Handle para and line breaks
if isinstance(token, int): # Handle para and line breaks
continue
if isinstance(token, Plot):
return self.pts_to_pixels(token.xsize)
@ -151,24 +151,24 @@ class Cell(object):
if width > mwidth:
mwidth = width
return parindent + mwidth + 2
def text_block_size(self, tb, maxwidth=sys.maxint, debug=False):
ts = tb.textStyle.attrs
default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
parindent = self.pts_to_pixels(ts['parindent'])
top, bottom, left, right = 0, 0, parindent, parindent
def add_word(width, height, left, right, top, bottom, ls, ws):
def add_word(width, height, left, right, top, bottom, ls, ws):
if left + width > maxwidth:
left = width + ws
top += ls
bottom = top+ls if top+ls > bottom else bottom
else:
left += (width + ws)
right = left if left > right else right
right = left if left > right else right
bottom = top+ls if top+ls > bottom else bottom
return left, right, top, bottom
for token, attrs in tokens(tb):
if attrs == None:
attrs = {}
@ -196,17 +196,17 @@ class Cell(object):
width, height = font.getsize(word)
left, right, top, bottom = add_word(width, height, left, right, top, bottom, ls, ws)
return right+3+max(parindent, 10), bottom
def text_block_preferred_width(self, tb, debug=False):
return self.text_block_size(tb, sys.maxint, debug=debug)[0]
def preferred_width(self, debug=False):
return ceil(max([self.text_block_preferred_width(i, debug=debug) for i in self.text_blocks]))
def height(self, width):
return sum([self.text_block_size(i, width)[1] for i in self.text_blocks])
class Row(object):
def __init__(self, conv, row, css, colpad):
@ -221,15 +221,15 @@ class Row(object):
name = a['name'] if a.has_key('name') else a['id'] if a.has_key('id') else None
if name is not None:
self.targets.append(name.replace('#', ''))
def number_of_cells(self):
'''Number of cells in this row. Respects colspan'''
ans = 0
for cell in self.cells:
ans += cell.colspan
return ans
def height(self, widths):
i, heights = 0, []
for cell in self.cells:
@ -239,11 +239,11 @@ class Row(object):
if not heights:
return 0
return max(heights)
def cell_from_index(self, col):
i = -1
cell = None
for cell in self.cells:
cell = None
for cell in self.cells:
for k in range(0, cell.colspan):
if i == col:
break
@ -251,30 +251,30 @@ class Row(object):
if i == col:
break
return cell
def minimum_width(self, col):
cell = self.cell_from_index(col)
if not cell:
return 0
return cell.minimum_width()
def preferred_width(self, col):
cell = self.cell_from_index(col)
if not cell:
return 0
return 0 if cell.colspan > 1 else cell.preferred_width()
def width_percent(self, col):
cell = self.cell_from_index(col)
if not cell:
return -1
return -1 if cell.colspan > 1 else cell.pwidth
def cell_iterator(self):
for c in self.cells:
yield c
class Table(object):
def __init__(self, conv, table, css, rowpad=10, colpad=10):
self.rows = []
@ -283,31 +283,31 @@ class Table(object):
self.colpad = colpad
rows = table.findAll('tr')
conv.in_table = True
for row in rows:
for row in rows:
rcss = conv.tag_css(row, css)[0]
self.rows.append(Row(conv, row, rcss, colpad))
conv.in_table = False
def number_of_columns(self):
max = 0
for row in self.rows:
max = row.number_of_cells() if row.number_of_cells() > max else max
return max
def number_or_rows(self):
return len(self.rows)
def height(self, maxwidth):
''' Return row heights + self.rowpad'''
widths = self.get_widths(maxwidth)
return sum([row.height(widths) + self.rowpad for row in self.rows]) - self.rowpad
def minimum_width(self, col):
return max([row.minimum_width(col) for row in self.rows])
def width_percent(self, col):
return max([row.width_percent(col) for row in self.rows])
def get_widths(self, maxwidth):
'''
Return widths of columns + self.colpad
@ -320,29 +320,29 @@ class Table(object):
try:
cellwidths[r] = self.rows[r].preferred_width(c)
except IndexError:
continue
continue
widths[c] = max(cellwidths)
min_widths = [self.minimum_width(i)+10 for i in xrange(cols)]
for i in xrange(len(widths)):
wp = self.width_percent(i)
if wp >= 0.:
widths[i] = max(min_widths[i], ceil((wp/100.) * (maxwidth - (cols-1)*self.colpad)))
itercount = 0
while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100:
for i in range(cols):
widths[i] = ceil((95./100.)*widths[i]) if \
ceil((95./100.)*widths[i]) >= min_widths[i] else widths[i]
itercount += 1
return [i+self.colpad for i in widths]
def blocks(self, maxwidth, maxheight):
def blocks(self, maxwidth, maxheight):
rows, cols = self.number_or_rows(), self.number_of_columns()
cellmatrix = [[None for c in range(cols)] for r in range(rows)]
cellmatrix = [[None for c in range(cols)] for r in range(rows)]
rowpos = [0 for i in range(rows)]
for r in range(rows):
nc = self.rows[r].cell_iterator()
@ -358,14 +358,14 @@ class Table(object):
break
except StopIteration: # No more cells in this row
continue
widths = self.get_widths(maxwidth)
heights = [row.height(widths) for row in self.rows]
xpos = [sum(widths[:i]) for i in range(cols)]
delta = maxwidth - sum(widths)
if delta < 0:
if delta < 0:
delta = 0
for r in range(len(cellmatrix)):
yield None, 0, heights[r], 0, self.rows[r].targets
@ -377,13 +377,13 @@ class Table(object):
sypos = 0
for tb in cell.text_blocks:
tb.blockStyle = self.conv.book.create_block_style(
blockwidth=width,
blockwidth=width,
blockheight=cell.text_block_size(tb, width)[1],
blockrule='horz-fixed')
yield tb, xpos[c], sypos, delta, None
sypos += tb.blockStyle.attrs['blockheight']

View File

@ -1,81 +1,81 @@
""" elements.py -- replacements and helpers for ElementTree """
class ElementWriter(object):
def __init__(self, e, header=False, sourceEncoding="ascii",
spaceBeforeClose=True, outputEncodingName="UTF-16"):
self.header = header
self.e = e
self.sourceEncoding=sourceEncoding
self.spaceBeforeClose = spaceBeforeClose
self.outputEncodingName = outputEncodingName
def _encodeCdata(self, rawText):
if type(rawText) is str:
rawText = rawText.decode(self.sourceEncoding)
text = rawText.replace("&", "&amp;")
text = text.replace("<", "&lt;")
text = text.replace(">", "&gt;")
return text
def _writeAttribute(self, f, name, value):
f.write(u' %s="' % unicode(name))
if not isinstance(value, basestring):
value = unicode(value)
value = self._encodeCdata(value)
value = value.replace('"', '&quot;')
f.write(value)
f.write(u'"')
def _writeText(self, f, rawText):
text = self._encodeCdata(rawText)
f.write(text)
def _write(self, f, e):
f.write(u'<' + unicode(e.tag))
attributes = e.items()
attributes.sort()
for name, value in attributes:
self._writeAttribute(f, name, value)
if e.text is not None or len(e) > 0:
f.write(u'>')
if e.text:
self._writeText(f, e.text)
for e2 in e:
self._write(f, e2)
f.write(u'</%s>' % e.tag)
else:
if self.spaceBeforeClose:
f.write(' ')
f.write(u'/>')
if e.tail is not None:
self._writeText(f, e.tail)
def toString(self):
class x:
pass
buffer = []
x.write = buffer.append
self.write(x)
return u''.join(buffer)
def write(self, f):
if self.header:
f.write(u'<?xml version="1.0" encoding="%s"?>\n' % self.outputEncodingName)
self._write(f, self.e)
""" elements.py -- replacements and helpers for ElementTree """
class ElementWriter(object):
def __init__(self, e, header=False, sourceEncoding="ascii",
spaceBeforeClose=True, outputEncodingName="UTF-16"):
self.header = header
self.e = e
self.sourceEncoding=sourceEncoding
self.spaceBeforeClose = spaceBeforeClose
self.outputEncodingName = outputEncodingName
def _encodeCdata(self, rawText):
if type(rawText) is str:
rawText = rawText.decode(self.sourceEncoding)
text = rawText.replace("&", "&amp;")
text = text.replace("<", "&lt;")
text = text.replace(">", "&gt;")
return text
def _writeAttribute(self, f, name, value):
f.write(u' %s="' % unicode(name))
if not isinstance(value, basestring):
value = unicode(value)
value = self._encodeCdata(value)
value = value.replace('"', '&quot;')
f.write(value)
f.write(u'"')
def _writeText(self, f, rawText):
text = self._encodeCdata(rawText)
f.write(text)
def _write(self, f, e):
f.write(u'<' + unicode(e.tag))
attributes = e.items()
attributes.sort()
for name, value in attributes:
self._writeAttribute(f, name, value)
if e.text is not None or len(e) > 0:
f.write(u'>')
if e.text:
self._writeText(f, e.text)
for e2 in e:
self._write(f, e2)
f.write(u'</%s>' % e.tag)
else:
if self.spaceBeforeClose:
f.write(' ')
f.write(u'/>')
if e.tail is not None:
self._writeText(f, e.tail)
def toString(self):
class x:
pass
buffer = []
x.write = buffer.append
self.write(x)
return u''.join(buffer)
def write(self, f):
if self.header:
f.write(u'<?xml version="1.0" encoding="%s"?>\n' % self.outputEncodingName)
self._write(f, self.e)

File diff suppressed because it is too large Load Diff

View File

@ -1,43 +1,43 @@
def _optimize(tagList, tagName, conversion):
# copy the tag of interest plus any text
newTagList = []
for tag in tagList:
if tag.name == tagName or tag.name == "rawtext":
newTagList.append(tag)
# now, eliminate any duplicates (leaving the last one)
for i, newTag in enumerate(newTagList[:-1]):
if newTag.name == tagName and newTagList[i+1].name == tagName:
tagList.remove(newTag)
# eliminate redundant settings to same value across text strings
newTagList = []
for tag in tagList:
if tag.name == tagName:
newTagList.append(tag)
for i, newTag in enumerate(newTagList[:-1]):
value = conversion(newTag.parameter)
nextValue = conversion(newTagList[i+1].parameter)
if value == nextValue:
tagList.remove(newTagList[i+1])
# eliminate any setting that don't have text after them
while len(tagList) > 0 and tagList[-1].name == tagName:
del tagList[-1]
def tagListOptimizer(tagList):
# this function eliminates redundant or unnecessary tags
# it scans a list of tags, looking for text settings that are
# changed before any text is output
# for example,
# fontsize=100, fontsize=200, text, fontsize=100, fontsize=200
# should be:
# fontsize=200 text
oldSize = len(tagList)
_optimize(tagList, "fontsize", int)
_optimize(tagList, "fontweight", int)
return oldSize - len(tagList)
def _optimize(tagList, tagName, conversion):
# copy the tag of interest plus any text
newTagList = []
for tag in tagList:
if tag.name == tagName or tag.name == "rawtext":
newTagList.append(tag)
# now, eliminate any duplicates (leaving the last one)
for i, newTag in enumerate(newTagList[:-1]):
if newTag.name == tagName and newTagList[i+1].name == tagName:
tagList.remove(newTag)
# eliminate redundant settings to same value across text strings
newTagList = []
for tag in tagList:
if tag.name == tagName:
newTagList.append(tag)
for i, newTag in enumerate(newTagList[:-1]):
value = conversion(newTag.parameter)
nextValue = conversion(newTagList[i+1].parameter)
if value == nextValue:
tagList.remove(newTagList[i+1])
# eliminate any setting that don't have text after them
while len(tagList) > 0 and tagList[-1].name == tagName:
del tagList[-1]
def tagListOptimizer(tagList):
# this function eliminates redundant or unnecessary tags
# it scans a list of tags, looking for text settings that are
# changed before any text is output
# for example,
# fontsize=100, fontsize=200, text, fontsize=100, fontsize=200
# should be:
# fontsize=200 text
oldSize = len(tagList)
_optimize(tagList, "fontsize", int)
_optimize(tagList, "fontweight", int)
return oldSize - len(tagList)

File diff suppressed because it is too large Load Diff

View File

@ -2,4 +2,6 @@
# Initialize extensions
from calibre.ebooks.markdown import mdx_footnotes
from calibre.ebooks.markdown import mdx_tables
from calibre.ebooks.markdown import mdx_toc
from calibre.ebooks.markdown import mdx_toc
mdx_footnotes, mdx_tables, mdx_toc

View File

@ -8,8 +8,6 @@ My markdown extensions for adding:
Table of Contents (aka toc)
"""
import os
import sys
import re
import markdown
@ -18,7 +16,7 @@ DEFAULT_TITLE = None
def extract_alphanumeric(in_str=None):
"""take alpha-numeric (7bit ascii) and return as a string
"""
# I'm sure this is really inefficient and
# I'm sure this is really inefficient and
# could be done with a lambda/map()
#x.strip().title().replace(' ', "")
out_str=[]
@ -42,7 +40,7 @@ class TocExtension (markdown.Extension):
toc is returned in a div tag with class='toc'
toc is either:
appended to end of document
OR
OR
replaces first string occurence of "///Table of Contents Goes Here///"
"""
@ -75,7 +73,7 @@ class TocExtension (markdown.Extension):
"""
Creates Table Of Contents based on headers.
@returns: toc as a single as a dom element
@returns: toc as a single as a dom element
in a <div> tag with class='toc'
"""
@ -85,9 +83,9 @@ class TocExtension (markdown.Extension):
if element.type=='element':
if headers_compiled_re.match(element.nodeName):
return True
headers_doc_list = doc.find(findHeadersFn)
# Insert anchor tags into dom
generated_anchor_id=0
headers_list=[]
@ -99,19 +97,19 @@ class TocExtension (markdown.Extension):
if heading_type == self.auto_toc_heading_type:
min_header_size_found=min(min_header_size_found,
heading_type)
html_anchor_name= (extract_alphanumeric(heading_title)
+'__MD_autoTOC_%d' % (generated_anchor_id))
# insert anchor tag inside header tags
html_anchor = doc.createElement("a")
html_anchor.setAttribute('name', html_anchor_name)
element.appendChild(html_anchor)
headers_list.append( (heading_type, heading_title,
html_anchor_name) )
generated_anchor_id = generated_anchor_id + 1
# create dom for toc
if headers_list != []:
# Create list
@ -125,9 +123,9 @@ class TocExtension (markdown.Extension):
toc_doc_link.appendChild(toc_doc_text)
toc_doc_entry.appendChild(toc_doc_link)
toc_doc_list.appendChild(toc_doc_entry)
# Put list into div
# Put list into div
div = doc.createElement("div")
div.setAttribute('class', 'toc')
if self.TOC_TITLE:
@ -149,7 +147,7 @@ class TocPostprocessor (markdown.Postprocessor):
def run(self, doc):
tocPlaceholder = self.toc.findTocPlaceholder(doc)
tocDiv = self.toc.createTocDiv(doc)
if tocDiv:
if tocPlaceholder :

View File

@ -2,7 +2,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Ashish Kulkarni <kulkarni.ashish@gmail.com>'
'''Read meta information from IMP files'''
import sys, os
import sys
from calibre.ebooks.metadata import MetaInformation, string_to_authors
@ -17,7 +17,7 @@ def get_metadata(stream):
if stream.read(10) not in MAGIC:
print >>sys.stderr, u'Couldn\'t read IMP header from file'
return mi
def cString(skip=0):
result = ''
while 1:
@ -30,7 +30,7 @@ def get_metadata(stream):
stream.read(38) # skip past some uninteresting headers
_, category, title, author = cString(), cString(), cString(1), cString(2)
if title:
mi.title = title
if author:

View File

@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
Read metadata from LRX files
'''
import sys, struct
import struct
from zlib import decompress
from lxml import etree
@ -33,7 +33,7 @@ def short_be(buf):
def get_metadata(f):
read = lambda at, amount: _read(f, at, amount)
f.seek(0)
buf = f.read(12)
buf = f.read(12)
if buf[4:] == 'ftypLRX2':
offset = 0
while True:
@ -74,9 +74,9 @@ def get_metadata(f):
mi.tags = [x.text for x in bi.findall('Category')]
mi.language = root.find('DocInfo').find('Language').text
return mi
elif buf[4:8] == 'LRX':
raise ValueError('Librie LRX format not supported')
else:
raise ValueError('Not a LRX file')

View File

@ -17,7 +17,7 @@
#
# Contributor(s):
#
import zipfile, sys, re
import zipfile, re
import xml.sax.saxutils
from cStringIO import StringIO
@ -46,7 +46,7 @@ fields = {
}
def normalize(str):
"""
"""
The normalize-space function returns the argument string with whitespace
normalized by stripping leading and trailing whitespace and replacing
sequences of whitespace characters by a single space.
@ -125,7 +125,7 @@ class odfmetaparser(xml.sax.saxutils.XMLGenerator):
else:
texttag = self._tag
self.seenfields[texttag] = self.data()
if field in self.deletefields:
self.output.dowrite = True
else:
@ -140,7 +140,7 @@ class odfmetaparser(xml.sax.saxutils.XMLGenerator):
def data(self):
return normalize(''.join(self._data))
def get_metadata(stream):
zin = zipfile.ZipFile(stream, 'r')
odfs = odfmetaparser()
@ -161,6 +161,6 @@ def get_metadata(stream):
mi.language = data['language']
if data.get('keywords', ''):
mi.tags = data['keywords'].split(',')
return mi

View File

@ -3,8 +3,8 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os
from zipfile import ZipFile
from cStringIO import StringIO
from zipfile import ZipFile
from cStringIO import StringIO
def get_metadata(stream):
@ -20,5 +20,5 @@ def get_metadata(stream):
stream = StringIO(zf.read(f))
return get_metadata(stream, stream_type)
raise ValueError('No ebook found in ZIP archive')

View File

@ -3,7 +3,6 @@
'''
Writer content to palmdoc pdb file.
'''
import os
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'

View File

@ -4,7 +4,6 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
class zTXTError(Exception):
pass

View File

@ -12,8 +12,6 @@ Decrypt content of PDF.
import os, sys
from optparse import OptionGroup, Option
from calibre.ebooks.metadata.meta import metadata_from_formats
from calibre.ebooks.metadata import authors_to_string
from calibre.utils.config import OptionParser
from calibre.utils.logging import Log
from calibre.constants import preferred_encoding
@ -36,8 +34,8 @@ OPTIONS = set([
class DecryptionError(Exception):
def __init__(self, pdf_path):
self.value = 'Unable to decrypt file `%s`.' % value
self.value = 'Unable to decrypt file `%s`.' % pdf_path
def __str__(self):
return repr(self.value)
@ -62,20 +60,20 @@ def add_options(parser):
group = OptionGroup(parser, _('Decrypt Options:'), _('Options to control the transformation of pdf'))
parser.add_option_group(group)
add_option = group.add_option
for rec in OPTIONS:
option_recommendation_to_cli_option(add_option, rec)
def decrypt(pdf_path, out_path, password):
pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb'))
if pdf.decrypt(str(password)) == 0:
raise DecryptionError(pdf_path)
title = pdf.documentInfo.title if pdf.documentInfo.title else _('Unknown')
author = pdf.documentInfo.author if pdf.documentInfo.author else _('Unknown')
out_pdf = PdfFileWriter(title=title, author=author)
for page in pdf.pages:
out_pdf.addPage(page)
@ -86,23 +84,23 @@ def main(args=sys.argv, name=''):
log = Log()
parser = option_parser(name)
add_options(parser)
opts, args = parser.parse_args(args)
args = args[1:]
if len(args) < 2:
print 'Error: A PDF file and decryption password is required.\n'
print_help(parser, log)
return 1
if not is_valid_pdf(args[0]):
print 'Error: Could not read file `%s`.' % args[0]
return 1
if not is_encrypted(args[0]):
print 'Error: file `%s` is not encrypted.' % args[0]
return 1
try:
decrypt(args[0], opts.output, args[1])
except DecryptionError, e:

View File

@ -17,6 +17,8 @@ from calibre.utils.logging import Log
from calibre.constants import preferred_encoding
from calibre.customize.conversion import OptionRecommendation
from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted
from calibre.ebooks.metadata import authors_to_string
from calibre.ebooks.metadata.meta import metadata_from_formats
from pyPdf import PdfFileWriter, PdfFileReader
@ -52,7 +54,7 @@ def add_options(parser):
group = OptionGroup(parser, _('Encrypt Options:'), _('Options to control the transformation of pdf'))
parser.add_option_group(group)
add_option = group.add_option
for rec in OPTIONS:
option_recommendation_to_cli_option(add_option, rec)
@ -78,23 +80,23 @@ def main(args=sys.argv, name=''):
log = Log()
parser = option_parser(name)
add_options(parser)
opts, args = parser.parse_args(args)
args = args[1:]
if len(args) < 2:
print 'Error: A PDF file and decryption password is required.\n'
print_help(parser, log)
return 1
if not is_valid_pdf(args[0]):
print 'Error: Could not read file `%s`.' % args[0]
return 1
if is_encrypted(args[0]):
print 'Error: file `%s` is already encrypted.' % args[0]
return 1
mi = metadata_from_formats([args[0]])
encrypt(args[0], opts.output, args[1], mi)

View File

@ -11,25 +11,25 @@ Verify PDF files.
import os
from pyPdf import PdfFileWriter, PdfFileReader
from pyPdf import PdfFileReader
def is_valid_pdf(pdf_path):
'''
Returns True if the pdf file is valid.
'''
try:
with open(os.path.abspath(pdf_path), 'rb') as pdf_file:
pdf = PdfFileReader(pdf_file)
except:
return False
return True
def is_valid_pdfs(pdf_paths):
'''
Returns a list of invalid pdf files.
'''
invalid = []
for pdf_path in pdf_paths:
if not is_valid_pdf(pdf_path):

View File

@ -4,7 +4,6 @@ __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
import struct
import zlib

View File

@ -15,7 +15,7 @@
# #
# #
#########################################################################
import sys, os, shutil
import os, shutil
class Copy:
"""Copy each changed file to a directory for debugging purposes"""
@ -66,6 +66,6 @@ class Copy:
"""
write_file = os.path.join(Copy.__dir,new_file)
shutil.copyfile(file, write_file)
def rename(self, source, dest):
shutil.copyfile(source, dest)
shutil.copyfile(source, dest)

View File

@ -1,5 +1,4 @@
import sys
from calibre.ebooks import rtf2xml
class ParseOptions:
"""
Requires:

View File

@ -16,7 +16,6 @@
# #
#########################################################################
import sys, os, codecs
from calibre.ebooks import rtf2xml
class Output:
"""
Output file

View File

@ -15,8 +15,6 @@
# #
# #
#########################################################################
import sys,os
from calibre.ebooks import rtf2xml
class OverrideTable:
"""
Parse a line of text to make the override table. Return a string

View File

@ -7,21 +7,19 @@ from calibre.gui2 import file_icon_provider
from calibre.gui2.dialogs.choose_format_ui import Ui_ChooseFormatDialog
class ChooseFormatDialog(QDialog, Ui_ChooseFormatDialog):
def __init__(self, window, msg, formats):
QDialog.__init__(self, window)
Ui_ChooseFormatDialog.__init__(self)
self.setupUi(self)
self.connect(self.formats, SIGNAL('activated(QModelIndex)'), lambda i: self.accept())
self.msg.setText(msg)
for format in formats:
self.formats.addItem(QListWidgetItem(file_icon_provider().icon_from_ext(format.lower()),
format.upper()))
self._formats = formats
self.formats.setCurrentRow(0)
def format(self):
return self._formats[self.formats.currentRow()]

View File

@ -5,7 +5,7 @@ from PyQt4.QtGui import QDialog
from calibre.gui2.dialogs.conversion_error_ui import Ui_ConversionErrorDialog
class ConversionErrorDialog(QDialog, Ui_ConversionErrorDialog):
def __init__(self, window, title, html, show=False):
QDialog.__init__(self, window)
Ui_ConversionErrorDialog.__init__(self)
@ -14,7 +14,7 @@ class ConversionErrorDialog(QDialog, Ui_ConversionErrorDialog):
self.set_message(html)
if show:
self.show()
def set_message(self, html):
self.text.setHtml('<html><body>%s</body></html'%(html,))

View File

@ -5,20 +5,20 @@ from PyQt4.QtGui import QGraphicsView
from PyQt4.QtCore import QSize
class BookView(QGraphicsView):
MINIMUM_SIZE = QSize(400, 500)
def __init__(self, *args):
QGraphicsView.__init__(self, *args)
self.preferred_size = self.MINIMUM_SIZE
def minimumSizeHint(self):
return self.MINIMUM_SIZE
def sizeHint(self):
return self.preferred_size
def resize_for(self, width, height):
self.preferred_size = QSize(width, height)

View File

@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
import os, math, re
from PyQt4.Qt import QWidget, QSize, QSizePolicy, QUrl, SIGNAL, Qt, QTimer, \
QPainter, QPalette, QBrush, QFontDatabase, QDialog, \
QByteArray, QColor, QWheelEvent, QPoint, QImage, QRegion, \
QByteArray, QColor, QPoint, QImage, QRegion, \
QFont, QObject, QApplication, pyqtSignature
from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings

View File

@ -4,17 +4,14 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
import os, sys, traceback, urlparse
import os, sys, urlparse
from BeautifulSoup import BeautifulSoup, Tag
from calibre.ebooks.oeb.iterator import EbookIterator
from calibre.ptempfile import TemporaryDirectory
from PyQt4 import QtCore
from PyQt4.Qt import QUrl, QEventLoop, SIGNAL, QObject, QApplication, Qt, \
QPrinter, QPrintPreviewDialog, QPrintDialog, QDialog, QMetaObject, Q_ARG
from PyQt4 import QtCore
from PyQt4.QtWebKit import QWebView
PRINTCSS = 'body{width:100%;margin:0;padding:0;font-family:Arial;color:#000;background:none;font-size:12pt;text-align:left;}h1,h2,h3,h4,h5,h6{font-family:Helvetica;}h1{font-size:19pt;}h2{font-size:17pt;}h3{font-size:15pt;}h4,h5,h6{font-size:12pt;}pre,code,samp{font:10ptCourier,monospace;white-space:pre-wrap;page-break-inside:avoid;}blockquote{margin:1.3em;padding:1em;font-size:10pt;}hr{background-color:#ccc;}aimg{border:none;}a:link,a:visited{background:transparent;font-weight:700;text-decoration:underline;color:#333;}a:link:after,a{color:#000;}table{margin:1px;text-align:left;}th{border-bottom:1pxsolid#333;font-weight:bold;}td{border-bottom:1pxsolid#333;}th,td{padding:4px10px4px0;}tfoot{font-style:italic;}caption{background:#fff;margin-bottom:2em;text-align:left;}thead{display:table-header-group;}tr{page-break-inside:avoid;}#header,.header,#footer,.footer,#navbar,.navbar,#navigation,.navigation,#rightSideBar,.rightSideBar,#leftSideBar,.leftSideBar{display:none;}'
@ -31,18 +28,18 @@ class Printing(QObject):
self.connect(self.view, SIGNAL('loadFinished(bool)'), self.print_preview)
else:
self.connect(self.view, SIGNAL('loadFinished(bool)'), self.print_book)
self.process_content(spine)
def process_content(self, spine):
content = ''
for path in spine:
raw = self.raw_content(path)
content += self.parsed_content(raw, path)
refined_content = self.refine_content(content)
base = os.path.splitdrive(spine[0])[0]
base = base if base != '' else '/'
@ -52,7 +49,7 @@ class Printing(QObject):
@QtCore.pyqtSignature('load_content(QString, QString)')
def load_content(self, content, base):
self.view.setHtml(content, QUrl(base))
def raw_content(self, path):
return open(path, 'rb').read().decode(path.encoding)
@ -64,11 +61,11 @@ class Printing(QObject):
styles = dom_tree.findAll('style')
for s in styles:
s.extract()
scripts = dom_tree.findAll('script')
for s in scripts:
s.extract()
# Convert all relative links to absolute paths.
links = dom_tree.findAll(src=True)
for s in links:
@ -85,40 +82,40 @@ class Printing(QObject):
# Adds the print css.
def refine_content(self, content):
dom_tree = BeautifulSoup('<html><head></head><body>%s</body></html>' % content)
css = dom_tree.findAll('link')
for c in css:
c.extract()
print_css = Tag(BeautifulSoup(), 'style', [('type', 'text/css'), ('title', 'override_css')])
print_css.insert(0, PRINTCSS)
dom_tree.findAll('head')[0].insert(0, print_css)
return unicode(dom_tree)
def print_preview(self, ok):
printer = QPrinter(QPrinter.HighResolution)
printer.setPageMargins(1, 1, 1, 1, QPrinter.Inch)
previewDialog = QPrintPreviewDialog(printer)
self.connect(previewDialog, SIGNAL('paintRequested(QPrinter *)'), self.view.print_)
previewDialog.exec_()
self.disconnect(previewDialog, SIGNAL('paintRequested(QPrinter *)'), self.view.print_)
self.loop.quit()
def print_book(self, ok):
printer = QPrinter(QPrinter.HighResolution)
printer.setPageMargins(1, 1, 1, 1, QPrinter.Inch)
printDialog = QPrintDialog(printer)
printDialog.setWindowTitle(_("Print eBook"))
printDialog.exec_()
if printDialog.result() == QDialog.Accepted:
self.view.print_(printer)
self.loop.quit()
def main():

View File

@ -18,7 +18,7 @@ sys.path.append(os.path.abspath('../../../'))
sys.path.append(os.path.abspath('.'))
from calibre import __appname__, __version__
import custom
custom
# General configuration
# ---------------------

View File

@ -1,970 +0,0 @@
""" path.py - An object representing a path to a file or directory.
Example:
from path import path
d = path('/home/guido/bin')
for f in d.files('*.py'):
f.chmod(0755)
This module requires Python 2.2 or later.
URL: http://www.jorendorff.com/articles/python/path
Author: Jason Orendorff <jason.orendorff\x40gmail\x2ecom> (and others - see the url!)
Date: 9 Mar 2007
"""
# TODO
# - Tree-walking functions don't avoid symlink loops. Matt Harrison
# sent me a patch for this.
# - Bug in write_text(). It doesn't support Universal newline mode.
# - Better error message in listdir() when self isn't a
# directory. (On Windows, the error message really sucks.)
# - Make sure everything has a good docstring.
# - Add methods for regex find and replace.
# - guess_content_type() method?
# - Perhaps support arguments to touch().
from __future__ import generators
import sys, warnings, os, fnmatch, glob, shutil, codecs, hashlib
__version__ = '2.2'
__all__ = ['path']
# Platform-specific support for path.owner
if os.name == 'nt':
try:
import win32security
except ImportError:
win32security = None
else:
try:
import pwd
except ImportError:
pwd = None
# Pre-2.3 support. Are unicode filenames supported?
_base = str
_getcwd = os.getcwd
try:
if os.path.supports_unicode_filenames:
_base = unicode
_getcwd = os.getcwdu
except AttributeError:
pass
# Pre-2.3 workaround for booleans
try:
True, False
except NameError:
True, False = 1, 0
# Pre-2.3 workaround for basestring.
try:
basestring
except NameError:
basestring = (str, unicode)
# Universal newline support
_textmode = 'r'
if hasattr(file, 'newlines'):
_textmode = 'U'
class TreeWalkWarning(Warning):
pass
class path(_base):
""" Represents a filesystem path.
For documentation on individual methods, consult their
counterparts in os.path.
"""
# --- Special Python methods.
def __repr__(self):
return 'path(%s)' % _base.__repr__(self)
# Adding a path and a string yields a path.
def __add__(self, more):
try:
resultStr = _base.__add__(self, more)
except TypeError: #Python bug
resultStr = NotImplemented
if resultStr is NotImplemented:
return resultStr
return self.__class__(resultStr)
def __radd__(self, other):
if isinstance(other, basestring):
return self.__class__(other.__add__(self))
else:
return NotImplemented
# The / operator joins paths.
def __div__(self, rel):
""" fp.__div__(rel) == fp / rel == fp.joinpath(rel)
Join two path components, adding a separator character if
needed.
"""
return self.__class__(os.path.join(self, rel))
# Make the / operator work even when true division is enabled.
__truediv__ = __div__
def getcwd(cls):
""" Return the current working directory as a path object. """
return cls(_getcwd())
getcwd = classmethod(getcwd)
# --- Operations on path strings.
isabs = os.path.isabs
def abspath(self): return self.__class__(os.path.abspath(self))
def normcase(self): return self.__class__(os.path.normcase(self))
def normpath(self): return self.__class__(os.path.normpath(self))
def realpath(self): return self.__class__(os.path.realpath(self))
def expanduser(self): return self.__class__(os.path.expanduser(self))
def expandvars(self): return self.__class__(os.path.expandvars(self))
def dirname(self): return self.__class__(os.path.dirname(self))
basename = os.path.basename
def expand(self):
""" Clean up a filename by calling expandvars(),
expanduser(), and normpath() on it.
This is commonly everything needed to clean up a filename
read from a configuration file, for example.
"""
return self.expandvars().expanduser().normpath()
def _get_namebase(self):
base, ext = os.path.splitext(self.name)
return base
def _get_ext(self):
f, ext = os.path.splitext(_base(self))
return ext
def _get_drive(self):
drive, r = os.path.splitdrive(self)
return self.__class__(drive)
parent = property(
dirname, None, None,
""" This path's parent directory, as a new path object.
For example, path('/usr/local/lib/libpython.so').parent == path('/usr/local/lib')
""")
name = property(
basename, None, None,
""" The name of this file or directory without the full path.
For example, path('/usr/local/lib/libpython.so').name == 'libpython.so'
""")
namebase = property(
_get_namebase, None, None,
""" The same as path.name, but with one file extension stripped off.
For example, path('/home/guido/python.tar.gz').name == 'python.tar.gz',
but path('/home/guido/python.tar.gz').namebase == 'python.tar'
""")
ext = property(
_get_ext, None, None,
""" The file extension, for example '.py'. """)
drive = property(
_get_drive, None, None,
""" The drive specifier, for example 'C:'.
This is always empty on systems that don't use drive specifiers.
""")
def splitpath(self):
""" p.splitpath() -> Return (p.parent, p.name). """
parent, child = os.path.split(self)
return self.__class__(parent), child
def splitdrive(self):
""" p.splitdrive() -> Return (p.drive, <the rest of p>).
Split the drive specifier from this path. If there is
no drive specifier, p.drive is empty, so the return value
is simply (path(''), p). This is always the case on Unix.
"""
drive, rel = os.path.splitdrive(self)
return self.__class__(drive), rel
def splitext(self):
""" p.splitext() -> Return (p.stripext(), p.ext).
Split the filename extension from this path and return
the two parts. Either part may be empty.
The extension is everything from '.' to the end of the
last path segment. This has the property that if
(a, b) == p.splitext(), then a + b == p.
"""
filename, ext = os.path.splitext(self)
return self.__class__(filename), ext
def stripext(self):
""" p.stripext() -> Remove one file extension from the path.
For example, path('/home/guido/python.tar.gz').stripext()
returns path('/home/guido/python.tar').
"""
return self.splitext()[0]
if hasattr(os.path, 'splitunc'):
def splitunc(self):
unc, rest = os.path.splitunc(self)
return self.__class__(unc), rest
def _get_uncshare(self):
unc, r = os.path.splitunc(self)
return self.__class__(unc)
uncshare = property(
_get_uncshare, None, None,
""" The UNC mount point for this path.
This is empty for paths on local drives. """)
def joinpath(self, *args):
""" Join two or more path components, adding a separator
character (os.sep) if needed. Returns a new path
object.
"""
return self.__class__(os.path.join(self, *args))
def splitall(self):
r""" Return a list of the path components in this path.
The first item in the list will be a path. Its value will be
either os.curdir, os.pardir, empty, or the root directory of
this path (for example, '/' or 'C:\\'). The other items in
the list will be strings.
path.path.joinpath(*result) will yield the original path.
"""
parts = []
loc = self
while loc != os.curdir and loc != os.pardir:
prev = loc
loc, child = prev.splitpath()
if loc == prev:
break
parts.append(child)
parts.append(loc)
parts.reverse()
return parts
def relpath(self):
""" Return this path as a relative path,
based from the current working directory.
"""
cwd = self.__class__(os.getcwd())
return cwd.relpathto(self)
def relpathto(self, dest):
""" Return a relative path from self to dest.
If there is no relative path from self to dest, for example if
they reside on different drives in Windows, then this returns
dest.abspath().
"""
origin = self.abspath()
dest = self.__class__(dest).abspath()
orig_list = origin.normcase().splitall()
# Don't normcase dest! We want to preserve the case.
dest_list = dest.splitall()
if orig_list[0] != os.path.normcase(dest_list[0]):
# Can't get here from there.
return dest
# Find the location where the two paths start to differ.
i = 0
for start_seg, dest_seg in zip(orig_list, dest_list):
if start_seg != os.path.normcase(dest_seg):
break
i += 1
# Now i is the point where the two paths diverge.
# Need a certain number of "os.pardir"s to work up
# from the origin to the point of divergence.
segments = [os.pardir] * (len(orig_list) - i)
# Need to add the diverging part of dest_list.
segments += dest_list[i:]
if len(segments) == 0:
# If they happen to be identical, use os.curdir.
relpath = os.curdir
else:
relpath = os.path.join(*segments)
return self.__class__(relpath)
# --- Listing, searching, walking, and matching
def listdir(self, pattern=None):
""" D.listdir() -> List of items in this directory.
Use D.files() or D.dirs() instead if you want a listing
of just files or just subdirectories.
The elements of the list are path objects.
With the optional 'pattern' argument, this only lists
items whose names match the given pattern.
"""
names = os.listdir(self)
if pattern is not None:
names = fnmatch.filter(names, pattern)
return [self / child for child in names]
def dirs(self, pattern=None):
""" D.dirs() -> List of this directory's subdirectories.
The elements of the list are path objects.
This does not walk recursively into subdirectories
(but see path.walkdirs).
With the optional 'pattern' argument, this only lists
directories whose names match the given pattern. For
example, d.dirs('build-*').
"""
return [p for p in self.listdir(pattern) if p.isdir()]
def files(self, pattern=None):
""" D.files() -> List of the files in this directory.
The elements of the list are path objects.
This does not walk into subdirectories (see path.walkfiles).
With the optional 'pattern' argument, this only lists files
whose names match the given pattern. For example,
d.files('*.pyc').
"""
return [p for p in self.listdir(pattern) if p.isfile()]
def walk(self, pattern=None, errors='strict'):
""" D.walk() -> iterator over files and subdirs, recursively.
The iterator yields path objects naming each child item of
this directory and its descendants. This requires that
D.isdir().
This performs a depth-first traversal of the directory tree.
Each directory is returned just before all its children.
The errors= keyword argument controls behavior when an
error occurs. The default is 'strict', which causes an
exception. The other allowed values are 'warn', which
reports the error via warnings.warn(), and 'ignore'.
"""
if errors not in ('strict', 'warn', 'ignore'):
raise ValueError("invalid errors parameter")
try:
childList = self.listdir()
except Exception:
if errors == 'ignore':
return
elif errors == 'warn':
warnings.warn(
"Unable to list directory '%s': %s"
% (self, sys.exc_info()[1]),
TreeWalkWarning)
return
else:
raise
for child in childList:
if pattern is None or child.fnmatch(pattern):
yield child
try:
isdir = child.isdir()
except Exception:
if errors == 'ignore':
isdir = False
elif errors == 'warn':
warnings.warn(
"Unable to access '%s': %s"
% (child, sys.exc_info()[1]),
TreeWalkWarning)
isdir = False
else:
raise
if isdir:
for item in child.walk(pattern, errors):
yield item
def walkdirs(self, pattern=None, errors='strict'):
""" D.walkdirs() -> iterator over subdirs, recursively.
With the optional 'pattern' argument, this yields only
directories whose names match the given pattern. For
example, mydir.walkdirs('*test') yields only directories
with names ending in 'test'.
The errors= keyword argument controls behavior when an
error occurs. The default is 'strict', which causes an
exception. The other allowed values are 'warn', which
reports the error via warnings.warn(), and 'ignore'.
"""
if errors not in ('strict', 'warn', 'ignore'):
raise ValueError("invalid errors parameter")
try:
dirs = self.dirs()
except Exception:
if errors == 'ignore':
return
elif errors == 'warn':
warnings.warn(
"Unable to list directory '%s': %s"
% (self, sys.exc_info()[1]),
TreeWalkWarning)
return
else:
raise
for child in dirs:
if pattern is None or child.fnmatch(pattern):
yield child
for subsubdir in child.walkdirs(pattern, errors):
yield subsubdir
def walkfiles(self, pattern=None, errors='strict'):
""" D.walkfiles() -> iterator over files in D, recursively.
The optional argument, pattern, limits the results to files
with names that match the pattern. For example,
mydir.walkfiles('*.tmp') yields only files with the .tmp
extension.
"""
if errors not in ('strict', 'warn', 'ignore'):
raise ValueError("invalid errors parameter")
try:
childList = self.listdir()
except Exception:
if errors == 'ignore':
return
elif errors == 'warn':
warnings.warn(
"Unable to list directory '%s': %s"
% (self, sys.exc_info()[1]),
TreeWalkWarning)
return
else:
raise
for child in childList:
try:
isfile = child.isfile()
isdir = not isfile and child.isdir()
except:
if errors == 'ignore':
continue
elif errors == 'warn':
warnings.warn(
"Unable to access '%s': %s"
% (self, sys.exc_info()[1]),
TreeWalkWarning)
continue
else:
raise
if isfile:
if pattern is None or child.fnmatch(pattern):
yield child
elif isdir:
for f in child.walkfiles(pattern, errors):
yield f
def fnmatch(self, pattern):
""" Return True if self.name matches the given pattern.
pattern - A filename pattern with wildcards,
for example '*.py'.
"""
return fnmatch.fnmatch(self.name, pattern)
def glob(self, pattern):
""" Return a list of path objects that match the pattern.
pattern - a path relative to this directory, with wildcards.
For example, path('/users').glob('*/bin/*') returns a list
of all the files users have in their bin directories.
"""
cls = self.__class__
return [cls(s) for s in glob.glob(_base(self / pattern))]
# --- Reading or writing an entire file at once.
def open(self, mode='r'):
""" Open this file. Return a file object. """
return file(self, mode)
def bytes(self):
""" Open this file, read all bytes, return them as a string. """
f = self.open('rb')
try:
return f.read()
finally:
f.close()
def write_bytes(self, bytes, append=False):
""" Open this file and write the given bytes to it.
Default behavior is to overwrite any existing file.
Call p.write_bytes(bytes, append=True) to append instead.
"""
if append:
mode = 'ab'
else:
mode = 'wb'
f = self.open(mode)
try:
f.write(bytes)
finally:
f.close()
def text(self, encoding=None, errors='strict'):
r""" Open this file, read it in, return the content as a string.
This uses 'U' mode in Python 2.3 and later, so '\r\n' and '\r'
are automatically translated to '\n'.
Optional arguments:
encoding - The Unicode encoding (or character set) of
the file. If present, the content of the file is
decoded and returned as a unicode object; otherwise
it is returned as an 8-bit str.
errors - How to handle Unicode errors; see help(str.decode)
for the options. Default is 'strict'.
"""
if encoding is None:
# 8-bit
f = self.open(_textmode)
try:
return f.read()
finally:
f.close()
else:
# Unicode
f = codecs.open(self, 'r', encoding, errors)
# (Note - Can't use 'U' mode here, since codecs.open
# doesn't support 'U' mode, even in Python 2.3.)
try:
t = f.read()
finally:
f.close()
return (t.replace(u'\r\n', u'\n')
.replace(u'\r\x85', u'\n')
.replace(u'\r', u'\n')
.replace(u'\x85', u'\n')
.replace(u'\u2028', u'\n'))
def write_text(self, text, encoding=None, errors='strict', linesep=os.linesep, append=False):
r""" Write the given text to this file.
The default behavior is to overwrite any existing file;
to append instead, use the 'append=True' keyword argument.
There are two differences between path.write_text() and
path.write_bytes(): newline handling and Unicode handling.
See below.
Parameters:
- text - str/unicode - The text to be written.
- encoding - str - The Unicode encoding that will be used.
This is ignored if 'text' isn't a Unicode string.
- errors - str - How to handle Unicode encoding errors.
Default is 'strict'. See help(unicode.encode) for the
options. This is ignored if 'text' isn't a Unicode
string.
- linesep - keyword argument - str/unicode - The sequence of
characters to be used to mark end-of-line. The default is
os.linesep. You can also specify None; this means to
leave all newlines as they are in 'text'.
- append - keyword argument - bool - Specifies what to do if
the file already exists (True: append to the end of it;
False: overwrite it.) The default is False.
--- Newline handling.
write_text() converts all standard end-of-line sequences
('\n', '\r', and '\r\n') to your platform's default end-of-line
sequence (see os.linesep; on Windows, for example, the
end-of-line marker is '\r\n').
If you don't like your platform's default, you can override it
using the 'linesep=' keyword argument. If you specifically want
write_text() to preserve the newlines as-is, use 'linesep=None'.
This applies to Unicode text the same as to 8-bit text, except
there are three additional standard Unicode end-of-line sequences:
u'\x85', u'\r\x85', and u'\u2028'.
(This is slightly different from when you open a file for
writing with fopen(filename, "w") in C or file(filename, 'w')
in Python.)
--- Unicode
If 'text' isn't Unicode, then apart from newline handling, the
bytes are written verbatim to the file. The 'encoding' and
'errors' arguments are not used and must be omitted.
If 'text' is Unicode, it is first converted to bytes using the
specified 'encoding' (or the default encoding if 'encoding'
isn't specified). The 'errors' argument applies only to this
conversion.
"""
if isinstance(text, unicode):
if linesep is not None:
# Convert all standard end-of-line sequences to
# ordinary newline characters.
text = (text.replace(u'\r\n', u'\n')
.replace(u'\r\x85', u'\n')
.replace(u'\r', u'\n')
.replace(u'\x85', u'\n')
.replace(u'\u2028', u'\n'))
text = text.replace(u'\n', linesep)
if encoding is None:
encoding = sys.getdefaultencoding()
bytes = text.encode(encoding, errors)
else:
# It is an error to specify an encoding if 'text' is
# an 8-bit string.
assert encoding is None
if linesep is not None:
text = (text.replace('\r\n', '\n')
.replace('\r', '\n'))
bytes = text.replace('\n', linesep)
self.write_bytes(bytes, append)
def lines(self, encoding=None, errors='strict', retain=True):
r""" Open this file, read all lines, return them in a list.
Optional arguments:
encoding - The Unicode encoding (or character set) of
the file. The default is None, meaning the content
of the file is read as 8-bit characters and returned
as a list of (non-Unicode) str objects.
errors - How to handle Unicode errors; see help(str.decode)
for the options. Default is 'strict'
retain - If true, retain newline characters; but all newline
character combinations ('\r', '\n', '\r\n') are
translated to '\n'. If false, newline characters are
stripped off. Default is True.
This uses 'U' mode in Python 2.3 and later.
"""
if encoding is None and retain:
f = self.open(_textmode)
try:
return f.readlines()
finally:
f.close()
else:
return self.text(encoding, errors).splitlines(retain)
def write_lines(self, lines, encoding=None, errors='strict',
linesep=os.linesep, append=False):
r""" Write the given lines of text to this file.
By default this overwrites any existing file at this path.
This puts a platform-specific newline sequence on every line.
See 'linesep' below.
lines - A list of strings.
encoding - A Unicode encoding to use. This applies only if
'lines' contains any Unicode strings.
errors - How to handle errors in Unicode encoding. This
also applies only to Unicode strings.
linesep - The desired line-ending. This line-ending is
applied to every line. If a line already has any
standard line ending ('\r', '\n', '\r\n', u'\x85',
u'\r\x85', u'\u2028'), that will be stripped off and
this will be used instead. The default is os.linesep,
which is platform-dependent ('\r\n' on Windows, '\n' on
Unix, etc.) Specify None to write the lines as-is,
like file.writelines().
Use the keyword argument append=True to append lines to the
file. The default is to overwrite the file. Warning:
When you use this with Unicode data, if the encoding of the
existing data in the file is different from the encoding
you specify with the encoding= parameter, the result is
mixed-encoding data, which can really confuse someone trying
to read the file later.
"""
if append:
mode = 'ab'
else:
mode = 'wb'
f = self.open(mode)
try:
for line in lines:
isUnicode = isinstance(line, unicode)
if linesep is not None:
# Strip off any existing line-end and add the
# specified linesep string.
if isUnicode:
if line[-2:] in (u'\r\n', u'\x0d\x85'):
line = line[:-2]
elif line[-1:] in (u'\r', u'\n',
u'\x85', u'\u2028'):
line = line[:-1]
else:
if line[-2:] == '\r\n':
line = line[:-2]
elif line[-1:] in ('\r', '\n'):
line = line[:-1]
line += linesep
if isUnicode:
if encoding is None:
encoding = sys.getdefaultencoding()
line = line.encode(encoding, errors)
f.write(line)
finally:
f.close()
def read_md5(self):
""" Calculate the md5 hash for this file.
This reads through the entire file.
"""
f = self.open('rb')
try:
m = hashlib.md5()
while True:
d = f.read(8192)
if not d:
break
m.update(d)
finally:
f.close()
return m.digest()
# --- Methods for querying the filesystem.
exists = os.path.exists
isdir = os.path.isdir
isfile = os.path.isfile
islink = os.path.islink
ismount = os.path.ismount
if hasattr(os.path, 'samefile'):
samefile = os.path.samefile
getatime = os.path.getatime
atime = property(
getatime, None, None,
""" Last access time of the file. """)
getmtime = os.path.getmtime
mtime = property(
getmtime, None, None,
""" Last-modified time of the file. """)
if hasattr(os.path, 'getctime'):
getctime = os.path.getctime
ctime = property(
getctime, None, None,
""" Creation time of the file. """)
getsize = os.path.getsize
size = property(
getsize, None, None,
""" Size of the file, in bytes. """)
if hasattr(os, 'access'):
def access(self, mode):
""" Return true if current user has access to this path.
mode - One of the constants os.F_OK, os.R_OK, os.W_OK, os.X_OK
"""
return os.access(self, mode)
def stat(self):
""" Perform a stat() system call on this path. """
return os.stat(self)
def lstat(self):
""" Like path.stat(), but do not follow symbolic links. """
return os.lstat(self)
def get_owner(self):
r""" Return the name of the owner of this file or directory.
This follows symbolic links.
On Windows, this returns a name of the form ur'DOMAIN\User Name'.
On Windows, a group can own a file or directory.
"""
if os.name == 'nt':
if win32security is None:
raise Exception("path.owner requires win32all to be installed")
desc = win32security.GetFileSecurity(
self, win32security.OWNER_SECURITY_INFORMATION)
sid = desc.GetSecurityDescriptorOwner()
account, domain, typecode = win32security.LookupAccountSid(None, sid)
return domain + u'\\' + account
else:
if pwd is None:
raise NotImplementedError("path.owner is not implemented on this platform.")
st = self.stat()
return pwd.getpwuid(st.st_uid).pw_name
owner = property(
get_owner, None, None,
""" Name of the owner of this file or directory. """)
if hasattr(os, 'statvfs'):
def statvfs(self):
""" Perform a statvfs() system call on this path. """
return os.statvfs(self)
if hasattr(os, 'pathconf'):
def pathconf(self, name):
return os.pathconf(self, name)
# --- Modifying operations on files and directories
def utime(self, times):
""" Set the access and modified times of this file. """
os.utime(self, times)
def chmod(self, mode):
os.chmod(self, mode)
if hasattr(os, 'chown'):
def chown(self, uid, gid):
os.chown(self, uid, gid)
def rename(self, new):
os.rename(self, new)
def renames(self, new):
os.renames(self, new)
# --- Create/delete operations on directories
def mkdir(self, mode=0777):
os.mkdir(self, mode)
def makedirs(self, mode=0777):
os.makedirs(self, mode)
def rmdir(self):
os.rmdir(self)
def removedirs(self):
os.removedirs(self)
# --- Modifying operations on files
def touch(self):
""" Set the access/modified times of this file to the current time.
Create the file if it does not exist.
"""
fd = os.open(self, os.O_WRONLY | os.O_CREAT, 0666)
os.close(fd)
os.utime(self, None)
def remove(self):
os.remove(self)
def unlink(self):
os.unlink(self)
# --- Links
if hasattr(os, 'link'):
def link(self, newpath):
""" Create a hard link at 'newpath', pointing to this file. """
os.link(self, newpath)
if hasattr(os, 'symlink'):
def symlink(self, newlink):
""" Create a symbolic link at 'newlink', pointing here. """
os.symlink(self, newlink)
if hasattr(os, 'readlink'):
def readlink(self):
""" Return the path to which this symbolic link points.
The result may be an absolute or a relative path.
"""
return self.__class__(os.readlink(self))
def readlinkabs(self):
""" Return the path to which this symbolic link points.
The result is always an absolute path.
"""
p = self.readlink()
if p.isabs():
return p
else:
return (self.parent / p).abspath()
# --- High-level functions from shutil
copyfile = shutil.copyfile
copymode = shutil.copymode
copystat = shutil.copystat
copy = shutil.copy
copy2 = shutil.copy2
copytree = shutil.copytree
if hasattr(shutil, 'move'):
move = shutil.move
rmtree = shutil.rmtree
# --- Special stuff from os
if hasattr(os, 'chroot'):
def chroot(self):
os.chroot(self)
if hasattr(os, 'startfile'):
def startfile(self):
os.startfile(self)

View File

@ -1,121 +0,0 @@
import sys, glob, re
import mechanize
URL = 'http://translate.google.com/translate_t?text=%(text)s&langpair=en|%(lang)s&oe=UTF8'
def browser():
opener = mechanize.Browser()
opener.set_handle_refresh(True)
opener.set_handle_robots(False)
opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
return opener
class PoFile(object):
SANITIZE = re.compile(r'&|<[^<>]+>|\%')
STRING = re.compile(r'"(.*)"')
def __init__(self, po_file):
self.po_file = open(po_file, 'r+b')
self.browser = browser()
self.entries = []
self.read()
def sanitize_line(self, line):
return self.SANITIZE.sub(line)
def read(self):
translated_lines = []
self.po_file.seek(0)
ID = 0
STR = 1
WHR = 2
mode = None
where, msgid, msgstr, fuzzy = [], [], [], False
for line in self.po_file.readlines():
prev_mode = mode
if line.startswith('#:'):
mode = WHR
elif line.startswith('msgid'):
mode = ID
elif line.startswith('msgstr'):
mode = STR
elif line.startswith('#,'):
fuzzy = True
continue
elif line.startswith('#') or not line.strip():
mode = None
if mode != prev_mode:
if prev_mode == STR:
self.add_entry(where, fuzzy, msgid, msgstr)
where, msgid, msgstr, fuzzy = [], [], [], False
if mode == WHR:
where.append(line[2:].strip())
elif mode == ID:
msgid.append(self.get_string(line))
elif mode == STR:
msgstr.append(self.get_string(line))
elif mode == None:
self.add_line(line)
def get_string(self, line):
return self.STRING.search(line).group(1)
def add_line(self, line):
self.entries.append(line.strip())
def add_entry(self, where, fuzzy, msgid, msgstr):
self.entries.append(Entry(where, fuzzy, msgid, msgstr))
def __str__(self):
return '\n'.join([str(i) for i in self.entries]) + '\n'
class Entry(object):
def __init__(self, where, fuzzy, msgid, msgstr, encoding='utf-8'):
self.fuzzy = fuzzy
self.where = [i.decode(encoding) for i in where]
self.msgid = [i.decode(encoding) for i in msgid]
self.msgstr = [i.decode(encoding) for i in msgstr]
self.encoding = encoding
def __str__(self):
ans = []
for line in self.where:
ans.append('#: ' + line.encode(self.encoding))
if self.fuzzy:
ans.append('#, fuzzy')
first = True
for line in self.msgid:
prefix = 'msgid ' if first else ''
ans.append(prefix + '"%s"'%line.encode(self.encoding))
first = False
first = True
for line in self.msgstr:
prefix = 'msgstr ' if first else ''
ans.append(prefix + '"%s"'%line.encode(self.encoding))
first = False
return '\n'.join(ans)
def main():
po_files = glob.glob('*.po')
for po_file in po_files:
PoFile(po_file)
pass
if __name__ == '__main__':
pof = PoFile('de.po')
open('/tmp/de.po', 'wb').write(str(pof))
#sys.exit(main())

View File

@ -15,7 +15,10 @@ def available_translations():
global _available_translations
if _available_translations is None:
stats = P('localization/stats.pickle')
stats = cPickle.load(open(stats, 'rb'))
if os.path.exists(stats):
stats = cPickle.load(open(stats, 'rb'))
else:
stats = {}
_available_translations = [x for x in stats if stats[x] > 0.1]
return _available_translations

View File

@ -85,7 +85,7 @@ __all__ = [
'htmlComment', 'javaStyleComment', 'keepOriginalText', 'line', 'lineEnd', 'lineStart', 'lineno',
'makeHTMLTags', 'makeXMLTags', 'matchOnlyAtCol', 'matchPreviousExpr', 'matchPreviousLiteral',
'nestedExpr', 'nullDebugAction', 'nums', 'oneOf', 'opAssoc', 'operatorPrecedence', 'printables',
'punc8bit', 'pythonStyleComment', 'quotedString', 'removeQuotes', 'replaceHTMLEntity',
'punc8bit', 'pythonStyleComment', 'quotedString', 'removeQuotes', 'replaceHTMLEntity',
'replaceWith', 'restOfLine', 'sglQuotedString', 'srange', 'stringEnd',
'stringStart', 'traceParseAction', 'unicodeString', 'upcaseTokens', 'withAttribute',
'indentedBlock', 'originalTextFor',
@ -425,7 +425,7 @@ class ParseResults(object):
self[k] = v
if isinstance(v[0],ParseResults):
v[0].__parent = wkref(self)
self.__toklist += other.__toklist
self.__accumNames.update( other.__accumNames )
del other
@ -3231,12 +3231,12 @@ def originalTextFor(expr, asString=True):
restore the parsed fields of an HTML start tag into the raw tag text itself, or to
revert separate tokens with intervening whitespace back to the original matching
input text. Simpler to use than the parse action keepOriginalText, and does not
require the inspect module to chase up the call stack. By default, returns a
string containing the original parsed text.
If the optional asString argument is passed as False, then the return value is a
ParseResults containing any results names that were originally matched, and a
single token containing the original matched text from the input string. So if
require the inspect module to chase up the call stack. By default, returns a
string containing the original parsed text.
If the optional asString argument is passed as False, then the return value is a
ParseResults containing any results names that were originally matched, and a
single token containing the original matched text from the input string. So if
the expression passed to originalTextFor contains expressions with defined
results names, you must set asString to False if you want to preserve those
results name values."""
@ -3252,7 +3252,7 @@ def originalTextFor(expr, asString=True):
del t["_original_end"]
matchExpr.setParseAction(extractText)
return matchExpr
# convenience constants for positional expressions
empty = Empty().setName("empty")
lineStart = LineStart().setName("lineStart")
@ -3532,7 +3532,7 @@ def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString):
).setParseAction(lambda t:t[0].strip()))
else:
if ignoreExpr is not None:
content = (Combine(OneOrMore(~ignoreExpr +
content = (Combine(OneOrMore(~ignoreExpr +
~Literal(opener) + ~Literal(closer) +
CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1))
).setParseAction(lambda t:t[0].strip()))

View File

@ -20,6 +20,7 @@ class WriteXmlMixin:
def to_xml(self, encoding = "iso-8859-1"):
try:
import cStringIO as StringIO
StringIO
except ImportError:
import StringIO
f = StringIO.StringIO()
@ -64,7 +65,7 @@ def _format_date(dt):
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1],
dt.year, dt.hour, dt.minute, dt.second)
##
# A couple simple wrapper objects for the fields which
# take a simple value other than a string.
@ -72,7 +73,7 @@ class IntElement:
"""implements the 'publish' API for integers
Takes the tag name and the integer value to publish.
(Could be used for anything which uses str() to be published
to text for XML.)
"""
@ -138,7 +139,7 @@ class Image:
self.width = width
self.height = height
self.description = description
def publish(self, handler):
handler.startElement("image", self.element_attrs)
@ -150,7 +151,7 @@ class Image:
if isinstance(width, int):
width = IntElement("width", width)
_opt_element(handler, "width", width)
height = self.height
if isinstance(height, int):
height = IntElement("height", height)
@ -196,7 +197,7 @@ class TextInput:
_element(handler, "name", self.name)
_element(handler, "link", self.link)
handler.endElement("textInput")
class Enclosure:
"""Publish an enclosure"""
@ -255,7 +256,7 @@ class RSS2(WriteXmlMixin):
Stores the channel attributes, with the "category" elements under
".categories" and the RSS items under ".items".
"""
rss_attrs = {"version": "2.0"}
element_attrs = {}
def __init__(self,
@ -269,7 +270,7 @@ class RSS2(WriteXmlMixin):
webMaster = None,
pubDate = None, # a datetime, *in* *GMT*
lastBuildDate = None, # a datetime
categories = None, # list of strings or Category
generator = _generator_name,
docs = "http://blogs.law.harvard.edu/tech/rss",
@ -294,7 +295,7 @@ class RSS2(WriteXmlMixin):
self.webMaster = webMaster
self.pubDate = pubDate
self.lastBuildDate = lastBuildDate
if categories is None:
categories = []
self.categories = categories
@ -320,7 +321,7 @@ class RSS2(WriteXmlMixin):
_element(handler, "description", self.description)
self.publish_extensions(handler)
_opt_element(handler, "language", self.language)
_opt_element(handler, "copyright", self.copyright)
_opt_element(handler, "managingEditor", self.managingEditor)
@ -374,8 +375,8 @@ class RSS2(WriteXmlMixin):
# output after the three required fields.
pass
class RSSItem(WriteXmlMixin):
"""Publish an RSS Item"""
element_attrs = {}
@ -391,7 +392,7 @@ class RSSItem(WriteXmlMixin):
pubDate = None, # a datetime
source = None, # a Source
):
if title is None and description is None:
raise TypeError(
"must define at least one of 'title' or 'description'")
@ -421,7 +422,7 @@ class RSSItem(WriteXmlMixin):
if isinstance(category, basestring):
category = Category(category)
category.publish(handler)
_opt_element(handler, "comments", self.comments)
if self.enclosure is not None:
self.enclosure.publish(handler)
@ -434,7 +435,7 @@ class RSSItem(WriteXmlMixin):
if self.source is not None:
self.source.publish(handler)
handler.endElement("item")
def publish_extensions(self, handler):

View File

@ -57,13 +57,13 @@ recipe_modules = ['recipe_' + r for r in (
'monitor', 'republika', 'beta', 'beta_en', 'glasjavnosti',
'esquire', 'livemint', 'thedgesingapore', 'darknet', 'rga',
'intelligencer', 'theoldfoodie', 'hln_be', 'honvedelem',
'the_new_republic',
)]
import re, imp, inspect, time, os
from calibre.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe, AutomaticNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.path import path
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre import __appname__, english_sort
@ -102,8 +102,8 @@ def compile_recipe(src):
'''
global _tdir, _crep
if _tdir is None or not os.path.exists(_tdir):
_tdir = path(PersistentTemporaryDirectory('_recipes'))
temp = _tdir/('recipe%d.py'%_crep)
_tdir = PersistentTemporaryDirectory('_recipes')
temp = os.path.join(_tdir, 'recipe%d.py'%_crep)
_crep += 1
if not isinstance(src, unicode):
match = re.search(r'coding[:=]\s*([-\w.]+)', src[:200])
@ -118,8 +118,9 @@ def compile_recipe(src):
src = src.replace('from libprs500', 'from calibre').encode('utf-8')
f.write(src)
f.close()
module = imp.find_module(temp.namebase, [temp.dirname()])
module = imp.load_module(temp.namebase, *module)
module = imp.find_module(os.path.splitext(os.path.basename(temp))[0],
[os.path.dirname(temp)])
module = imp.load_module(os.path.splitext(os.path.basename(temp))[0], *module)
classes = inspect.getmembers(module,
lambda x : inspect.isclass(x) and \
issubclass(x, (BasicNewsRecipe,)) and \
@ -148,6 +149,7 @@ _titles.sort(cmp=english_sort)
titles = _titles
def migrate_automatic_profile_to_automatic_recipe(profile):
BeautifulSoup
oprofile = profile
profile = compile_recipe(profile)
if 'BasicUserProfile' not in profile.__name__:
@ -165,3 +167,4 @@ class BasicUserRecipe%d(AutomaticNewsRecipe):
'''%(int(time.time()), repr(profile.title), profile.oldest_article,
profile.max_articles_per_feed, profile.summary_length, repr(profile.feeds))

View File

@ -1,61 +1,61 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
24sata.hr
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Cro24Sata(BasicNewsRecipe):
title = '24 Sata - Hr'
__author__ = 'Darko Miletic'
description = "News Portal from Croatia"
publisher = '24sata.hr'
category = 'news, politics, Croatia'
oldest_article = 2
max_articles_per_feed = 100
delay = 4
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
24sata.hr
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Cro24Sata(BasicNewsRecipe):
title = '24 Sata - Hr'
__author__ = 'Darko Miletic'
description = "News Portal from Croatia"
publisher = '24sata.hr'
category = 'news, politics, Croatia'
oldest_article = 2
max_articles_per_feed = 100
delay = 4
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = 'hr'
lang = 'hr-HR'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
remove_tags = [
dict(name=['object','link','embed'])
,dict(name='table', attrs={'class':'enumbox'})
]
feeds = [(u'Najnovije Vijesti', u'http://www.24sata.hr/index.php?cmd=show_rss&action=novo')]
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
for item in soup.findAll(style=True):
del item['style']
return soup
def print_version(self, url):
return url + '&action=ispis'
lang = 'hr-HR'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
remove_tags = [
dict(name=['object','link','embed'])
,dict(name='table', attrs={'class':'enumbox'})
]
feeds = [(u'Najnovije Vijesti', u'http://www.24sata.hr/index.php?cmd=show_rss&action=novo')]
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
for item in soup.findAll(style=True):
del item['style']
return soup
def print_version(self, url):
return url + '&action=ispis'

View File

@ -1,68 +1,68 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
24sata.rs
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Ser24Sata(BasicNewsRecipe):
title = '24 Sata - Sr'
__author__ = 'Darko Miletic'
description = '24 sata portal vesti iz Srbije'
publisher = 'Ringier d.o.o.'
category = 'news, politics, entertainment, Serbia'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
24sata.rs
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Ser24Sata(BasicNewsRecipe):
title = '24 Sata - Sr'
__author__ = 'Darko Miletic'
description = '24 sata portal vesti iz Srbije'
publisher = 'Ringier d.o.o.'
category = 'news, politics, entertainment, Serbia'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = 'sr'
lang = 'sr-Latn-RS'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
feeds = [(u'Vesti Dana', u'http://www.24sata.rs/rss.php')]
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)
def print_version(self, url):
article = url.partition('#')[0]
article_id = article.partition('id=')[2]
return 'http://www.24sata.rs/_print.php?id=' + article_id
lang = 'sr-Latn-RS'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
feeds = [(u'Vesti Dana', u'http://www.24sata.rs/rss.php')]
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)
def print_version(self, url):
article = url.partition('#')[0]
article_id = article.partition('id=')[2]
return 'http://www.24sata.rs/_print.php?id=' + article_id

View File

@ -1,72 +1,72 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
elargentino.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class SieteDias(BasicNewsRecipe):
title = '7 dias'
__author__ = 'Darko Miletic'
description = 'Revista Argentina'
publisher = 'ElArgentino.com'
category = 'news, politics, show, Argentina'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
elargentino.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class SieteDias(BasicNewsRecipe):
title = '7 dias'
__author__ = 'Darko Miletic'
description = 'Revista Argentina'
publisher = 'ElArgentino.com'
category = 'news, politics, show, Argentina'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = 'es'
lang = 'es-AR'
direction = 'ltr'
INDEX = 'http://www.elargentino.com/medios/125/7-Dias.html'
extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} '
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})]
remove_tags = [dict(name='link')]
feeds = [(u'Articulos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=125&Content-Type=text/xml&ChannelDesc=7%20D%C3%ADas')]
def print_version(self, url):
main, sep, article_part = url.partition('/nota-')
article_id, rsep, rrest = article_part.partition('-')
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('div',attrs={'class':'colder'})
if cover_item:
clean_url = self.image_url_processor(None,cover_item.div.img['src'])
cover_url = 'http://www.elargentino.com' + clean_url + '&height=600'
return cover_url
def image_url_processor(self, baseurl, url):
base, sep, rest = url.rpartition('?Id=')
img, sep2, rrest = rest.partition('&')
return base + sep + img
lang = 'es-AR'
direction = 'ltr'
INDEX = 'http://www.elargentino.com/medios/125/7-Dias.html'
extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} '
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})]
remove_tags = [dict(name='link')]
feeds = [(u'Articulos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=125&Content-Type=text/xml&ChannelDesc=7%20D%C3%ADas')]
def print_version(self, url):
main, sep, article_part = url.partition('/nota-')
article_id, rsep, rrest = article_part.partition('-')
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('div',attrs={'class':'colder'})
if cover_item:
clean_url = self.image_url_processor(None,cover_item.div.img['src'])
cover_url = 'http://www.elargentino.com' + clean_url + '&height=600'
return cover_url
def image_url_processor(self, baseurl, url):
base, sep, rest = url.rpartition('?Id=')
img, sep2, rrest = rest.partition('&')
return base + sep + img

View File

@ -1,59 +1,59 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.accountancyage.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class AccountancyAge(BasicNewsRecipe):
title = 'Accountancy Age'
__author__ = 'Darko Miletic'
description = 'business news'
publisher = 'accountancyage.com'
category = 'news, politics, finances'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
simultaneous_downloads = 1
encoding = 'utf-8'
lang = 'en'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.accountancyage.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class AccountancyAge(BasicNewsRecipe):
title = 'Accountancy Age'
__author__ = 'Darko Miletic'
description = 'business news'
publisher = 'accountancyage.com'
category = 'news, politics, finances'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
simultaneous_downloads = 1
encoding = 'utf-8'
lang = 'en'
language = 'en'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'bodycol'})]
remove_tags = [dict(name=['embed','object'])]
remove_tags_after = dict(name='div', attrs={'id':'permalink'})
remove_tags_before = dict(name='div', attrs={'class':'gap6'})
feeds = [(u'All News', u'http://feeds.accountancyage.com/rss/latest/accountancyage/all')]
def print_version(self, url):
rest, sep, miss = url.rpartition('/')
rr, ssep, artid = rest.rpartition('/')
return u'http://www.accountancyage.com/articles/print/' + artid
def get_article_url(self, article):
return article.get('guid', None)
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'bodycol'})]
remove_tags = [dict(name=['embed','object'])]
remove_tags_after = dict(name='div', attrs={'id':'permalink'})
remove_tags_before = dict(name='div', attrs={'class':'gap6'})
feeds = [(u'All News', u'http://feeds.accountancyage.com/rss/latest/accountancyage/all')]
def print_version(self, url):
rest, sep, miss = url.rpartition('/')
rr, ssep, artid = rest.rpartition('/')
return u'http://www.accountancyage.com/articles/print/' + artid
def get_article_url(self, article):
return article.get('guid', None)
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)

View File

@ -1,77 +1,77 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.adventuregamers.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class AdventureGamers(BasicNewsRecipe):
title = u'Adventure Gamers'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.adventuregamers.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class AdventureGamers(BasicNewsRecipe):
title = u'Adventure Gamers'
language = 'en'
__author__ = 'Darko Miletic'
description = 'Adventure games portal'
publisher = 'Adventure Gamers'
category = 'news, games, adventure, technology'
__author__ = 'Darko Miletic'
description = 'Adventure games portal'
publisher = 'Adventure Gamers'
category = 'news, games, adventure, technology'
language = 'en'
oldest_article = 10
delay = 10
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'cp1252'
remove_javascript = True
use_embedded_content = False
INDEX = u'http://www.adventuregamers.com'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [
dict(name='div', attrs={'class':'content_middle'})
]
remove_tags = [
dict(name=['object','link','embed','form'])
,dict(name='div', attrs={'class':['related-stories','article_leadout','prev','next','both']})
]
remove_tags_after = [dict(name='div', attrs={'class':'toolbar_fat'})]
feeds = [(u'Articles', u'http://feeds2.feedburner.com/AdventureGamers')]
def get_article_url(self, article):
return article.get('guid', None)
def append_page(self, soup, appendtag, position):
pager = soup.find('div',attrs={'class':'toolbar_fat_next'})
if pager:
nexturl = self.INDEX + pager.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'bodytext'})
for it in texttag.findAll(style=True):
del it['style']
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
appendtag.insert(position,texttag)
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
self.append_page(soup, soup.body, 3)
pager = soup.find('div',attrs={'class':'toolbar_fat'})
if pager:
pager.extract()
return soup
oldest_article = 10
delay = 10
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'cp1252'
remove_javascript = True
use_embedded_content = False
INDEX = u'http://www.adventuregamers.com'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [
dict(name='div', attrs={'class':'content_middle'})
]
remove_tags = [
dict(name=['object','link','embed','form'])
,dict(name='div', attrs={'class':['related-stories','article_leadout','prev','next','both']})
]
remove_tags_after = [dict(name='div', attrs={'class':'toolbar_fat'})]
feeds = [(u'Articles', u'http://feeds2.feedburner.com/AdventureGamers')]
def get_article_url(self, article):
return article.get('guid', None)
def append_page(self, soup, appendtag, position):
pager = soup.find('div',attrs={'class':'toolbar_fat_next'})
if pager:
nexturl = self.INDEX + pager.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'bodytext'})
for it in texttag.findAll(style=True):
del it['style']
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
appendtag.insert(position,texttag)
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
self.append_page(soup, soup.body, 3)
pager = soup.find('div',attrs={'class':'toolbar_fat'})
if pager:
pager.extract()
return soup

View File

@ -1,62 +1,61 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
ambito.com
'''
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
ambito.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Ambito(BasicNewsRecipe):
title = 'Ambito.com'
__author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas'
publisher = 'Ambito.com'
category = 'news, politics, Argentina'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'iso-8859-1'
cover_url = 'http://www.ambito.com/img/logo_.jpg'
remove_javascript = True
use_embedded_content = False
html2lrf_options = [
class Ambito(BasicNewsRecipe):
title = 'Ambito.com'
__author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas'
publisher = 'Ambito.com'
category = 'news, politics, Argentina'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'iso-8859-1'
cover_url = 'http://www.ambito.com/img/logo_.jpg'
remove_javascript = True
use_embedded_content = False
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'align':'justify'})]
remove_tags = [dict(name=['object','link'])]
feeds = [
(u'Principales Noticias', u'http://www.ambito.com/rss/noticiasp.asp' )
,(u'Economia' , u'http://www.ambito.com/rss/noticias.asp?S=Econom%EDa' )
,(u'Politica' , u'http://www.ambito.com/rss/noticias.asp?S=Pol%EDtica' )
,(u'Informacion General' , u'http://www.ambito.com/rss/noticias.asp?S=Informaci%F3n%20General')
,(u'Agro' , u'http://www.ambito.com/rss/noticias.asp?S=Agro' )
,(u'Internacionales' , u'http://www.ambito.com/rss/noticias.asp?S=Internacionales' )
,(u'Deportes' , u'http://www.ambito.com/rss/noticias.asp?S=Deportes' )
,(u'Espectaculos' , u'http://www.ambito.com/rss/noticias.asp?S=Espect%E1culos' )
,(u'Tecnologia' , u'http://www.ambito.com/rss/noticias.asp?S=Tecnologia' )
,(u'Salud' , u'http://www.ambito.com/rss/noticias.asp?S=Salud' )
,(u'Ambito Nacional' , u'http://www.ambito.com/rss/noticias.asp?S=Ambito%20Nacional' )
]
def print_version(self, url):
return url.replace('http://www.ambito.com/noticia.asp?','http://www.ambito.com/noticias/imprimir.asp?')
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-AR"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'align':'justify'})]
remove_tags = [dict(name=['object','link'])]
feeds = [
(u'Principales Noticias', u'http://www.ambito.com/rss/noticiasp.asp' )
,(u'Economia' , u'http://www.ambito.com/rss/noticias.asp?S=Econom%EDa' )
,(u'Politica' , u'http://www.ambito.com/rss/noticias.asp?S=Pol%EDtica' )
,(u'Informacion General' , u'http://www.ambito.com/rss/noticias.asp?S=Informaci%F3n%20General')
,(u'Agro' , u'http://www.ambito.com/rss/noticias.asp?S=Agro' )
,(u'Internacionales' , u'http://www.ambito.com/rss/noticias.asp?S=Internacionales' )
,(u'Deportes' , u'http://www.ambito.com/rss/noticias.asp?S=Deportes' )
,(u'Espectaculos' , u'http://www.ambito.com/rss/noticias.asp?S=Espect%E1culos' )
,(u'Tecnologia' , u'http://www.ambito.com/rss/noticias.asp?S=Tecnologia' )
,(u'Salud' , u'http://www.ambito.com/rss/noticias.asp?S=Salud' )
,(u'Ambito Nacional' , u'http://www.ambito.com/rss/noticias.asp?S=Ambito%20Nacional' )
]
def print_version(self, url):
return url.replace('http://www.ambito.com/noticia.asp?','http://www.ambito.com/noticias/imprimir.asp?')
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-AR"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = 'es'

View File

@ -1,55 +1,55 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
spectator.org
'''
from calibre.web.feeds.news import BasicNewsRecipe
class TheAmericanSpectator(BasicNewsRecipe):
title = 'The American Spectator'
__author__ = 'Darko Miletic'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
spectator.org
'''
from calibre.web.feeds.news import BasicNewsRecipe
class TheAmericanSpectator(BasicNewsRecipe):
title = 'The American Spectator'
__author__ = 'Darko Miletic'
language = 'en'
description = 'News from USA'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
INDEX = 'http://spectator.org'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, politics, USA'
, '--publisher' , title
]
keep_only_tags = [
dict(name='div', attrs={'class':'post inner'})
,dict(name='div', attrs={'class':'author-bio'})
]
remove_tags = [
dict(name='object')
,dict(name='div', attrs={'class':'col3' })
,dict(name='div', attrs={'class':'post-options' })
,dict(name='p' , attrs={'class':'letter-editor'})
,dict(name='div', attrs={'class':'social' })
]
feeds = [ (u'Articles', u'http://feedproxy.google.com/amspecarticles')]
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.INDEX)
link_item = soup.find('a',attrs={'class':'cover'})
if link_item:
soup2 = self.index_to_soup(link_item['href'])
link_item2 = soup2.find('div',attrs={'class':'post inner issues'})
cover_url = self.INDEX + link_item2.img['src']
return cover_url
def print_version(self, url):
return url + '/print'
description = 'News from USA'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
INDEX = 'http://spectator.org'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, politics, USA'
, '--publisher' , title
]
keep_only_tags = [
dict(name='div', attrs={'class':'post inner'})
,dict(name='div', attrs={'class':'author-bio'})
]
remove_tags = [
dict(name='object')
,dict(name='div', attrs={'class':'col3' })
,dict(name='div', attrs={'class':'post-options' })
,dict(name='p' , attrs={'class':'letter-editor'})
,dict(name='div', attrs={'class':'social' })
]
feeds = [ (u'Articles', u'http://feedproxy.google.com/amspecarticles')]
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.INDEX)
link_item = soup.find('a',attrs={'class':'cover'})
if link_item:
soup2 = self.index_to_soup(link_item['href'])
link_item2 = soup2.find('div',attrs={'class':'post inner issues'})
cover_url = self.INDEX + link_item2.img['src']
return cover_url
def print_version(self, url):
return url + '/print'

View File

@ -1,62 +1,62 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
axxon.com.ar
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Axxon_news(BasicNewsRecipe):
title = 'Axxon noticias'
__author__ = 'Darko Miletic'
description = 'Axxon, Ciencia Ficcion en Bits'
publisher = 'Axxon'
category = 'news, SF, Argentina, science, movies'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = False
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
axxon.com.ar
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Axxon_news(BasicNewsRecipe):
title = 'Axxon noticias'
__author__ = 'Darko Miletic'
description = 'Axxon, Ciencia Ficcion en Bits'
publisher = 'Axxon'
category = 'news, SF, Argentina, science, movies'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = False
language = 'es'
lang = 'es-AR'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
keep_only_tags = [dict(name='div', attrs={'class':'post'})]
remove_tags = [dict(name=['object','link','iframe','embed'])]
feeds = [(u'Noticias', u'http://axxon.com.ar/noticias/feed/')]
remove_attributes = ['style','width','height','font','border','align']
def adeify_images2(cls, soup):
for item in soup.findAll('img'):
for attrib in ['height','width','border','align','style']:
if item.has_key(attrib):
del item[attrib]
oldParent = item.parent
if oldParent.name == 'a':
oldParent.name == 'p'
myIndex = oldParent.contents.index(item)
brtag = Tag(soup,'br')
oldParent.insert(myIndex+1,brtag)
return soup
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.html.insert(0,mlang)
return self.adeify_images2(soup)
lang = 'es-AR'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
keep_only_tags = [dict(name='div', attrs={'class':'post'})]
remove_tags = [dict(name=['object','link','iframe','embed'])]
feeds = [(u'Noticias', u'http://axxon.com.ar/noticias/feed/')]
remove_attributes = ['style','width','height','font','border','align']
def adeify_images2(cls, soup):
for item in soup.findAll('img'):
for attrib in ['height','width','border','align','style']:
if item.has_key(attrib):
del item[attrib]
oldParent = item.parent
if oldParent.name == 'a':
oldParent.name == 'p'
myIndex = oldParent.contents.index(item)
brtag = Tag(soup,'br')
oldParent.insert(myIndex+1,brtag)
return soup
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.html.insert(0,mlang)
return self.adeify_images2(soup)

View File

@ -1,65 +1,65 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.azstarnet.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Azstarnet(BasicNewsRecipe):
title = 'Arizona Daily Star'
__author__ = 'Darko Miletic'
description = 'news from Arizona'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.azstarnet.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Azstarnet(BasicNewsRecipe):
title = 'Arizona Daily Star'
__author__ = 'Darko Miletic'
description = 'news from Arizona'
language = 'en'
publisher = 'azstarnet.com'
category = 'news, politics, Arizona, USA'
delay = 1
oldest_article = 1
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
needs_subscription = True
remove_javascript = True
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://azstarnet.com/registration/retro.php')
br.select_form(nr=1)
br['email'] = self.username
br['pass' ] = self.password
br.submit()
return br
keep_only_tags = [dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [
dict(name=['object','link','iframe','base','img'])
,dict(name='div',attrs={'class':'bannerinstory'})
]
feeds = [(u'Tucson Region', u'http://rss.azstarnet.com/index.php?site=metro')]
def preprocess_html(self, soup):
soup.html['dir' ] = 'ltr'
soup.html['lang'] = 'en-US'
mtag = '\n<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
publisher = 'azstarnet.com'
category = 'news, politics, Arizona, USA'
delay = 1
oldest_article = 1
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
needs_subscription = True
remove_javascript = True
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://azstarnet.com/registration/retro.php')
br.select_form(nr=1)
br['email'] = self.username
br['pass' ] = self.password
br.submit()
return br
keep_only_tags = [dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [
dict(name=['object','link','iframe','base','img'])
,dict(name='div',attrs={'class':'bannerinstory'})
]
feeds = [(u'Tucson Region', u'http://rss.azstarnet.com/index.php?site=metro')]
def preprocess_html(self, soup):
soup.html['dir' ] = 'ltr'
soup.html['lang'] = 'en-US'
mtag = '\n<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -1,69 +1,69 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
b92.net
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class B92(BasicNewsRecipe):
title = 'B92'
__author__ = 'Darko Miletic'
description = 'Dnevne vesti iz Srbije i sveta'
publisher = 'B92'
category = 'news, politics, Serbia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1250'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
b92.net
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class B92(BasicNewsRecipe):
title = 'B92'
__author__ = 'Darko Miletic'
description = 'Dnevne vesti iz Srbije i sveta'
publisher = 'B92'
category = 'news, politics, Serbia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1250'
language = 'sr'
lang = 'sr-Latn-RS'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='table', attrs={'class':'maindocument'})]
remove_tags = [
dict(name='ul', attrs={'class':'comment-nav'})
,dict(name=['embed','link','base'] )
,dict(name='div', attrs={'class':'udokum'} )
]
feeds = [
(u'Vesti', u'http://www.b92.net/info/rss/vesti.xml')
,(u'Biz' , u'http://www.b92.net/info/rss/biz.xml' )
]
def print_version(self, url):
return url + '&version=print'
def preprocess_html(self, soup):
del soup.body['onload']
for item in soup.findAll('font'):
item.name='div'
if item.has_key('size'):
del item['size']
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
return soup
lang = 'sr-Latn-RS'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='table', attrs={'class':'maindocument'})]
remove_tags = [
dict(name='ul', attrs={'class':'comment-nav'})
,dict(name=['embed','link','base'] )
,dict(name='div', attrs={'class':'udokum'} )
]
feeds = [
(u'Vesti', u'http://www.b92.net/info/rss/vesti.xml')
,(u'Biz' , u'http://www.b92.net/info/rss/biz.xml' )
]
def print_version(self, url):
return url + '&version=print'
def preprocess_html(self, soup):
del soup.body['onload']
for item in soup.findAll('font'):
item.name='div'
if item.has_key('size'):
del item['size']
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
return soup

View File

@ -1,93 +1,93 @@
##
## web2lrf profile to download articles from Barrons.com
## can download subscriber-only content if username and
## password are supplied.
##
'''
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Barrons(BasicNewsRecipe):
title = 'Barron\'s'
max_articles_per_feed = 50
needs_subscription = True
##
## web2lrf profile to download articles from Barrons.com
## can download subscriber-only content if username and
## password are supplied.
##
'''
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Barrons(BasicNewsRecipe):
title = 'Barron\'s'
max_articles_per_feed = 50
needs_subscription = True
language = 'en'
__author__ = 'Kovid Goyal'
description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
timefmt = ' [%a, %b %d, %Y]'
use_embedded_content = False
no_stylesheets = False
match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
conversion_options = {'linearize_tables': True}
##delay = 1
## Don't grab articles more than 7 days old
oldest_article = 7
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the body of the article.
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
## Remove any insets from the body of the article.
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
## Remove any reprint info from the body of the article.
(r'<hr size.*?<p', lambda match : '<p'),
## Remove anything after the end of the article.
(r'<!-- article end.*?</body>', lambda match : '</body>'),
]
]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://commerce.barrons.com/auth/login')
br.select_form(name='login_form')
br['user'] = self.username
br['password'] = self.password
br.submit()
return br
## Use the print version of a page when available.
def print_version(self, url):
return url.replace('/article/', '/article_print/')
## Comment out the feeds you don't want retrieved.
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
def get_feeds(self):
return [
('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
]
## Logout of website
## NOT CURRENTLY WORKING
# def cleanup(self):
# try:
# self.browser.set_debug_responses(True)
# import sys, logging
# logger = logging.getLogger("mechanize")
# logger.addHandler(logging.StreamHandler(sys.stdout))
# logger.setLevel(logging.INFO)
# res = self.browser.open('http://online.barrons.com/logout')
# except:
# import traceback
# traceback.print_exc()
__author__ = 'Kovid Goyal'
description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
timefmt = ' [%a, %b %d, %Y]'
use_embedded_content = False
no_stylesheets = False
match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
conversion_options = {'linearize_tables': True}
##delay = 1
## Don't grab articles more than 7 days old
oldest_article = 7
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the body of the article.
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
## Remove any insets from the body of the article.
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
## Remove any reprint info from the body of the article.
(r'<hr size.*?<p', lambda match : '<p'),
## Remove anything after the end of the article.
(r'<!-- article end.*?</body>', lambda match : '</body>'),
]
]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://commerce.barrons.com/auth/login')
br.select_form(name='login_form')
br['user'] = self.username
br['password'] = self.password
br.submit()
return br
## Use the print version of a page when available.
def print_version(self, url):
return url.replace('/article/', '/article_print/')
## Comment out the feeds you don't want retrieved.
## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
def get_feeds(self):
return [
('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
]
## Logout of website
## NOT CURRENTLY WORKING
# def cleanup(self):
# try:
# self.browser.set_debug_responses(True)
# import sys, logging
# logger = logging.getLogger("mechanize")
# logger.addHandler(logging.StreamHandler(sys.stdout))
# logger.setLevel(logging.INFO)
# res = self.browser.open('http://online.barrons.com/logout')
# except:
# import traceback
# traceback.print_exc()

View File

@ -1,35 +1,35 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Huan Komrade T <huantnh at gmail.com>'
'''
bbc.co.uk
'''
from calibre.web.feeds.news import BasicNewsRecipe
class BBCVietnamese(BasicNewsRecipe):
title = u'BBC Vietnamese'
__author__ = 'Huan Komrade T'
description = 'Vietnam news and current affairs from the British Broadcasting Corporation'
no_stylesheets = True
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Huan Komrade T <huantnh at gmail.com>'
'''
bbc.co.uk
'''
from calibre.web.feeds.news import BasicNewsRecipe
class BBCVietnamese(BasicNewsRecipe):
title = u'BBC Vietnamese'
__author__ = 'Huan Komrade T'
description = 'Vietnam news and current affairs from the British Broadcasting Corporation'
no_stylesheets = True
language = 'vi'
encoding = 'utf-8'
recursions = 0
remove_tags = [dict(name='div', attrs={'class':'footer'})]
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
feeds = [
('Index', 'http://www.bbc.co.uk/vietnamese/index.xml'),
('Vietnam', 'http://www.bbc.co.uk/vietnamese/vietnam/index.xml'),
('Business', 'http://www.bbc.co.uk/vietnamese/business/index.xml'),
('Culture', 'http://www.bbc.co.uk/vietnamese/culture/index.xml'),
('Football', 'http://www.bbc.co.uk/vietnamese/football/index.xml'),
('Forum', 'http://www.bbc.co.uk/vietnamese/forum/index.xml'),
('In Depth', 'http://www.bbc.co.uk/vietnamese/indepth/index.xml'),
]
def print_version(self, url):
return url.replace('http://www.bbc.co.uk/vietnamese/', 'http://www.bbc.co.uk/vietnamese/lg/')
encoding = 'utf-8'
recursions = 0
remove_tags = [dict(name='div', attrs={'class':'footer'})]
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
feeds = [
('Index', 'http://www.bbc.co.uk/vietnamese/index.xml'),
('Vietnam', 'http://www.bbc.co.uk/vietnamese/vietnam/index.xml'),
('Business', 'http://www.bbc.co.uk/vietnamese/business/index.xml'),
('Culture', 'http://www.bbc.co.uk/vietnamese/culture/index.xml'),
('Football', 'http://www.bbc.co.uk/vietnamese/football/index.xml'),
('Forum', 'http://www.bbc.co.uk/vietnamese/forum/index.xml'),
('In Depth', 'http://www.bbc.co.uk/vietnamese/indepth/index.xml'),
]
def print_version(self, url):
return url.replace('http://www.bbc.co.uk/vietnamese/', 'http://www.bbc.co.uk/vietnamese/lg/')

View File

@ -1,51 +1,51 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
beta.rs
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Danas(BasicNewsRecipe):
title = 'BETA'
__author__ = 'Darko Miletic'
description = 'Novinska Agencija'
publisher = 'Beta'
category = 'news, politics, Serbia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = True
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
beta.rs
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Danas(BasicNewsRecipe):
title = 'BETA'
__author__ = 'Darko Miletic'
description = 'Novinska Agencija'
publisher = 'Beta'
category = 'news, politics, Serbia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = True
language = 'sr'
lang = 'sr-Latn-RS'
direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
feeds = [
(u'Vesti dana', u'http://www.beta.rs/rssvd.asp')
,(u'Ekonomija' , u'http://www.beta.rs/rssek.asp')
,(u'Sport' , u'http://www.beta.rs/rsssp.asp')
]
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)
lang = 'sr-Latn-RS'
direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
feeds = [
(u'Vesti dana', u'http://www.beta.rs/rssvd.asp')
,(u'Ekonomija' , u'http://www.beta.rs/rssek.asp')
,(u'Sport' , u'http://www.beta.rs/rsssp.asp')
]
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)

View File

@ -1,38 +1,37 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
beta.rs
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Danas(BasicNewsRecipe):
title = 'BETA - English'
__author__ = 'Darko Miletic'
description = 'Serbian news agency'
publisher = 'Beta'
category = 'news, politics, Serbia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = True
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
beta.rs
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Danas(BasicNewsRecipe):
title = 'BETA - English'
__author__ = 'Darko Miletic'
description = 'Serbian news agency'
publisher = 'Beta'
category = 'news, politics, Serbia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = True
language = 'en'
lang = 'en'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
feeds = [(u'News', u'http://www.beta.rs/rssen.asp')]
def preprocess_html(self, soup):
return self.adeify_images(soup)
lang = 'en'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
feeds = [(u'News', u'http://www.beta.rs/rssen.asp')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -1,66 +1,65 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
blic.rs
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Blic(BasicNewsRecipe):
title = 'Blic'
__author__ = 'Darko Miletic'
description = 'Blic.co.yu online verzija najtiraznije novine u Srbiji donosi najnovije vesti iz Srbije i sveta, komentare, politicke analize, poslovne i ekonomske vesti, vesti iz regiona, intervjue, informacije iz kulture, reportaze, pokriva sve sportske dogadjaje, detaljan tv program, nagradne igre, zabavu, fenomenalni Blic strip, dnevni horoskop, arhivu svih dogadjaja'
publisher = 'RINGIER d.o.o.'
category = 'news, politics, Serbia'
delay = 1
oldest_article = 2
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
blic.rs
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Blic(BasicNewsRecipe):
title = 'Blic'
__author__ = 'Darko Miletic'
description = 'Blic.co.yu online verzija najtiraznije novine u Srbiji donosi najnovije vesti iz Srbije i sveta, komentare, politicke analize, poslovne i ekonomske vesti, vesti iz regiona, intervjue, informacije iz kulture, reportaze, pokriva sve sportske dogadjaje, detaljan tv program, nagradne igre, zabavu, fenomenalni Blic strip, dnevni horoskop, arhivu svih dogadjaja'
publisher = 'RINGIER d.o.o.'
category = 'news, politics, Serbia'
delay = 1
oldest_article = 2
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
language = 'sr'
lang = 'sr-Latn-RS'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'class':'single_news'})]
feeds = [(u'Vesti', u'http://www.blic.rs/rssall.php')]
remove_tags = [dict(name=['object','link'])]
def print_version(self, url):
rest_url = url.partition('?')[2]
return u'http://www.blic.rs/_print.php?' + rest_url
def preprocess_html(self, soup):
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
return self.adeify_images(soup)
def get_article_url(self, article):
raw = article.get('link', None)
return raw.replace('.co.yu','.rs')
lang = 'sr-Latn-RS'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'class':'single_news'})]
feeds = [(u'Vesti', u'http://www.blic.rs/rssall.php')]
remove_tags = [dict(name=['object','link'])]
def print_version(self, url):
rest_url = url.partition('?')[2]
return u'http://www.blic.rs/_print.php?' + rest_url
def preprocess_html(self, soup):
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
return self.adeify_images(soup)
def get_article_url(self, article):
raw = article.get('link', None)
return raw.replace('.co.yu','.rs')

View File

@ -1,95 +1,95 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
borba.rs
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Borba(BasicNewsRecipe):
title = 'Borba Online'
__author__ = 'Darko Miletic'
description = 'Dnevne novine Borba Online'
publisher = 'IP Novine Borba'
category = 'news, politics, Serbia'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
borba.rs
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Borba(BasicNewsRecipe):
title = 'Borba Online'
__author__ = 'Darko Miletic'
description = 'Dnevne novine Borba Online'
publisher = 'IP Novine Borba'
category = 'news, politics, Serbia'
language = 'sr'
lang = _('sr-Latn-RS')
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
cover_url = 'http://www.borba.rs/images/stories/novine/naslovna_v.jpg'
INDEX = u'http://www.borba.rs/'
extra_css = ' @font-face {font-family: "serif1"; src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} .contentheading{font-size: x-large; font-weight: bold} .createdate{font-size: small; font-weight: bold} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'class':'main'})]
remove_tags_after = dict(name='div',attrs={'id':'written_comments_title'})
remove_tags = [
dict(name=['object','link','iframe','base','img'])
,dict(name='div',attrs={'id':'written_comments_title'})
]
feeds = [
(u'Najnovije vesti', u'http://www.borba.rs/content/blogsection/28/105/')
,(u'Prvi plan' , u'http://www.borba.rs/content/blogsection/4/92/' )
,(u'Dogadjaji' , u'http://www.borba.rs/content/blogsection/21/83/' )
,(u'Ekonomija' , u'http://www.borba.rs/content/blogsection/5/35/' )
,(u'Komentari' , u'http://www.borba.rs/content/blogsection/23/94/' )
,(u'Svet' , u'http://www.borba.rs/content/blogsection/7/36/' )
,(u'Sport' , u'http://www.borba.rs/content/blogsection/6/37/' )
,(u'Fama' , u'http://www.borba.rs/content/blogsection/25/89/' )
,(u'B2 Dodatak' , u'http://www.borba.rs/content/blogsection/30/116/')
]
def preprocess_html(self, soup):
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
return soup
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll('a', attrs={'class':'contentpagetitle'}):
url = item['href']
title = self.tag_to_string(item)
articles.append({
'title' :title
,'date' :''
,'url' :url
,'description':''
})
totalfeeds.append((feedtitle, articles))
return totalfeeds
lang = _('sr-Latn-RS')
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
cover_url = 'http://www.borba.rs/images/stories/novine/naslovna_v.jpg'
INDEX = u'http://www.borba.rs/'
extra_css = ' @font-face {font-family: "serif1"; src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} .contentheading{font-size: x-large; font-weight: bold} .createdate{font-size: small; font-weight: bold} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'class':'main'})]
remove_tags_after = dict(name='div',attrs={'id':'written_comments_title'})
remove_tags = [
dict(name=['object','link','iframe','base','img'])
,dict(name='div',attrs={'id':'written_comments_title'})
]
feeds = [
(u'Najnovije vesti', u'http://www.borba.rs/content/blogsection/28/105/')
,(u'Prvi plan' , u'http://www.borba.rs/content/blogsection/4/92/' )
,(u'Dogadjaji' , u'http://www.borba.rs/content/blogsection/21/83/' )
,(u'Ekonomija' , u'http://www.borba.rs/content/blogsection/5/35/' )
,(u'Komentari' , u'http://www.borba.rs/content/blogsection/23/94/' )
,(u'Svet' , u'http://www.borba.rs/content/blogsection/7/36/' )
,(u'Sport' , u'http://www.borba.rs/content/blogsection/6/37/' )
,(u'Fama' , u'http://www.borba.rs/content/blogsection/25/89/' )
,(u'B2 Dodatak' , u'http://www.borba.rs/content/blogsection/30/116/')
]
def preprocess_html(self, soup):
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
return soup
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll('a', attrs={'class':'contentpagetitle'}):
url = item['href']
title = self.tag_to_string(item)
articles.append({
'title' :title
,'date' :''
,'url' :url
,'description':''
})
totalfeeds.append((feedtitle, articles))
return totalfeeds

View File

@ -1,72 +1,72 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
elargentino.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class BsAsEconomico(BasicNewsRecipe):
title = 'Buenos Aires Economico'
__author__ = 'Darko Miletic'
description = 'Revista Argentina'
publisher = 'ElArgentino.com'
category = 'news, politics, economy, Argentina'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
elargentino.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class BsAsEconomico(BasicNewsRecipe):
title = 'Buenos Aires Economico'
__author__ = 'Darko Miletic'
description = 'Revista Argentina'
publisher = 'ElArgentino.com'
category = 'news, politics, economy, Argentina'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = 'es'
lang = 'es-AR'
direction = 'ltr'
INDEX = 'http://www.elargentino.com/medios/121/Buenos-Aires-Economico.html'
extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} '
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})]
remove_tags = [dict(name='link')]
feeds = [(u'Articulos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=121&Content-Type=text/xml&ChannelDesc=Buenos%20Aires%20Econ%C3%B3mico')]
def print_version(self, url):
main, sep, article_part = url.partition('/nota-')
article_id, rsep, rrest = article_part.partition('-')
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('div',attrs={'class':'colder'})
if cover_item:
clean_url = self.image_url_processor(None,cover_item.div.img['src'])
cover_url = 'http://www.elargentino.com' + clean_url + '&height=600'
return cover_url
def image_url_processor(self, baseurl, url):
base, sep, rest = url.rpartition('?Id=')
img, sep2, rrest = rest.partition('&')
return base + sep + img
lang = 'es-AR'
direction = 'ltr'
INDEX = 'http://www.elargentino.com/medios/121/Buenos-Aires-Economico.html'
extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} '
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})]
remove_tags = [dict(name='link')]
feeds = [(u'Articulos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=121&Content-Type=text/xml&ChannelDesc=Buenos%20Aires%20Econ%C3%B3mico')]
def print_version(self, url):
main, sep, article_part = url.partition('/nota-')
article_id, rsep, rrest = article_part.partition('-')
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('div',attrs={'class':'colder'})
if cover_item:
clean_url = self.image_url_processor(None,cover_item.div.img['src'])
cover_url = 'http://www.elargentino.com' + clean_url + '&height=600'
return cover_url
def image_url_processor(self, baseurl, url):
base, sep, rest = url.rpartition('?Id=')
img, sep2, rrest = rest.partition('&')
return base + sep + img

View File

@ -1,46 +1,46 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
chicagobreakingnews.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ChicagoBreakingNews(BasicNewsRecipe):
title = 'Chicago Breaking News'
__author__ = 'Darko Miletic'
description = 'Breaking News from Chicago'
oldest_article = 1
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = True
publisher = 'Chicago Breaking News'
category = 'news, politics, USA, Chicago'
encoding = 'utf8'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
chicagobreakingnews.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ChicagoBreakingNews(BasicNewsRecipe):
title = 'Chicago Breaking News'
__author__ = 'Darko Miletic'
description = 'Breaking News from Chicago'
oldest_article = 1
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = True
publisher = 'Chicago Breaking News'
category = 'news, politics, USA, Chicago'
encoding = 'utf8'
language = 'en'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'Breaking news', u'http://feeds2.feedburner.com/ChicagoBreakingNews/')]
def preprocess_html(self, soup):
links = soup.findAll('a')
for item in soup.findAll('a'):
if item['href'].find('http://feedads.googleadservices.com') > -1:
item.extract()
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(color=True):
del item['color']
for item in soup.findAll(size=True):
del item['size']
return soup
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'Breaking news', u'http://feeds2.feedburner.com/ChicagoBreakingNews/')]
def preprocess_html(self, soup):
links = soup.findAll('a')
for item in soup.findAll('a'):
if item['href'].find('http://feedads.googleadservices.com') > -1:
item.extract()
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(color=True):
del item['color']
for item in soup.findAll(size=True):
del item['size']
return soup

View File

@ -3,11 +3,7 @@ __license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from urlparse import urlparse, urlunparse
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from threading import RLock
class ChicagoTribune(BasicNewsRecipe):

View File

@ -1,73 +1,73 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
clarin.com
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Clarin(BasicNewsRecipe):
title = 'Clarin'
__author__ = 'Darko Miletic'
description = 'Noticias de Argentina y mundo'
publisher = 'Grupo Clarin'
category = 'news, politics, Argentina'
oldest_article = 2
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
cover_url = strftime('http://www.clarin.com/diario/%Y/%m/%d/portada.jpg')
remove_javascript = True
encoding = 'cp1252'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
clarin.com
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Clarin(BasicNewsRecipe):
title = 'Clarin'
__author__ = 'Darko Miletic'
description = 'Noticias de Argentina y mundo'
publisher = 'Grupo Clarin'
category = 'news, politics, Argentina'
oldest_article = 2
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
cover_url = strftime('http://www.clarin.com/diario/%Y/%m/%d/portada.jpg')
remove_javascript = True
encoding = 'cp1252'
language = 'es'
lang = 'es-AR'
direction = 'ltr'
extra_css = ' .Txt{ font-family: sans-serif } .Volan{ font-family: sans-serif; font-size: x-small} .Pie{ font-family: sans-serif; font-size: x-small} .Copete{font-family: sans-serif; font-size: large} .Hora{font-family: sans-serif; font-size: large} .Autor{font-family: sans-serif; font-size: small} '
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\npretty_print=True\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
remove_tags = [
dict(name='a' , attrs={'class':'Imp' })
,dict(name='div' , attrs={'class':'Perma' })
,dict(name='h1' , text='Imprimir' )
]
feeds = [
(u'Ultimo Momento', u'http://www.clarin.com/diario/hoy/um/sumariorss.xml')
,(u'El Pais' , u'http://www.clarin.com/diario/hoy/elpais.xml' )
,(u'Opinion' , u'http://www.clarin.com/diario/hoy/opinion.xml' )
,(u'El Mundo' , u'http://www.clarin.com/diario/hoy/elmundo.xml' )
,(u'Sociedad' , u'http://www.clarin.com/diario/hoy/sociedad.xml' )
,(u'La Ciudad' , u'http://www.clarin.com/diario/hoy/laciudad.xml' )
,(u'Policiales' , u'http://www.clarin.com/diario/hoy/policiales.xml' )
,(u'Deportes' , u'http://www.clarin.com/diario/hoy/deportes.xml' )
]
def print_version(self, url):
rest = url.partition('-0')[-1]
lmain = rest.partition('.')[0]
lurl = u'http://www.servicios.clarin.com/notas/jsp/clarin/v9/notas/imprimir.jsp?pagid=' + lmain
return lurl
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
for item in soup.findAll(style=True):
del item['style']
return soup
lang = 'es-AR'
direction = 'ltr'
extra_css = ' .Txt{ font-family: sans-serif } .Volan{ font-family: sans-serif; font-size: x-small} .Pie{ font-family: sans-serif; font-size: x-small} .Copete{font-family: sans-serif; font-size: large} .Hora{font-family: sans-serif; font-size: large} .Autor{font-family: sans-serif; font-size: small} '
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\npretty_print=True\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
remove_tags = [
dict(name='a' , attrs={'class':'Imp' })
,dict(name='div' , attrs={'class':'Perma' })
,dict(name='h1' , text='Imprimir' )
]
feeds = [
(u'Ultimo Momento', u'http://www.clarin.com/diario/hoy/um/sumariorss.xml')
,(u'El Pais' , u'http://www.clarin.com/diario/hoy/elpais.xml' )
,(u'Opinion' , u'http://www.clarin.com/diario/hoy/opinion.xml' )
,(u'El Mundo' , u'http://www.clarin.com/diario/hoy/elmundo.xml' )
,(u'Sociedad' , u'http://www.clarin.com/diario/hoy/sociedad.xml' )
,(u'La Ciudad' , u'http://www.clarin.com/diario/hoy/laciudad.xml' )
,(u'Policiales' , u'http://www.clarin.com/diario/hoy/policiales.xml' )
,(u'Deportes' , u'http://www.clarin.com/diario/hoy/deportes.xml' )
]
def print_version(self, url):
rest = url.partition('-0')[-1]
lmain = rest.partition('.')[0]
lurl = u'http://www.servicios.clarin.com/notas/jsp/clarin/v9/notas/imprimir.jsp?pagid=' + lmain
return lurl
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -1,46 +1,46 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
climateprogress.org
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class ClimateProgress(BasicNewsRecipe):
title = 'Climate Progress'
__author__ = 'Darko Miletic'
description = "An insider's view of climate science, politics and solutions"
publisher = 'Climate Progress'
category = 'news, ecology, climate, blog'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = True
encoding = 'utf-8'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
climateprogress.org
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class ClimateProgress(BasicNewsRecipe):
title = 'Climate Progress'
__author__ = 'Darko Miletic'
description = "An insider's view of climate science, politics and solutions"
publisher = 'Climate Progress'
category = 'news, ecology, climate, blog'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = True
encoding = 'utf-8'
language = 'en'
lang = 'en-US'
direction = 'ltr'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'Posts', u'http://feeds.feedburner.com/climateprogress/lCrX')]
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)
lang = 'en-US'
direction = 'ltr'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'Posts', u'http://feeds.feedburner.com/climateprogress/lCrX')]
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)

View File

@ -1,41 +1,41 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.codinghorror.com/blog/
'''
from calibre.web.feeds.news import BasicNewsRecipe
class CodingHorror(BasicNewsRecipe):
title = 'Coding Horror'
__author__ = 'Darko Miletic'
description = 'programming and human factors - Jeff Atwood'
category = 'blog, programming'
publisher = 'Jeff Atwood'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.codinghorror.com/blog/
'''
from calibre.web.feeds.news import BasicNewsRecipe
class CodingHorror(BasicNewsRecipe):
title = 'Coding Horror'
__author__ = 'Darko Miletic'
description = 'programming and human factors - Jeff Atwood'
category = 'blog, programming'
publisher = 'Jeff Atwood'
language = 'en'
author = 'Jeff Atwood'
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = True
encoding = 'cp1252'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
, '--author' , author
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nauthors="' + author + '"'
remove_tags = [
dict(name=['object','link'])
,dict(name='div',attrs={'class':'feedflare'})
]
feeds = [(u'Articles', u'http://feeds2.feedburner.com/codinghorror' )]
author = 'Jeff Atwood'
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = True
encoding = 'cp1252'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
, '--author' , author
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nauthors="' + author + '"'
remove_tags = [
dict(name=['object','link'])
,dict(name='div',attrs={'class':'feedflare'})
]
feeds = [(u'Articles', u'http://feeds2.feedburner.com/codinghorror' )]

View File

@ -1,46 +1,46 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.corriere.it/english
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Corriere_en(BasicNewsRecipe):
title = 'Corriere della Sera in English'
__author__ = 'Darko Miletic'
description = 'News from Milan and Italy'
oldest_article = 15
publisher = 'Corriere della Sera'
category = 'news, politics, Italy'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
remove_javascript = True
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.corriere.it/english
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Corriere_en(BasicNewsRecipe):
title = 'Corriere della Sera in English'
__author__ = 'Darko Miletic'
description = 'News from Milan and Italy'
oldest_article = 15
publisher = 'Corriere della Sera'
category = 'news, politics, Italy'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
remove_javascript = True
language = 'en'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})]
remove_tags = [
dict(name=['base','object','link','embed','img'])
,dict(name='div', attrs={'class':'news-goback'})
,dict(name='ul', attrs={'class':'toolbar'})
]
remove_tags_after = dict(name='p', attrs={'class':'footnotes'})
feeds = [(u'Italian Life', u'http://www.corriere.it/rss/english.xml')]
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})]
remove_tags = [
dict(name=['base','object','link','embed','img'])
,dict(name='div', attrs={'class':'news-goback'})
,dict(name='ul', attrs={'class':'toolbar'})
]
remove_tags_after = dict(name='p', attrs={'class':'footnotes'})
feeds = [(u'Italian Life', u'http://www.corriere.it/rss/english.xml')]

View File

@ -1,56 +1,56 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.corriere.it
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Corriere_it(BasicNewsRecipe):
title = 'Corriere della Sera'
__author__ = 'Darko Miletic'
description = 'News from Milan and Italy'
oldest_article = 7
publisher = 'Corriere della Sera'
category = 'news, politics, Italy'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
remove_javascript = True
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.corriere.it
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Corriere_it(BasicNewsRecipe):
title = 'Corriere della Sera'
__author__ = 'Darko Miletic'
description = 'News from Milan and Italy'
oldest_article = 7
publisher = 'Corriere della Sera'
category = 'news, politics, Italy'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
remove_javascript = True
language = 'it'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})]
remove_tags = [
dict(name=['base','object','link','embed','img'])
,dict(name='div', attrs={'class':'news-goback'})
,dict(name='ul', attrs={'class':'toolbar'})
]
remove_tags_after = dict(name='p', attrs={'class':'footnotes'})
feeds = [
(u'Ultimora' , u'http://www.corriere.it/rss/ultimora.xml' )
,(u'Cronache' , u'http://www.corriere.it/rss/cronache.xml' )
,(u'Economia' , u'http://www.corriere.it/rss/economia.xml' )
,(u'Editoriali', u'http://www.corriere.it/rss/editoriali.xml')
,(u'Esteri' , u'http://www.corriere.it/rss/esteri.xml' )
,(u'Politica' , u'http://www.corriere.it/rss/politica.xml' )
,(u'Salute' , u'http://www.corriere.it/rss/salute.xml' )
,(u'Scienze' , u'http://www.corriere.it/rss/scienze.xml' )
,(u'Spettacolo', u'http://www.corriere.it/rss/spettacoli.xml')
,(u'Sport' , u'http://www.corriere.it/rss/sport.xml' )
]
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})]
remove_tags = [
dict(name=['base','object','link','embed','img'])
,dict(name='div', attrs={'class':'news-goback'})
,dict(name='ul', attrs={'class':'toolbar'})
]
remove_tags_after = dict(name='p', attrs={'class':'footnotes'})
feeds = [
(u'Ultimora' , u'http://www.corriere.it/rss/ultimora.xml' )
,(u'Cronache' , u'http://www.corriere.it/rss/cronache.xml' )
,(u'Economia' , u'http://www.corriere.it/rss/economia.xml' )
,(u'Editoriali', u'http://www.corriere.it/rss/editoriali.xml')
,(u'Esteri' , u'http://www.corriere.it/rss/esteri.xml' )
,(u'Politica' , u'http://www.corriere.it/rss/politica.xml' )
,(u'Salute' , u'http://www.corriere.it/rss/salute.xml' )
,(u'Scienze' , u'http://www.corriere.it/rss/scienze.xml' )
,(u'Spettacolo', u'http://www.corriere.it/rss/spettacoli.xml')
,(u'Sport' , u'http://www.corriere.it/rss/sport.xml' )
]

View File

@ -7,7 +7,6 @@ Courrier International
'''
import re
from datetime import date
from calibre.web.feeds.news import BasicNewsRecipe
class CourrierInternational(BasicNewsRecipe):
@ -21,12 +20,12 @@ class CourrierInternational(BasicNewsRecipe):
no_stylesheets = True
html2lrf_options = ['--base-font-size', '10']
feeds = [
# Some articles requiring subscription fails on download.
('A la Une', 'http://www.courrierinternational.com/rss/rss_a_la_une.xml'),
]
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in
[
#Handle Depeches
@ -35,8 +34,8 @@ class CourrierInternational(BasicNewsRecipe):
(r'.*<td [^>]*>(Courrier international.*?) <td width="10"><img src="/img/espaceur.gif"></td>.*', lambda match : '<html><body><table><tr><td>'+match.group(1)+'</body></html>'),
]
]
def print_version(self, url):
return re.sub('/[a-zA-Z]+\.asp','/imprimer.asp' ,url)

View File

@ -1,62 +1,62 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
criticadigital.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class CriticaDigital(BasicNewsRecipe):
title = 'Critica de la Argentina'
__author__ = 'Darko Miletic'
description = 'Noticias de Argentina'
oldest_article = 2
max_articles_per_feed = 100
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
criticadigital.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class CriticaDigital(BasicNewsRecipe):
title = 'Critica de la Argentina'
__author__ = 'Darko Miletic'
description = 'Noticias de Argentina'
oldest_article = 2
max_articles_per_feed = 100
language = 'es'
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Argentina'
, '--publisher' , title
]
keep_only_tags = [
dict(name='div', attrs={'class':'bloqueTitulosNoticia'})
,dict(name='div', attrs={'id':'c453-1' })
]
remove_tags = [
dict(name='div', attrs={'class':'box300' })
,dict(name='div', style=True )
,dict(name='div', attrs={'class':'titcomentario'})
,dict(name='div', attrs={'class':'comentario' })
,dict(name='div', attrs={'class':'paginador' })
]
feeds = [
(u'Politica', u'http://www.criticadigital.com/herramientas/rss.php?ch=politica' )
,(u'Economia', u'http://www.criticadigital.com/herramientas/rss.php?ch=economia' )
,(u'Deportes', u'http://www.criticadigital.com/herramientas/rss.php?ch=deportes' )
,(u'Espectaculos', u'http://www.criticadigital.com/herramientas/rss.php?ch=espectaculos')
,(u'Mundo', u'http://www.criticadigital.com/herramientas/rss.php?ch=mundo' )
,(u'Policiales', u'http://www.criticadigital.com/herramientas/rss.php?ch=policiales' )
,(u'Sociedad', u'http://www.criticadigital.com/herramientas/rss.php?ch=sociedad' )
,(u'Salud', u'http://www.criticadigital.com/herramientas/rss.php?ch=salud' )
,(u'Tecnologia', u'http://www.criticadigital.com/herramientas/rss.php?ch=tecnologia' )
,(u'Santa Fe', u'http://www.criticadigital.com/herramientas/rss.php?ch=santa_fe' )
]
def get_cover_url(self):
cover_url = None
index = 'http://www.criticadigital.com/impresa/'
soup = self.index_to_soup(index)
link_item = soup.find('div',attrs={'class':'tapa'})
if link_item:
cover_url = index + link_item.img['src']
return cover_url
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Argentina'
, '--publisher' , title
]
keep_only_tags = [
dict(name='div', attrs={'class':'bloqueTitulosNoticia'})
,dict(name='div', attrs={'id':'c453-1' })
]
remove_tags = [
dict(name='div', attrs={'class':'box300' })
,dict(name='div', style=True )
,dict(name='div', attrs={'class':'titcomentario'})
,dict(name='div', attrs={'class':'comentario' })
,dict(name='div', attrs={'class':'paginador' })
]
feeds = [
(u'Politica', u'http://www.criticadigital.com/herramientas/rss.php?ch=politica' )
,(u'Economia', u'http://www.criticadigital.com/herramientas/rss.php?ch=economia' )
,(u'Deportes', u'http://www.criticadigital.com/herramientas/rss.php?ch=deportes' )
,(u'Espectaculos', u'http://www.criticadigital.com/herramientas/rss.php?ch=espectaculos')
,(u'Mundo', u'http://www.criticadigital.com/herramientas/rss.php?ch=mundo' )
,(u'Policiales', u'http://www.criticadigital.com/herramientas/rss.php?ch=policiales' )
,(u'Sociedad', u'http://www.criticadigital.com/herramientas/rss.php?ch=sociedad' )
,(u'Salud', u'http://www.criticadigital.com/herramientas/rss.php?ch=salud' )
,(u'Tecnologia', u'http://www.criticadigital.com/herramientas/rss.php?ch=tecnologia' )
,(u'Santa Fe', u'http://www.criticadigital.com/herramientas/rss.php?ch=santa_fe' )
]
def get_cover_url(self):
cover_url = None
index = 'http://www.criticadigital.com/impresa/'
soup = self.index_to_soup(index)
link_item = soup.find('div',attrs={'class':'tapa'})
if link_item:
cover_url = index + link_item.img['src']
return cover_url

View File

@ -1,45 +1,44 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
newyorker.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class CubaDebate(BasicNewsRecipe):
title = 'CubaDebate'
__author__ = 'Darko Miletic'
description = 'Contra el Terorismo Mediatico'
oldest_article = 15
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
newyorker.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class CubaDebate(BasicNewsRecipe):
title = 'CubaDebate'
__author__ = 'Darko Miletic'
description = 'Contra el Terorismo Mediatico'
oldest_article = 15
language = 'es'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
publisher = 'Cubadebate'
category = 'news, politics, Cuba'
encoding = 'utf-8'
extra_css = ' #BlogTitle{font-size: x-large; font-weight: bold} '
conversion_options = {
'comments' : description
,'tags' : category
,'language' : 'es'
,'publisher' : publisher
,'pretty_print': True
}
keep_only_tags = [dict(name='div', attrs={'id':'Outline'})]
remove_tags_after = dict(name='div',attrs={'id':'BlogContent'})
remove_tags = [dict(name='link')]
feeds = [(u'Articulos', u'http://www.cubadebate.cu/feed/')]
def print_version(self, url):
return url + 'print/'
def preprocess_html(self, soup):
return self.adeify_images(soup)
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
publisher = 'Cubadebate'
category = 'news, politics, Cuba'
encoding = 'utf-8'
extra_css = ' #BlogTitle{font-size: x-large; font-weight: bold} '
conversion_options = {
'comments' : description
,'tags' : category
,'language' : 'es'
,'publisher' : publisher
,'pretty_print': True
}
keep_only_tags = [dict(name='div', attrs={'id':'Outline'})]
remove_tags_after = dict(name='div',attrs={'id':'BlogContent'})
remove_tags = [dict(name='link')]
feeds = [(u'Articulos', u'http://www.cubadebate.cu/feed/')]
def print_version(self, url):
return url + 'print/'
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -1,34 +1,34 @@
from calibre.web.feeds.news import BasicNewsRecipe
class TheDailyMail(BasicNewsRecipe):
title = u'The Daily Mail'
oldest_article = 2
from calibre.web.feeds.news import BasicNewsRecipe
class TheDailyMail(BasicNewsRecipe):
title = u'The Daily Mail'
oldest_article = 2
language = 'en'
author = 'RufusA'
simultaneous_downloads= 1
max_articles_per_feed = 50
extra_css = 'h1 {text-align: left;}'
remove_tags = [ dict(name='ul', attrs={'class':'article-icons-links'}) ]
remove_tags_after = dict(name='h3', attrs={'class':'social-links-title'})
remove_tags_before = dict(name='div', attrs={'id':'content'})
no_stylesheets = True
feeds = [
(u'Home', u'http://www.dailymail.co.uk/home/index.rss'),
(u'News', u'http://www.dailymail.co.uk/news/index.rss'),
(u'Sport', u'http://www.dailymail.co.uk/sport/index.rss'),
(u'TV and Showbiz', u'http://www.dailymail.co.uk/tvshowbiz/index.rss'),
(u'Femail', u'http://www.dailymail.co.uk/femail/index.rss'),
(u'Health', u'http://www.dailymail.co.uk/health/index.rss'),
(u'Science and Technology', u'http://www.dailymail.co.uk/sciencetech/index.rss'),
(u'Money', u'http://www.dailymail.co.uk/money/index.rss'),
(u'Property', u'http://www.dailymail.co.uk/property/index.rss'),
(u'Motoring', u'http://www.dailymail.co.uk/motoring/index.rss'),
(u'Travel', u'http://www.dailymail.co.uk/travel/index.rss')]
def print_version(self, url):
main = url.partition('?')[0]
return main + '?printingPage=true'
author = 'RufusA'
simultaneous_downloads= 1
max_articles_per_feed = 50
extra_css = 'h1 {text-align: left;}'
remove_tags = [ dict(name='ul', attrs={'class':'article-icons-links'}) ]
remove_tags_after = dict(name='h3', attrs={'class':'social-links-title'})
remove_tags_before = dict(name='div', attrs={'id':'content'})
no_stylesheets = True
feeds = [
(u'Home', u'http://www.dailymail.co.uk/home/index.rss'),
(u'News', u'http://www.dailymail.co.uk/news/index.rss'),
(u'Sport', u'http://www.dailymail.co.uk/sport/index.rss'),
(u'TV and Showbiz', u'http://www.dailymail.co.uk/tvshowbiz/index.rss'),
(u'Femail', u'http://www.dailymail.co.uk/femail/index.rss'),
(u'Health', u'http://www.dailymail.co.uk/health/index.rss'),
(u'Science and Technology', u'http://www.dailymail.co.uk/sciencetech/index.rss'),
(u'Money', u'http://www.dailymail.co.uk/money/index.rss'),
(u'Property', u'http://www.dailymail.co.uk/property/index.rss'),
(u'Motoring', u'http://www.dailymail.co.uk/motoring/index.rss'),
(u'Travel', u'http://www.dailymail.co.uk/travel/index.rss')]
def print_version(self, url):
main = url.partition('?')[0]
return main + '?printingPage=true'

View File

@ -1,62 +1,62 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
danas.rs
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Danas(BasicNewsRecipe):
title = 'Danas'
__author__ = 'Darko Miletic'
description = 'Vesti'
publisher = 'Danas d.o.o.'
category = 'news, politics, Serbia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = False
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
danas.rs
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Danas(BasicNewsRecipe):
title = 'Danas'
__author__ = 'Darko Miletic'
description = 'Vesti'
publisher = 'Danas d.o.o.'
category = 'news, politics, Serbia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = False
language = 'sr'
lang = 'sr-Latn-RS'
direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':'left'})]
remove_tags = [
dict(name='div', attrs={'class':['width_1_4','metaClanka','baner']})
,dict(name='div', attrs={'id':'comments'})
,dict(name=['object','link'])
]
feeds = [ (u'Vesti', u'http://www.danas.rs/rss/rss.asp')]
def preprocess_html(self, soup):
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.head.insert(0,mlang)
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
return soup
lang = 'sr-Latn-RS'
direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':'left'})]
remove_tags = [
dict(name='div', attrs={'class':['width_1_4','metaClanka','baner']})
,dict(name='div', attrs={'id':'comments'})
,dict(name=['object','link'])
]
feeds = [ (u'Vesti', u'http://www.danas.rs/rss/rss.asp')]
def preprocess_html(self, soup):
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.head.insert(0,mlang)
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
return soup

View File

@ -1,76 +1,76 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.nieuwsblad.be
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class DeGentenaarOnline(BasicNewsRecipe):
title = 'De Gentenaar Online'
__author__ = 'Darko Miletic'
description = 'News from Belgium in Dutch'
publisher = 'De Gentenaar'
category = 'news, politics, Belgium'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.nieuwsblad.be
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class DeGentenaarOnline(BasicNewsRecipe):
title = 'De Gentenaar Online'
__author__ = 'Darko Miletic'
description = 'News from Belgium in Dutch'
publisher = 'De Gentenaar'
category = 'news, politics, Belgium'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = 'nl'
lang = 'nl-BE'
direction = 'ltr'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='span', attrs={'id':['lblArticleTitle','lblArticleIntroduction','lblArticleMainText']})]
remove_tags = [dict(name=['embed','object'])]
feeds = [
(u'Snelnieuws' , u'http://feeds.nieuwsblad.be/nieuws/snelnieuws' )
,(u'Binnenland' , u'http://feeds.nieuwsblad.be/nieuws/binnenland' )
,(u'Buitenland' , u'http://feeds.nieuwsblad.be/nieuwsblad/buitenland' )
,(u'Economie' , u'http://feeds.nieuwsblad.be/economie/home' )
,(u'Economie' , u'http://feeds.nieuwsblad.be/economie/home' )
,(u'Algemeen' , u'http://feeds.nieuwsblad.be/life/algemeen' )
,(u'Film' , u'http://feeds.nieuwsblad.be/life/film' )
,(u'Boek' , u'http://feeds.nieuwsblad.be/life/boeken' )
,(u'Muziek' , u'http://feeds.nieuwsblad.be/life/muziek' )
,(u'Podium' , u'http://feeds.nieuwsblad.be/life/podium' )
,(u'TV & radio' , u'http://feeds.nieuwsblad.be/life/tv' )
]
def print_version(self, url):
return url.replace('/Detail.aspx?articleid','/PrintArticle.aspx?ArticleID')
def get_article_url(self, article):
return article.get('guid', None)
def preprocess_html(self, soup):
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('span'):
item.name='div'
if item.has_key('id') and item['id'] == 'lblArticleTitle':
item.name='h3'
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup
lang = 'nl-BE'
direction = 'ltr'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='span', attrs={'id':['lblArticleTitle','lblArticleIntroduction','lblArticleMainText']})]
remove_tags = [dict(name=['embed','object'])]
feeds = [
(u'Snelnieuws' , u'http://feeds.nieuwsblad.be/nieuws/snelnieuws' )
,(u'Binnenland' , u'http://feeds.nieuwsblad.be/nieuws/binnenland' )
,(u'Buitenland' , u'http://feeds.nieuwsblad.be/nieuwsblad/buitenland' )
,(u'Economie' , u'http://feeds.nieuwsblad.be/economie/home' )
,(u'Economie' , u'http://feeds.nieuwsblad.be/economie/home' )
,(u'Algemeen' , u'http://feeds.nieuwsblad.be/life/algemeen' )
,(u'Film' , u'http://feeds.nieuwsblad.be/life/film' )
,(u'Boek' , u'http://feeds.nieuwsblad.be/life/boeken' )
,(u'Muziek' , u'http://feeds.nieuwsblad.be/life/muziek' )
,(u'Podium' , u'http://feeds.nieuwsblad.be/life/podium' )
,(u'TV & radio' , u'http://feeds.nieuwsblad.be/life/tv' )
]
def print_version(self, url):
return url.replace('/Detail.aspx?articleid','/PrintArticle.aspx?ArticleID')
def get_article_url(self, article):
return article.get('guid', None)
def preprocess_html(self, soup):
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('span'):
item.name='div'
if item.has_key('id') and item['id'] == 'lblArticleTitle':
item.name='h3'
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup

View File

@ -1,69 +1,69 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
''' http://www.derstandard.at - Austrian Newspaper '''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class DerStandardRecipe(BasicNewsRecipe):
title = u'derStandard'
__author__ = 'Gerhard Aigner'
description = u'Nachrichten aus Österreich'
publisher ='derStandard.at'
category = 'news, politics, nachrichten, Austria'
use_embedded_content = False
remove_empty_feeds = True
lang = 'de-AT'
no_stylesheets = True
encoding = 'utf-8'
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
''' http://www.derstandard.at - Austrian Newspaper '''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class DerStandardRecipe(BasicNewsRecipe):
title = u'derStandard'
__author__ = 'Gerhard Aigner'
description = u'Nachrichten aus Österreich'
publisher ='derStandard.at'
category = 'news, politics, nachrichten, Austria'
use_embedded_content = False
remove_empty_feeds = True
lang = 'de-AT'
no_stylesheets = True
encoding = 'utf-8'
language = 'de'
recursions = 0
oldest_article = 1
max_articles_per_feed = 100
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'),
(u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'),
(u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'),
(u'Web', u'http://derstandard.at/?page=rss&ressort=webstandard'),
(u'Sport', u'http://derstandard.at/?page=rss&ressort=sport'),
(u'Panorama', u'http://derstandard.at/?page=rss&ressort=panorama'),
(u'Etat', u'http://derstandard.at/?page=rss&ressort=etat'),
(u'Kultur', u'http://derstandard.at/?page=rss&ressort=kultur'),
(u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'),
(u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'),
(u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')]
remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'),
dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')]
preprocess_regexps = [
(re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '')
]
def print_version(self, url):
return url.replace('?id=', 'txt/?id=')
def get_article_url(self, article):
'''if the article links to a index page (ressort) or a picture gallery
(ansichtssache), don't add it'''
if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0):
return None
return article.link
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
soup.head.insert(0,mtag)
recursions = 0
oldest_article = 1
max_articles_per_feed = 100
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'),
(u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'),
(u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'),
(u'Web', u'http://derstandard.at/?page=rss&ressort=webstandard'),
(u'Sport', u'http://derstandard.at/?page=rss&ressort=sport'),
(u'Panorama', u'http://derstandard.at/?page=rss&ressort=panorama'),
(u'Etat', u'http://derstandard.at/?page=rss&ressort=etat'),
(u'Kultur', u'http://derstandard.at/?page=rss&ressort=kultur'),
(u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'),
(u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'),
(u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')]
remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'),
dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')]
preprocess_regexps = [
(re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '')
]
def print_version(self, url):
return url.replace('?id=', 'txt/?id=')
def get_article_url(self, article):
'''if the article links to a index page (ressort) or a picture gallery
(ansichtssache), don't add it'''
if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0):
return None
return article.link
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
soup.head.insert(0,mtag)
return soup

View File

@ -1,72 +1,72 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
elargentino.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Diagonales(BasicNewsRecipe):
title = 'Diagonales'
__author__ = 'Darko Miletic'
description = 'El nuevo diario de La Plata'
publisher = 'ElArgentino.com'
category = 'news, politics, Argentina, La Plata'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
elargentino.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Diagonales(BasicNewsRecipe):
title = 'Diagonales'
__author__ = 'Darko Miletic'
description = 'El nuevo diario de La Plata'
publisher = 'ElArgentino.com'
category = 'news, politics, Argentina, La Plata'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = 'es'
lang = 'es-AR'
direction = 'ltr'
INDEX = 'http://www.elargentino.com/medios/122/Diagonales.html'
extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} '
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})]
remove_tags = [dict(name='link')]
feeds = [(u'Articulos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=122&Content-Type=text/xml&ChannelDesc=Diagonales')]
def print_version(self, url):
main, sep, article_part = url.partition('/nota-')
article_id, rsep, rrest = article_part.partition('-')
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('div',attrs={'class':'colder'})
if cover_item:
clean_url = self.image_url_processor(None,cover_item.div.img['src'])
cover_url = 'http://www.elargentino.com' + clean_url + '&height=600'
return cover_url
def image_url_processor(self, baseurl, url):
base, sep, rest = url.rpartition('?Id=')
img, sep2, rrest = rest.partition('&')
return base + sep + img
lang = 'es-AR'
direction = 'ltr'
INDEX = 'http://www.elargentino.com/medios/122/Diagonales.html'
extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} '
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})]
remove_tags = [dict(name='link')]
feeds = [(u'Articulos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=122&Content-Type=text/xml&ChannelDesc=Diagonales')]
def print_version(self, url):
main, sep, article_part = url.partition('/nota-')
article_id, rsep, rrest = article_part.partition('-')
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('div',attrs={'class':'colder'})
if cover_item:
clean_url = self.image_url_processor(None,cover_item.div.img['src'])
cover_url = 'http://www.elargentino.com' + clean_url + '&height=600'
return cover_url
def image_url_processor(self, baseurl, url):
base, sep, rest = url.rpartition('?Id=')
img, sep2, rrest = rest.partition('&')
return base + sep + img

View File

@ -1,73 +1,73 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
''' http://www.diepresse.at - Austrian Newspaper '''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class DiePresseRecipe(BasicNewsRecipe):
title = u'diePresse'
__author__ = 'Gerhard Aigner'
description = u'DiePresse.com - Die Online-Ausgabe der Österreichischen Tageszeitung Die Presse.'
publisher ='DiePresse.com'
category = 'news, politics, nachrichten, Austria'
use_embedded_content = False
remove_empty_feeds = True
lang = 'de-AT'
no_stylesheets = True
encoding = 'ISO-8859-1'
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
''' http://www.diepresse.at - Austrian Newspaper '''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class DiePresseRecipe(BasicNewsRecipe):
title = u'diePresse'
__author__ = 'Gerhard Aigner'
description = u'DiePresse.com - Die Online-Ausgabe der Österreichischen Tageszeitung Die Presse.'
publisher ='DiePresse.com'
category = 'news, politics, nachrichten, Austria'
use_embedded_content = False
remove_empty_feeds = True
lang = 'de-AT'
no_stylesheets = True
encoding = 'ISO-8859-1'
language = 'de'
recursions = 0
oldest_article = 1
max_articles_per_feed = 100
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [
(re.compile(r'Textversion', re.DOTALL), lambda match: ''),
]
remove_tags = [dict(name='hr'),
dict(name='br'),
dict(name='small'),
dict(name='img'),
dict(name='div', attrs={'class':'textnavi'}),
dict(name='h1', attrs={'class':'titel'}),
dict(name='a', attrs={'class':'print'}),
dict(name='div', attrs={'class':'hline'})]
feeds = [(u'Politik', u'http://diepresse.com/rss/Politik'),
(u'Wirtschaft', u'http://diepresse.com/rss/Wirtschaft'),
(u'Europa', u'http://diepresse.com/rss/EU'),
(u'Panorama', u'http://diepresse.com/rss/Panorama'),
(u'Sport', u'http://diepresse.com/rss/Sport'),
(u'Kultur', u'http://diepresse.com/rss/Kultur'),
(u'Leben', u'http://diepresse.com/rss/Leben'),
(u'Tech', u'http://diepresse.com/rss/Tech'),
(u'Wissenschaft', u'http://diepresse.com/rss/Science'),
(u'Bildung', u'http://diepresse.com/rss/Bildung'),
(u'Gesundheit', u'http://diepresse.com/rss/Gesundheit'),
(u'Recht', u'http://diepresse.com/rss/Recht'),
(u'Spectrum', u'http://diepresse.com/rss/Spectrum'),
(u'Meinung', u'http://diepresse.com/rss/Meinung')]
def print_version(self, url):
return url.replace('home','text/home')
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
soup.head.insert(0,mtag)
recursions = 0
oldest_article = 1
max_articles_per_feed = 100
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [
(re.compile(r'Textversion', re.DOTALL), lambda match: ''),
]
remove_tags = [dict(name='hr'),
dict(name='br'),
dict(name='small'),
dict(name='img'),
dict(name='div', attrs={'class':'textnavi'}),
dict(name='h1', attrs={'class':'titel'}),
dict(name='a', attrs={'class':'print'}),
dict(name='div', attrs={'class':'hline'})]
feeds = [(u'Politik', u'http://diepresse.com/rss/Politik'),
(u'Wirtschaft', u'http://diepresse.com/rss/Wirtschaft'),
(u'Europa', u'http://diepresse.com/rss/EU'),
(u'Panorama', u'http://diepresse.com/rss/Panorama'),
(u'Sport', u'http://diepresse.com/rss/Sport'),
(u'Kultur', u'http://diepresse.com/rss/Kultur'),
(u'Leben', u'http://diepresse.com/rss/Leben'),
(u'Tech', u'http://diepresse.com/rss/Tech'),
(u'Wissenschaft', u'http://diepresse.com/rss/Science'),
(u'Bildung', u'http://diepresse.com/rss/Bildung'),
(u'Gesundheit', u'http://diepresse.com/rss/Gesundheit'),
(u'Recht', u'http://diepresse.com/rss/Recht'),
(u'Spectrum', u'http://diepresse.com/rss/Spectrum'),
(u'Meinung', u'http://diepresse.com/rss/Meinung')]
def print_version(self, url):
return url.replace('home','text/home')
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
soup.head.insert(0,mtag)
return soup

View File

@ -1,69 +1,69 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
dnevniavaz.ba
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class DnevniAvaz(BasicNewsRecipe):
title = 'Dnevni Avaz'
__author__ = 'Darko Miletic'
description = 'Latest news from Bosnia'
publisher = 'Dnevni Avaz'
category = 'news, politics, Bosnia and Herzegovina'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
remove_javascript = True
cover_url = 'http://www.dnevniavaz.ba/img/logo.gif'
lang = 'bs-BA'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
dnevniavaz.ba
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class DnevniAvaz(BasicNewsRecipe):
title = 'Dnevni Avaz'
__author__ = 'Darko Miletic'
description = 'Latest news from Bosnia'
publisher = 'Dnevni Avaz'
category = 'news, politics, Bosnia and Herzegovina'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
remove_javascript = True
cover_url = 'http://www.dnevniavaz.ba/img/logo.gif'
lang = 'bs-BA'
language = 'bs'
direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':['fullarticle-title','fullarticle-leading','fullarticle-date','fullarticle-text','articleauthor']})]
remove_tags = [dict(name=['object','link','base'])]
feeds = [
(u'Najnovije' , u'http://www.dnevniavaz.ba/rss/novo' )
,(u'Najpopularnije', u'http://www.dnevniavaz.ba/rss/popularno')
]
def replace_tagname(self,soup,tagname,tagid,newtagname):
headtag = soup.find(tagname,attrs={'id':tagid})
if headtag:
headtag.name = newtagname
return
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
self.replace_tagname(soup,'div','fullarticle-title' ,'h1')
self.replace_tagname(soup,'div','fullarticle-leading','h3')
self.replace_tagname(soup,'div','fullarticle-date' ,'h5')
return self.adeify_images(soup)
direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':['fullarticle-title','fullarticle-leading','fullarticle-date','fullarticle-text','articleauthor']})]
remove_tags = [dict(name=['object','link','base'])]
feeds = [
(u'Najnovije' , u'http://www.dnevniavaz.ba/rss/novo' )
,(u'Najpopularnije', u'http://www.dnevniavaz.ba/rss/popularno')
]
def replace_tagname(self,soup,tagname,tagid,newtagname):
headtag = soup.find(tagname,attrs={'id':tagid})
if headtag:
headtag.name = newtagname
return
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
self.replace_tagname(soup,'div','fullarticle-title' ,'h1')
self.replace_tagname(soup,'div','fullarticle-leading','h3')
self.replace_tagname(soup,'div','fullarticle-date' ,'h5')
return self.adeify_images(soup)

View File

@ -1,75 +1,75 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
dnevnik.hr
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class DnevnikCro(BasicNewsRecipe):
title = 'Dnevnik - Hr'
__author__ = 'Darko Miletic'
description = "Vijesti iz Hrvatske"
publisher = 'Dnevnik.hr'
category = 'news, politics, Croatia'
oldest_article = 2
max_articles_per_feed = 100
delay = 4
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
dnevnik.hr
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class DnevnikCro(BasicNewsRecipe):
title = 'Dnevnik - Hr'
__author__ = 'Darko Miletic'
description = "Vijesti iz Hrvatske"
publisher = 'Dnevnik.hr'
category = 'news, politics, Croatia'
oldest_article = 2
max_articles_per_feed = 100
delay = 4
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = 'hr'
lang = 'hr-HR'
direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':'article'})]
remove_tags = [
dict(name=['object','link','embed'])
,dict(name='div', attrs={'class':'menu'})
,dict(name='div', attrs={'id':'video'})
]
remove_tags_after = dict(name='div', attrs={'id':'content'})
feeds = [(u'Vijesti', u'http://rss.dnevnik.hr/index.rss')]
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)
lang = 'hr-HR'
direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':'article'})]
remove_tags = [
dict(name=['object','link','embed'])
,dict(name='div', attrs={'class':'menu'})
,dict(name='div', attrs={'id':'video'})
]
remove_tags_after = dict(name='div', attrs={'id':'content'})
feeds = [(u'Vijesti', u'http://rss.dnevnik.hr/index.rss')]
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)

View File

@ -1,59 +1,59 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
e-novine.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class E_novine(BasicNewsRecipe):
title = 'E-Novine'
__author__ = 'Darko Miletic'
description = 'News from Serbia'
publisher = 'E-novine'
category = 'news, politics, Balcans'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'cp1250'
use_embedded_content = False
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
e-novine.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class E_novine(BasicNewsRecipe):
title = 'E-Novine'
__author__ = 'Darko Miletic'
description = 'News from Serbia'
publisher = 'E-novine'
category = 'news, politics, Balcans'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'cp1250'
use_embedded_content = False
language = 'sr'
lang = 'sr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':['css_47_0_2844H']})]
remove_tags = [dict(name=['object','link','embed','iframe'])]
feeds = [(u'Sve vesti', u'http://www.e-novine.com/rss/e-novine.xml' )]
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.head.insert(0,mlang)
for item in soup.findAll(style=True):
del item['style']
ftag = soup.find('div', attrs={'id':'css_47_0_2844H'})
if ftag:
it = ftag.div
it.extract()
ftag.div.extract()
ftag.insert(0,it)
return soup
lang = 'sr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':['css_47_0_2844H']})]
remove_tags = [dict(name=['object','link','embed','iframe'])]
feeds = [(u'Sve vesti', u'http://www.e-novine.com/rss/e-novine.xml' )]
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.head.insert(0,mlang)
for item in soup.findAll(style=True):
del item['style']
ftag = soup.find('div', attrs={'id':'css_47_0_2844H'})
if ftag:
it = ftag.div
it.extract()
ftag.div.extract()
ftag.insert(0,it)
return soup

View File

@ -1,32 +1,32 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
EcoGeek.org
'''
from calibre.web.feeds.news import BasicNewsRecipe
class EcoGeek(BasicNewsRecipe):
title = 'EcoGeek'
__author__ = 'Darko Miletic'
description = 'EcoGeek - Technology for the Environment Blog Feed'
publisher = 'EcoGeek'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
EcoGeek.org
'''
from calibre.web.feeds.news import BasicNewsRecipe
class EcoGeek(BasicNewsRecipe):
title = 'EcoGeek'
__author__ = 'Darko Miletic'
description = 'EcoGeek - Technology for the Environment Blog Feed'
publisher = 'EcoGeek'
language = 'en'
category = 'news, ecology, blog'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = True
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'Posts', u'http://feeds2.feedburner.com/EcoGeek')]
category = 'news, ecology, blog'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = True
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u'Posts', u'http://feeds2.feedburner.com/EcoGeek')]

View File

@ -1,62 +1,61 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
emol.com
'''
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
emol.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ElMercurio(BasicNewsRecipe):
title = 'El Mercurio online'
__author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile'
publisher = 'El Mercurio'
category = 'news, politics, Chile'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
cover_url = 'http://www.emol.com/especiales/logo_emol/logo_emol.gif'
remove_javascript = True
use_embedded_content = False
html2lrf_options = [
class ElMercurio(BasicNewsRecipe):
title = 'El Mercurio online'
__author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile'
publisher = 'El Mercurio'
category = 'news, politics, Chile'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
cover_url = 'http://www.emol.com/especiales/logo_emol/logo_emol.gif'
remove_javascript = True
use_embedded_content = False
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [
dict(name='div', attrs={'class':'despliegue-txt_750px'})
,dict(name='div', attrs={'id':'div_cuerpo_participa'})
]
remove_tags = [
dict(name='div', attrs={'class':'contenedor_despliegue-col-left300'})
,dict(name='div', attrs={'id':['div_centro_dn_opc','div_cabezera','div_secciones','div_contenidos','div_pie','nav']})
]
feeds = [
(u'Noticias de ultima hora', u'http://www.emol.com/rss20/rss.asp?canal=0')
,(u'Nacional', u'http://www.emol.com/rss20/rss.asp?canal=1')
,(u'Mundo', u'http://www.emol.com/rss20/rss.asp?canal=2')
,(u'Deportes', u'http://www.emol.com/rss20/rss.asp?canal=4')
,(u'Magazine', u'http://www.emol.com/rss20/rss.asp?canal=6')
,(u'Tecnologia', u'http://www.emol.com/rss20/rss.asp?canal=5')
,(u'La Musica', u'http://www.emol.com/rss20/rss.asp?canal=7')
]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [
dict(name='div', attrs={'class':'despliegue-txt_750px'})
,dict(name='div', attrs={'id':'div_cuerpo_participa'})
]
remove_tags = [
dict(name='div', attrs={'class':'contenedor_despliegue-col-left300'})
,dict(name='div', attrs={'id':['div_centro_dn_opc','div_cabezera','div_secciones','div_contenidos','div_pie','nav']})
]
feeds = [
(u'Noticias de ultima hora', u'http://www.emol.com/rss20/rss.asp?canal=0')
,(u'Nacional', u'http://www.emol.com/rss20/rss.asp?canal=1')
,(u'Mundo', u'http://www.emol.com/rss20/rss.asp?canal=2')
,(u'Deportes', u'http://www.emol.com/rss20/rss.asp?canal=4')
,(u'Magazine', u'http://www.emol.com/rss20/rss.asp?canal=6')
,(u'Tecnologia', u'http://www.emol.com/rss20/rss.asp?canal=5')
,(u'La Musica', u'http://www.emol.com/rss20/rss.asp?canal=7')
]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = 'es'

View File

@ -1,66 +1,66 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
eluniversal.com.mx
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ElUniversal(BasicNewsRecipe):
title = 'El Universal'
__author__ = 'Darko Miletic'
description = 'News from Mexico'
oldest_article = 1
max_articles_per_feed = 100
publisher = 'El Universal'
category = 'news, politics, Mexico'
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
remove_javascript = True
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
eluniversal.com.mx
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ElUniversal(BasicNewsRecipe):
title = 'El Universal'
__author__ = 'Darko Miletic'
description = 'News from Mexico'
oldest_article = 1
max_articles_per_feed = 100
publisher = 'El Universal'
category = 'news, politics, Mexico'
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
remove_javascript = True
language = 'es'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
remove_tags = [dict(name='link')]
feeds = [
(u'Minuto por Minuto', u'http://www.eluniversal.com.mx/rss/universalmxm.xml' )
,(u'Mundo' , u'http://www.eluniversal.com.mx/rss/mundo.xml' )
,(u'Mexico' , u'http://www.eluniversal.com.mx/rss/mexico.xml' )
,(u'Estados' , u'http://www.eluniversal.com.mx/rss/estados.xml' )
,(u'Finanzas' , u'http://www.eluniversal.com.mx/rss/finanzas.xml' )
,(u'Deportes' , u'http://www.eluniversal.com.mx/rss/deportes.xml' )
,(u'Espectaculos' , u'http://www.eluniversal.com.mx/rss/espectaculos.xml' )
,(u'Cultura' , u'http://www.eluniversal.com.mx/rss/cultura.xml' )
,(u'Ciencia' , u'http://www.eluniversal.com.mx/rss/ciencia.xml' )
,(u'Computacion' , u'http://www.eluniversal.com.mx/rss/computo.xml' )
,(u'Sociedad' , u'http://www.eluniversal.com.mx/rss/sociedad.xml' )
]
def print_version(self, url):
return url.replace('/notas/','/notas/vi_')
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-MX"/><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(font=True):
del item['font']
for item in soup.findAll(face=True):
del item['face']
for item in soup.findAll(helvetica=True):
del item['helvetica']
return soup
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
remove_tags = [dict(name='link')]
feeds = [
(u'Minuto por Minuto', u'http://www.eluniversal.com.mx/rss/universalmxm.xml' )
,(u'Mundo' , u'http://www.eluniversal.com.mx/rss/mundo.xml' )
,(u'Mexico' , u'http://www.eluniversal.com.mx/rss/mexico.xml' )
,(u'Estados' , u'http://www.eluniversal.com.mx/rss/estados.xml' )
,(u'Finanzas' , u'http://www.eluniversal.com.mx/rss/finanzas.xml' )
,(u'Deportes' , u'http://www.eluniversal.com.mx/rss/deportes.xml' )
,(u'Espectaculos' , u'http://www.eluniversal.com.mx/rss/espectaculos.xml' )
,(u'Cultura' , u'http://www.eluniversal.com.mx/rss/cultura.xml' )
,(u'Ciencia' , u'http://www.eluniversal.com.mx/rss/ciencia.xml' )
,(u'Computacion' , u'http://www.eluniversal.com.mx/rss/computo.xml' )
,(u'Sociedad' , u'http://www.eluniversal.com.mx/rss/sociedad.xml' )
]
def print_version(self, url):
return url.replace('/notas/','/notas/vi_')
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-MX"/><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(font=True):
del item['font']
for item in soup.findAll(face=True):
del item['face']
for item in soup.findAll(helvetica=True):
del item['helvetica']
return soup

View File

@ -1,62 +1,62 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
elargentino.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ElArgentino(BasicNewsRecipe):
title = 'ElArgentino.com'
__author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas'
publisher = 'ElArgentino.com'
category = 'news, politics, Argentina'
oldest_article = 2
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
cover_url = 'http://www.elargentino.com/TemplateWeb/MediosFooter/tapa_elargentino.png'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
elargentino.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ElArgentino(BasicNewsRecipe):
title = 'ElArgentino.com'
__author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas'
publisher = 'ElArgentino.com'
category = 'news, politics, Argentina'
oldest_article = 2
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
cover_url = 'http://www.elargentino.com/TemplateWeb/MediosFooter/tapa_elargentino.png'
language = 'es'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
remove_tags = [
dict(name='div', attrs={'id':'noprint' })
,dict(name='div', attrs={'class':'encabezadoImprimir'})
,dict(name='a' , attrs={'target':'_blank' })
]
feeds = [
(u'Portada' , u'http://www.elargentino.com/Highlights.aspx?Content-Type=text/xml&ChannelDesc=Home' )
,(u'Pais' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=112&Content-Type=text/xml&ChannelDesc=Pa%C3%ADs' )
,(u'Economia' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=107&Content-Type=text/xml&ChannelDesc=Econom%C3%ADa' )
,(u'Mundo' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=113&Content-Type=text/xml&ChannelDesc=Mundo' )
,(u'Tecnologia' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=118&Content-Type=text/xml&ChannelDesc=Tecnolog%C3%ADa' )
,(u'Espectaculos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=114&Content-Type=text/xml&ChannelDesc=Espect%C3%A1culos')
,(u'Deportes' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=106&Content-Type=text/xml&ChannelDesc=Deportes' )
,(u'Sociedad' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=109&Content-Type=text/xml&ChannelDesc=Sociedad' )
,(u'Entrevistas' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=115&Content-Type=text/xml&ChannelDesc=Entrevistas' )
]
def print_version(self, url):
main, sep, article_part = url.partition('/nota-')
article_id, rsep, rrest = article_part.partition('-')
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<meta http-equiv="Content-Language" content="es-AR"/>\n'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
remove_tags = [
dict(name='div', attrs={'id':'noprint' })
,dict(name='div', attrs={'class':'encabezadoImprimir'})
,dict(name='a' , attrs={'target':'_blank' })
]
feeds = [
(u'Portada' , u'http://www.elargentino.com/Highlights.aspx?Content-Type=text/xml&ChannelDesc=Home' )
,(u'Pais' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=112&Content-Type=text/xml&ChannelDesc=Pa%C3%ADs' )
,(u'Economia' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=107&Content-Type=text/xml&ChannelDesc=Econom%C3%ADa' )
,(u'Mundo' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=113&Content-Type=text/xml&ChannelDesc=Mundo' )
,(u'Tecnologia' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=118&Content-Type=text/xml&ChannelDesc=Tecnolog%C3%ADa' )
,(u'Espectaculos', u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=114&Content-Type=text/xml&ChannelDesc=Espect%C3%A1culos')
,(u'Deportes' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=106&Content-Type=text/xml&ChannelDesc=Deportes' )
,(u'Sociedad' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=109&Content-Type=text/xml&ChannelDesc=Sociedad' )
,(u'Entrevistas' , u'http://www.elargentino.com/Highlights.aspx?ParentType=Section&ParentId=115&Content-Type=text/xml&ChannelDesc=Entrevistas' )
]
def print_version(self, url):
main, sep, article_part = url.partition('/nota-')
article_id, rsep, rrest = article_part.partition('-')
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<meta http-equiv="Content-Language" content="es-AR"/>\n'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -1,72 +1,72 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
cronista.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ElCronista(BasicNewsRecipe):
title = 'El Cronista'
__author__ = 'Darko Miletic'
description = 'Noticias de Argentina'
oldest_article = 2
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
cronista.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ElCronista(BasicNewsRecipe):
title = 'El Cronista'
__author__ = 'Darko Miletic'
description = 'Noticias de Argentina'
oldest_article = 2
language = 'es'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Argentina'
, '--publisher' , title
]
keep_only_tags = [
dict(name='table', attrs={'width':'100%' })
,dict(name='h1' , attrs={'class':'Arialgris16normal'})
]
remove_tags = [dict(name='a', attrs={'class':'Arialazul12'})]
feeds = [
(u'Economia' , u'http://www.cronista.com/adjuntos/8/rss/Economia_EI.xml' )
,(u'Negocios' , u'http://www.cronista.com/adjuntos/8/rss/negocios_EI.xml' )
,(u'Ultimo momento' , u'http://www.cronista.com/adjuntos/8/rss/ultimo_momento.xml' )
,(u'Finanzas y Mercados' , u'http://www.cronista.com/adjuntos/8/rss/Finanzas_Mercados_EI.xml' )
,(u'Financial Times' , u'http://www.cronista.com/adjuntos/8/rss/FT_EI.xml' )
,(u'Opinion edicion impresa' , u'http://www.cronista.com/adjuntos/8/rss/opinion_edicion_impresa.xml' )
,(u'Socialmente Responsables', u'http://www.cronista.com/adjuntos/8/rss/Socialmente_Responsables.xml')
,(u'Asuntos Legales' , u'http://www.cronista.com/adjuntos/8/rss/asuntoslegales.xml' )
,(u'IT Business' , u'http://www.cronista.com/adjuntos/8/rss/itbusiness.xml' )
,(u'Management y RR.HH.' , u'http://www.cronista.com/adjuntos/8/rss/management.xml' )
,(u'Inversiones Personales' , u'http://www.cronista.com/adjuntos/8/rss/inversionespersonales.xml' )
]
def print_version(self, url):
main, sep, rest = url.partition('.com/notas/')
article_id, lsep, rrest = rest.partition('-')
return 'http://www.cronista.com/interior/index.php?p=imprimir_nota&idNota=' + article_id
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
soup.head.insert(0,mtag)
soup.head.base.extract()
htext = soup.find('h1',attrs={'class':'Arialgris16normal'})
htext.name = 'p'
soup.prettify()
return soup
def get_cover_url(self):
cover_url = None
index = 'http://www.cronista.com/contenidos/'
soup = self.index_to_soup(index + 'ee.html')
link_item = soup.find('a',attrs={'href':"javascript:Close()"})
if link_item:
cover_url = index + link_item.img['src']
return cover_url
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Argentina'
, '--publisher' , title
]
keep_only_tags = [
dict(name='table', attrs={'width':'100%' })
,dict(name='h1' , attrs={'class':'Arialgris16normal'})
]
remove_tags = [dict(name='a', attrs={'class':'Arialazul12'})]
feeds = [
(u'Economia' , u'http://www.cronista.com/adjuntos/8/rss/Economia_EI.xml' )
,(u'Negocios' , u'http://www.cronista.com/adjuntos/8/rss/negocios_EI.xml' )
,(u'Ultimo momento' , u'http://www.cronista.com/adjuntos/8/rss/ultimo_momento.xml' )
,(u'Finanzas y Mercados' , u'http://www.cronista.com/adjuntos/8/rss/Finanzas_Mercados_EI.xml' )
,(u'Financial Times' , u'http://www.cronista.com/adjuntos/8/rss/FT_EI.xml' )
,(u'Opinion edicion impresa' , u'http://www.cronista.com/adjuntos/8/rss/opinion_edicion_impresa.xml' )
,(u'Socialmente Responsables', u'http://www.cronista.com/adjuntos/8/rss/Socialmente_Responsables.xml')
,(u'Asuntos Legales' , u'http://www.cronista.com/adjuntos/8/rss/asuntoslegales.xml' )
,(u'IT Business' , u'http://www.cronista.com/adjuntos/8/rss/itbusiness.xml' )
,(u'Management y RR.HH.' , u'http://www.cronista.com/adjuntos/8/rss/management.xml' )
,(u'Inversiones Personales' , u'http://www.cronista.com/adjuntos/8/rss/inversionespersonales.xml' )
]
def print_version(self, url):
main, sep, rest = url.partition('.com/notas/')
article_id, lsep, rrest = rest.partition('-')
return 'http://www.cronista.com/interior/index.php?p=imprimir_nota&idNota=' + article_id
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
soup.head.insert(0,mtag)
soup.head.base.extract()
htext = soup.find('h1',attrs={'class':'Arialgris16normal'})
htext.name = 'p'
soup.prettify()
return soup
def get_cover_url(self):
cover_url = None
index = 'http://www.cronista.com/contenidos/'
soup = self.index_to_soup(index + 'ee.html')
link_item = soup.find('a',attrs={'href':"javascript:Close()"})
if link_item:
cover_url = index + link_item.img['src']
return cover_url

View File

@ -1,61 +1,60 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
elmundo.es
'''
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
elmundo.es
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ElMundo(BasicNewsRecipe):
title = 'El Mundo'
__author__ = 'Darko Miletic'
description = 'News from Spain'
publisher = 'El Mundo'
category = 'news, politics, Spain'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'iso8859_15'
cover_url = 'http://estaticos02.cache.el-mundo.net/papel/imagenes/v2.0/logoverde.gif'
remove_javascript = True
html2lrf_options = [
class ElMundo(BasicNewsRecipe):
title = 'El Mundo'
__author__ = 'Darko Miletic'
description = 'News from Spain'
publisher = 'El Mundo'
category = 'news, politics, Spain'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'iso8859_15'
cover_url = 'http://estaticos02.cache.el-mundo.net/papel/imagenes/v2.0/logoverde.gif'
remove_javascript = True
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [
dict(name='div', attrs={'id':['bloqueprincipal','noticia']})
,dict(name='div', attrs={'class':['contenido_noticia_01']})
]
remove_tags = [
dict(name='div', attrs={'class':['herramientas','publicidad_google']})
,dict(name='div', attrs={'id':'modulo_multimedia' })
,dict(name='ul', attrs={'class':'herramientas' })
,dict(name=['object','link'])
]
feeds = [
(u'Portada' , u'http://rss.elmundo.es/rss/descarga.htm?data2=4' )
,(u'Espana' , u'http://rss.elmundo.es/rss/descarga.htm?data2=8' )
,(u'Internacional' , u'http://rss.elmundo.es/rss/descarga.htm?data2=9' )
,(u'Cultura' , u'http://rss.elmundo.es/rss/descarga.htm?data2=6' )
,(u'Ciencia/Ecologia', u'http://rss.elmundo.es/rss/descarga.htm?data2=5' )
,(u'Comunicacion' , u'http://rss.elmundo.es/rss/descarga.htm?data2=26')
,(u'Television' , u'http://rss.elmundo.es/rss/descarga.htm?data2=76')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [
dict(name='div', attrs={'id':['bloqueprincipal','noticia']})
,dict(name='div', attrs={'class':['contenido_noticia_01']})
]
remove_tags = [
dict(name='div', attrs={'class':['herramientas','publicidad_google']})
,dict(name='div', attrs={'id':'modulo_multimedia' })
,dict(name='ul', attrs={'class':'herramientas' })
,dict(name=['object','link'])
]
feeds = [
(u'Portada' , u'http://rss.elmundo.es/rss/descarga.htm?data2=4' )
,(u'Espana' , u'http://rss.elmundo.es/rss/descarga.htm?data2=8' )
,(u'Internacional' , u'http://rss.elmundo.es/rss/descarga.htm?data2=9' )
,(u'Cultura' , u'http://rss.elmundo.es/rss/descarga.htm?data2=6' )
,(u'Ciencia/Ecologia', u'http://rss.elmundo.es/rss/descarga.htm?data2=5' )
,(u'Comunicacion' , u'http://rss.elmundo.es/rss/descarga.htm?data2=26')
,(u'Television' , u'http://rss.elmundo.es/rss/descarga.htm?data2=76')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
language = 'es'

View File

@ -1,56 +1,56 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
elperiodico.cat
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class ElPeriodico_cat(BasicNewsRecipe):
title = 'El Periodico de Catalunya'
__author__ = 'Darko Miletic'
description = 'Noticias desde Catalunya'
publisher = 'elperiodico.cat'
category = 'news, politics, Spain, Catalunya'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
delay = 1
encoding = 'cp1252'
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
elperiodico.cat
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class ElPeriodico_cat(BasicNewsRecipe):
title = 'El Periodico de Catalunya'
__author__ = 'Darko Miletic'
description = 'Noticias desde Catalunya'
publisher = 'elperiodico.cat'
category = 'news, politics, Spain, Catalunya'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
delay = 1
encoding = 'cp1252'
language = 'ca'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u"Tota l'edició", u'http://www.elperiodico.cat/rss.asp?id=46')]
keep_only_tags = [dict(name='div', attrs={'id':'noticia'})]
remove_tags = [
dict(name=['object','link','script'])
,dict(name='ul',attrs={'class':'herramientasDeNoticia'})
,dict(name='div', attrs={'id':'inferiores'})
]
def print_version(self, url):
return url.replace('/default.asp?','/print.asp?')
def preprocess_html(self, soup):
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mcharset)
for item in soup.findAll(style=True):
del item['style']
return soup
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u"Tota l'edició", u'http://www.elperiodico.cat/rss.asp?id=46')]
keep_only_tags = [dict(name='div', attrs={'id':'noticia'})]
remove_tags = [
dict(name=['object','link','script'])
,dict(name='ul',attrs={'class':'herramientasDeNoticia'})
,dict(name='div', attrs={'id':'inferiores'})
]
def print_version(self, url):
return url.replace('/default.asp?','/print.asp?')
def preprocess_html(self, soup):
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mcharset)
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -1,56 +1,56 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
elperiodico.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class ElPeriodico_esp(BasicNewsRecipe):
title = 'El Periodico de Catalunya'
__author__ = 'Darko Miletic'
description = 'Noticias desde Catalunya'
publisher = 'elperiodico.com'
category = 'news, politics, Spain, Catalunya'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
delay = 1
encoding = 'cp1252'
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
elperiodico.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class ElPeriodico_esp(BasicNewsRecipe):
title = 'El Periodico de Catalunya'
__author__ = 'Darko Miletic'
description = 'Noticias desde Catalunya'
publisher = 'elperiodico.com'
category = 'news, politics, Spain, Catalunya'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
delay = 1
encoding = 'cp1252'
language = 'es'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u"Toda la edición", u'http://www.elperiodico.com/rss.asp?id=46')]
keep_only_tags = [dict(name='div', attrs={'id':'noticia'})]
remove_tags = [
dict(name=['object','link','script'])
,dict(name='ul',attrs={'class':'herramientasDeNoticia'})
,dict(name='div', attrs={'id':'inferiores'})
]
def print_version(self, url):
return url.replace('/default.asp?','/print.asp?')
def preprocess_html(self, soup):
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mcharset)
for item in soup.findAll(style=True):
del item['style']
return soup
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [(u"Toda la edición", u'http://www.elperiodico.com/rss.asp?id=46')]
keep_only_tags = [dict(name='div', attrs={'id':'noticia'})]
remove_tags = [
dict(name=['object','link','script'])
,dict(name='ul',attrs={'class':'herramientasDeNoticia'})
,dict(name='div', attrs={'id':'inferiores'})
]
def print_version(self, url):
return url.replace('/default.asp?','/print.asp?')
def preprocess_html(self, soup):
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mcharset)
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -1,53 +1,53 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.tiempo.hn
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class ElTiempoHn(BasicNewsRecipe):
title = 'El Tiempo - Honduras'
__author__ = 'Darko Miletic'
description = 'Noticias de Honduras y mundo'
publisher = 'El Tiempo'
category = 'news, politics, Honduras'
oldest_article = 2
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.tiempo.hn
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class ElTiempoHn(BasicNewsRecipe):
title = 'El Tiempo - Honduras'
__author__ = 'Darko Miletic'
description = 'Noticias de Honduras y mundo'
publisher = 'El Tiempo'
category = 'news, politics, Honduras'
oldest_article = 2
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
language = 'es'
lang = 'es-HN'
direction = 'ltr'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True\npretty_print=True\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} img {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em}"'
remove_tags = [dict(name=['form','object','embed','base'])]
keep_only_tags = [dict(name='td' , attrs={'id':'mainbodycont'})]
feeds = [(u'Noticias', u'http://www.tiempo.hn/index.php?format=feed&type=rss')]
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
for item in soup.findAll(style=True):
del item['style']
return self.adeify_images(soup)
lang = 'es-HN'
direction = 'ltr'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True\npretty_print=True\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} img {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em}"'
remove_tags = [dict(name=['form','object','embed','base'])]
keep_only_tags = [dict(name='td' , attrs={'id':'mainbodycont'})]
feeds = [(u'Noticias', u'http://www.tiempo.hn/index.php?format=feed&type=rss')]
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
for item in soup.findAll(style=True):
del item['style']
return self.adeify_images(soup)

View File

@ -1,32 +1,31 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
engadget.com
'''
import string,re
from calibre.web.feeds.news import BasicNewsRecipe
class Engadget(BasicNewsRecipe):
title = u'Engadget'
__author__ = 'Darko Miletic'
description = 'Tech news'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
engadget.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Engadget(BasicNewsRecipe):
title = u'Engadget'
__author__ = 'Darko Miletic'
description = 'Tech news'
language = 'en'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
keep_only_tags = [ dict(name='div', attrs={'class':'post'}) ]
remove_tags = [
dict(name='object')
,dict(name='div', attrs={'class':'postmeta'})
,dict(name='div', attrs={'class':'quigoads'})
]
feeds = [ (u'Posts', u'http://www.engadget.com/rss.xml')]
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
keep_only_tags = [ dict(name='div', attrs={'class':'post'}) ]
remove_tags = [
dict(name='object')
,dict(name='div', attrs={'class':'postmeta'})
,dict(name='div', attrs={'class':'quigoads'})
]
feeds = [ (u'Posts', u'http://www.engadget.com/rss.xml')]

View File

@ -1,63 +1,63 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.esquire.com
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Esquire(BasicNewsRecipe):
title = 'Esquire'
__author__ = 'Darko Miletic'
description = 'Esquire: Man at His Best'
publisher = 'Hearst Communications, Inc.'
category = 'magazine, men, women we love, style, the guide, sex, screen'
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'cp1250'
use_embedded_content = False
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.esquire.com
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Esquire(BasicNewsRecipe):
title = 'Esquire'
__author__ = 'Darko Miletic'
description = 'Esquire: Man at His Best'
publisher = 'Hearst Communications, Inc.'
category = 'magazine, men, women we love, style, the guide, sex, screen'
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'cp1250'
use_embedded_content = False
language = 'en'
lang = 'en-US'
cover_url = strftime('http://www.esquire.com/cm/esquire/cover-images/%Y_') + strftime('%m').strip('0') + '.jpg'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
remove_tags = [dict(name=['object','link','embed','iframe'])]
feeds = [
(u'Style' , u'http://www.esquire.com/style/rss/' )
,(u'Women' , u'http://www.esquire.com/women/rss/' )
,(u'Features' , u'http://www.esquire.com/features/rss/' )
,(u'Fiction' , u'http://www.esquire.com/fiction/rss/' )
,(u'Frontpage', u'http://www.esquire.com/rss/' )
]
def print_version(self, url):
rest = url.rpartition('?')[0]
article = rest.rpartition('/')[2]
return 'http://www.esquire.com/print-this/' + article
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.head.insert(0,mlang)
for item in soup.findAll(style=True):
del item['style']
return soup
lang = 'en-US'
cover_url = strftime('http://www.esquire.com/cm/esquire/cover-images/%Y_') + strftime('%m').strip('0') + '.jpg'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
remove_tags = [dict(name=['object','link','embed','iframe'])]
feeds = [
(u'Style' , u'http://www.esquire.com/style/rss/' )
,(u'Women' , u'http://www.esquire.com/women/rss/' )
,(u'Features' , u'http://www.esquire.com/features/rss/' )
,(u'Fiction' , u'http://www.esquire.com/fiction/rss/' )
,(u'Frontpage', u'http://www.esquire.com/rss/' )
]
def print_version(self, url):
rest = url.rpartition('?')[0]
article = rest.rpartition('/')[2]
return 'http://www.esquire.com/print-this/' + article
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.head.insert(0,mlang)
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -1,58 +1,58 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
exiledonline.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Exiled(BasicNewsRecipe):
title = 'Exiled Online'
__author__ = 'Darko Miletic'
description = "Mankind's only alternative since 1997 - Formerly known as The eXile"
publisher = 'Exiled Online'
category = 'news, politics, international'
oldest_article = 15
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
remove_javascript = True
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
exiledonline.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Exiled(BasicNewsRecipe):
title = 'Exiled Online'
__author__ = 'Darko Miletic'
description = "Mankind's only alternative since 1997 - Formerly known as The eXile"
publisher = 'Exiled Online'
category = 'news, politics, international'
oldest_article = 15
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
remove_javascript = True
language = 'en'
cover_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
html2lrf_options = [
'--comment' , description
, '--base-font-size', '10'
, '--category' , category
, '--publisher' , publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'main'})]
remove_tags = [
dict(name=['object','link'])
,dict(name='div', attrs={'class':'info'})
,dict(name='div', attrs={'id':['comments','navig']})
]
feeds = [(u'Articles', u'http://exiledonline.com/feed/')]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
mtag = '\n<meta http-equiv="Content-Language" content="en"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
soup.head.insert(0,mtag)
return soup
def get_article_url(self, article):
raw = article.get('link', None)
final = raw + 'all/1/'
return final
cover_url = 'http://exiledonline.com/wp-content/themes/exiledonline_theme/images/header-sm.gif'
html2lrf_options = [
'--comment' , description
, '--base-font-size', '10'
, '--category' , category
, '--publisher' , publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'main'})]
remove_tags = [
dict(name=['object','link'])
,dict(name='div', attrs={'class':'info'})
,dict(name='div', attrs={'id':['comments','navig']})
]
feeds = [(u'Articles', u'http://exiledonline.com/feed/')]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
mtag = '\n<meta http-equiv="Content-Language" content="en"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
soup.head.insert(0,mtag)
return soup
def get_article_url(self, article):
raw = article.get('link', None)
final = raw + 'all/1/'
return final

View File

@ -1,59 +1,59 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.expansion.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Expansion(BasicNewsRecipe):
title = 'Diario Expansion'
__author__ = 'Darko Miletic'
description = 'Lider de informacion de mercados, economica y politica'
publisher = 'expansion.com'
category = 'news, politics, Spain'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
delay = 1
encoding = 'iso-8859-15'
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.expansion.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Expansion(BasicNewsRecipe):
title = 'Diario Expansion'
__author__ = 'Darko Miletic'
description = 'Lider de informacion de mercados, economica y politica'
publisher = 'expansion.com'
category = 'news, politics, Spain'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
delay = 1
encoding = 'iso-8859-15'
language = 'es'
direction = 'ltr'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [
(u'Ultimas noticias', u'http://rss.expansion.com/rss/descarga.htm?data2=178')
,(u'Temas del dia' , u'http://rss.expansion.com/rss/descarga.htm?data2=178')
]
keep_only_tags = [dict(name='div', attrs={'id':'principal'})]
remove_tags = [
dict(name=['object','link','script'])
,dict(name='div', attrs={'class':['utilidades','tit_relacionadas']})
]
remove_tags_after = [dict(name='div', attrs={'class':'tit_relacionadas'})]
def preprocess_html(self, soup):
soup.html['dir' ] = self.direction
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mcharset)
for item in soup.findAll(style=True):
del item['style']
return soup
direction = 'ltr'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [
(u'Ultimas noticias', u'http://rss.expansion.com/rss/descarga.htm?data2=178')
,(u'Temas del dia' , u'http://rss.expansion.com/rss/descarga.htm?data2=178')
]
keep_only_tags = [dict(name='div', attrs={'id':'principal'})]
remove_tags = [
dict(name=['object','link','script'])
,dict(name='div', attrs={'class':['utilidades','tit_relacionadas']})
]
remove_tags_after = [dict(name='div', attrs={'class':'tit_relacionadas'})]
def preprocess_html(self, soup):
soup.html['dir' ] = self.direction
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mcharset)
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -1,55 +1,55 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.fastcompany.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class FastCompany(BasicNewsRecipe):
title = 'Fast Company'
__author__ = 'Darko Miletic'
description = 'Where ideas and people meet'
publisher = 'fastcompany.com'
category = 'news, technology, gadgets, games'
oldest_article = 15
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = True
simultaneous_downloads = 1
encoding = 'utf-8'
lang = 'en'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.fastcompany.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class FastCompany(BasicNewsRecipe):
title = 'Fast Company'
__author__ = 'Darko Miletic'
description = 'Where ideas and people meet'
publisher = 'fastcompany.com'
category = 'news, technology, gadgets, games'
oldest_article = 15
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = True
simultaneous_downloads = 1
encoding = 'utf-8'
lang = 'en'
language = 'en'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
remove_tags = [dict(name=['embed','object']), dict(name='div',attrs={'class':'feedflare'})]
feeds = [(u'All News', u'http://feeds.feedburner.com/fastcompany/headlines')]
def get_article_url(self, article):
return article.get('guid', None)
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
for item in soup.findAll('a'):
sp = item['href'].find('http://feedads.g.doubleclick.net/')
if sp != -1:
item.extract()
return self.adeify_images(soup)
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
remove_tags = [dict(name=['embed','object']), dict(name='div',attrs={'class':'feedflare'})]
feeds = [(u'All News', u'http://feeds.feedburner.com/fastcompany/headlines')]
def get_article_url(self, article):
return article.get('guid', None)
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
for item in soup.findAll('a'):
sp = item['href'].find('http://feedads.g.doubleclick.net/')
if sp != -1:
item.extract()
return self.adeify_images(soup)

View File

@ -1,51 +1,51 @@
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Kovid Goyal <kovid at kovidgoyal.net>, Darko Miletic <darko at gmail.com>'
'''
Profile to download FAZ.net
'''
from calibre.web.feeds.news import BasicNewsRecipe
class FazNet(BasicNewsRecipe):
title = 'FAZ NET'
__author__ = 'Kovid Goyal, Darko Miletic'
description = 'Frankfurter Allgemeine Zeitung'
publisher = 'FAZ Electronic Media GmbH'
category = 'news, politics, Germany'
use_embedded_content = False
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Kovid Goyal <kovid at kovidgoyal.net>, Darko Miletic <darko at gmail.com>'
'''
Profile to download FAZ.net
'''
from calibre.web.feeds.news import BasicNewsRecipe
class FazNet(BasicNewsRecipe):
title = 'FAZ NET'
__author__ = 'Kovid Goyal, Darko Miletic'
description = 'Frankfurter Allgemeine Zeitung'
publisher = 'FAZ Electronic Media GmbH'
category = 'news, politics, Germany'
use_embedded_content = False
language = 'de'
max_articles_per_feed = 30
no_stylesheets = True
encoding = 'utf-8'
remove_javascript = True
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'Article'})]
remove_tags = [
dict(name=['object','link','embed','base'])
,dict(name='div', attrs={'class':['LinkBoxModulSmall','ModulVerlagsInfo']})
]
feeds = [ ('FAZ.NET', 'http://www.faz.net/s/Rub/Tpl~Epartner~SRss_.xml') ]
def print_version(self, url):
article, sep, rest = url.partition('?')
return article.replace('.html', '~Afor~Eprint.html')
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
soup.head.insert(0,mtag)
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
return soup
max_articles_per_feed = 30
no_stylesheets = True
encoding = 'utf-8'
remove_javascript = True
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'Article'})]
remove_tags = [
dict(name=['object','link','embed','base'])
,dict(name='div', attrs={'class':['LinkBoxModulSmall','ModulVerlagsInfo']})
]
feeds = [ ('FAZ.NET', 'http://www.faz.net/s/Rub/Tpl~Epartner~SRss_.xml') ]
def print_version(self, url):
article, sep, rest = url.partition('?')
return article.replace('.html', '~Afor~Eprint.html')
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
soup.head.insert(0,mtag)
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
return soup

Some files were not shown because too many files have changed in this diff Show More