Sync to trunk.

This commit is contained in:
John Schember 2009-06-23 16:54:39 -04:00
commit 970af577e5
22 changed files with 427 additions and 65 deletions

View File

@ -242,9 +242,9 @@ class KindleDXOutput(OutputProfile):
description = _('This profile is intended for the Amazon Kindle DX.')
# Screen size is a best guess
screen_size = (824, 1200)
screen_size = (744, 1022)
dpi = 150.0
comic_screen_size = (741, 1080)
comic_screen_size = (741, 1022)
@classmethod
def tags_to_string(cls, tags):

View File

@ -694,7 +694,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
'''
from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
opts.preprocess_html)
opts.preprocess_html, getattr(opts, 'pdf_line_length', 0.5))
oeb = OEBBook(log, html_preprocessor,
pretty_print=opts.pretty_print, input_encoding=encoding)
if not populate:

View File

@ -160,9 +160,11 @@ class HTMLPreProcessor(object):
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
]
def __init__(self, input_plugin_preprocess, plugin_preprocess):
def __init__(self, input_plugin_preprocess, plugin_preprocess,
pdf_line_length):
self.input_plugin_preprocess = input_plugin_preprocess
self.plugin_preprocess = plugin_preprocess
self.pdf_line_length = pdf_line_length
def is_baen(self, src):
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
@ -183,7 +185,7 @@ class HTMLPreProcessor(object):
elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html):
length = line_length(html, .5)
length = line_length(html, self.pdf_line_length)
line_length_rules = []
if length:
line_length_rules = [

View File

@ -261,6 +261,11 @@ class HTMLInput(InputFormatPlugin):
'nasty side effects in the rest of of the conversion pipeline.'
)
),
OptionRecommendation(name='pdf_line_length', recommended_value=0.5,
help=_('Average line length for line breaking if the HTML is from a '
'previous partial conversion of a PDF file.')),
])
def convert(self, stream, opts, file_ext, log,

View File

@ -39,10 +39,6 @@ class MOBIOutput(OutputFormatPlugin):
])
recommendations = set([
('dont_justify', True, OptionRecommendation.HIGH),
])
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
from calibre.ebooks.mobi.writer import PALM_MAX_IMAGE_SIZE, \

View File

@ -218,8 +218,8 @@ class Serializer(object):
def serialize_body(self):
buffer = self.buffer
self.anchor_offset = buffer.tell()
buffer.write('<body>')
self.anchor_offset = buffer.tell()
# CybookG3 'Start Reading' link
if 'text' in self.oeb.guide:
href = self.oeb.guide['text'].href

View File

@ -16,7 +16,7 @@ from lxml import etree
from lxml.cssselect import CSSSelector
from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \
urldefrag, rewrite_links, urlunquote
urldefrag, rewrite_links, urlunquote, barename
from calibre.ebooks.epub import rules
XPath = functools.partial(_XPath, namespaces=NAMESPACES)
@ -46,9 +46,10 @@ class Split(object):
if self.page_breaks_xpath is not None:
self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)]
def __call__(self, oeb, context):
def __call__(self, oeb, opts):
self.oeb = oeb
self.log = oeb.log
self.opts = opts
self.map = {}
for item in list(self.oeb.manifest.items):
if item.spine_position is not None and etree.iselement(item.data):
@ -62,7 +63,7 @@ class Split(object):
page_breaks, page_break_ids = self.find_page_breaks(item)
splitter = FlowSplitter(item, page_breaks, page_break_ids,
self.max_flow_size, self.oeb)
self.max_flow_size, self.oeb, self.opts)
if splitter.was_split:
am = splitter.anchor_map
self.map[item.href] = collections.defaultdict(
@ -153,9 +154,11 @@ class Split(object):
class FlowSplitter(object):
'The actual splitting logic'
def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb):
def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb,
opts):
self.item = item
self.oeb = oeb
self.opts = opts
self.log = oeb.log
self.page_breaks = page_breaks
self.page_break_ids = page_break_ids
@ -221,6 +224,34 @@ class FlowSplitter(object):
return None
return body[0]
def adjust_split_point(self, root, path):
'''
Move the split point up its ancestor chain if it has no textual content
before it. This handles the common case:
<div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
h2.
'''
sp = root.xpath(path)[0]
while True:
parent = sp.getparent()
if barename(parent.tag) in ('body', 'html'):
break
if parent.text and parent.text.strip():
break
if parent.index(sp) > 0:
break
sp = parent
npath = sp.getroottree().getpath(sp)
if self.opts.verbose > 3 and npath != path:
self.log.debug('\t\t\tMoved split point %s to %s'%(path, npath))
return npath
def do_split(self, tree, split_point, before):
'''
Split ``tree`` into a *before* and *after* tree at ``split_point``,
@ -236,9 +267,11 @@ class FlowSplitter(object):
root = tree.getroot()
root2 = tree2.getroot()
body, body2 = map(self.get_body, (root, root2))
path = self.adjust_split_point(root, path)
split_point = root.xpath(path)[0]
split_point2 = root2.xpath(path)[0]
def nix_element(elem, top=True):
parent = elem.getparent()
index = parent.index(elem)
@ -254,9 +287,12 @@ class FlowSplitter(object):
if elem is split_point:
hit_split_point = True
if before:
x = elem.get('id', None)
nix_element(elem)
continue
if hit_split_point:
x = elem.get('id', None)
nix_element(elem)
@ -266,9 +302,11 @@ class FlowSplitter(object):
if elem is split_point2:
hit_split_point = True
if not before:
x = elem.get('id', None)
nix_element(elem, top=False)
continue
if not hit_split_point:
x = elem.get('id', None)
nix_element(elem, top=False)
return tree, tree2

View File

@ -36,8 +36,8 @@ class DetectStructure(object):
if self.oeb.toc.count() < 1:
if not opts.no_chapters_in_toc and self.detected_chapters:
self.create_toc_from_chapters()
if self.oeb.toc.count() < opts.toc_threshold:
self.create_toc_from_links()
if self.oeb.toc.count() < opts.toc_threshold:
self.create_toc_from_links()
if self.oeb.toc.count() < 2 and orig_toc.count() > 2:
self.oeb.toc = orig_toc
else:

View File

@ -20,6 +20,8 @@ class PDFInput(InputFormatPlugin):
options = set([
OptionRecommendation(name='no_images', recommended_value=False,
help=_('Do not extract images from the document')),
OptionRecommendation(name='pdf_line_length', recommended_value=0.5,
help=_('Average line length for line breaking')),
])
def convert(self, stream, options, file_ext, log,

View File

@ -48,7 +48,8 @@ def _config():
help=_('Defaults for conversion to LRF'))
c.add_opt('LRF_ebook_viewer_options', default=None,
help=_('Options for the LRF ebook viewer'))
c.add_opt('internally_viewed_formats', default=['LRF', 'EPUB', 'LIT', 'MOBI', 'PRC', 'HTML', 'FB2'],
c.add_opt('internally_viewed_formats', default=['LRF', 'EPUB', 'LIT',
'MOBI', 'PRC', 'HTML', 'FB2', 'PDB', 'RB'],
help=_('Formats that are viewed using the internal viewer'))
c.add_opt('column_map', default=ALL_COLUMNS,
help=_('Columns to be displayed in the book list'))

View File

@ -12,7 +12,7 @@ from calibre.gui2.dialogs.tag_editor import TagEditor
from calibre.ebooks.metadata import string_to_authors, authors_to_sort_string
class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
def __init__(self, window, rows, db):
QDialog.__init__(self, window)
Ui_MetadataBulkDialog.__init__(self)
@ -22,33 +22,33 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
self.write_series = False
self.write_rating = False
self.changed = False
QObject.connect(self.button_box, SIGNAL("accepted()"), self.sync)
QObject.connect(self.button_box, SIGNAL("accepted()"), self.sync)
QObject.connect(self.rating, SIGNAL('valueChanged(int)'), self.rating_changed)
all_series = self.db.all_series()
for i in all_series:
id, name = i
self.series.addItem(name)
for f in self.db.all_formats():
self.remove_format.addItem(f)
self.remove_format.setCurrentIndex(-1)
self.series.lineEdit().setText('')
QObject.connect(self.series, SIGNAL('currentIndexChanged(int)'), self.series_changed)
QObject.connect(self.series, SIGNAL('editTextChanged(QString)'), self.series_changed)
QObject.connect(self.tag_editor_button, SIGNAL('clicked()'), self.tag_editor)
self.exec_()
def tag_editor(self):
d = TagEditor(self, self.db, None)
d.exec_()
if d.result() == QDialog.Accepted:
tag_string = ', '.join(d.tags)
self.tags.setText(tag_string)
def sync(self):
for id in self.ids:
au = qstring_to_unicode(self.authors.text())
@ -80,14 +80,14 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
self.db.set_tags(id, tags, append=True, notify=False)
if self.write_series:
self.db.set_series(id, qstring_to_unicode(self.series.currentText()), notify=False)
if self.remove_format.currentIndex() > -1:
self.db.remove_format(id, unicode(self.remove_format.currentText()), index_is_id=True, notify=False)
self.changed = True
def series_changed(self):
self.write_series = True
def rating_changed(self):
self.write_rating = True
self.write_rating = True

Binary file not shown.

After

Width:  |  Height:  |  Size: 621 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 295 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 381 B

View File

@ -224,6 +224,10 @@ class ResultCache(SearchQueryParser):
id = row if row_is_id else self._map_filtered[row]
self._data[id][col] = val
def get(self, row, col, row_is_id=False):
id = row if row_is_id else self._map_filtered[row]
return self._data[id][col]
def index(self, id, cache=False):
x = self._map if cache else self._map_filtered
return x.index(id)
@ -557,6 +561,35 @@ class LibraryDatabase2(LibraryDatabase):
)
def upgrade_version_6(self):
'Show authors in order'
self.conn.executescript('''
BEGIN TRANSACTION;
DROP VIEW meta;
CREATE VIEW meta AS
SELECT id, title,
(SELECT sortconcat(bal.id, name) FROM books_authors_link AS bal JOIN authors ON(author = authors.id) WHERE book = books.id) authors,
(SELECT name FROM publishers WHERE publishers.id IN (SELECT publisher from books_publishers_link WHERE book=books.id)) publisher,
(SELECT rating FROM ratings WHERE ratings.id IN (SELECT rating from books_ratings_link WHERE book=books.id)) rating,
timestamp,
(SELECT MAX(uncompressed_size) FROM data WHERE book=books.id) size,
(SELECT concat(name) FROM tags WHERE tags.id IN (SELECT tag from books_tags_link WHERE book=books.id)) tags,
(SELECT text FROM comments WHERE book=books.id) comments,
(SELECT name FROM series WHERE series.id IN (SELECT series FROM books_series_link WHERE book=books.id)) series,
series_index,
sort,
author_sort,
(SELECT concat(format) FROM data WHERE data.book=books.id) formats,
isbn,
path,
lccn,
pubdate,
flags
FROM books;
END TRANSACTION;
''')
def last_modified(self):
''' Return last modified time as a UTC datetime object'''
@ -1105,6 +1138,14 @@ class LibraryDatabase2(LibraryDatabase):
if notify:
self.notify('metadata', [id])
def get_tags(self, id):
result = self.conn.get(
'SELECT name FROM tags WHERE id IN (SELECT tag FROM books_tags_link WHERE book=?)',
(id,), all=True)
if not result:
return set([])
return set([r[0] for r in result])
def set_tags(self, id, tags, append=False, notify=True):
'''
@param tags: list of strings
@ -1113,7 +1154,8 @@ class LibraryDatabase2(LibraryDatabase):
if not append:
self.conn.execute('DELETE FROM books_tags_link WHERE book=?', (id,))
self.conn.execute('DELETE FROM tags WHERE (SELECT COUNT(id) FROM books_tags_link WHERE tag=tags.id) < 1')
for tag in set(tags):
otags = self.get_tags(id)
for tag in (set(tags)-otags):
tag = tag.strip()
if not tag:
continue
@ -1138,13 +1180,7 @@ class LibraryDatabase2(LibraryDatabase):
self.conn.execute('INSERT INTO books_tags_link(book, tag) VALUES (?,?)',
(id, tid))
self.conn.commit()
try:
otags = [t.strip() for t in self.data[self.data.row(id)][FIELD_MAP['tags']].split(',')]
except AttributeError:
otags = []
if not append:
otags = []
tags = ','.join(otags+tags)
tags = ','.join(self.get_tags(id))
self.data.set(id, FIELD_MAP['tags'], tags, row_is_id=True)
if notify:
self.notify('metadata', [id])

View File

@ -40,10 +40,10 @@ def convert_timestamp(val):
if tz is not None:
h, m = map(int, tz.split(':'))
delta = timedelta(minutes=mult*(60*h + m))
tz = type('CustomTZ', (tzinfo,), {'utcoffset':lambda self, dt:delta,
tz = type('CustomTZ', (tzinfo,), {'utcoffset':lambda self, dt:delta,
'dst':lambda self,dt:timedelta(0)})()
val = datetime(year, month, day, hours, minutes, seconds, microseconds,
val = datetime(year, month, day, hours, minutes, seconds, microseconds,
tzinfo=tz)
if tz is not None:
val = datetime(*(val.utctimetuple()[:6]))
@ -61,11 +61,11 @@ class Concatenate(object):
def __init__(self, sep=','):
self.sep = sep
self.ans = ''
def step(self, value):
if value is not None:
self.ans += value + self.sep
def finalize(self):
if not self.ans:
return None
@ -73,8 +73,23 @@ class Concatenate(object):
return self.ans[:-len(self.sep)]
return self.ans
class SortedConcatenate(object):
'''String concatenation aggregator for sqlite, sorted by supplied index'''
def __init__(self, sep=','):
self.sep = sep
self.ans = {}
def step(self, ndx, value):
if value is not None:
self.ans[ndx] = value
def finalize(self):
if len(self.ans) == 0:
return None
return self.sep.join(map(self.ans.get, sorted(self.ans.keys())))
class Connection(sqlite.Connection):
def get(self, *args, **kw):
ans = self.execute(*args)
if not kw.get('all', True):
@ -83,12 +98,12 @@ class Connection(sqlite.Connection):
ans = [None]
return ans[0]
return ans.fetchall()
class DBThread(Thread):
CLOSE = '-------close---------'
def __init__(self, path, row_factory):
Thread.__init__(self)
self.setDaemon(True)
@ -98,14 +113,15 @@ class DBThread(Thread):
self.requests = Queue(1)
self.results = Queue(1)
self.conn = None
def connect(self):
self.conn = sqlite.connect(self.path, factory=Connection,
self.conn = sqlite.connect(self.path, factory=Connection,
detect_types=sqlite.PARSE_DECLTYPES|sqlite.PARSE_COLNAMES)
self.conn.row_factory = sqlite.Row if self.row_factory else lambda cursor, row : list(row)
self.conn.create_aggregate('concat', 1, Concatenate)
self.conn.create_aggregate('sortconcat', 2, SortedConcatenate)
self.conn.create_function('title_sort', 1, title_sort)
def run(self):
try:
self.connect()
@ -124,7 +140,7 @@ class DBThread(Thread):
self.unhandled_error = (err, traceback.format_exc())
class DatabaseException(Exception):
def __init__(self, err, tb):
tb = '\n\t'.join(('\tRemote'+tb).splitlines())
msg = unicode(err) +'\n' + tb
@ -146,41 +162,41 @@ def proxy(fn):
raise DatabaseException(*res)
return res
return run
class ConnectionProxy(object):
def __init__(self, proxy):
self.proxy = proxy
def close(self):
if self.proxy.unhandled_error is None:
self.proxy.requests.put((self.proxy.CLOSE, [], {}))
@proxy
def get(self, query, all=True): pass
@proxy
@proxy
def commit(self): pass
@proxy
def execute(self): pass
@proxy
def executemany(self): pass
@proxy
def executescript(self): pass
@proxy
def create_aggregate(self): pass
@proxy
def create_function(self): pass
@proxy
def cursor(self): pass
def connect(dbpath, row_factory=None):
conn = ConnectionProxy(DBThread(dbpath, row_factory))
conn.proxy.start()
@ -188,4 +204,4 @@ def connect(dbpath, row_factory=None):
time.sleep(0.01)
if conn.proxy.unhandled_error[0] is not None:
raise DatabaseException(*conn.proxy.unhandled_error)
return conn
return conn

View File

@ -140,6 +140,11 @@ sudo calibre_postinstall
</form>
</div>
<hr/>
<h3>Note</h3>
<p>
If your kernel is compiled with CONFIG_SYSFS_DEPRECATED device detection may not work.
</p>
<hr/>
<h3>Dependencies</h3>
${app} has the following dependencies (the listed version is the minimum version)
<br/><br/>

View File

@ -50,6 +50,7 @@ recipe_modules = ['recipe_' + r for r in (
'marca', 'kellog_faculty', 'kellog_insight',
'theeconomictimes_india', '7dias', 'buenosaireseconomico',
'diagonales', 'miradasalsur', 'newsweek_argentina', 'veintitres',
'gva_be', 'hln', 'tijd', 'degentenaar',
)]
import re, imp, inspect, time, os

View File

@ -0,0 +1,75 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.nieuwsblad.be
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class DeGentenaarOnline(BasicNewsRecipe):
title = 'De Gentenaar Online'
__author__ = 'Darko Miletic'
description = 'News from Belgium in Dutch'
publisher = 'De Gentenaar'
category = 'news, politics, Belgium'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = _('Dutch')
lang = 'nl-BE'
direction = 'ltr'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='span', attrs={'id':['lblArticleTitle','lblArticleIntroduction','lblArticleMainText']})]
remove_tags = [dict(name=['embed','object'])]
feeds = [
(u'Snelnieuws' , u'http://feeds.nieuwsblad.be/nieuws/snelnieuws' )
,(u'Binnenland' , u'http://feeds.nieuwsblad.be/nieuws/binnenland' )
,(u'Buitenland' , u'http://feeds.nieuwsblad.be/nieuwsblad/buitenland' )
,(u'Economie' , u'http://feeds.nieuwsblad.be/economie/home' )
,(u'Economie' , u'http://feeds.nieuwsblad.be/economie/home' )
,(u'Algemeen' , u'http://feeds.nieuwsblad.be/life/algemeen' )
,(u'Film' , u'http://feeds.nieuwsblad.be/life/film' )
,(u'Boek' , u'http://feeds.nieuwsblad.be/life/boeken' )
,(u'Muziek' , u'http://feeds.nieuwsblad.be/life/muziek' )
,(u'Podium' , u'http://feeds.nieuwsblad.be/life/podium' )
,(u'TV & radio' , u'http://feeds.nieuwsblad.be/life/tv' )
]
def print_version(self, url):
return url.replace('/Detail.aspx?articleid','/PrintArticle.aspx?ArticleID')
def get_article_url(self, article):
return article.get('guid', None)
def preprocess_html(self, soup):
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('span'):
item.name='div'
if item.has_key('id') and item['id'] == 'lblArticleTitle':
item.name='h3'
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup

View File

@ -0,0 +1,63 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.gva.be
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class GazetvanAntwerpen(BasicNewsRecipe):
title = 'Gazet van Antwerpen'
__author__ = 'Darko Miletic'
description = 'News from Belgium in Dutch'
publisher = 'Gazet van Antwerpen'
category = 'news, politics, Belgium'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = _('Dutch')
lang = 'nl-BE'
direction = 'ltr'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'id':'article'})]
remove_tags = [
dict(name=['embed','object'])
, dict (name='div',attrs={'class':['note NotePortrait','note']})
]
remove_tags_after = dict(name='span', attrs={'class':'author'})
feeds = [
(u'Overzicht & Blikvanger', u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/overview/overzicht' )
,(u'Stad & Regio' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/stadenregio' )
,(u'Economie' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/economie' )
,(u'Binnenland' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/binnenland' )
,(u'Buitenland' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/buitenland' )
,(u'Media & Cultur' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/mediaencultuur')
,(u'Wetenschap' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/mediaencultuur')
,(u'Sport' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/sport' )
]
def preprocess_html(self, soup):
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup

View File

@ -0,0 +1,52 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.hln.be
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class HLN_be(BasicNewsRecipe):
title = 'Het Belang Van Limburg'
__author__ = 'Darko Miletic'
description = 'News from Belgium in Dutch'
publisher = 'Het Belang Van Limburg'
category = 'news, politics, Belgium'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = _('Dutch')
lang = 'nl-BE'
direction = 'ltr'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'class':'art_box2'})]
remove_tags = [
dict(name=['embed','object'])
]
feeds = [(u'Alle nieuws', u'http://www.hln.be/rss.xml')]
def preprocess_html(self, soup):
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup

View File

@ -0,0 +1,70 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.tijd.be
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class DeTijd(BasicNewsRecipe):
title = 'De Tijd'
__author__ = 'Darko Miletic'
description = 'News from Belgium in Dutch'
publisher = 'De Tijd'
category = 'news, politics, Belgium'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = _('Dutch')
lang = 'nl-BE'
direction = 'ltr'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'id':'lcol'})]
remove_tags = [
dict(name=['embed','object'])
, dict (name='div',attrs={'id':'art_reactwrap'})
]
remove_tags_after = dict(name='div', attrs={'id':'art_author'})
feeds = [
(u'Volledig nieuwsaanbod', u'http://www.tijd.be/rss/nieuws.xml' )
,(u'Markten' , u'http://www.tijd.be/rss/markten.xml' )
,(u'Ondernemingen' , u'http://www.tijd.be/rss/ondernemingen.xml' )
,(u'Chemie-Farma' , u'http://www.tijd.be/rss/chemie_farma.xml' )
,(u'Consumptie' , u'http://www.tijd.be/rss/consumptie.xml' )
,(u'Diensten' , u'http://www.tijd.be/rss/diensten.xml' )
,(u'Energie' , u'http://www.tijd.be/rss/energie.xml' )
,(u'Financen' , u'http://www.tijd.be/rss/financien.xml' )
,(u'Industrie' , u'http://www.tijd.be/rss/industrie.xml' )
,(u'Media' , u'http://www.tijd.be/rss/media_telecom.xml' )
,(u'Technologie' , u'http://www.tijd.be/rss/technologie.xml' )
,(u'Economie & Financien' , u'http://www.tijd.be/rss/economie.xml' )
,(u'Binnenland' , u'http://www.tijd.be/rss/binnenland.xml' )
,(u'Buitenland' , u'http://www.tijd.be/rss/buitenland.xml' )
,(u'De wijde wereld' , u'http://www.tijd.be/rss/cultuur.xml' )
]
def preprocess_html(self, soup):
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup