Sync to trunk.

This commit is contained in:
John Schember 2009-06-23 16:54:39 -04:00
commit 970af577e5
22 changed files with 427 additions and 65 deletions

View File

@ -242,9 +242,9 @@ class KindleDXOutput(OutputProfile):
description = _('This profile is intended for the Amazon Kindle DX.') description = _('This profile is intended for the Amazon Kindle DX.')
# Screen size is a best guess # Screen size is a best guess
screen_size = (824, 1200) screen_size = (744, 1022)
dpi = 150.0 dpi = 150.0
comic_screen_size = (741, 1080) comic_screen_size = (741, 1022)
@classmethod @classmethod
def tags_to_string(cls, tags): def tags_to_string(cls, tags):

View File

@ -694,7 +694,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
''' '''
from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
opts.preprocess_html) opts.preprocess_html, getattr(opts, 'pdf_line_length', 0.5))
oeb = OEBBook(log, html_preprocessor, oeb = OEBBook(log, html_preprocessor,
pretty_print=opts.pretty_print, input_encoding=encoding) pretty_print=opts.pretty_print, input_encoding=encoding)
if not populate: if not populate:

View File

@ -160,9 +160,11 @@ class HTMLPreProcessor(object):
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)), lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
] ]
def __init__(self, input_plugin_preprocess, plugin_preprocess): def __init__(self, input_plugin_preprocess, plugin_preprocess,
pdf_line_length):
self.input_plugin_preprocess = input_plugin_preprocess self.input_plugin_preprocess = input_plugin_preprocess
self.plugin_preprocess = plugin_preprocess self.plugin_preprocess = plugin_preprocess
self.pdf_line_length = pdf_line_length
def is_baen(self, src): def is_baen(self, src):
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"', return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
@ -183,7 +185,7 @@ class HTMLPreProcessor(object):
elif self.is_book_designer(html): elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html): elif self.is_pdftohtml(html):
length = line_length(html, .5) length = line_length(html, self.pdf_line_length)
line_length_rules = [] line_length_rules = []
if length: if length:
line_length_rules = [ line_length_rules = [

View File

@ -261,6 +261,11 @@ class HTMLInput(InputFormatPlugin):
'nasty side effects in the rest of of the conversion pipeline.' 'nasty side effects in the rest of of the conversion pipeline.'
) )
), ),
OptionRecommendation(name='pdf_line_length', recommended_value=0.5,
help=_('Average line length for line breaking if the HTML is from a '
'previous partial conversion of a PDF file.')),
]) ])
def convert(self, stream, opts, file_ext, log, def convert(self, stream, opts, file_ext, log,

View File

@ -39,10 +39,6 @@ class MOBIOutput(OutputFormatPlugin):
]) ])
recommendations = set([
('dont_justify', True, OptionRecommendation.HIGH),
])
def convert(self, oeb, output_path, input_plugin, opts, log): def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb self.log, self.opts, self.oeb = log, opts, oeb
from calibre.ebooks.mobi.writer import PALM_MAX_IMAGE_SIZE, \ from calibre.ebooks.mobi.writer import PALM_MAX_IMAGE_SIZE, \

View File

@ -218,8 +218,8 @@ class Serializer(object):
def serialize_body(self): def serialize_body(self):
buffer = self.buffer buffer = self.buffer
self.anchor_offset = buffer.tell()
buffer.write('<body>') buffer.write('<body>')
self.anchor_offset = buffer.tell()
# CybookG3 'Start Reading' link # CybookG3 'Start Reading' link
if 'text' in self.oeb.guide: if 'text' in self.oeb.guide:
href = self.oeb.guide['text'].href href = self.oeb.guide['text'].href

View File

@ -16,7 +16,7 @@ from lxml import etree
from lxml.cssselect import CSSSelector from lxml.cssselect import CSSSelector
from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \ from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \
urldefrag, rewrite_links, urlunquote urldefrag, rewrite_links, urlunquote, barename
from calibre.ebooks.epub import rules from calibre.ebooks.epub import rules
XPath = functools.partial(_XPath, namespaces=NAMESPACES) XPath = functools.partial(_XPath, namespaces=NAMESPACES)
@ -46,9 +46,10 @@ class Split(object):
if self.page_breaks_xpath is not None: if self.page_breaks_xpath is not None:
self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)] self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)]
def __call__(self, oeb, context): def __call__(self, oeb, opts):
self.oeb = oeb self.oeb = oeb
self.log = oeb.log self.log = oeb.log
self.opts = opts
self.map = {} self.map = {}
for item in list(self.oeb.manifest.items): for item in list(self.oeb.manifest.items):
if item.spine_position is not None and etree.iselement(item.data): if item.spine_position is not None and etree.iselement(item.data):
@ -62,7 +63,7 @@ class Split(object):
page_breaks, page_break_ids = self.find_page_breaks(item) page_breaks, page_break_ids = self.find_page_breaks(item)
splitter = FlowSplitter(item, page_breaks, page_break_ids, splitter = FlowSplitter(item, page_breaks, page_break_ids,
self.max_flow_size, self.oeb) self.max_flow_size, self.oeb, self.opts)
if splitter.was_split: if splitter.was_split:
am = splitter.anchor_map am = splitter.anchor_map
self.map[item.href] = collections.defaultdict( self.map[item.href] = collections.defaultdict(
@ -153,9 +154,11 @@ class Split(object):
class FlowSplitter(object): class FlowSplitter(object):
'The actual splitting logic' 'The actual splitting logic'
def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb): def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb,
opts):
self.item = item self.item = item
self.oeb = oeb self.oeb = oeb
self.opts = opts
self.log = oeb.log self.log = oeb.log
self.page_breaks = page_breaks self.page_breaks = page_breaks
self.page_break_ids = page_break_ids self.page_break_ids = page_break_ids
@ -221,6 +224,34 @@ class FlowSplitter(object):
return None return None
return body[0] return body[0]
def adjust_split_point(self, root, path):
'''
Move the split point up its ancestor chain if it has no textual content
before it. This handles the common case:
<div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
h2.
'''
sp = root.xpath(path)[0]
while True:
parent = sp.getparent()
if barename(parent.tag) in ('body', 'html'):
break
if parent.text and parent.text.strip():
break
if parent.index(sp) > 0:
break
sp = parent
npath = sp.getroottree().getpath(sp)
if self.opts.verbose > 3 and npath != path:
self.log.debug('\t\t\tMoved split point %s to %s'%(path, npath))
return npath
def do_split(self, tree, split_point, before): def do_split(self, tree, split_point, before):
''' '''
Split ``tree`` into a *before* and *after* tree at ``split_point``, Split ``tree`` into a *before* and *after* tree at ``split_point``,
@ -236,9 +267,11 @@ class FlowSplitter(object):
root = tree.getroot() root = tree.getroot()
root2 = tree2.getroot() root2 = tree2.getroot()
body, body2 = map(self.get_body, (root, root2)) body, body2 = map(self.get_body, (root, root2))
path = self.adjust_split_point(root, path)
split_point = root.xpath(path)[0] split_point = root.xpath(path)[0]
split_point2 = root2.xpath(path)[0] split_point2 = root2.xpath(path)[0]
def nix_element(elem, top=True): def nix_element(elem, top=True):
parent = elem.getparent() parent = elem.getparent()
index = parent.index(elem) index = parent.index(elem)
@ -254,9 +287,12 @@ class FlowSplitter(object):
if elem is split_point: if elem is split_point:
hit_split_point = True hit_split_point = True
if before: if before:
x = elem.get('id', None)
nix_element(elem) nix_element(elem)
continue continue
if hit_split_point: if hit_split_point:
x = elem.get('id', None)
nix_element(elem) nix_element(elem)
@ -266,9 +302,11 @@ class FlowSplitter(object):
if elem is split_point2: if elem is split_point2:
hit_split_point = True hit_split_point = True
if not before: if not before:
x = elem.get('id', None)
nix_element(elem, top=False) nix_element(elem, top=False)
continue continue
if not hit_split_point: if not hit_split_point:
x = elem.get('id', None)
nix_element(elem, top=False) nix_element(elem, top=False)
return tree, tree2 return tree, tree2

View File

@ -20,6 +20,8 @@ class PDFInput(InputFormatPlugin):
options = set([ options = set([
OptionRecommendation(name='no_images', recommended_value=False, OptionRecommendation(name='no_images', recommended_value=False,
help=_('Do not extract images from the document')), help=_('Do not extract images from the document')),
OptionRecommendation(name='pdf_line_length', recommended_value=0.5,
help=_('Average line length for line breaking')),
]) ])
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,

View File

@ -48,7 +48,8 @@ def _config():
help=_('Defaults for conversion to LRF')) help=_('Defaults for conversion to LRF'))
c.add_opt('LRF_ebook_viewer_options', default=None, c.add_opt('LRF_ebook_viewer_options', default=None,
help=_('Options for the LRF ebook viewer')) help=_('Options for the LRF ebook viewer'))
c.add_opt('internally_viewed_formats', default=['LRF', 'EPUB', 'LIT', 'MOBI', 'PRC', 'HTML', 'FB2'], c.add_opt('internally_viewed_formats', default=['LRF', 'EPUB', 'LIT',
'MOBI', 'PRC', 'HTML', 'FB2', 'PDB', 'RB'],
help=_('Formats that are viewed using the internal viewer')) help=_('Formats that are viewed using the internal viewer'))
c.add_opt('column_map', default=ALL_COLUMNS, c.add_opt('column_map', default=ALL_COLUMNS,
help=_('Columns to be displayed in the book list')) help=_('Columns to be displayed in the book list'))

Binary file not shown.

After

Width:  |  Height:  |  Size: 621 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 295 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 381 B

View File

@ -224,6 +224,10 @@ class ResultCache(SearchQueryParser):
id = row if row_is_id else self._map_filtered[row] id = row if row_is_id else self._map_filtered[row]
self._data[id][col] = val self._data[id][col] = val
def get(self, row, col, row_is_id=False):
id = row if row_is_id else self._map_filtered[row]
return self._data[id][col]
def index(self, id, cache=False): def index(self, id, cache=False):
x = self._map if cache else self._map_filtered x = self._map if cache else self._map_filtered
return x.index(id) return x.index(id)
@ -557,6 +561,35 @@ class LibraryDatabase2(LibraryDatabase):
) )
def upgrade_version_6(self):
'Show authors in order'
self.conn.executescript('''
BEGIN TRANSACTION;
DROP VIEW meta;
CREATE VIEW meta AS
SELECT id, title,
(SELECT sortconcat(bal.id, name) FROM books_authors_link AS bal JOIN authors ON(author = authors.id) WHERE book = books.id) authors,
(SELECT name FROM publishers WHERE publishers.id IN (SELECT publisher from books_publishers_link WHERE book=books.id)) publisher,
(SELECT rating FROM ratings WHERE ratings.id IN (SELECT rating from books_ratings_link WHERE book=books.id)) rating,
timestamp,
(SELECT MAX(uncompressed_size) FROM data WHERE book=books.id) size,
(SELECT concat(name) FROM tags WHERE tags.id IN (SELECT tag from books_tags_link WHERE book=books.id)) tags,
(SELECT text FROM comments WHERE book=books.id) comments,
(SELECT name FROM series WHERE series.id IN (SELECT series FROM books_series_link WHERE book=books.id)) series,
series_index,
sort,
author_sort,
(SELECT concat(format) FROM data WHERE data.book=books.id) formats,
isbn,
path,
lccn,
pubdate,
flags
FROM books;
END TRANSACTION;
''')
def last_modified(self): def last_modified(self):
''' Return last modified time as a UTC datetime object''' ''' Return last modified time as a UTC datetime object'''
@ -1105,6 +1138,14 @@ class LibraryDatabase2(LibraryDatabase):
if notify: if notify:
self.notify('metadata', [id]) self.notify('metadata', [id])
def get_tags(self, id):
result = self.conn.get(
'SELECT name FROM tags WHERE id IN (SELECT tag FROM books_tags_link WHERE book=?)',
(id,), all=True)
if not result:
return set([])
return set([r[0] for r in result])
def set_tags(self, id, tags, append=False, notify=True): def set_tags(self, id, tags, append=False, notify=True):
''' '''
@param tags: list of strings @param tags: list of strings
@ -1113,7 +1154,8 @@ class LibraryDatabase2(LibraryDatabase):
if not append: if not append:
self.conn.execute('DELETE FROM books_tags_link WHERE book=?', (id,)) self.conn.execute('DELETE FROM books_tags_link WHERE book=?', (id,))
self.conn.execute('DELETE FROM tags WHERE (SELECT COUNT(id) FROM books_tags_link WHERE tag=tags.id) < 1') self.conn.execute('DELETE FROM tags WHERE (SELECT COUNT(id) FROM books_tags_link WHERE tag=tags.id) < 1')
for tag in set(tags): otags = self.get_tags(id)
for tag in (set(tags)-otags):
tag = tag.strip() tag = tag.strip()
if not tag: if not tag:
continue continue
@ -1138,13 +1180,7 @@ class LibraryDatabase2(LibraryDatabase):
self.conn.execute('INSERT INTO books_tags_link(book, tag) VALUES (?,?)', self.conn.execute('INSERT INTO books_tags_link(book, tag) VALUES (?,?)',
(id, tid)) (id, tid))
self.conn.commit() self.conn.commit()
try: tags = ','.join(self.get_tags(id))
otags = [t.strip() for t in self.data[self.data.row(id)][FIELD_MAP['tags']].split(',')]
except AttributeError:
otags = []
if not append:
otags = []
tags = ','.join(otags+tags)
self.data.set(id, FIELD_MAP['tags'], tags, row_is_id=True) self.data.set(id, FIELD_MAP['tags'], tags, row_is_id=True)
if notify: if notify:
self.notify('metadata', [id]) self.notify('metadata', [id])

View File

@ -73,6 +73,21 @@ class Concatenate(object):
return self.ans[:-len(self.sep)] return self.ans[:-len(self.sep)]
return self.ans return self.ans
class SortedConcatenate(object):
'''String concatenation aggregator for sqlite, sorted by supplied index'''
def __init__(self, sep=','):
self.sep = sep
self.ans = {}
def step(self, ndx, value):
if value is not None:
self.ans[ndx] = value
def finalize(self):
if len(self.ans) == 0:
return None
return self.sep.join(map(self.ans.get, sorted(self.ans.keys())))
class Connection(sqlite.Connection): class Connection(sqlite.Connection):
def get(self, *args, **kw): def get(self, *args, **kw):
@ -104,6 +119,7 @@ class DBThread(Thread):
detect_types=sqlite.PARSE_DECLTYPES|sqlite.PARSE_COLNAMES) detect_types=sqlite.PARSE_DECLTYPES|sqlite.PARSE_COLNAMES)
self.conn.row_factory = sqlite.Row if self.row_factory else lambda cursor, row : list(row) self.conn.row_factory = sqlite.Row if self.row_factory else lambda cursor, row : list(row)
self.conn.create_aggregate('concat', 1, Concatenate) self.conn.create_aggregate('concat', 1, Concatenate)
self.conn.create_aggregate('sortconcat', 2, SortedConcatenate)
self.conn.create_function('title_sort', 1, title_sort) self.conn.create_function('title_sort', 1, title_sort)
def run(self): def run(self):

View File

@ -140,6 +140,11 @@ sudo calibre_postinstall
</form> </form>
</div> </div>
<hr/> <hr/>
<h3>Note</h3>
<p>
If your kernel is compiled with CONFIG_SYSFS_DEPRECATED device detection may not work.
</p>
<hr/>
<h3>Dependencies</h3> <h3>Dependencies</h3>
${app} has the following dependencies (the listed version is the minimum version) ${app} has the following dependencies (the listed version is the minimum version)
<br/><br/> <br/><br/>

View File

@ -50,6 +50,7 @@ recipe_modules = ['recipe_' + r for r in (
'marca', 'kellog_faculty', 'kellog_insight', 'marca', 'kellog_faculty', 'kellog_insight',
'theeconomictimes_india', '7dias', 'buenosaireseconomico', 'theeconomictimes_india', '7dias', 'buenosaireseconomico',
'diagonales', 'miradasalsur', 'newsweek_argentina', 'veintitres', 'diagonales', 'miradasalsur', 'newsweek_argentina', 'veintitres',
'gva_be', 'hln', 'tijd', 'degentenaar',
)] )]
import re, imp, inspect, time, os import re, imp, inspect, time, os

View File

@ -0,0 +1,75 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.nieuwsblad.be
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class DeGentenaarOnline(BasicNewsRecipe):
title = 'De Gentenaar Online'
__author__ = 'Darko Miletic'
description = 'News from Belgium in Dutch'
publisher = 'De Gentenaar'
category = 'news, politics, Belgium'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = _('Dutch')
lang = 'nl-BE'
direction = 'ltr'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='span', attrs={'id':['lblArticleTitle','lblArticleIntroduction','lblArticleMainText']})]
remove_tags = [dict(name=['embed','object'])]
feeds = [
(u'Snelnieuws' , u'http://feeds.nieuwsblad.be/nieuws/snelnieuws' )
,(u'Binnenland' , u'http://feeds.nieuwsblad.be/nieuws/binnenland' )
,(u'Buitenland' , u'http://feeds.nieuwsblad.be/nieuwsblad/buitenland' )
,(u'Economie' , u'http://feeds.nieuwsblad.be/economie/home' )
,(u'Economie' , u'http://feeds.nieuwsblad.be/economie/home' )
,(u'Algemeen' , u'http://feeds.nieuwsblad.be/life/algemeen' )
,(u'Film' , u'http://feeds.nieuwsblad.be/life/film' )
,(u'Boek' , u'http://feeds.nieuwsblad.be/life/boeken' )
,(u'Muziek' , u'http://feeds.nieuwsblad.be/life/muziek' )
,(u'Podium' , u'http://feeds.nieuwsblad.be/life/podium' )
,(u'TV & radio' , u'http://feeds.nieuwsblad.be/life/tv' )
]
def print_version(self, url):
return url.replace('/Detail.aspx?articleid','/PrintArticle.aspx?ArticleID')
def get_article_url(self, article):
return article.get('guid', None)
def preprocess_html(self, soup):
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('span'):
item.name='div'
if item.has_key('id') and item['id'] == 'lblArticleTitle':
item.name='h3'
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup

View File

@ -0,0 +1,63 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.gva.be
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class GazetvanAntwerpen(BasicNewsRecipe):
title = 'Gazet van Antwerpen'
__author__ = 'Darko Miletic'
description = 'News from Belgium in Dutch'
publisher = 'Gazet van Antwerpen'
category = 'news, politics, Belgium'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = _('Dutch')
lang = 'nl-BE'
direction = 'ltr'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'id':'article'})]
remove_tags = [
dict(name=['embed','object'])
, dict (name='div',attrs={'class':['note NotePortrait','note']})
]
remove_tags_after = dict(name='span', attrs={'class':'author'})
feeds = [
(u'Overzicht & Blikvanger', u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/overview/overzicht' )
,(u'Stad & Regio' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/stadenregio' )
,(u'Economie' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/economie' )
,(u'Binnenland' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/binnenland' )
,(u'Buitenland' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/buitenland' )
,(u'Media & Cultur' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/mediaencultuur')
,(u'Wetenschap' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/mediaencultuur')
,(u'Sport' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/sport' )
]
def preprocess_html(self, soup):
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup

View File

@ -0,0 +1,52 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.hln.be
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class HLN_be(BasicNewsRecipe):
title = 'Het Belang Van Limburg'
__author__ = 'Darko Miletic'
description = 'News from Belgium in Dutch'
publisher = 'Het Belang Van Limburg'
category = 'news, politics, Belgium'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = _('Dutch')
lang = 'nl-BE'
direction = 'ltr'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'class':'art_box2'})]
remove_tags = [
dict(name=['embed','object'])
]
feeds = [(u'Alle nieuws', u'http://www.hln.be/rss.xml')]
def preprocess_html(self, soup):
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup

View File

@ -0,0 +1,70 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.tijd.be
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class DeTijd(BasicNewsRecipe):
title = 'De Tijd'
__author__ = 'Darko Miletic'
description = 'News from Belgium in Dutch'
publisher = 'De Tijd'
category = 'news, politics, Belgium'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = _('Dutch')
lang = 'nl-BE'
direction = 'ltr'
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'id':'lcol'})]
remove_tags = [
dict(name=['embed','object'])
, dict (name='div',attrs={'id':'art_reactwrap'})
]
remove_tags_after = dict(name='div', attrs={'id':'art_author'})
feeds = [
(u'Volledig nieuwsaanbod', u'http://www.tijd.be/rss/nieuws.xml' )
,(u'Markten' , u'http://www.tijd.be/rss/markten.xml' )
,(u'Ondernemingen' , u'http://www.tijd.be/rss/ondernemingen.xml' )
,(u'Chemie-Farma' , u'http://www.tijd.be/rss/chemie_farma.xml' )
,(u'Consumptie' , u'http://www.tijd.be/rss/consumptie.xml' )
,(u'Diensten' , u'http://www.tijd.be/rss/diensten.xml' )
,(u'Energie' , u'http://www.tijd.be/rss/energie.xml' )
,(u'Financen' , u'http://www.tijd.be/rss/financien.xml' )
,(u'Industrie' , u'http://www.tijd.be/rss/industrie.xml' )
,(u'Media' , u'http://www.tijd.be/rss/media_telecom.xml' )
,(u'Technologie' , u'http://www.tijd.be/rss/technologie.xml' )
,(u'Economie & Financien' , u'http://www.tijd.be/rss/economie.xml' )
,(u'Binnenland' , u'http://www.tijd.be/rss/binnenland.xml' )
,(u'Buitenland' , u'http://www.tijd.be/rss/buitenland.xml' )
,(u'De wijde wereld' , u'http://www.tijd.be/rss/cultuur.xml' )
]
def preprocess_html(self, soup):
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return soup