Sync to trunk.

2025-07-09 03:04:10 -04:00 · 2009-06-23 16:54:39 -04:00 · 2009-06-23 16:54:39 -04:00 · 970af577e5
commit 970af577e5
parent 60cbc6a2cb a7956e2c9d
22 changed files with 427 additions and 65 deletions
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@ -242,9 +242,9 @@ class KindleDXOutput(OutputProfile):
    description = _('This profile is intended for the Amazon Kindle DX.')
    # Screen size is a best guess
-    screen_size               = (824, 1200)
+    screen_size               = (744, 1022)
    dpi                       = 150.0
-    comic_screen_size         = (741, 1080)
+    comic_screen_size         = (741, 1022)
    @classmethod
    def tags_to_string(cls, tags):
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -694,7 +694,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
    '''
    from calibre.ebooks.oeb.base import OEBBook
    html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
-            opts.preprocess_html)
+            opts.preprocess_html, getattr(opts, 'pdf_line_length', 0.5))
    oeb = OEBBook(log, html_preprocessor,
            pretty_print=opts.pretty_print, input_encoding=encoding)
    if not populate:
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -160,9 +160,11 @@ class HTMLPreProcessor(object):
                     (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                      lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
                     ]
-    def __init__(self, input_plugin_preprocess, plugin_preprocess):
+    def __init__(self, input_plugin_preprocess, plugin_preprocess,
            pdf_line_length):
        self.input_plugin_preprocess = input_plugin_preprocess
        self.plugin_preprocess = plugin_preprocess
        self.pdf_line_length = pdf_line_length
    def is_baen(self, src):
        return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
@ -183,7 +185,7 @@ class HTMLPreProcessor(object):
        elif self.is_book_designer(html):
            rules = self.BOOK_DESIGNER
        elif self.is_pdftohtml(html):
-            length = line_length(html, .5)
+            length = line_length(html, self.pdf_line_length)
            line_length_rules = []
            if length:
                line_length_rules = [
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -261,6 +261,11 @@ class HTMLInput(InputFormatPlugin):
                'nasty side effects in the rest of of the conversion pipeline.'
                )
        ),
        OptionRecommendation(name='pdf_line_length', recommended_value=0.5,
            help=_('Average line length for line breaking if the HTML is from a '
                'previous partial conversion of a PDF file.')),
    ])
    def convert(self, stream, opts, file_ext, log,
--- a/src/calibre/ebooks/mobi/output.py
+++ b/src/calibre/ebooks/mobi/output.py
@ -39,10 +39,6 @@ class MOBIOutput(OutputFormatPlugin):
    ])
    recommendations = set([
        ('dont_justify', True, OptionRecommendation.HIGH),
        ])
    def convert(self, oeb, output_path, input_plugin, opts, log):
        self.log, self.opts, self.oeb = log, opts, oeb
        from calibre.ebooks.mobi.writer import PALM_MAX_IMAGE_SIZE, \
--- a/src/calibre/ebooks/mobi/writer.py
+++ b/src/calibre/ebooks/mobi/writer.py
@ -218,8 +218,8 @@ class Serializer(object):
    def serialize_body(self):
        buffer = self.buffer
        self.anchor_offset = buffer.tell()
        buffer.write('<body>')
        self.anchor_offset = buffer.tell()
        # CybookG3 'Start Reading' link
        if 'text' in self.oeb.guide:
            href = self.oeb.guide['text'].href
--- a/src/calibre/ebooks/oeb/transforms/split.py
+++ b/src/calibre/ebooks/oeb/transforms/split.py
@ -16,7 +16,7 @@ from lxml import etree
 from lxml.cssselect import CSSSelector
 from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \
-        urldefrag, rewrite_links, urlunquote
+        urldefrag, rewrite_links, urlunquote, barename
 from calibre.ebooks.epub import rules
 XPath = functools.partial(_XPath, namespaces=NAMESPACES)
@ -46,9 +46,10 @@ class Split(object):
        if self.page_breaks_xpath is not None:
            self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)]
-    def __call__(self, oeb, context):
+    def __call__(self, oeb, opts):
        self.oeb = oeb
        self.log = oeb.log
        self.opts = opts
        self.map = {}
        for item in list(self.oeb.manifest.items):
            if item.spine_position is not None and etree.iselement(item.data):
@ -62,7 +63,7 @@ class Split(object):
            page_breaks, page_break_ids = self.find_page_breaks(item)
        splitter = FlowSplitter(item, page_breaks, page_break_ids,
-                self.max_flow_size, self.oeb)
+                self.max_flow_size, self.oeb, self.opts)
        if splitter.was_split:
            am = splitter.anchor_map
            self.map[item.href] = collections.defaultdict(
@ -153,9 +154,11 @@ class Split(object):
 class FlowSplitter(object):
    'The actual splitting logic'
-    def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb):
+    def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb,
            opts):
        self.item           = item
        self.oeb            = oeb
        self.opts           = opts
        self.log            = oeb.log
        self.page_breaks    = page_breaks
        self.page_break_ids = page_break_ids
@ -221,6 +224,34 @@ class FlowSplitter(object):
            return None
        return body[0]
    def adjust_split_point(self, root, path):
        '''
        Move the split point up its ancestor chain if it has no textual content
        before it. This handles the common case:
        <div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
        h2.
        '''
        sp = root.xpath(path)[0]
        while True:
            parent = sp.getparent()
            if barename(parent.tag) in ('body', 'html'):
                break
            if parent.text and parent.text.strip():
                break
            if parent.index(sp) > 0:
                break
            sp = parent
        npath = sp.getroottree().getpath(sp)
        if self.opts.verbose > 3 and npath != path:
            self.log.debug('\t\t\tMoved split point %s to %s'%(path, npath))
        return npath
    def do_split(self, tree, split_point, before):
        '''
        Split ``tree`` into a *before* and *after* tree at ``split_point``,
@ -236,9 +267,11 @@ class FlowSplitter(object):
        root         = tree.getroot()
        root2        = tree2.getroot()
        body, body2  = map(self.get_body, (root, root2))
        path = self.adjust_split_point(root, path)
        split_point  = root.xpath(path)[0]
        split_point2 = root2.xpath(path)[0]
        def nix_element(elem, top=True):
            parent = elem.getparent()
            index = parent.index(elem)
@ -254,9 +287,12 @@ class FlowSplitter(object):
            if elem is split_point:
                hit_split_point = True
                if before:
                    x = elem.get('id', None)
                    nix_element(elem)
                continue
            if hit_split_point:
                x = elem.get('id', None)
                nix_element(elem)
@ -266,9 +302,11 @@ class FlowSplitter(object):
            if elem is split_point2:
                hit_split_point = True
                if not before:
                    x = elem.get('id', None)
                    nix_element(elem, top=False)
                continue
            if not hit_split_point:
                x = elem.get('id', None)
                nix_element(elem, top=False)
        return tree, tree2
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@ -20,6 +20,8 @@ class PDFInput(InputFormatPlugin):
    options = set([
        OptionRecommendation(name='no_images', recommended_value=False,
            help=_('Do not extract images from the document')),
        OptionRecommendation(name='pdf_line_length', recommended_value=0.5,
            help=_('Average line length for line breaking')),
    ])
    def convert(self, stream, options, file_ext, log,
--- a/src/calibre/gui2/init.py
+++ b/src/calibre/gui2/init.py
@ -48,7 +48,8 @@ def _config():
              help=_('Defaults for conversion to LRF'))
    c.add_opt('LRF_ebook_viewer_options', default=None,
              help=_('Options for the LRF ebook viewer'))
-    c.add_opt('internally_viewed_formats', default=['LRF', 'EPUB', 'LIT', 'MOBI', 'PRC', 'HTML', 'FB2'],
+    c.add_opt('internally_viewed_formats', default=['LRF', 'EPUB', 'LIT',
        'MOBI', 'PRC', 'HTML', 'FB2', 'PDB', 'RB'],
              help=_('Formats that are viewed using the internal viewer'))
    c.add_opt('column_map', default=ALL_COLUMNS,
              help=_('Columns to be displayed in the book list'))
--- a/src/calibre/gui2/images/news/gva_be.png
+++ b/src/calibre/gui2/images/news/gva_be.png
--- a/src/calibre/gui2/images/news/hln.png
+++ b/src/calibre/gui2/images/news/hln.png
--- a/src/calibre/gui2/images/news/tijd.png
+++ b/src/calibre/gui2/images/news/tijd.png
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -224,6 +224,10 @@ class ResultCache(SearchQueryParser):
        id = row if row_is_id else self._map_filtered[row]
        self._data[id][col] = val
    def get(self, row, col, row_is_id=False):
        id = row if row_is_id else self._map_filtered[row]
        return self._data[id][col]
    def index(self, id, cache=False):
        x = self._map if cache else self._map_filtered
        return x.index(id)
@ -557,6 +561,35 @@ class LibraryDatabase2(LibraryDatabase):
        )
    def upgrade_version_6(self):
        'Show authors in order'
        self.conn.executescript('''
        BEGIN TRANSACTION;
        DROP VIEW meta;
        CREATE VIEW meta AS
        SELECT id, title,
               (SELECT sortconcat(bal.id, name) FROM books_authors_link AS bal JOIN authors ON(author = authors.id) WHERE book = books.id) authors,
               (SELECT name FROM publishers WHERE publishers.id IN (SELECT publisher from books_publishers_link WHERE book=books.id)) publisher,
               (SELECT rating FROM ratings WHERE ratings.id IN (SELECT rating from books_ratings_link WHERE book=books.id)) rating,
               timestamp,
               (SELECT MAX(uncompressed_size) FROM data WHERE book=books.id) size,
               (SELECT concat(name) FROM tags WHERE tags.id IN (SELECT tag from books_tags_link WHERE book=books.id)) tags,
               (SELECT text FROM comments WHERE book=books.id) comments,
               (SELECT name FROM series WHERE series.id IN (SELECT series FROM books_series_link WHERE book=books.id)) series,
               series_index,
               sort,
               author_sort,
               (SELECT concat(format) FROM data WHERE data.book=books.id) formats,
               isbn,
               path,
               lccn,
               pubdate,
               flags
        FROM books;
        END TRANSACTION;
        ''')
    def last_modified(self):
        ''' Return last modified time as a UTC datetime object'''
@ -1105,6 +1138,14 @@ class LibraryDatabase2(LibraryDatabase):
            if notify:
                self.notify('metadata', [id])
    def get_tags(self, id):
        result = self.conn.get(
        'SELECT name FROM tags WHERE id IN (SELECT tag FROM books_tags_link WHERE book=?)',
        (id,), all=True)
        if not result:
            return set([])
        return set([r[0] for r in result])
    def set_tags(self, id, tags, append=False, notify=True):
        '''
        @param tags: list of strings
@ -1113,7 +1154,8 @@ class LibraryDatabase2(LibraryDatabase):
        if not append:
            self.conn.execute('DELETE FROM books_tags_link WHERE book=?', (id,))
            self.conn.execute('DELETE FROM tags WHERE (SELECT COUNT(id) FROM books_tags_link WHERE tag=tags.id) < 1')
-        for tag in set(tags):
+        otags = self.get_tags(id)
        for tag in (set(tags)-otags):
            tag = tag.strip()
            if not tag:
                continue
@ -1138,13 +1180,7 @@ class LibraryDatabase2(LibraryDatabase):
                self.conn.execute('INSERT INTO books_tags_link(book, tag) VALUES (?,?)',
                              (id, tid))
        self.conn.commit()
-        try:
+        tags = ','.join(self.get_tags(id))
            otags = [t.strip() for t in self.data[self.data.row(id)][FIELD_MAP['tags']].split(',')]
        except AttributeError:
            otags = []
        if not append:
            otags = []
        tags = ','.join(otags+tags)
        self.data.set(id, FIELD_MAP['tags'], tags, row_is_id=True)
        if notify:
            self.notify('metadata', [id])
--- a/src/calibre/library/sqlite.py
+++ b/src/calibre/library/sqlite.py
@ -73,6 +73,21 @@ class Concatenate(object):
            return self.ans[:-len(self.sep)]
        return self.ans
 class SortedConcatenate(object):
    '''String concatenation aggregator for sqlite, sorted by supplied index'''
    def __init__(self, sep=','):
        self.sep = sep
        self.ans = {}
    def step(self, ndx, value):
        if value is not None:
            self.ans[ndx] = value
    def finalize(self):
        if len(self.ans) == 0:
            return None
        return self.sep.join(map(self.ans.get, sorted(self.ans.keys())))
 class Connection(sqlite.Connection):
    def get(self, *args, **kw):
@ -104,6 +119,7 @@ class DBThread(Thread):
                                   detect_types=sqlite.PARSE_DECLTYPES|sqlite.PARSE_COLNAMES)
        self.conn.row_factory = sqlite.Row if self.row_factory else  lambda cursor, row : list(row)
        self.conn.create_aggregate('concat', 1, Concatenate)
        self.conn.create_aggregate('sortconcat', 2, SortedConcatenate)
        self.conn.create_function('title_sort', 1, title_sort)
    def run(self):
--- a/src/calibre/trac/plugins/templates/linux.html
+++ b/src/calibre/trac/plugins/templates/linux.html
@ -140,6 +140,11 @@ sudo calibre_postinstall
        </form>
      </div>
      <hr/>
      <h3>Note</h3>
      <p>
        If your kernel is compiled with CONFIG_SYSFS_DEPRECATED device detection may not work.
      </p>
      <hr/>
      <h3>Dependencies</h3>
      ${app} has the following dependencies (the listed version is the minimum version)
      <br/><br/>
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -50,6 +50,7 @@ recipe_modules = ['recipe_' + r for r in (
           'marca', 'kellog_faculty', 'kellog_insight',
           'theeconomictimes_india', '7dias', 'buenosaireseconomico',
           'diagonales', 'miradasalsur', 'newsweek_argentina', 'veintitres',
           'gva_be', 'hln', 'tijd', 'degentenaar',
          )]
 import re, imp, inspect, time, os
--- a/src/calibre/web/feeds/recipes/recipe_degentenaar.py
+++ b/src/calibre/web/feeds/recipes/recipe_degentenaar.py
@ -0,0 +1,75 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
 '''
 www.nieuwsblad.be
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import Tag
 class DeGentenaarOnline(BasicNewsRecipe):
    title                 = 'De Gentenaar Online'
    __author__            = 'Darko Miletic'
    description           = 'News from Belgium in Dutch'
    publisher             = 'De Gentenaar'
    category              = 'news, politics, Belgium'
    oldest_article        = 2
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'utf-8'
    language              = _('Dutch')
    lang                  = 'nl-BE'
    direction             = 'ltr'
    html2lrf_options = [
                          '--comment'  , description
                        , '--category' , category
                        , '--publisher', publisher
                        ]
    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
    keep_only_tags = [dict(name='span', attrs={'id':['lblArticleTitle','lblArticleIntroduction','lblArticleMainText']})]
    remove_tags    = [dict(name=['embed','object'])]
    feeds = [
              (u'Snelnieuws' , u'http://feeds.nieuwsblad.be/nieuws/snelnieuws'     )
             ,(u'Binnenland' , u'http://feeds.nieuwsblad.be/nieuws/binnenland'     )
             ,(u'Buitenland' , u'http://feeds.nieuwsblad.be/nieuwsblad/buitenland' )
             ,(u'Economie'   , u'http://feeds.nieuwsblad.be/economie/home'         )
             ,(u'Economie'   , u'http://feeds.nieuwsblad.be/economie/home'         )
             ,(u'Algemeen'   , u'http://feeds.nieuwsblad.be/life/algemeen'         )
             ,(u'Film'       , u'http://feeds.nieuwsblad.be/life/film'             )
             ,(u'Boek'       , u'http://feeds.nieuwsblad.be/life/boeken'           )
             ,(u'Muziek'     , u'http://feeds.nieuwsblad.be/life/muziek'           )
             ,(u'Podium'     , u'http://feeds.nieuwsblad.be/life/podium'           )
             ,(u'TV & radio' , u'http://feeds.nieuwsblad.be/life/tv'               )
            ]
    def print_version(self, url):
        return url.replace('/Detail.aspx?articleid','/PrintArticle.aspx?ArticleID')
    def get_article_url(self, article):
        return article.get('guid',  None)
    def preprocess_html(self, soup):
        del soup.body['onload']
        for item in soup.findAll(style=True):
            del item['style']
        for item in soup.findAll('span'):
            item.name='div'
            if item.has_key('id') and item['id'] == 'lblArticleTitle':
               item.name='h3'
        soup.html['lang']     = self.lang
        soup.html['dir' ]     = self.direction
        mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
        mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
        soup.head.insert(0,mlang)
        soup.head.insert(1,mcharset)
        return soup
--- a/src/calibre/web/feeds/recipes/recipe_gva_be.py
+++ b/src/calibre/web/feeds/recipes/recipe_gva_be.py
@ -0,0 +1,63 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
 '''
 www.gva.be
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import Tag
 class GazetvanAntwerpen(BasicNewsRecipe):
    title                 = 'Gazet van Antwerpen'
    __author__            = 'Darko Miletic'
    description           = 'News from Belgium in Dutch'
    publisher             = 'Gazet van Antwerpen'
    category              = 'news, politics, Belgium'
    oldest_article        = 2
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'utf-8'
    language              = _('Dutch')
    lang                  = 'nl-BE'
    direction             = 'ltr'
    html2lrf_options = [
                          '--comment'  , description
                        , '--category' , category
                        , '--publisher', publisher
                        ]
    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
    keep_only_tags = [dict(name='div', attrs={'id':'article'})]
    remove_tags    = [
                         dict(name=['embed','object'])
                       , dict (name='div',attrs={'class':['note NotePortrait','note']})
                     ]
    remove_tags_after  = dict(name='span', attrs={'class':'author'})
    feeds = [
              (u'Overzicht & Blikvanger', u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/overview/overzicht'       )
             ,(u'Stad & Regio'          , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/stadenregio'   )
             ,(u'Economie'              , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/economie'      )
             ,(u'Binnenland'            , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/binnenland'    )
             ,(u'Buitenland'            , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/buitenland'    )
             ,(u'Media & Cultur'        , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/mediaencultuur')
             ,(u'Wetenschap'            , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/mediaencultuur')
             ,(u'Sport'                 , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/sport'         )
            ]
    def preprocess_html(self, soup):
        del soup.body['onload']
        for item in soup.findAll(style=True):
            del item['style']
        soup.html['lang']     = self.lang
        soup.html['dir' ]     = self.direction
        mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
        mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
        soup.head.insert(0,mlang)
        soup.head.insert(1,mcharset)
        return soup
--- a/src/calibre/web/feeds/recipes/recipe_hln.py
+++ b/src/calibre/web/feeds/recipes/recipe_hln.py
@ -0,0 +1,52 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
 '''
 www.hln.be
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import Tag
 class HLN_be(BasicNewsRecipe):
    title                 = 'Het Belang Van Limburg'
    __author__            = 'Darko Miletic'
    description           = 'News from Belgium in Dutch'
    publisher             = 'Het Belang Van Limburg'
    category              = 'news, politics, Belgium'
    oldest_article        = 2
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'utf-8'
    language              = _('Dutch')
    lang                  = 'nl-BE'
    direction             = 'ltr'
    html2lrf_options = [
                          '--comment'  , description
                        , '--category' , category
                        , '--publisher', publisher
                        ]
    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
    keep_only_tags = [dict(name='div', attrs={'class':'art_box2'})]
    remove_tags    = [
                         dict(name=['embed','object'])
                     ]
    feeds = [(u'Alle nieuws', u'http://www.hln.be/rss.xml')]
    def preprocess_html(self, soup):
        del soup.body['onload']
        for item in soup.findAll(style=True):
            del item['style']
        soup.html['lang']     = self.lang
        soup.html['dir' ]     = self.direction
        mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
        mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
        soup.head.insert(0,mlang)
        soup.head.insert(1,mcharset)
        return soup
--- a/src/calibre/web/feeds/recipes/recipe_tijd.py
+++ b/src/calibre/web/feeds/recipes/recipe_tijd.py
@ -0,0 +1,70 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
 '''
 www.tijd.be
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import Tag
 class DeTijd(BasicNewsRecipe):
    title                 = 'De Tijd'
    __author__            = 'Darko Miletic'
    description           = 'News from Belgium in Dutch'
    publisher             = 'De Tijd'
    category              = 'news, politics, Belgium'
    oldest_article        = 2
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'utf-8'
    language              = _('Dutch')
    lang                  = 'nl-BE'
    direction             = 'ltr'
    html2lrf_options = [
                          '--comment'  , description
                        , '--category' , category
                        , '--publisher', publisher
                        ]
    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
    keep_only_tags = [dict(name='div', attrs={'id':'lcol'})]
    remove_tags    = [
                         dict(name=['embed','object'])
                       , dict (name='div',attrs={'id':'art_reactwrap'})
                     ]
    remove_tags_after  = dict(name='div', attrs={'id':'art_author'})
    feeds = [
              (u'Volledig nieuwsaanbod', u'http://www.tijd.be/rss/nieuws.xml'        )
             ,(u'Markten'              , u'http://www.tijd.be/rss/markten.xml'       )
             ,(u'Ondernemingen'        , u'http://www.tijd.be/rss/ondernemingen.xml' )
             ,(u'Chemie-Farma'         , u'http://www.tijd.be/rss/chemie_farma.xml'  )
             ,(u'Consumptie'           , u'http://www.tijd.be/rss/consumptie.xml'    )
             ,(u'Diensten'             , u'http://www.tijd.be/rss/diensten.xml'      )
             ,(u'Energie'              , u'http://www.tijd.be/rss/energie.xml'       )
             ,(u'Financen'             , u'http://www.tijd.be/rss/financien.xml'     )
             ,(u'Industrie'            , u'http://www.tijd.be/rss/industrie.xml'     )
             ,(u'Media'                , u'http://www.tijd.be/rss/media_telecom.xml' )
             ,(u'Technologie'          , u'http://www.tijd.be/rss/technologie.xml'   )
             ,(u'Economie & Financien' , u'http://www.tijd.be/rss/economie.xml'      )
             ,(u'Binnenland'           , u'http://www.tijd.be/rss/binnenland.xml'    )
             ,(u'Buitenland'           , u'http://www.tijd.be/rss/buitenland.xml'    )
             ,(u'De wijde wereld'      , u'http://www.tijd.be/rss/cultuur.xml'       )
            ]
    def preprocess_html(self, soup):
        del soup.body['onload']
        for item in soup.findAll(style=True):
            del item['style']
        soup.html['lang']     = self.lang
        soup.html['dir' ]     = self.direction
        mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
        mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
        soup.head.insert(0,mlang)
        soup.head.insert(1,mcharset)
        return soup