Sync to trunk.

2025-07-08 02:34:06 -04:00 · 2010-01-17 09:57:27 -05:00 · 2010-01-17 09:57:27 -05:00 · 2b2659d955
commit 2b2659d955
parent f8d78a167d e12253ff15
10 changed files with 315 additions and 93 deletions
--- a/resources/images/news/joop.png
+++ b/resources/images/news/joop.png
--- a/resources/images/news/nrcnext.png
+++ b/resources/images/news/nrcnext.png
--- a/resources/quick_start.epub
+++ b/resources/quick_start.epub
--- a/resources/recipes/fokkeensukke.recipe
+++ b/resources/recipes/fokkeensukke.recipe
@ -1,23 +1,29 @@
 #!/usr/bin/python
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
 class FokkeEnSukkeRecipe(BasicNewsRecipe) :
    __license__   = 'GPL v3'
    __author__ = 'kwetal'
    language = 'nl'
-    description = u'Popular Dutch daily cartoon Fokke en Sukke'
+    country = 'NL'
    version = 2
    title = u'Fokke en Sukke'
-    no_stylesheets = True
+    publisher = u'Reid, Geleijnse & Van Tol'
-    # For reasons unknown to me the extra css is, on the cartoon pages, inserted in the <body> and not in the <head>. My reader (Sony PRS-600) has a serious issue
+    category = u'News, Cartoons'
-    # with that: it treats it as content and displays it as is. Setting this property to empty solves this for me.
+    description = u'Popular Dutch daily cartoon Fokke en Sukke'
    template_css = ''
    INDEX = u'http://foksuk.nl'
-    # This cover is not as nice as it could be, needs some work
+    conversion_options = {'comments': description, 'language': language, 'publisher': publisher}
-    #cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'
+
    no_stylesheets = True
    extra_css = '''
                    body{font-family: verdana, arial, helvetica, geneva, sans-serif ; margin: 0em; padding: 0em;}
                    div.title {text-align: center; margin-bottom: 1em;}
                    '''
    INDEX = u'http://foksuk.nl'
    cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'
    keep_only_tags = [dict(name='div', attrs={'class' : 'cartoon'})]
@ -31,15 +37,14 @@ class FokkeEnSukkeRecipe(BasicNewsRecipe) :
        links = index.findAll('a')
        maxIndex = len(links) - 1
        articles = []
-        for i in range(len(links)) :
+        for i in range(1, len(links)) :
-            # The first link does not interest us, as it points to no cartoon. A begin_at parameter in the range() function would be nice.
+            # There can be more than one cartoon for a given day (currently either one or two).
-            if i == 0 :
+            # If there's only one, there is just a link with the dayname.
-                continue
+            # If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>.
-
+            # In that case we're interested in the last two.
            # There can be more than one cartoon for a given day (currently either one or two). If there's only one, there is just a link with the dayname.
            # If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>. In that case we're interested in the last two.
            if links[i].renderContents() in dayNames :
-                # If the link is not in daynames, we processed it already, but if it is, let's see if the next one has '1' as content
+                # If the link is not in daynames, we processed it already, but if it is, let's see
                # if the next one has '1' as content
                if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1') :
                    # Got you! Add it to the list
                    article = {'title' : links[i].renderContents() + ' 1', 'date' : u'', 'url'  : self.INDEX + links[i + 1]['href'], 'description' : ''}
@ -59,29 +64,31 @@ class FokkeEnSukkeRecipe(BasicNewsRecipe) :
        return [[week, articles]]
    def preprocess_html(self, soup) :
        # This method is called for every page, be it cartoon or TOC. We need to process each in their own way
        cartoon = soup.find('div', attrs={'class' : 'cartoon'})
-        if cartoon :
+
            # It is a cartoon. Extract the title.
        title = ''
        img = soup.find('img', attrs = {'alt' : True})
        if img :
            title = img['alt']
-            # Using the 'extra_css' displays it in the <body> and not in the <head>. See comment at the top of this class. Setting the style this way solves that.
+        tag = Tag(soup, 'div', [('class', 'title')])
            tag = Tag(soup, 'div', [('style', 'text-align: center; margin-bottom: 8px')])
        tag.insert(0, title)
        cartoon.insert(0, tag)
-            # I have not quite worked out why, but we have to throw out this part of the page. It contains the very same index we processed earlier,
+        # We only want the cartoon, so throw out the index
            # and Calibre does not like that too much. As far as I can tell it goes into recursion and the result is an empty eBook.
        select = cartoon.find('div', attrs={'class' : 'selectcartoon'})
        if select :
            select.extract()
-            return cartoon
+        freshSoup = self.getFreshSoup(soup)
-        else :
+        freshSoup.body.append(cartoon)
-            # It is a TOC. Just return the whole lot.
+
-            return soup
+        return freshSoup
    def getFreshSoup(self, oldSoup):
        freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
        if oldSoup.head.title:
            freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
        return freshSoup
--- a/resources/recipes/joop.recipe
+++ b/resources/recipes/joop.recipe
@ -0,0 +1,91 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import Tag
 import re
 class JoopRecipe(BasicNewsRecipe):
    __license__  = 'GPL v3'
    __author__ = 'kwetal'
    language = 'nl'
    country = 'NL'
    version = 1
    title = u'Joop'
    publisher = u'Vara'
    category = u'News, Politics, Discussion'
    description = u'Political blog from the Netherlands'
    oldest_article = 7
    max_articles_per_feed = 100
    use_embedded_content = False
    no_stylesheets = True
    remove_javascript = True
    keep_only_tags = []
    keep_only_tags.append(dict(name = 'div', attrs = {'class': 'author_head clearfix photo'}))
    keep_only_tags.append(dict(name = 'h2', attrs = {'class': 'columnhead smallline'}))
    keep_only_tags.append(dict(name = 'div', attrs = {'class': re.compile('article.*')}))
    extra_css = '''
                body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
                img {margin-right: 0.4em;}
                h3 {font-size: medium; font-style: italic; font-weight: normal;}
                h2 {font-size: xx-large; font-weight: bold}
                sub {color: #666666; font-size: x-small; font-weight: normal;}
                div.joop_byline {font-size: large}
                div.joop_byline_job {font-size: small; color: #696969;}
                div.joop_date {font-size: x-small; font-style: italic; margin-top: 0.6em}
                '''
    INDEX = 'http://www.joop.nl'
    conversion_options = {'comments': description, 'tags': category, 'language': language,
                          'publisher': publisher}
    def parse_index(self):
        sections = ['Politiek', 'Wereld', 'Economie', 'Groen', 'Media', 'Leven', 'Show', 'Opinies']
        soup = self.index_to_soup(self.INDEX)
        answer = []
        div = soup.find('div', attrs = {'id': 'footer'})
        for section in sections:
            articles = []
            h2 = div.find(lambda tag: tag.name == 'h2' and tag.renderContents() == section)
            if h2:
                ul = h2.findNextSibling('ul', 'linklist')
                if ul:
                    for li in ul.findAll('li'):
                        title = self.tag_to_string(li.a)
                        url = self.INDEX + li.a['href']
                        articles.append({'title': title, 'date': None, 'url': url, 'description': ''})
            answer.append((section, articles))
        return answer
    def preprocess_html(self, soup):
        div = soup.find('div', 'author_head clearfix photo')
        if div:
            h2 = soup.find('h2')
            if h2:
                h2.name = 'div'
                h2['class'] = 'joop_byline'
                span = h2.find('span')
                if span:
                    span.name = 'div'
                    span['class'] = 'joop_byline_job'
                div.replaceWith(h2)
        h2 = soup.find('h2', attrs = {'class': 'columnhead smallline'})
        if h2:
            txt = None
            span = h2.find('span', 'info')
            if span:
                txt = span.find(text = True)
            div = Tag(soup, 'div', attrs = [('class', 'joop_date')])
            div.append(txt)
            h2.replaceWith(div)
        return soup
--- a/resources/recipes/ledevoir.recipe
+++ b/resources/recipes/ledevoir.recipe
@ -0,0 +1,80 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __author__    = 'Lorenzo Vigentini'
 __copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
 __version__     = 'v1.01'
 __date__        = '14, January 2010'
 __description__   = 'Canadian Paper '
 '''
 http://www.ledevoir.com/
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class ledevoir(BasicNewsRecipe):
    author        = 'Lorenzo Vigentini'
    description   = 'Canadian Paper'
    cover_url      = 'http://www.ledevoir.com/images/ul/graphiques/logo_devoir.gif'
    title          = u'Le Devoir'
    publisher      = 'leDevoir.com'
    category       = 'News, finance, economy, politics'
    language       = 'fr'
    encoding       = 'utf-8'
    timefmt        = '[%a, %d %b, %Y]'
    oldest_article = 1
    max_articles_per_feed = 50
    use_embedded_content  = False
    recursion             = 10
    remove_javascript     = True
    no_stylesheets        = True
    keep_only_tags  = [
                        dict(name='div', attrs={'id':'article'}),
                        dict(name='ul', attrs={'id':'ariane'})
                    ]
    remove_tags     = [
                        dict(name='div', attrs={'id':'dialog'}),
                        dict(name='div', attrs={'class':['interesse_actions','reactions']}),
                        dict(name='ul', attrs={'class':'mots_cles'}),
                        dict(name='a', attrs={'class':'haut'}),
                        dict(name='h5', attrs={'class':'interesse_actions'})
                    ]
    feeds          = [
                       (u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'),
                       (u'Edition complete', 'http://feeds2.feedburner.com/fluxdudevoir'),
                       (u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'),
                       (u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'),
                       (u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'),
                       (u'International', 'http://www.ledevoir.com/rss/section/international.xml?id=76'),
                       (u'Culture', 'http://www.ledevoir.com/rss/section/culture.xml?id=48'),
                       (u'Environnement', 'http://www.ledevoir.com/rss/section/environnement.xml?id=78'),
                       (u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'),
                       (u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'),
                       (u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'),
                       (u'Loisirs', 'http://www.ledevoir.com/rss/section/loisirs.xml?id=50')
                     ]
    extra_css = '''
                h1 {color:#1C1E7C;font-family:Times,Georgia,serif;font-size:1.85em;font-size-adjust:none;font-stretch:normal;font-style:normal;font-variant:normal;font-weight:bold;line-height:1.2em;margin:0 0 5px;}
                h2 {color:#333333;font-family:Times,Georgia,serif;font-size:1.5em;font-size-adjust:none;font-stretch:normal;font-style:normal;font-variant:normal;font-weight:normal;line-height:1.2em;margin:0 0 5px;}
                h3 {color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px;}
                h4 {color:#333333; font-family:Arial,Helvetica,sans-serif;font-size:13px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; }
                h5 {color:#333333; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; text-transform:uppercase;}
                .specs {line-height:1em;margin:1px 0;}
                .specs span.auteur {font:0.85em/1.1em Arial, Verdana, sans-serif;color:#787878;}
                .specs span.auteur a,
                .specs span.auteur span {text-transform:uppercase;color:#787878;}
                .specs .date {font:0.85em/1.1em Arial, Verdana, sans-serif;color:#787878;}
                ul#ariane {list-style-type:none;margin:0;padding:5px 0 8px 0;font:0.85em/1.2em Arial, Verdana, sans-serif;color:#2E2E2E;border-bottom:10px solid #fff;}
                ul#ariane li {display:inline;}
                ul#ariane a {color:#2E2E2E;text-decoration:underline;}
                .credit {color:#787878;font-size:0.71em;line-height:1.1em;font-weight:bold;}
                .texte {font-size:1.15em;line-height:1.4em;margin-bottom:17px;}
                '''
--- a/resources/recipes/ncrnext.recipe
+++ b/resources/recipes/ncrnext.recipe
@ -1,29 +1,38 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
 class NrcNextRecipe(BasicNewsRecipe):
    __license__  = 'GPL v3'
    __author__ = 'kwetal'
    version = 1
    language = 'nl'
-    description = u'Dutch newsblog from the Dutch daily newspaper nrcnext.'
+    country = 'NL'
    version = 2
    title = u'nrcnext'
    publisher = u'NRC Media'
    category = u'News, Opinion, the Netherlands'
    description = u'Dutch newsblog from the Dutch daily newspaper nrcnext.'
    conversion_options = {'comments': description, 'language': language, 'publisher': publisher}
    no_stylesheets = True
-    template_css = ''
+    remove_javascript = True
    # I want to do some special processing on the articles. I could not solve it with the 'extra_css' property . So we do it the hard way.
    keep_only_tags = [dict(name='div', attrs={'id' : 'main'})]
    # If that's overkill for you comment out the previous line and uncomment the next. Then get rid of the preprocess_html() method.
    #keep_only_tags = [dict(name='div', attrs={'class' : 'post'}), dict(name='div', attrs={'class' : 'vlag'}) ]
-    remove_tags = [dict(name = 'div', attrs = {'class' : 'meta'}),
+    remove_tags = []
-                          dict(name = 'div', attrs = {'class' : 'datumlabel'}),
+    remove_tags.append(dict(name = 'div', attrs = {'class' : 'meta'}))
-                          dict(name = 'ul', attrs = {'class' : 'cats single'}),
+    remove_tags.append(dict(name = 'div', attrs = {'class' : 'datumlabel'}))
-                          dict(name = 'ul', attrs = {'class' : 'cats onderwerpen'}),
+    remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats single'}))
-                          dict(name = 'ul', attrs = {'class' : 'cats rubrieken'})]
+    remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats onderwerpen'}))
    remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats rubrieken'}))
-    use_embedded_content = False
+    extra_css = '''
                body {font-family: verdana, arial, helvetica, geneva, sans-serif; text-align: left;}
                p.wp-caption-text {font-size: x-small; color: #666666;}
                h2.sub_title {font-size: medium; color: #696969;}
                h2.vlag {font-size: small; font-weight: bold;}
                '''
    def parse_index(self) :
        # Use the wesbite as an index. Their RSS feeds can be out of date.
@ -44,10 +53,11 @@ class NrcNextRecipe(BasicNewsRecipe):
                # Find the links to the actual articles and rember the location they're pointing to and the title
                a = post.find('a', attrs={'rel' : 'bookmark'})
                href = a['href']
-                title = a.renderContents()
+                title = self.tag_to_string(a)
                if index == 'columnisten' :
-                    # In this feed/page articles can be written by more than one author. It is nice to see their names in the titles.
+                    # In this feed/page articles can be written by more than one author.
                    # It is nice to see their names in the titles.
                    flag = post.find('h2', attrs = {'class' : 'vlag'})
                    author = flag.contents[0].renderContents()
                    completeTitle = u''.join([author, u': ', title])
@ -71,44 +81,46 @@ class NrcNextRecipe(BasicNewsRecipe):
        return answer
    def preprocess_html(self, soup) :
        # This method is called for every page, be it cartoon or TOC. We need to process each in their own way
        if soup.find('div', attrs = {'id' : 'main', 'class' : 'single'}):
            # It's an article, find the interesting part
            tag = soup.find('div', attrs = {'class' : 'post'})
            if tag:
-                # And replace any links with their text, so they don't show up underlined on my reader.
+                h2 = tag.find('h2', 'vlag')
-                for link in tag.findAll('a') :
+                if h2:
-                    link.replaceWith(link.renderContents())
+                    new_h2 = Tag(soup, 'h2', attrs = [('class', 'vlag')])
                    new_h2.append(self.tag_to_string(h2))
                    h2.replaceWith(new_h2)
                else:
                    h2 = tag.find('h2')
                    if h2:
                        new_h2 = Tag(soup, 'h2', attrs = [('class', 'sub_title')])
                        new_h2.append(self.tag_to_string(h2))
                        h2.replaceWith(new_h2)
-                # Slows down my Sony reader; feel free to comment out
+                h1 = tag.find('h1')
                if h1:
                    new_h1 = Tag(soup, 'h1')
                    new_h1.append(self.tag_to_string(h1))
                    h1.replaceWith(new_h1)
                # Slows down my reader.
                for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqvimeo'}):
                    movie.extract()
                for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqyoutube'}):
                    movie.extract()
                for iframe in tag.findAll('iframe') :
                    iframe.extract()
-                homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
+                fresh_soup = self.getFreshSoup(soup)
-                body = homeMadeSoup.find('body')
+                fresh_soup.body.append(tag)
                body.append(tag)
-                return homeMadeSoup
+                return fresh_soup
            else:
                # This should never happen and other famous last words...
                return soup
        else :
            # It's a TOC, return the whole lot.
            return soup
    def postproces_html(self, soup) :
        # Should not happen, but it does. Slows down my Sony eReader
        for img in soup.findAll('img') :
            if img['src'].startswith('http://') :
                img.extract()
        # Happens for some movies which we are not able to view anyway
        for iframe in soup.findAll('iframe') :
            if iframe['src'].startswith('http://') :
                iframe.extract()
    def getFreshSoup(self, oldSoup):
        freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
        if oldSoup.head.title:
            freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
        return freshSoup
--- a/src/calibre/gui2/init.py
+++ b/src/calibre/gui2/init.py
@ -10,11 +10,12 @@ from PyQt4.QtGui import QFileDialog, QMessageBox, QPixmap, QFileIconProvider, \
 ORG_NAME = 'KovidsBrain'
 APP_UID  = 'libprs500'
 from calibre import islinux, iswindows, isosx
-from calibre.utils.config import Config, ConfigProxy, dynamic
+from calibre.utils.config import Config, ConfigProxy, dynamic, JSONConfig
 from calibre.utils.localization import set_qt_translator
 from calibre.ebooks.metadata.meta import get_metadata, metadata_from_formats
 from calibre.ebooks.metadata import MetaInformation
 gprefs = JSONConfig('gui')
 NONE = QVariant() #: Null value to return from the data function of item models
--- a/src/calibre/gui2/ui.py
+++ b/src/calibre/gui2/ui.py
@ -31,7 +31,7 @@ from calibre.utils.ipc.server import Server
 from calibre.gui2 import warning_dialog, choose_files, error_dialog, \
                            question_dialog,\
                           pixmap_to_data, choose_dir, \
-                           Dispatcher, \
+                           Dispatcher, gprefs, \
                           available_height, \
                           max_available_height, config, info_dialog, \
                           available_width, GetMetadata
@ -518,7 +518,21 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
        self.connect(self.library_view.model(), SIGNAL('count_changed(int)'),
                     self.tags_view.recount)
        self.connect(self.search, SIGNAL('cleared()'), self.tags_view.clear)
        if not gprefs.get('quick_start_guide_added', False):
            from calibre.ebooks.metadata import MetaInformation
            mi = MetaInformation(_('Calibre Quick Start Guide'), ['John Schember'])
            mi.author_sort = 'Schember, John'
            mi.comments = "A guide to get you up an running with calibre"
            mi.publisher = 'calibre'
            self.library_view.model().add_books([P('quick_start.epub')], ['epub'],
                    [mi])
            gprefs['quick_start_guide_added'] = True
            self.library_view.model().books_added(1)
            if hasattr(self, 'db_images'):
                self.db_images.reset()
        self.library_view.model().count_changed()
        ########################### Cover Flow ################################
        self.cover_flow = None
        if CoverFlow is not None:
@ -1008,7 +1022,6 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
            return
        self._add_books(books, to_device)
    def _add_books(self, paths, to_device, on_card=None):
        if on_card is None:
            on_card = 'carda' if self.stack.currentIndex() == 2 else 'cardb' if self.stack.currentIndex() == 3 else None
--- a/src/calibre/utils/config.py
+++ b/src/calibre/utils/config.py
@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
 '''
 Manage application-wide preferences.
 '''
-import os, re, cPickle, textwrap, traceback, plistlib
+import os, re, cPickle, textwrap, traceback, plistlib, json
 from copy import deepcopy
 from functools import partial
 from optparse import OptionParser as _OptionParser
@ -564,23 +564,31 @@ class XMLConfig(dict):
    data types.
    '''
    EXTENSION = '.plist'
    def __init__(self, rel_path_to_cf_file):
        dict.__init__(self)
        self.file_path = os.path.join(config_dir,
                *(rel_path_to_cf_file.split('/')))
        self.file_path = os.path.abspath(self.file_path)
-        if not self.file_path.endswith('.plist'):
+        if not self.file_path.endswith(self.EXTENSION):
-            self.file_path += '.plist'
+            self.file_path += self.EXTENSION
        self.refresh()
    def raw_to_object(self, raw):
        return plistlib.readPlistFromString(raw)
    def to_raw(self):
        return plistlib.writePlistToString(self)
    def refresh(self):
        d = {}
        if os.path.exists(self.file_path):
            with ExclusiveFile(self.file_path) as f:
                raw = f.read()
                try:
-                    d = plistlib.readPlistFromString(raw) if raw.strip() else {}
+                    d = self.raw_to_object(raw) if raw.strip() else {}
                except SystemError:
                    pass
                except:
@ -618,11 +626,21 @@ class XMLConfig(dict):
            if not os.path.exists(dpath):
                os.makedirs(dpath, mode=CONFIG_DIR_MODE)
            with ExclusiveFile(self.file_path) as f:
-                raw = plistlib.writePlistToString(self)
+                raw = self.to_raw()
                f.seek(0)
                f.truncate()
                f.write(raw)
 class JSONConfig(XMLConfig):
    EXTENSION = '.json'
    def raw_to_object(self, raw):
        return json.loads(raw.decode('utf-8'))
    def to_raw(self):
        return json.dumps(self, indent=2)
 def _prefs():
    c = Config('global', 'calibre wide preferences')