Merge from trunk

2025-07-09 03:04:10 -04:00 · 2010-09-30 09:00:50 +01:00 · 2010-09-30 09:00:50 +01:00 · fa9c23031e
commit fa9c23031e
parent 59d5090654 01d7397cca
11 changed files with 341 additions and 7202 deletions
--- a/imgsrc/plugboard.svg
+++ b/imgsrc/plugboard.svg
--- a/resources/images/plugboard.png
+++ b/resources/images/plugboard.png
--- a/resources/recipes/peterschiff.recipe
+++ b/resources/recipes/peterschiff.recipe
@ -12,15 +12,18 @@ class PeterSchiff(BasicNewsRecipe):
    description           = 'Economic commentary'
    publisher             = 'Euro Pacific capital'
    category              = 'news, politics, economy, USA'
-    oldest_article        = 15
+    oldest_article        = 25
    max_articles_per_feed = 200
    no_stylesheets        = True
-    encoding              = 'cp1252'
+    encoding              = 'utf8'
    use_embedded_content  = False
    language              = 'en'
-    country               = 'US'
    remove_empty_feeds    = True
-    extra_css             = ' body{font-family: Verdana,Times,serif } h1{text-align: left} img{margin-bottom: 0.4em}  '
+    extra_css             = """ 
+                                body{font-family: Verdana,Times,serif } 
+                                .field-field-commentary-writer-name{font-weight: bold}
+                                .field-items{display: inline}
+                            """

    conversion_options = {
                          'comment'   : description
@ -30,7 +33,15 @@ class PeterSchiff(BasicNewsRecipe):
                        , 'linearize_tables' : True
                        }

-    keep_only_tags = [dict(name='tr',attrs={'style':'vertical-align: top;'})]
+    keep_only_tags = [
+                        dict(name='h2',attrs={'id':'page-title'})
+                       ,dict(name='div',attrs={'class':'node'})
+                     ]
+    remove_tags = [
+                    dict(name=['meta','link','base','iframe','embed'])                   
+                   ,dict(attrs={'id':'text-zoom'})
+                  ]
+    remove_attributes=['track','linktype','lang']


    feeds = [(u'Articles', u'http://feeds.feedburner.com/PeterSchiffsEconomicCommentary')]
--- a/resources/recipes/rmf24_opinie.recipe
+++ b/resources/recipes/rmf24_opinie.recipe
@ -0,0 +1,55 @@
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = u'2010, Tomasz Dlugosz <tomek3d@gmail.com>'
+'''
+rmf24.pl
+'''
+
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class RMF24_opinie(BasicNewsRecipe):
+    title          = u'Rmf24.pl - Opinie'
+    description    = u'Blogi, wywiady i komentarze ze strony rmf24.pl'
+    language = 'pl'
+    oldest_article = 7
+    max_articles_per_feed = 100
+    __author__ = u'Tomasz D\u0142ugosz'
+    no_stylesheets = True
+    remove_javascript = True
+
+    feeds          = [(u'Blogi', u'http://www.rmf24.pl/opinie/blogi/feed'),
+                      (u'Kontrwywiad', u'http://www.rmf24.pl/opinie/wywiady/kontrwywiad/feed'),
+                      (u'Przes\u0142uchanie', u'http://www.rmf24.pl/opinie/wywiady/przesluchanie/feed'),
+                      (u'Komentarze', u'http://www.rmf24.pl/opinie/komentarze/feed')]
+
+    keep_only_tags = [
+        dict(name='div', attrs={'class':'box articleSingle print'}),
+        dict(name='div', attrs={'class':'box articleSingle print singleCommentary'}),
+        dict(name='div', attrs={'class':'box articleSingle print blogSingleEntry'})]
+
+    remove_tags = [
+        dict(name='div', attrs={'class':'toTop'}),
+        dict(name='div', attrs={'class':'category'}),
+        dict(name='div', attrs={'class':'REMOVE'}),
+        dict(name='div', attrs={'class':'embed embedAd'})]
+
+    extra_css = '''
+        h1 { font-size: 1.2em; }
+    '''
+
+    # thanks to Kovid Goyal
+    def get_article_url(self, article):
+        link = article.get('link')
+        if 'audio' not in link:
+            return link
+
+    preprocess_regexps = [
+        (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+        [
+            (r'<h2>Zdj.cie</h2>', lambda match: ''),
+            (r'embed embed(Left|Right|Center) articleEmbed(Audio|Wideo articleEmbedVideo|ArticleFull|ArticleTitle|ArticleListTitle|AlbumHorizontal)">', lambda match: 'REMOVE">'),
+            (r'<a href="http://www.facebook.com/pages/RMF24pl/.*?>RMF24.pl</a> on Facebook</div>', lambda match: '</div>')
+        ]
+    ]
--- a/src/calibre/constants.py
+++ b/src/calibre/constants.py
@ -2,7 +2,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 __appname__   = 'calibre'
-__version__   = '0.7.905'
+__version__   = '0.7.906'
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"

 import re
--- a/src/calibre/ebooks/metadata/isbndb.py
+++ b/src/calibre/ebooks/metadata/isbndb.py
@ -47,29 +47,43 @@ class ISBNDBMetadata(Metadata):
    def __init__(self, book):
        Metadata.__init__(self, None, [])

+        def tostring(e):
+            if not hasattr(e, 'string'):
+                return None
+            ans = e.string
+            if ans is not None:
+                ans = unicode(ans).strip()
+            if not ans:
+                ans = None
+            return ans
+
        self.isbn = unicode(book.get('isbn13', book.get('isbn')))
-        self.title = unicode(book.find('titlelong').string)
+        self.title = tostring(book.find('titlelong'))
        if not self.title:
-            self.title = unicode(book.find('title').string)
+            self.title = tostring(book.find('title'))
+        if not self.title:
+            self.title = _('Unknown')
        self.title = unicode(self.title).strip()
-        au = unicode(book.find('authorstext').string).strip()
-        temp = au.split(',')
        self.authors = []
-        for au in temp:
-            if not au: continue
-            self.authors.extend([a.strip() for a in au.split('&amp;')])
+        au = tostring(book.find('authorstext'))
+        if au:
+            au = au.strip()
+            temp = au.split(',')
+            for au in temp:
+                if not au: continue
+                self.authors.extend([a.strip() for a in au.split('&amp;')])

        try:
-            self.author_sort = book.find('authors').find('person').string
+            self.author_sort = tostring(book.find('authors').find('person'))
            if self.authors and self.author_sort == self.authors[0]:
                self.author_sort = None
        except:
            pass
-        self.publisher = unicode(book.find('publishertext').string)
+        self.publisher = tostring(book.find('publishertext'))

-        summ = book.find('summary')
-        if summ and hasattr(summ, 'string') and summ.string:
-            self.comments = 'SUMMARY:\n'+unicode(summ.string)
+        summ = tostring(book.find('summary'))
+        if summ:
+            self.comments = 'SUMMARY:\n'+summ.string


 def build_isbn(base_url, opts):
--- a/src/calibre/ebooks/metadata/library_thing.py
+++ b/src/calibre/ebooks/metadata/library_thing.py
@ -12,6 +12,7 @@ import mechanize
 from calibre import browser, prints
 from calibre.utils.config import OptionParser
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.chardet import strip_encoding_declarations

 OPENLIBRARY = 'http://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false'

@ -110,6 +111,8 @@ def get_social_metadata(title, authors, publisher, isbn, username=None,
                    +isbn).read()
        if not raw:
            return mi
+        raw = raw.decode('utf-8', 'replace')
+        raw = strip_encoding_declarations(raw)
        root = html.fromstring(raw)
        h1 = root.xpath('//div[@class="headsummary"]/h1')
        if h1 and not mi.title:
--- a/src/calibre/gui2/metadata.py
+++ b/src/calibre/gui2/metadata.py
@ -19,6 +19,7 @@ from calibre import prints
 from calibre.constants import DEBUG

 class Worker(Thread):
+    'Cover downloader'

    def __init__(self):
        Thread.__init__(self)
@ -88,7 +89,7 @@ class DownloadMetadata(Thread):
                if mi.isbn:
                    args['isbn'] = mi.isbn
                else:
-                    if not mi.title or mi.title == _('Unknown'):
+                    if mi.is_null('title'):
                        self.failures[id] = \
                                (str(id), _('Book has neither title nor ISBN'))
                        continue
--- a/src/calibre/gui2/ui.py
+++ b/src/calibre/gui2/ui.py
@ -579,6 +579,8 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, # {{{
        except KeyboardInterrupt:
            pass
        time.sleep(2)
+        if mb is not None:
+            mb.flush()
        self.hide_windows()
        return True

--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@ -40,12 +40,14 @@ class MetadataBackup(Thread): # {{{
        self.get_metadata_for_dump = FunctionDispatcher(db.get_metadata_for_dump)
        self.clear_dirtied = FunctionDispatcher(db.clear_dirtied)
        self.set_dirtied = FunctionDispatcher(db.dirtied)
+        self.in_limbo = None

    def stop(self):
        self.keep_running = False

    def run(self):
        while self.keep_running:
+            self.in_limbo = None
            try:
                time.sleep(0.5) # Limit to two per second
                id_ = self.db.dirtied_queue.get(True, 1.45)
@ -72,6 +74,7 @@ class MetadataBackup(Thread): # {{{

            if mi is None:
                continue
+            self.in_limbo = id_

            # Give the GUI thread a chance to do something. Python threads don't
            # have priorities, so this thread would naturally keep the processor
@ -98,6 +101,15 @@ class MetadataBackup(Thread): # {{{
                    prints('Failed to write backup metadata for id:', id_,
                            'again, giving up')
                    continue
+        self.in_limbo = None
+
+    def flush(self):
+        'Used during shutdown to ensure that a dirtied book is not missed'
+        if self.in_limbo is not None:
+            try:
+                self.db.dirtied([self.in_limbo])
+            except:
+                traceback.print_exc()

    def write(self, path, raw):
        with open(path, 'wb') as f:
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -348,10 +348,10 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        setattr(self, 'title_sort', functools.partial(self.get_property,
                loc=self.FIELD_MAP['sort']))

-        self.dirtied_cache = set()
        d = self.conn.get('SELECT book FROM metadata_dirtied', all=True)
        for x in d:
            self.dirtied_queue.put(x[0])
+        self.dirtied_cache = set([x[0] for x in d])

        self.refresh_ondevice = functools.partial(self.data.refresh_ondevice, self)
        self.refresh()
@ -616,9 +616,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
            self.conn.commit()

    def dirtied(self, book_ids, commit=True):
-        for book in book_ids:
-            if book in self.dirtied_cache:
-                continue
+        for book in frozenset(book_ids) - self.dirtied_cache:
            try:
                self.conn.execute(
                    'INSERT INTO metadata_dirtied (book) VALUES (?)',