Pull from trunk

2025-07-09 03:04:10 -04:00 · 2010-09-25 23:43:22 -06:00 · 2010-09-25 23:43:22 -06:00 · 9a0f97c6aa
commit 9a0f97c6aa
parent 37eadd1c10 2dd9692a5e
16 changed files with 684 additions and 416 deletions
--- a/Changelog.yaml
+++ b/Changelog.yaml
@ -4,6 +4,99 @@
 # for important features/bug fixes.
 # Also, each release can have new and improved recipes.
 - version: 0.7.20
  date: 2010-09-24
  new features:
    - title: "Tweak epub feature."
      type: major
      description: >
        "Now you can conveniently browse the contents of an epub, tweak them and rebuild the epub within your calibre library 
         by right clicking on the book and selecting Tweak ePub. See http://www.mobileread.com/forums/showthread.php?t=99875 
         for details."
    - title: "Add button to Edit metadata dialog to trim borders from the cover"
    - title: "Kobo driver: Add support for setting the ReadStatus to Read and correctly deal with empty collections"
    - title: "Improved algorithm for removal of hyphens during pre-processing"
    - title: "EPUB metadata: Don't read timestamp value from epubs as I am sick of closing bugs about adding books and having the Date not be today."
    - title: "After bulk edit metadata, reselect previously selected books."
  bug fixes:
    - title: "Fix regression in 0.7.19 that broke the By Author and By Title category listing in Stanza/Aldiko feeds."
    - title: "MOBI Output: Fix regression that broke sections list in downloaded periodicals on Kindle for non-english news sources"
    - title: "News download: Rationalize cover processing."
      tickets: [6852]
    - title: "Cover cache: load images only in the GUI thread to prevent stale files being leftover by set_path due to Windows file locking"
    - title: "Database: Make renaming of folders on case change more robust"
      tickets: [6914]
    - title: "When adding/replacing files to/in EPUB files, set the GPF bit for all files in the archive, to prevent unzip from complaining in linux"
      tickets: [6363]
    - title: "Plugin loading: Handle encoding declarations in .py files correctly"
    - title: "MOBI input: Another corner case"
      tickets: [6909]
    - title: "IPC: Store results file in the calibre temp dir and also dont die if for some reason removing result file fails. Should make adding/saving more robust"
    - title: "Database: Fix regression that caused has_cover to create empty directories unneccessarily"
    - title: "Detection of Alex on unix"
      tickets: [5900]
    - title: "News download: Don't add inline table of contents when downloading news for the Kindle"
    - title: "Add prologue and epilogue to default chapter detection regex"
    - title: "Kobo driver: Fix issue where books that are read were getting their status reset to Unread"
    - title: "Device drivers: Fix occassional false positive when matching books on device with books in the calibre library"
    - title: "Content server: Making serving of large files more efficient."
    - title: "GUI device detection: Handle case when user yanks connected device before device connection handler is called."
      tickets: [6864]
    - title: "Strip leading/trailing whitespace when setting metadata using the edit metadata dialog"
      tickets: [6854]
    - title: "KOBO: Editing the Im_Reading list with SD Card installed fixed"
      tickets: [6850]
  new recipes:
    - title: "Neal's Nuze and Popular Science"
      author: Tony Stegall
    - title: "Rmf24.pl"
      author: "Tomasz Dlugosz"
    - title: "Gazeta Pomorska"
      author: "Richard"
    - title: "Le Journal de Montreal and superesportes"
      author: "Luciano Furtado"
    - title: "The Marker"
      author: Marbs
    - title: "Tagesanzeiger"
      author: noxxx
  improved recipes:
    - Danas
    - Harvard Business Review
 - version: 0.7.19
  date: 2010-09-17
@ -61,6 +154,7 @@
    - title: "PDB Input: Fix bug in conversion of TOC in some PML files"
  new recipes:
    - title: "taz.de RSS"
      author: Alexander Schremmer
@ -272,7 +366,7 @@
  new features:
    - title: "Multiple library support: Various improvements to make using multiple calibre libraries easier."
      type: major
-      desc: >
+      description: >
        "Now, when you switch libraries using the Choose Library button on the toolbar, entries are created in the menu of that button to easily switch to that library in the 
        future. Also, you can now right click on a book in the calibre library and use the 'Copy to library' action to copy the book to another library,
        that you have switched to at least once. The name of the current library is shown in the titlebar.
@ -280,7 +374,7 @@
    - title: "Content server: Allow setting a restriction so that the server shares only some of the books in the library."
      type: major
-      desc: >
+      description: >
        "You can now use a Saved Search as a restiction for the content server, via Preferences->Content Server. This will cause the
        server to share only those books that match the saved search.
        "
--- a/resources/content_server/gui.js
+++ b/resources/content_server/gui.js
@ -54,7 +54,7 @@ function render_book(book) {
    formats = book.attr("formats").split(",");
    if (formats.length > 0) {
        for (i=0; i < formats.length; i++) {
-            title += '<a title="Download in '+formats[i]+' format" class="format" href="'+format_url(formats[i], id, book.attr("title"))+'">'+formats[i]+'</a>, ';
+            title += '<a title="Download in '+formats[i]+' format" class="format" href="'+format_url(formats[i], id, book.attr("safe_title"))+'">'+formats[i]+'</a>, ';
        }
        title = title.slice(0, title.length-2);
        title += '&nbsp;({0}&nbsp;MB)&nbsp;'.format(size);
--- a/resources/images/news/howtogeek.png
+++ b/resources/images/news/howtogeek.png
--- a/resources/images/news/jpost_fr.png
+++ b/resources/images/news/jpost_fr.png
--- a/resources/recipes/boortz.recipe
+++ b/resources/recipes/boortz.recipe
@ -1,4 +1,5 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
 class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    title = 'Nealz Nuze'
    language = 'en'
@ -6,16 +7,16 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    description = 'Neal Boortz Show Radio Notes'
    publisher = 'Neal Boortz'
    category = 'news, politics, USA, talkshow'
-    oldest_article = 1
+    oldest_article = 2
    max_articles_per_feed = 100
    linearize_tables = True
    no_stylesheets = True
    remove_javascript   = True
-
+    
    masthead_url = 'http://boortz.com/images/nuze_logo.gif'
    keep_only_tags    = [
-                         dict(name='div', attrs={'id':['SiteContent']})
+                         dict(name='td', attrs={'id':['contentWellCell']})
-                       #,dict(attrs={'id':['cxArticleText']})
+                     
                        ]
    remove_tags = [
                   dict(name='a', attrs={'class':['blogPermalink']}),
@ -25,13 +26,13 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    remove_tags_after = [dict(name='div', attrs={'class':'blogEntryBody'}),]
    feeds          = [
                      ('NUZE', 'http://boortz.com/nealz_nuze_rss/rss.xml')
-
+                      
                    ]
-
+    
-
+
-
+
-
+    
--- a/resources/recipes/howtogeek.recipe
+++ b/resources/recipes/howtogeek.recipe
@ -0,0 +1,40 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    title = 'How To Geek'
    language = 'en'
    __author__ = 'TonytheBookworm'
    description = 'Daily Computer Tips and Tricks'
    publisher = 'Howtogeek'
    category = 'PC,tips,tricks'
    oldest_article = 2
    max_articles_per_feed = 100
    linearize_tables = True
    no_stylesheets = True
    remove_javascript   = True
    masthead_url = 'http://blog.stackoverflow.com/wp-content/uploads/how-to-geek-logo.png'
    remove_tags =[dict(name='a', attrs={'target':['_blank']}),
                  dict(name='table', attrs={'id':['articleTable']}),
                  dict(name='div',   attrs={'class':['feedflare']}),
                  ]
    feeds          = [
                      ('Tips', 'http://feeds.howtogeek.com/howtogeek')
                    ]
--- a/resources/recipes/jpost_fr.recipe
+++ b/resources/recipes/jpost_fr.recipe
@ -0,0 +1,57 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class JerusalemPost(BasicNewsRecipe):
    title = 'Jerusalem post'
    language = 'fr'
    __author__ = 'TonytheBookworm'
    description = 'The Jerusalem Post (in French)'
    publisher = 'jpost'
    category = 'news'
    oldest_article = 30
    max_articles_per_feed = 100
    linearize_tables = True
    no_stylesheets = True
    remove_javascript   = True
    masthead_url = 'http://static.jpost.com/JPSITES/images/JFrench/2008/site/jplogo.JFrench.gif'
    remove_tags = [
                   dict(name='a', attrs={'href':['javascript:window.print()']}),
                   dict(name='div', attrs={'class':['bot']}),
                   ]
    feeds          = [
                      ('NEWS', 'http://fr.jpost.com/servlet/Satellite?collId=1216805762036&pagename=JFrench%2FPage%2FRSS'),
                      ('JFrench En route vers la paix', 'http://fr.jpost.com/servlet/Satellite?collId=1216805762201&pagename=JFrench%2FPage%2FRSS'),
                      ('JFrench Politique', 'http://fr.jpost.com/servlet/Satellite?collId=1215356737334&pagename=JFrench%2FPage%2FRSS'),
                      ('JFrench Securite', 'http://fr.jpost.com/servlet/Satellite?collId=1215356737338&pagename=JFrench%2FPage%2FRSS'),
                      ('JFrench Moyen Orient', 'http://fr.jpost.com/servlet/Satellite?collId=1215356737342&pagename=JFrench%2FPage%2FRSS'),
                      ('JFrench Diplomatie / Monde', 'http://fr.jpost.com/servlet/Satellite?collId=1215356737346&pagename=JFrench%2FPage%2FRSS'),
                      ('JFrench Economie / Sciences', 'http://fr.jpost.com/servlet/Satellite?collId=1215356737358&pagename=JFrench%2FPage%2FRSS'),
                      ('JFrench Societe', 'http://fr.jpost.com/servlet/Satellite?collId=1215356737354&pagename=JFrench%2FPage%2FRSS'),
                      ('JFrench Opinions', 'http://fr.jpost.com/servlet/Satellite?collId=1215356737350&pagename=JFrench%2FPage%2FRSS'),
                      ('JFrench Monde juif', 'http://fr.jpost.com/servlet/Satellite?collId=1215356737366&pagename=JFrench%2FPage%2FRSS'),
                      ('JFrench Culture / Sport', 'http://fr.jpost.com/servlet/Satellite?collId=1215356737362&pagename=JFrench%2FPage%2FRSS')
                    ]
    def print_version(self, url):
        split1 = url.split("cid=")
        #for testing only -------
        #print 'SPLIT IS: ', split1
        #print 'ORG URL IS: ', url
        #---------------------------
        idnum = split1[1] # get the actual value of the id article
        #for testing only --------------------
        #print 'the idnum is: ', idnum
        #--------------------------------------
        print_url = 'http://fr.jpost.com/servlet/Satellite?cid=' + idnum + '&pagename=JFrench%2FJPArticle%2FPrinter'
        #for testing only -------------------------
        #print 'PRINT URL IS: ', print_url
        #------------------------------------------
        return print_url
    #example of how links should be formated
    #--------------------------------------------------------------------------------------------------------------
    #org   version =  http://fr.jpost.com/servlet/Satellite?pagename=JFrench/JPArticle/ShowFull&cid=1282804806075
    #print version =  http://fr.jpost.com/servlet/Satellite?cid=1282804806075&pagename=JFrench%2FJPArticle%2FPrinter
    #------------------------------------------------------------------------------------------------------------------
--- a/resources/recipes/popscience.recipe
+++ b/resources/recipes/popscience.recipe
@ -1,5 +1,5 @@
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
 class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    title = 'Popular Science'
@ -12,38 +12,36 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    max_articles_per_feed = 100
    no_stylesheets = True
    remove_javascript = True
-
+    use_embedded_content = True
    masthead_url = 'http://www.raytheon.com/newsroom/rtnwcm/groups/Public/documents/masthead/rtn08_popscidec_masthead.jpg'
-
+    
-    remove_tags = [dict(name='div', attrs={'id':['toolbar','main_supplements']}),
+               
                   dict(name='span', attrs={'class':['comments']}),
                   dict(name='div', attrs={'class':['relatedinfo related-right','node_navigation','content2']}),
                   dict(name='ul', attrs={'class':['item-list clear-block']})]
    feeds          = [
-
+                      
                      ('Gadgets', 'http://www.popsci.com/full-feed/gadgets'),
                      ('Cars', 'http://www.popsci.com/full-feed/cars'),
                      ('Science', 'http://www.popsci.com/full-feed/science'),
                      ('Technology', 'http://www.popsci.com/full-feed/technology'),
                      ('DIY', 'http://www.popsci.com/full-feed/diy'),
-
+                      
                    ]
-
+    
- #The following will get read of the Gallery: links when found
+ #The following will get read of the Gallery: links when found    
-
+        
    def preprocess_html(self, soup) :
        print 'SOUP IS: ', soup
        weblinks = soup.findAll(['head','h2'])
        if weblinks is not None:
            for link in weblinks:
                if re.search('(Gallery)(:)',str(link)):
-
+                  
                  link.parent.extract()
        return soup
-  #-----------------------------------------------------------------
+  #-----------------------------------------------------------------      
-
+        
-
+        
--- a/resources/recipes/scientific_american.recipe
+++ b/resources/recipes/scientific_american.recipe
@ -1,78 +1,91 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 '''
 sciam.com
 '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class ScientificAmerican(BasicNewsRecipe):
-    title = u'Scientific American'
+    title                 = u'Scientific American'
-    description = u'Popular science. Monthly magazine.'
+    description           = u'Popular Science. Monthly magazine.'
-    __author__ = 'Kovid Goyal'
+    category              = 'science'
-    language = 'en'
+    __author__            = 'Starson17'
-    remove_javascript   = True
+    no_stylesheets        = True
-    encoding = 'utf-8'
+    use_embedded_content  = False
    language              = 'en'
    publisher             = 'Nature Publishing Group'
    remove_empty_feeds    = True
    remove_javascript     = True
    oldest_article        = 30
    max_articles_per_feed = 100
-    def print_version(self, url):
+    conversion_options = {'linearize_tables'  : True
-        return url + '&print=true'
+                        , 'comment'           : description
                        , 'tags'              : category
                        , 'publisher'         : publisher
                        , 'language'          : language
                        }
    keep_only_tags = [
                dict(name='h2', attrs={'class':'articleTitle'})
                ,dict(name='p', attrs={'id':'articleDek'})
                ,dict(name='p', attrs={'class':'articleInfo'})
                ,dict(name='div', attrs={'id':['articleContent']})
                ,dict(name='img', attrs={'src':re.compile(r'/media/inline/blog/Image/', re.DOTALL|re.IGNORECASE)}) 
                ]
    remove_tags = [dict(name='a', attrs={'class':'tinyCommentCount'})]
    def parse_index(self):
        soup = self.index_to_soup('http://www.scientificamerican.com/sciammag/')
-        month = self.tag_to_string(soup.find('p',attrs={'id':'articleDek'}))
+        issuetag = soup.find('p',attrs={'id':'articleDek'})
-        self.timefmt = ' [%s]'%(' '.join(month.strip().split()[:2]))
+        self.timefmt = ' [%s]'%(self.tag_to_string(issuetag))
        img = soup.find('img', alt='Scientific American Magazine', src=True)
        if img is not None:
            self.cover_url = img['src']
-
+        features, feeds = [], []
-        feeds = []
+        for a in soup.find(attrs={'class':'primaryCol'}).findAll('a',attrs={'title':'Feature'}):
-        for div in soup.findAll('div', attrs={'class':['primaryCol',
+            if a is None: continue
-            'secondaryCol']}):
+            desc = ''
-            current_section = None
+            s = a.parent.parent.find(attrs={'class':'dek'})
-            for tag in div.findAll(['h2', 'ul']):
+            desc = self.tag_to_string(s)
-                if tag.name == 'h2':
+            article = {
-                    current_section = self.tag_to_string(tag).strip()
+                    'url' : a['href'],
-                    self.log('\tFound section:', current_section)
+                    'title' : self.tag_to_string(a),
-                elif current_section is not None and tag.name == 'ul':
+                    'date' : '',
-                    articles = []
+                    'description' : desc,
-                    for li in tag.findAll('li'):
+                    }
-                        t = li.findAll('a',
+            features.append(article)
-                                attrs={'class':lambda x: x != 'thumb'},
+        feeds.append(('Features', features))
-                                href=lambda x: x and 'article.cfm' in x)
+        department = []
-                        if not t:
+        title = None
-                            continue
+        for li in soup.find(attrs={'class':'secondaryCol'}).findAll('li'):
-                        t = t[-1]
+            if 'department.cfm' in li.a['href']:
-                        title = self.tag_to_string(t)
+                if department:
-                        url = t['href']
+                    feeds.append((title, department))
-                        desc = ''
+                title = self.tag_to_string(li.a)
-                        p = li.find(attrs={'class':'dek'})
+                department = []
-                        if p is not None:
+            if 'article.cfm' in li.h3.a['href']:
-                            desc = self.tag_to_string(p)
+                article = {
-                        articles.append({'title':title, 'url':url,
+                        'url' : li.h3.a['href'],
-                            'description':desc, 'date':''})
+                        'title' : self.tag_to_string(li.h3.a),
-                        self.log('\t\tFound article:', title, '\n\t\tat', url)
+                        'date': '',
-                    if articles:
+                        'description': self.tag_to_string(li.p),
-                        feeds.append((current_section, articles))
+                    }
-                    current_section = None
+                department.append(article)
        if department:
            feeds.append((title, department))
        return feeds
    def postprocess_html(self, soup, first_fetch):
-        if soup is not None:
+        for item in soup.findAll('a'):
-            for span in soup.findAll('span', attrs={'class':'pagination'}):
+            if 'topic.cfm' in item['href']:
-                span.extract()
+                item.replaceWith(item.string)
            if not first_fetch:
                div = soup.find('div', attrs={'class':'headline'})
                if div:
                    div.extract()
        return soup
-    preprocess_regexps = [
+    extra_css = '''
-        (re.compile(r'Already a Digital subscriber.*Now</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
+                p{font-weight: normal; font-size:small}
-        (re.compile(r'If your institution has site license access, enter.*here</a>.', re.DOTALL|re.IGNORECASE), lambda match: ''),
+                li{font-weight: normal; font-size:small}
-        (re.compile(r'to subscribe to our.*;.*\}', re.DOTALL|re.IGNORECASE), lambda match: ''),
+                .headline p{font-size:x-small; font-family:Arial,Helvetica,sans-serif;}
-        (re.compile(r'\)\(jQuery\);.*-->', re.DOTALL|re.IGNORECASE), lambda match: ''),
+                h2{font-size:large; font-family:Arial,Helvetica,sans-serif;}
-        ]
+                h3{font-size:x-small;font-family:Arial,Helvetica,sans-serif;}
                '''
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -389,6 +389,7 @@ class HTMLPreProcessor(object):
            if is_pdftohtml:
                end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
        length = -1
        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
            length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
            if length:
@ -425,7 +426,7 @@ class HTMLPreProcessor(object):
        for rule in rules + end_rules:
            html = rule[0].sub(rule[1], html)
-        if is_pdftohtml:
+        if is_pdftohtml and length > -1:
            # Dehyphenate
            dehyphenator = Dehyphenator()
            html = dehyphenator(html,'pdf', length)
--- a/src/calibre/gui2/dialogs/metadata_single.py
+++ b/src/calibre/gui2/dialogs/metadata_single.py
@ -819,7 +819,8 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
                fname = err.filename if err.filename else 'file'
                return error_dialog(self, _('Permission denied'),
                        _('Could not open %s. Is it being used by another'
-                        ' program?')%fname, show=True)
+                        ' program?')%fname, det_msg=traceback.format_exc(),
                        show=True)
            raise
        self.save_state()
        QDialog.accept(self)
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -870,7 +870,13 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        path = self.format_abspath(index, format, index_is_id=index_is_id)
        if path is not None:
            f = open(path, mode)
-            ret = f if as_file else f.read()
+            try:
                ret = f if as_file else f.read()
            except IOError:
                f.seek(0)
                out = cStringIO.StringIO()
                shutil.copyfileobj(f, out)
                ret = out.getvalue()
            if not as_file:
                f.close()
            return ret
--- a/src/calibre/library/server/content.py
+++ b/src/calibre/library/server/content.py
@ -123,8 +123,6 @@ class ContentServer(object):
        return self.static('index.html')
    # Actually get content from the database {{{
    def get_cover(self, id, thumbnail=False):
        cover = self.db.cover(id, index_is_id=True, as_file=False)
--- a/src/calibre/library/server/mobile.py
+++ b/src/calibre/library/server/mobile.py
@ -19,6 +19,7 @@ from calibre.ebooks.metadata import fmt_sidx
 from calibre.constants import __appname__
 from calibre import human_readable
 from calibre.utils.date import utcfromtimestamp
 from calibre.utils.filenames import ascii_filename
 def CLASS(*args, **kwargs): # class is a reserved word in Python
    kwargs['class'] = ' '.join(args)
@ -111,11 +112,13 @@ def build_index(books, num, search, sort, order, start, total, url_base, CKEYS):
        data = TD()
        last = None
        for fmt in book['formats'].split(','):
            a = ascii_filename(book['authors'])
            t = ascii_filename(book['title'])
            s = SPAN(
                A(
                    fmt.lower(),
-                    href='/get/%s/%s-%s_%d.%s' % (fmt, book['authors'],
+                    href='/get/%s/%s-%s_%d.%s' % (fmt, a, t,
-                        book['title'], book['id'], fmt)
+                        book['id'], fmt)
                ),
                CLASS('button'))
            s.tail = u'\u202f' # &nbsp;
--- a/src/calibre/library/server/xml.py
+++ b/src/calibre/library/server/xml.py
@ -16,6 +16,7 @@ from calibre.library.server.utils import strftime, format_tag_string
 from calibre.ebooks.metadata import fmt_sidx
 from calibre.constants import preferred_encoding
 from calibre import isbytestring
 from calibre.utils.filenames import ascii_filename
 E = ElementMaker()
@ -92,6 +93,8 @@ class XMLServer(object):
                    y = format_tag_string(y, ',', ignore_max=True)
                kwargs[x] = serialize(y) if y else ''
            kwargs['safe_title'] = ascii_filename(kwargs['title'])
            c = kwargs.pop('comments')
            CFM = self.db.field_metadata
--- a/src/calibre/translations/calibre.pot
+++ b/src/calibre/translations/calibre.pot