Pull from trunk

2025-08-11 09:13:57 -04:00 · 2010-09-17 14:48:39 -06:00 · 2010-09-17 14:48:39 -06:00 · d9a4eb3423
commit d9a4eb3423
parent 819700a9de 7e2fe87485
56 changed files with 32483 additions and 28038 deletions
--- a/Changelog.yaml
+++ b/Changelog.yaml
@ -4,6 +4,91 @@
 # for important features/bug fixes.
 # Also, each release can have new and improved recipes.

+- version: 0.7.19
+  date: 2010-09-17
+
+  new features:
+    - title: "The ability to perform search and replace via regular expressions in the Bulk Edit metadata dialog"
+      type: major
+
+    - title: "Add an option to have calibre automatically convert straight quotes to curly quotes. Also handles em/en-dashes and ellipses. Found under 'Look & Feel' in the conversion options"
+      type: major
+      tickets: [6808]
+
+    - title: "Greatly improve sorting performance on large libraries."
+      type: major
+
+    - title: "Drivers for the SONY PRS-350/PRS-650 and the Sovos E-reader"
+
+    - title: "Kobo driver: Add management of the I'm Reading list on Kobo via an Im_Reading tag in calibre. See http://www.mobileread.com/forums/showthread.php?t=98906 for details"
+
+    - title: "Conversion pipeline: Add an option to control how hard line breaks are removed during preprocessing. See the Structure Detection section in the conversion options"
+
+    - title: "In the Edit metadata dialog, indicate whether the author sort value matches the author value, using a background color"
+
+    - title: "Add an option to split the toolbar into two toolbars in Preferences->Interface->Look and Feel"
+
+    - title: "EPUB Output: Improved design of the 'jacket' page created by calibre when using the 'Insert metadata at start of book' option"
+
+    - title: "PDF Input: Improve line unwrapping, handling of hyphens/dashes and quotes. Also handle more specially encoded non ASCII characters"
+
+  bug fixes:
+    - title: "Fix regression in filename shortening that caused loss of filename extension"
+
+    - title: "Fix various regressions that could be triggered when using search restrictions and/or multi-sorts and connecting a device"
+
+    - title: "Database: Fix possible race condition in windows when changing title/author during move of book files, that could lead to old files not being deleted"
+
+    - title: "Conversion pipeline: Don't die if rescaling of image raises an exception, just ignore and continue"
+
+    - title: "Database: Update has_cover cache when setting/removing covers so that the search returns correct results. Also fix an exception that could occur when adding books with a db that has been upgraded from very old SQL."
+
+    - title: "Workaround for bug that affects some windows installs causing white backgrounds on default covers to be rendered as yellow"
+
+    - title: "Fix handling of non-ASCII chars when rendering series in default EPUB cover"
+
+    - title: "Fix --start-in-tray switch displays hidden windows in metacity, xfwm4 and compiz"
+      tickets: [6806]
+
+    - title: "Conversion pipeline: When setting margins on <body> explicitly set padding to 0 to override and existing padding in the input document"
+
+    - title: "CHM Input: Ignore missing image files in the input document"
+      tickets: [6773]
+
+    - title: "News download: Fix bug that could break some downloads in non ASCII locales"
+
+    - title: "TXT Output: When using preserve spaces, output tab characters as a sequence of four non-breaking spaces as some readers dont handle the 09 char code."
+
+    - title: "PDB Input: Fix bug in conversion of TOC in some PML files"
+
+  new recipes:
+    - title: "taz.de RSS"
+      author: Alexander Schremmer
+
+    - title: "Brand Eins"
+      author: Constantin Hofstetter
+
+    - title: "Winnipeg Free Press"
+      author: buyo
+
+    - title: "Buckmasters in the kitchen, The Walrus Magazine and Kansas City Star"
+      author: Tony Stegall
+
+    - title: "Europa Sur"
+      author: "Darko Miletic"
+
+
+  improved recipes:
+    - Harpers (free)
+    - Danas
+    - Novosti
+    - ESPN
+    - Taz Digiabo
+    - Slate
+    - AJC
+    - Infobae
+    - NSPM
+ 
 - version: 0.7.18
  date: 2010-09-10

--- a/resources/images/news/ajc.png
+++ b/resources/images/news/ajc.png
--- a/resources/recipes/adventuregamers.recipe
+++ b/resources/recipes/adventuregamers.recipe
@ -1,7 +1,5 @@
-#!/usr/bin/env  python
-
 __license__   = 'GPL v3'
-__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 www.adventuregamers.com
 '''
@ -10,14 +8,11 @@ from calibre.web.feeds.news import BasicNewsRecipe

 class AdventureGamers(BasicNewsRecipe):
    title                 = u'Adventure Gamers'
-    language = 'en'
-
+    language              = 'en'
    __author__            = 'Darko Miletic'
-    description           = 'Adventure games portal'    
+    description           = 'Adventure games portal'
    publisher             = 'Adventure Gamers'
-    category              = 'news, games, adventure, technology'    
-    language = 'en'
-
+    category              = 'news, games, adventure, technology'
    oldest_article        = 10
    delay                 = 10
    max_articles_per_feed = 100
@ -26,14 +21,25 @@ class AdventureGamers(BasicNewsRecipe):
    remove_javascript     = True
    use_embedded_content  = False
    INDEX                 = u'http://www.adventuregamers.com'
-    
-    html2lrf_options = [
-                          '--comment', description
-                        , '--category', category
-                        , '--publisher', publisher
-                        ]
-    
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
+    extra_css             = """
+                                .pageheader_type{font-size: x-large; font-weight: bold; color: #828D74}
+                                .pageheader_title{font-size: xx-large; color: #394128}
+                                .pageheader_byline{font-size: small; font-weight: bold; color: #394128}
+                                .score_bg {display: inline; width: 100%; margin-bottom: 2em}
+                                .score_column_1{ padding-left: 10px; font-size: small; width: 50%}
+                                .score_column_2{ padding-left: 10px; font-size: small; width: 50%}
+                                .score_column_3{ padding-left: 10px; font-size: small; width: 50%}
+                                .score_header{font-size: large; color: #50544A}
+                                .bodytext{display: block}
+                                body{font-family: Helvetica,Arial,sans-serif}
+                            """
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }

    keep_only_tags = [
                       dict(name='div', attrs={'class':'content_middle'})
@ -43,14 +49,15 @@ class AdventureGamers(BasicNewsRecipe):
                     dict(name=['object','link','embed','form'])
                    ,dict(name='div', attrs={'class':['related-stories','article_leadout','prev','next','both']})
                  ]
-                  
+
    remove_tags_after = [dict(name='div', attrs={'class':'toolbar_fat'})]
-    
+    remove_attributes = ['width','height']
+
    feeds = [(u'Articles', u'http://feeds2.feedburner.com/AdventureGamers')]
-    
+
    def get_article_url(self, article):
        return article.get('guid',  None)
-    
+
    def append_page(self, soup, appendtag, position):
        pager = soup.find('div',attrs={'class':'toolbar_fat_next'})
        if pager:
@ -59,19 +66,19 @@ class AdventureGamers(BasicNewsRecipe):
           texttag = soup2.find('div', attrs={'class':'bodytext'})
           for it in texttag.findAll(style=True):
               del it['style']
-           newpos = len(texttag.contents)          
+           newpos = len(texttag.contents)
           self.append_page(soup2,texttag,newpos)
           texttag.extract()
           appendtag.insert(position,texttag)
-        
-    
+
+
    def preprocess_html(self, soup):
-        mtag = '<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
-        soup.head.insert(0,mtag)    
        for item in soup.findAll(style=True):
            del item['style']
+        for item in soup.findAll('div', attrs={'class':'floatright'}):
+            item.extract()
        self.append_page(soup, soup.body, 3)
        pager = soup.find('div',attrs={'class':'toolbar_fat'})
        if pager:
-           pager.extract()        
-        return soup
+           pager.extract()
+        return self.adeify_images(soup)
--- a/resources/recipes/ajc.recipe
+++ b/resources/recipes/ajc.recipe
@ -10,12 +10,31 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    oldest_article = 1
    max_articles_per_feed = 100
    no_stylesheets = True
-    extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
+
    masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif'
+    extra_css = '''
+                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
+                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
+                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
+		        '''
+
+
    keep_only_tags    = [
-                       dict(name='div', attrs={'id':['cxArticleContent']})
-                       ,dict(attrs={'id':['cxArticleText','cxArticleBodyText']})
+                        dict(name='div', attrs={'class':['cxArticleHeader']})
+                       ,dict(attrs={'id':['cxArticleText']})
                        ]
+
+
+    remove_tags = [
+                     dict(name='div'  , attrs={'class':'cxArticleList'       })
+                    ,dict(name='div'  , attrs={'class':'cxFeedTease' })
+                    ,dict(name='div'  , attrs={'class':'cxElementEnlarge'  })
+                    ,dict(name='div'  , attrs={'id':'cxArticleTools'  })
+                  ]
+
+
+
    feeds          = [
                      ('Breaking News', 'http://www.ajc.com/genericList-rss.do?source=61499'),
                      # -------------------------------------------------------------------
@ -23,7 +42,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
                      # read by simply removing the pound sign from it.  I currently have it
                      # set to only get the Cobb area
                      # --------------------------------------------------------------------
-                      ('Atlanta & Fulton', 'http://www.ajc.com/section-rss.do?source=atlanta'),
+                      #('Atlanta & Fulton', 'http://www.ajc.com/section-rss.do?source=atlanta'),
                      #('Clayton', 'http://www.ajc.com/section-rss.do?source=clayton'),
                      #('DeKalb', 'http://www.ajc.com/section-rss.do?source=dekalb'),
                      #('Gwinnett', 'http://www.ajc.com/section-rss.do?source=gwinnett'),
@ -41,7 +60,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
                      # but again
                      # You can enable which ever team you like by removing the pound sign
                      # ------------------------------------------------------------------------
-                      ('Sports News', 'http://www.ajc.com/genericList-rss.do?source=61510'),
+                      #('Sports News', 'http://www.ajc.com/genericList-rss.do?source=61510'),
                      #('Braves', 'http://www.ajc.com/genericList-rss.do?source=61457'),
                       ('Falcons', 'http://www.ajc.com/genericList-rss.do?source=61458'),
                      #('Hawks', 'http://www.ajc.com/genericList-rss.do?source=61522'),
@ -52,11 +71,16 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
                       ('Music', 'http://www.accessatlanta.com/section-rss.do?source=music'),
                    ]

+    def postprocess_html(self, soup, first):
+      for credit_tag in soup.findAll('span', attrs={'class':['imageCredit rightFloat']}):
+       credit_tag.name ='p'
+
+      return soup
+
+   #def print_version(self, url):
+   #     return url.partition('?')[0] +'?printArticle=y'


-
-    def print_version(self, url):
-        return url.partition('?')[0] +'?printArticle=y'



--- a/resources/recipes/brand_eins.recipe
+++ b/resources/recipes/brand_eins.recipe
@ -0,0 +1,125 @@
+#!/usr/bin/env  python
+# -*- coding: utf-8 -*-
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>'
+__version__   = '0.95'
+
+''' http://brandeins.de - Wirtschaftsmagazin '''
+import re
+import string
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class BrandEins(BasicNewsRecipe):
+
+  title = u'Brand Eins'
+  __author__ = 'Constantin Hofstetter'
+  description = u'Wirtschaftsmagazin'
+  publisher ='brandeins.de'
+  category = 'politics, business, wirtschaft, Germany'
+  use_embedded_content = False
+  lang = 'de-DE'
+  no_stylesheets = True
+  encoding = 'utf-8'
+  language = 'de'
+
+  # 2 is the last full magazine (default)
+  # 1 is the newest (but not full)
+  # 3 is one before 2 etc.
+  which_ausgabe = 2
+
+  keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]
+
+  '''
+  brandeins.de
+  '''
+
+  def postprocess_html(self, soup,first):
+
+    # Move the image of the sidebar right below the h3
+    first_h3 = soup.find(name='div', attrs={'id':'theContent'}).find('h3')
+    for imgdiv in soup.findAll(name='div', attrs={'class':'single_image'}):
+      if len(first_h3.findNextSiblings('div', {'class':'intro'})) >= 1:
+        # first_h3.parent.insert(2, imgdiv)
+        first_h3.findNextSiblings('div', {'class':'intro'})[0].parent.insert(4, imgdiv)
+      else:
+        first_h3.parent.insert(2, imgdiv)
+
+    # Now, remove the sidebar
+    soup.find(name='div', attrs={'id':'sidebar'}).extract()
+
+    # Remove the rating-image (stars) from the h3
+    for img in first_h3.findAll(name='img'):
+        img.extract()
+
+    # Mark the intro texts as italic
+    for div in soup.findAll(name='div', attrs={'class':'intro'}):
+      for p in div.findAll('p'):
+        content = self.tag_to_string(p)
+        new_p = "<p><i>"+ content +"</i></p>"
+        p.replaceWith(new_p)
+
+    return soup
+
+  def parse_index(self):
+    feeds = []
+
+    archive = "http://www.brandeins.de/archiv.html"
+
+    soup = self.index_to_soup(archive)
+    latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0]
+    pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-self.which_ausgabe]
+    url = pre_latest_issue.get('href', False)
+    # Get the title for the magazin - build it out of the title of the cover - take the issue and year;
+    self.title = "Brand Eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d+)", pre_latest_issue.find('img').get('title', False)).group('date')
+    url = 'http://brandeins.de/'+url
+
+    # url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
+    titles_and_articles = self.brand_eins_parse_latest_issue(url)
+    if titles_and_articles:
+      for title, articles in titles_and_articles:
+        feeds.append((title, articles))
+    return feeds
+
+  def brand_eins_parse_latest_issue(self, url):
+    soup = self.index_to_soup(url)
+    article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]
+
+    titles_and_articles = []
+    current_articles = []
+    chapter_title = "Editorial"
+    self.log('Found Chapter:', chapter_title)
+
+    # Remove last list of links (thats just the impressum and the 'gewinnspiel')
+    article_lists[1].findAll('ul')[len(article_lists[1].findAll('ul'))-1].extract()
+
+    for article_list in article_lists:
+      for chapter in article_list.findAll('ul'):
+        if len(chapter.findPreviousSiblings('h3')) >= 1:
+          new_chapter_title = string.capwords(self.tag_to_string(chapter.findPreviousSiblings('h3')[0]))
+          if new_chapter_title != chapter_title:
+            titles_and_articles.append([chapter_title, current_articles])
+            current_articles = []
+            self.log('Found Chapter:', new_chapter_title)
+          chapter_title = new_chapter_title
+        for li in chapter.findAll('li'):
+          a = li.find('a', href = True)
+          if a is None:
+            continue
+          title = self.tag_to_string(a)
+          url = a.get('href', False)
+          if not url or not title:
+            continue
+          url = 'http://brandeins.de/'+url
+          if len(a.parent.findNextSiblings('p')) >= 1:
+            description = self.tag_to_string(a.parent.findNextSiblings('p')[0])
+          else:
+            description = ''
+
+          self.log('\t\tFound article:', title)
+          self.log('\t\t\t', url)
+          self.log('\t\t\t', description)
+
+          current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
+    titles_and_articles.append([chapter_title, current_articles])
+    return titles_and_articles
--- a/resources/recipes/danas.recipe
+++ b/resources/recipes/danas.recipe
@ -20,16 +20,27 @@ class Danas(BasicNewsRecipe):
    encoding              = 'utf-8'
    masthead_url          = 'http://www.danas.rs/images/basic/danas.gif'
    language              = 'sr'
+    remove_javascript     = True
    publication_type      = 'newspaper'
    remove_empty_feeds    = True
    extra_css             = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
                                @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
                                .article_description,body,.lokacija{font-family: Tahoma,Arial,Helvetica,sans1,sans-serif}
                                .nadNaslov,h1,.preamble{font-family: Georgia,"Times New Roman",Times,serif1,serif}
-                                .antrfileText{border-left: 2px solid #999999; margin-left: 0.8em; padding-left: 1.2em;
-                                margin-bottom: 0; margin-top: 0} h2,.datum,.lokacija,.autor{font-size: small}
-                                .antrfileNaslov{border-left: 2px solid #999999; margin-left: 0.8em; padding-left: 1.2em;
-                                font-weight:bold; margin-bottom: 0; margin-top: 0} img{margin-bottom: 0.8em} """
+                                .antrfileText{border-left: 2px solid #999999; 
+                                              margin-left: 0.8em; 
+                                              padding-left: 1.2em; 
+                                              margin-bottom: 0; 
+                                              margin-top: 0} 
+                                h2,.datum,.lokacija,.autor{font-size: small}
+                                .antrfileNaslov{border-left: 2px solid #999999; 
+                                                margin-left: 0.8em; 
+                                                padding-left: 1.2em; 
+                                                font-weight:bold; 
+                                                margin-bottom: 0; 
+                                                margin-top: 0} 
+                                img{margin-bottom: 0.8em} 
+                            """

    conversion_options = {
                          'comment'          : description
@ -44,8 +55,9 @@ class Danas(BasicNewsRecipe):
    remove_tags = [
                     dict(name='div', attrs={'class':['width_1_4','metaClanka','baner']})
                    ,dict(name='div', attrs={'id':'comments'})
-                    ,dict(name=['object','link','iframe'])
+                    ,dict(name=['object','link','iframe','meta'])
                  ]
+    remove_attributes = ['w:st','st']

    feeds          = [
                        (u'Politika'             , u'http://www.danas.rs/rss/rss.asp?column_id=27')
@ -73,13 +85,22 @@ class Danas(BasicNewsRecipe):
                       ,(u'Zvaka u pepeljari'    , u'http://www.danas.rs/rss/rss.asp?column_id=56')
                       ,(u'Vostani Serbie'       , u'http://www.danas.rs/rss/rss.asp?column_id=57')
                       ,(u'Med&Jad-a'            , u'http://www.danas.rs/rss/rss.asp?column_id=58')
-                       ,(u'Svetlosti pozornice'  , u'http://www.danas.rs/rss/rss.asp?column_id=59')
+                       ,(u'Svetlosti pozornice'  , u'http://www.danas.rs/rss/rss.asp?column_id=59')                     
                     ]

    def preprocess_html(self, soup):
+        for tagn in ['st1:place','st1:city','st1:country-region','st1:state']:
+            for item in soup.body.findAll(tagn):
+                item.name='span'
        for item in soup.findAll(style=True):
            del item['style']
-        return self.adeify_images(soup)
+        for item in soup.findAll('a'):
+            if item.has_key('name'):
+               item.extract()
+        for item in soup.findAll('img'):
+            if not item.has_key('alt'):
+               item['alt'] = 'image'    
+        return soup

    def print_version(self, url):
        return url + '&action=print'
--- a/resources/recipes/espn.recipe
+++ b/resources/recipes/espn.recipe
@ -8,6 +8,7 @@ espn.com
 '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ptempfile import TemporaryFile

 class ESPN(BasicNewsRecipe):

@ -78,12 +79,19 @@ class ESPN(BasicNewsRecipe):
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        br.set_handle_refresh(False)
-        if self.username is not None and self.password is not None:
-            br.open('http://espn.com')#('http://espn.go.com/#myespn')
-            br.select_form(nr=1)
-            br.form.find_control(name='username', type='text').value = self.username
-            br.form['password'] = self.password
-            br.submit()
+        url = ('https://r.espn.go.com/members/v3_1/login')
+        raw = br.open(url).read()
+        raw = re.sub(r'(?s)<form>.*?id="regsigninbtn".*?</form>', '', raw)
+        with TemporaryFile(suffix='.htm') as fname:
+            with open(fname, 'wb') as f:
+                f.write(raw)
+            br.open_local_file(fname)
+
+        br.form = br.forms().next()
+        br.form.find_control(name='username', type='text').value = self.username
+        br.form['password'] = self.password
+        br.submit().read()
+        br.open('http://espn.go.com').read()
        br.set_handle_refresh(True)
        return br

--- a/resources/recipes/harpers.recipe
+++ b/resources/recipes/harpers.recipe
@ -1,18 +1,14 @@
-#!/usr/bin/env  python
-
 __license__   = 'GPL v3'
-__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 harpers.org
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag

 class Harpers(BasicNewsRecipe):
    title                 = u"Harper's Magazine"
    __author__            = u'Darko Miletic'
-    language = 'en'
-
+    language              = 'en'
    description           = u"Harper's Magazine: Founded June 1850."
    publisher             = "Harper's Magazine "
    category              = 'news, politics, USA'
@ -21,26 +17,26 @@ class Harpers(BasicNewsRecipe):
    no_stylesheets        = True
    use_embedded_content  = False

-    html2lrf_options = [
-                          '--comment', description
-                        , '--category', category
-                        , '--publisher', publisher
-                        ]
-
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"'
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }

    extra_css = '''
-                h1{ font-family:georgia ; color:#111111; font-size:large;}                
+                h1{ font-family:georgia ; color:#111111; font-size:large;}
                .box-of-helpful{ font-family:arial ; font-size:x-small;}
                p{font-family:georgia ;}
-                .caption{font-family:Verdana,sans-serif;font-size:x-small;color:#666666;}                                
+                .caption{font-family:Verdana,sans-serif;font-size:x-small;color:#666666;}
                '''
-              
+
    keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
    remove_tags = [
                     dict(name='table', attrs={'class':['rcnt','rcnt topline']})
-                    ,dict(name=['link','object','embed'])
+                    ,dict(name=['link','object','embed','meta','base'])
                  ]
+    remove_attributes = ['width','height']

    feeds       = [(u"Harper's Magazine", u'http://www.harpers.org/rss/frontpage-rss20.xml')]

@ -49,20 +45,13 @@ class Harpers(BasicNewsRecipe):
        index = 'http://harpers.org/'
        soup = self.index_to_soup(index)
        link_item = soup.find(name = 'img',attrs= {'class':"cover"})
-        print link_item
        if link_item:
-           cover_url = 'http://harpers.org' + link_item['src'] 
-        print cover_url   
+           cover_url = 'http://harpers.org' + link_item['src']
        return cover_url
-    
+
    def preprocess_html(self, soup):
-        mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
-        soup.head.insert(1,mcharset)
        for item in soup.findAll(style=True):
            del item['style']
        for item in soup.findAll(xmlns=True):
            del item['xmlns']
        return soup
-
-    
-
--- a/resources/recipes/novosti.recipe
+++ b/resources/recipes/novosti.recipe
@ -37,6 +37,16 @@ class Novosti(BasicNewsRecipe):
    preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]

    keep_only_tags     = [dict(attrs={'class':['articleTitle','author','articleLead','articleBody']})]
-    remove_tags        = [dict(name=['embed','object','iframe','base'])]
-
+    remove_tags        = [dict(name=['embed','object','iframe','base','link','meta'])]
    feeds              = [(u'Vesti', u'http://www.novosti.rs/rss/rss-vesti')]
+
+    def preprocess_html(self, soup):
+        for item in soup.findAll(style=True):
+            del item['style']
+        for item in soup.findAll('span', attrs={'class':'author'}):
+            item.name='p'
+        for item in soup.findAll('img'):
+            if not item.has_key('alt'):
+               item['alt'] = 'image'    
+        return soup
+ 
--- a/resources/recipes/slate.recipe
+++ b/resources/recipes/slate.recipe
@ -1,7 +1,8 @@
 #!/usr/bin/env  python
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

 __license__   = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+
 '''
 calibre recipe for slate.com
 '''
@ -10,13 +11,12 @@ import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag

-class PeriodicalNameHere(BasicNewsRecipe):
+class Slate(BasicNewsRecipe):
    # Method variables for customizing downloads
-    title                   = 'Slate'
    description             = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
-    __author__              = 'GRiker and Sujata Raman'
-    max_articles_per_feed   = 20
-    oldest_article          = 7.0
+    __author__              = 'GRiker, Sujata Raman and Nick Redding'
+    max_articles_per_feed   = 100
+    oldest_article          = 14
    recursions              = 0
    delay                   = 0
    simultaneous_downloads  = 5
@ -27,6 +27,12 @@ class PeriodicalNameHere(BasicNewsRecipe):
    encoding                = None
    language = 'en'

+    slate_complete = True
+    if slate_complete:
+        title = 'Slate (complete)'
+    else:
+        title = 'Slate (weekly)'
+
    # Method variables for customizing feed parsing
    summary_length          = 250
    use_embedded_content    = None
@ -42,26 +48,15 @@ class PeriodicalNameHere(BasicNewsRecipe):
    match_regexps           = []

    # The second entry is for 'Big Money', which comes from a different site, uses different markup
-    keep_only_tags          = [dict(attrs={   'id':['article_top', 'article_body', 'story']}),
+    keep_only_tags          = [dict(attrs={   'id':['article_top', 'article_body']}),
                               dict(attrs={   'id':['content']})  ]

    # The second entry is for 'Big Money', which comes from a different site, uses different markup
-    remove_tags             = [dict(attrs={   'id':[
-                                                    'add_comments_button',
-                                                    'article_bottom_tools',
-                                                    'article_bottom_tools_cntr',
-                                                    'bizbox_links_bottom',
-                                                    'BOXXLE',
-                                                    'comments_button',
-                                                    'comments-to-fray',
-                                                    'fbog_article_bottom_cntr',
-                                                    'fray_article_discussion',                                                    'fray_article_links','bottom_sponsored_links','author_bio',
-                                                    'insider_ad_wrapper',
-                                                    'js_kit_cntr',
-                                                    'recommend_tab',
-                                                    'ris_links_wrapper',
-                                                    'toolbox',
-                                                    ]}),
+    remove_tags             = [dict(attrs={   'id':['toolbox','recommend_tab','insider_ad_wrapper',
+                                                    'article_bottom_tools_cntr','fray_article_discussion','fray_article_links','bottom_sponsored_links','author_bio',
+                                                    'bizbox_links_bottom','ris_links_wrapper','BOXXLE',
+                                                    'comments_button','add_comments_button','comments-to-fray','marriott_ad',
+                                                    'article_bottom_tools','recommend_tab2','fbog_article_bottom_cntr']}),
                               dict(attrs={    'id':['content-top','service-links-bottom','hed']})   ]

    excludedDescriptionKeywords =   ['Slate V','Twitter feed','podcast']
@ -72,16 +67,15 @@ class PeriodicalNameHere(BasicNewsRecipe):
    extra_css = '''
                  .h1_subhead{font-family:Arial; font-size:small; }
                   h1{font-family:Verdana; font-size:large; }
-                 .byline        {font-family:Georgia;   margin-bottom: 0px; color: #660033;}
-                 .dateline      {font-family:Arial;  font-size: smaller; height: 0pt; color:#666666;}
+                 .byline        {font-family:Georgia;   margin-bottom: 0px; }
+                 .dateline      {font-family:Arial;  font-size: smaller; height: 0pt;}
                 .imagewrapper  {font-family:Verdana;font-size:x-small; }
                 .source        {font-family:Verdana; font-size:x-small;}
                 .credit        {font-family:Verdana; font-size:     smaller;}
                 #article_body  {font-family:Verdana; }
                 #content  {font-family:Arial; }
                 .caption{font-family:Verdana;font-style:italic; font-size:x-small;}
-                 h3{font-family:Arial; color:#666666; font-size:small}
-                  a{color:#0066CC;}
+                 h3{font-family:Arial; font-size:small}
                  '''

    # Local variables to extend class
@ -99,32 +93,59 @@ class PeriodicalNameHere(BasicNewsRecipe):
            if isinstance(item, (NavigableString, CData)):
                strings.append(item.string)
            elif isinstance(item, Tag):
-                res = self.tag_to_string(item)
+                res = self.tag_to_string(item,use_alt=False)
                if res:
                    strings.append(res)
        return strings

-
-    def extract_sections(self):
+    def extract_named_sections(self):
        soup = self.index_to_soup( self.baseURL )
-        soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'})
+        soup_nav_bar = soup.find(True, attrs={'id':'nav'})
+        briefing_nav = soup.find('li')
+        briefing_url = briefing_nav.a['href']
+        for section_nav in soup_nav_bar.findAll('li'):
+            section_name = self.tag_to_string(section_nav,use_alt=False)
+            self.section_dates.append(section_name)
+
+        soup = self.index_to_soup(briefing_url)
+
+        self.log("Briefing url = %s " % briefing_url)
+        section_lists = soup.findAll('ul','view_links_list')
+
+        sections = []
+        for section in section_lists :
+            sections.append(section)
+        return sections
+
+
+    def extract_dated_sections(self):
+        soup = self.index_to_soup( self.baseURL )
+        soup_top_stories = soup.find(True, attrs={'id':'tap3_cntr'})
+        if soup_top_stories:
+            self.section_dates.append("Top Stories")
+            self.log("SELECTION TOP STORIES %s" % "Top Stories")
+
        soup = soup.find(True, attrs={'id':'toc_links_container'})

        todays_section = soup.find(True, attrs={'class':'todaydateline'})
        self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
+        self.log("SELECTION DATE %s" % self.tag_to_string(todays_section,use_alt=False))

        older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
        for older_section in older_section_dates :
            self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
+            self.log("SELECTION DATE %s" % self.tag_to_string(older_section,use_alt=False))

        if soup_top_stories:
-            headline_stories = soup_top_stories.find('ul')
+            headline_stories = soup_top_stories
+            self.log("HAVE top_stories")
        else:
            headline_stories = None
+            self.log("NO top_stories")
        section_lists = soup.findAll('ul')
        # Prepend the headlines to the first section
        if headline_stories:
-            section_lists[0].insert(0,headline_stories)
+            section_lists.insert(0,headline_stories)

        sections = []
        for section in section_lists :
@ -133,9 +154,8 @@ class PeriodicalNameHere(BasicNewsRecipe):


    def extract_section_articles(self, sections_html) :
-        #       Find the containers with section content
-        soup = self.index_to_soup(str(sections_html))
-        sections = soup.findAll('ul')
+        # Find the containers with section content
+        sections = sections_html

        articles = {}
        key = None
@ -145,10 +165,25 @@ class PeriodicalNameHere(BasicNewsRecipe):

            # Get the section name
            if section.has_key('id') :
+                self.log("PROCESSING SECTION id = %s" % section['id'])
                key = self.section_dates[i]
+                if key.startswith("Pod"):
+                    continue
+                if key.startswith("Blog"):
+                    continue
+                articles[key] = []
+                ans.append(key)
+            elif self.slate_complete:
+                key = self.section_dates[i]
+                if key.startswith("Pod"):
+                    continue
+                if key.startswith("Blog"):
+                    continue
+                self.log("PROCESSING SECTION name = %s" % key)
                articles[key] = []
                ans.append(key)
            else :
+                self.log("SECTION %d HAS NO id" % i);
                continue

            # Get the section article_list
@ -159,8 +194,10 @@ class PeriodicalNameHere(BasicNewsRecipe):
                bylines = self.tag_to_strings(article)
                url = article.a['href']
                title = bylines[0]
-                full_title = self.tag_to_string(article)
-
+                full_title = self.tag_to_string(article,use_alt=False)
+                #self.log("ARTICLE TITLE%s" % title)
+                #self.log("ARTICLE FULL_TITLE%s" % full_title)
+                #self.log("URL %s" % url)
                author = None
                description = None
                pubdate = None
@ -191,7 +228,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
                    excluded = re.compile('|'.join(self.excludedDescriptionKeywords))
                    found_excluded = excluded.search(description)
                    if found_excluded :
-                        if self.verbose : self.log("  >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
+                        self.log("  >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
                        continue

                # Skip articles whose title contain excluded keywords
@ -200,7 +237,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
                    #self.log("evaluating full_title: %s" % full_title)
                    found_excluded = excluded.search(full_title)
                    if found_excluded :
-                        if self.verbose : self.log("  >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
+                        self.log("  >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
                        continue

                # Skip articles whose author contain excluded keywords
@ -208,7 +245,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
                    excluded = re.compile('|'.join(self.excludedAuthorKeywords))
                    found_excluded = excluded.search(author)
                    if found_excluded :
-                        if self.verbose : self.log("  >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
+                        self.log("  >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
                        continue

                skip_this_article = False
@ -216,6 +253,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
                for article in articles[key] :
                    if article['url'] == url :
                        skip_this_article = True
+                        self.log("SKIPPING DUP %s" % url)
                        break

                if skip_this_article :
@ -227,6 +265,8 @@ class PeriodicalNameHere(BasicNewsRecipe):
                    articles[feed] = []
                articles[feed].append(dict(title=title, url=url, date=pubdate, description=description,
                                           author=author, content=''))
+                #self.log("KEY %s" % feed)
+                #self.log("APPENDED %s" % url)
            # Promote 'newspapers' to top
            for (i,article) in enumerate(articles[feed]) :
                if article['description'] is not None :
@ -235,32 +275,6 @@ class PeriodicalNameHere(BasicNewsRecipe):


        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
-        ans = self.remove_duplicates(ans)
-        return ans
-
-    def flatten_document(self, ans):
-        flat_articles = []
-        for (i,section) in enumerate(ans) :
-            #self.log("flattening section %s: " % section[0])
-            for article in section[1] :
-                #self.log("moving %s to flat_articles[]" % article['title'])
-                flat_articles.append(article)
-        flat_section = ['All Articles', flat_articles]
-        flat_ans = [flat_section]
-        return flat_ans
-
-    def remove_duplicates(self, ans):
-        # Return a stripped ans
-        for (i,section) in enumerate(ans) :
-            #self.log("section %s: " % section[0])
-            for article in section[1] :
-                #self.log("\t%s" % article['title'])
-                #self.log("\looking for %s" % article['url'])
-                for (j,subsequent_section) in enumerate(ans[i+1:]) :
-                    for (k,subsequent_article) in enumerate(subsequent_section[1]) :
-                        if article['url'] == subsequent_article['url'] :
-                            #self.log( "removing %s (%s) from %s" % (subsequent_article['title'], subsequent_article['url'], subsequent_section[0]) )
-                            del subsequent_section[1][k]
        return ans

    def print_version(self, url) :
@ -268,13 +282,22 @@ class PeriodicalNameHere(BasicNewsRecipe):

    # Class methods
    def parse_index(self) :
-        sections = self.extract_sections()
+        if self.slate_complete:
+            sections = self.extract_named_sections()
+        else:
+            sections = self.extract_dated_sections()
        section_list = self.extract_section_articles(sections)
-        section_list = self.flatten_document(section_list)
        return section_list

-    def get_browser(self) :
-        return BasicNewsRecipe.get_browser()
+    def get_masthead_url(self):
+        masthead = 'http://img.slate.com/images/redesign2008/slate_logo.gif'
+        br = BasicNewsRecipe.get_browser()
+        try:
+            br.open(masthead)
+        except:
+            self.log("\nMasthead unavailable")
+            masthead = None
+        return masthead

    def stripAnchors(self,soup):
        body = soup.find('div',attrs={'id':['article_body','content']})
@ -304,8 +327,8 @@ class PeriodicalNameHere(BasicNewsRecipe):
            excluded = re.compile('|'.join(self.excludedContentKeywords))
            found_excluded = excluded.search(str(soup))
            if found_excluded :
-                print "no allowed content found, removing article"
-                raise Exception('String error')
+                print "No allowed content found, removing article"
+                raise Exception('Rejected article')

        # Articles from www.thebigmoney.com use different tagging for byline, dateline and body
        head = soup.find('head')
@ -338,7 +361,6 @@ class PeriodicalNameHere(BasicNewsRecipe):
        dept_kicker = soup.find('div', attrs={'class':'department_kicker'})
        if dept_kicker is not None :
            kicker_strings = self.tag_to_strings(dept_kicker)
-            #kicker = kicker_strings[2] + kicker_strings[3]
            kicker = ''.join(kicker_strings[2:])
            kicker = re.sub('\.','',kicker)
            h3Tag = Tag(soup, "h3")
@ -346,25 +368,11 @@ class PeriodicalNameHere(BasicNewsRecipe):
            emTag.insert(0,NavigableString(kicker))
            h3Tag.insert(0, emTag)
            dept_kicker.replaceWith(h3Tag)
+        else:
+            self.log("No kicker--return null")
+            return None

-        # Change <h1> to <h2>
-        headline = soup.find("h1")
-        #tag = headline.find("span")
-        #tag.name = 'div'
-
-        if headline is not None :
-            h2tag = Tag(soup, "h2")
-            h2tag['class'] = "headline"
-            strs = self.tag_to_strings(headline)
-            result = ''
-            for (i,substr) in enumerate(strs) :
-                result += substr
-                if i < len(strs) -1 :
-                    result += '<br />'
-            #h2tag.insert(0, result)
-            #headline.replaceWith(h2tag)
-
-        # Fix up the concatenated byline and dateline
+       # Fix up the concatenated byline and dateline
        byline = soup.find(True,attrs={'class':'byline'})
        if byline is not None :
            bylineTag = Tag(soup,'div')
--- a/resources/recipes/taz.recipe
+++ b/resources/recipes/taz.recipe
@ -8,8 +8,9 @@ __docformat__ = 'restructuredtext de'
 '''
 www.taz.de/digiabo
 '''
-import os, urllib2, zipfile, tempfile
+import os, urllib2, zipfile
 from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ptempfile import PersistentTemporaryFile

 class TazDigiabo(BasicNewsRecipe):

@ -26,38 +27,39 @@ class TazDigiabo(BasicNewsRecipe):
    }

    def build_index(self):
-        if self.username is not None and self.password is not None:
-            domain = "http://www.taz.de"
+        domain = "http://www.taz.de"

-            url = domain + "/epub/"
+        url = domain + "/epub/"

-            auth_handler = urllib2.HTTPBasicAuthHandler()
-            auth_handler.add_password(realm='TAZ-ABO',
-                                      uri=url,
-                                      user=self.username,
-                                      passwd=self.password)
-            opener = urllib2.build_opener(auth_handler)
-            urllib2.install_opener(opener)
+        auth_handler = urllib2.HTTPBasicAuthHandler()
+        auth_handler.add_password(realm='TAZ-ABO',
+                                    uri=url,
+                                    user=self.username,
+                                    passwd=self.password)
+        opener = urllib2.build_opener(auth_handler)
+        urllib2.install_opener(opener)

-            try:
-                f = urllib2.urlopen(url)
-            except urllib2.HTTPError:
-                self.report_progress(0,_('Can\'t login to download issue'))
-                raise ValueError('Failed to login, check your username and'
-                        ' password')
+        try:
+            f = urllib2.urlopen(url)
+        except urllib2.HTTPError:
+            self.report_progress(0,_('Can\'t login to download issue'))
+            raise ValueError('Failed to login, check your username and'
+                    ' password')

-            tmp = tempfile.TemporaryFile()
-            self.report_progress(0,_('downloading epub'))
-            tmp.write(f.read())
+        tmp = PersistentTemporaryFile(suffix='.epub')
+        self.report_progress(0,_('downloading epub'))
+        tmp.write(f.read())
+        tmp.close()

-            zfile = zipfile.ZipFile(tmp, 'r')
-            self.report_progress(0,_('extracting epub'))
+        zfile = zipfile.ZipFile(tmp.name, 'r')
+        self.report_progress(0,_('extracting epub'))

-            zfile.extractall(self.output_dir)
+        zfile.extractall(self.output_dir)

-            tmp.close()
-            index = os.path.join(self.output_dir, 'content.opf')
+        tmp.close()
+        index = os.path.join(self.output_dir, 'content.opf')

-            self.report_progress(1,_('epub downloaded and extracted'))
+        self.report_progress(1,_('epub downloaded and extracted'))
+
+        return index

-            return index
--- a/resources/recipes/taz_rss.recipe
+++ b/resources/recipes/taz_rss.recipe
@ -0,0 +1,24 @@
+__license__   = 'GPL v3'
+__copyright__ = '2010, Alexander Schremmer <alex@alexanderweb.de>'
+
+from calibre.resources.recipes import BasicNewsRecipe
+
+class TazRSSRecipe(BasicNewsRecipe):
+    title = u'Taz.de (die tageszeitung) RSS Feed - German'
+    __author__ = 'Alexander Schremmer'
+    language = 'de'
+    lang = 'de-DE'
+    oldest_article = 7
+    max_articles_per_feed = 100
+    publisher = 'taz Entwicklungs GmbH & Co. Medien KG'
+
+    conversion_options = {'publisher': publisher,
+                          'language': lang,
+                        }
+
+    feeds          = [(u'TAZ main feed', u'http://www.taz.de/rss.xml')]
+    keep_only_tags = [dict(name='div', attrs={'class': 'sect sect_article'})]
+    remove_tags_after = dict(name='div', attrs={'class': 'rack'})
+    remove_tags = [dict(name=['div'], attrs={'class': 'rack'}),
+		 dict(name=['div'], attrs={'class': 'artikelwerbung'}),
+		 dict(name=['ul'], attrs={'class': 'toolbar'}),]
--- a/src/calibre/constants.py
+++ b/src/calibre/constants.py
@ -2,7 +2,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 __appname__   = 'calibre'
-__version__   = '0.7.18'
+__version__   = '0.7.19'
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"

 import re
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -155,7 +155,7 @@ class InputFormatPlugin(Plugin):
        '''
        raise NotImplementedError()

-    def preprocess_html(self, html):
+    def preprocess_html(self, opts, html):
        '''
        This method is called by the conversion pipeline on all HTML before it
        is parsed. It is meant to be used to do any required preprocessing on
--- a/src/calibre/devices/kobo/driver.py
+++ b/src/calibre/devices/kobo/driver.py
@ -5,15 +5,16 @@ __license__   = 'GPL v3'
 __copyright__ = '2010, Timothy Legge <timlegge at gmail.com> and Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import os
+import os, time
 import sqlite3 as sqlite

 from calibre.devices.usbms.books import BookList
 from calibre.devices.kobo.books import Book
 from calibre.devices.kobo.books import ImageWrapper
 from calibre.devices.mime import mime_type_ext
-from calibre.devices.usbms.driver import USBMS
+from calibre.devices.usbms.driver import USBMS, debug_print
 from calibre import prints
+from calibre.devices.usbms.books import CollectionsBookList

 class KOBO(USBMS):

@ -21,12 +22,15 @@ class KOBO(USBMS):
    gui_name = 'Kobo Reader'
    description = _('Communicate with the Kobo Reader')
    author = 'Timothy Legge and Kovid Goyal'
-    version = (1, 0, 4)
+    version = (1, 0, 6)

    supported_platforms = ['windows', 'osx', 'linux']

+    booklist_class = CollectionsBookList
+
    # Ordered list of supported formats
    FORMATS     = ['epub', 'pdf']
+    CAN_SET_METADATA = True

    VENDOR_ID   = [0x2237]
    PRODUCT_ID  = [0x4161]
@ -40,6 +44,12 @@ class KOBO(USBMS):

    VIRTUAL_BOOK_EXTENSIONS = frozenset(['kobo'])

+    EXTRA_CUSTOMIZATION_MESSAGE = _('The Kobo supports only one collection '
+            'currently: the \"Im_Reading\" list.  Create a tag called \"Im_Reading\" ')+\
+                    'for automatic management'
+
+    EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(['tags'])
+
    def initialize(self):
        USBMS.initialize(self)
        self.book_class = Book
@ -63,6 +73,8 @@ class KOBO(USBMS):
                 self._card_b_prefix if oncard == 'cardb' \
                 else self._main_prefix

+        self.booklist_class.rebuild_collections = self.rebuild_collections
+
        # get the metadata cache
        bl = self.booklist_class(oncard, prefix, self.settings)
        need_sync = self.parse_metadata_cache(bl, prefix, self.METADATA_CACHE)
@ -85,9 +97,7 @@ class KOBO(USBMS):
                playlist_map = {}

                if readstatus == 1:
-                    if lpath not in playlist_map:
-                        playlist_map[lpath] = []
-                    playlist_map[lpath].append("I\'m Reading")
+                    playlist_map[lpath]= "Im_Reading"

                path = self.normalize_path(path)
                # print "Normalized FileName: " + path
@ -104,14 +114,17 @@ class KOBO(USBMS):
                        if self.update_metadata_item(bl[idx]):
                            # print 'update_metadata_item returned true'
                            changed = True
-                    bl[idx].device_collections = playlist_map.get(lpath, [])
+                    if lpath in playlist_map and \
+                        playlist_map[lpath] not in bl[idx].device_collections:
+                            bl[idx].device_collections.append(playlist_map[lpath])
                else:
                    if ContentType == '6':
                        book =  Book(prefix, lpath, title, authors, mime, date, ContentType, ImageID, size=1048576)
                    else:
                        book = self.book_from_path(prefix, lpath, title, authors, mime, date, ContentType, ImageID)
                    # print 'Update booklist'
-                    book.device_collections = playlist_map.get(book.lpath, [])
+                    book.device_collections = [playlist_map[lpath]] if lpath in playlist_map else []
+                                       
                    if bl.add_book(book, replace_metadata=False):
                        changed = True
            except: # Probably a path encoding error
@ -398,3 +411,95 @@ class KOBO(USBMS):
        size = os.stat(cls.normalize_path(os.path.join(prefix, lpath))).st_size
        book =  Book(prefix, lpath, title, authors, mime, date, ContentType, ImageID, size=size, other=mi)
        return book
+
+    def get_device_paths(self):
+        paths, prefixes = {}, {}
+        for prefix, path, source_id in [
+                ('main', 'metadata.calibre', 0),
+                ('card_a', 'metadata.calibre', 1),
+                ('card_b', 'metadata.calibre', 2)
+                ]:
+            prefix = getattr(self, '_%s_prefix'%prefix)
+            if prefix is not None and os.path.exists(prefix):
+                paths[source_id] = os.path.join(prefix, *(path.split('/')))
+        return paths
+
+    def update_device_database_collections(self, booklists, collections_attributes):
+#        debug_print('Starting update_device_database_collections', collections_attributes)
+
+        # Force collections_attributes to be 'tags' as no other is currently supported
+#        debug_print('KOBO: overriding the provided collections_attributes:', collections_attributes)
+        collections_attributes = ['tags']
+
+        collections = booklists.get_collections(collections_attributes)
+#        debug_print('Collections', collections)
+        for category, books in collections.items():
+            if category == 'Im_Reading':
+                # Create a connection to the sqlite database
+                connection = sqlite.connect(self._main_prefix + '.kobo/KoboReader.sqlite')
+                cursor = connection.cursor()
+
+                # Reset Im_Reading list in the database
+                query= 'update content set ReadStatus=0, FirstTimeReading = \'true\' where BookID is Null'
+                try:
+                    cursor.execute (query)
+                except:
+                    debug_print('Database Exception:  Unable to reset Im_Reading list')
+                    raise
+                else:
+#                    debug_print('Commit: Reset Im_Reading list')
+                    connection.commit()
+
+                for book in books:
+#                    debug_print('Title:', book.title, 'lpath:', book.path)
+                    book.device_collections = ['Im_Reading']
+
+                    extension =  os.path.splitext(book.path)[1]
+                    ContentType = self.get_content_type_from_extension(extension)
+
+                    ContentID = self.contentid_from_path(book.path, ContentType)
+                    datelastread = time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime())
+
+                    t = (datelastread,ContentID,)
+
+                    try:
+                        cursor.execute('update content set ReadStatus=1,FirstTimeReading=\'false\',DateLastRead=? where BookID is Null and ContentID = ?', t)
+                    except:
+                        debug_print('Database Exception:  Unable create Im_Reading list')
+                        raise
+                    else:
+                        connection.commit()
+ #                       debug_print('Database: Commit create Im_Reading list')
+
+                cursor.close()
+                connection.close()
+
+#        debug_print('Finished update_device_database_collections', collections_attributes)
+        
+    def sync_booklists(self, booklists, end_session=True):
+#        debug_print('KOBO: started sync_booklists')
+        paths = self.get_device_paths()
+
+        blists = {}
+        for i in paths:
+            if booklists[i] is not None:
+               #debug_print('Booklist: ', i)
+               blists[i] = booklists[i]
+        opts = self.settings()
+        if opts.extra_customization:
+            collections = [x.lower().strip() for x in
+                    opts.extra_customization.split(',')]
+        else:
+            collections = []
+
+        #debug_print('KOBO: collection fields:', collections)
+        for i, blist in blists.items():
+                self.update_device_database_collections(blist, collections)
+
+        USBMS.sync_booklists(self, booklists, end_session=end_session)
+        #debug_print('KOBO: finished sync_booklists')
+
+    def rebuild_collections(self, booklist, oncard):
+        collections_attributes = []
+        self.update_device_database_collections(booklist, collections_attributes)
+
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -137,7 +137,7 @@ def add_pipeline_options(parser, plumber):
                      'chapter', 'chapter_mark',
                      'prefer_metadata_cover', 'remove_first_image',
                      'insert_metadata', 'page_breaks_before',
-                      'preprocess_html',
+                      'preprocess_html', 'html_unwrap_factor',
                  ]
                  ),

--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -362,6 +362,15 @@ OptionRecommendation(name='preprocess_html',
            )
        ),

+OptionRecommendation(name='html_unwrap_factor',
+        recommended_value=0.40, level=OptionRecommendation.LOW,
+        help=_('Scale used to determine the length at which a line should '
+            'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The '
+            'default is 0.40, just below the median line length. This will unwrap typical books '
+            ' with hard line breaks, but should be reduced if the line length is variable.'
+            )
+        ),
+
 OptionRecommendation(name='smarten_punctuation',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Convert plain quotes, dashes and ellipsis to their '
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -144,7 +144,6 @@ class HTMLPreProcessor(object):
    # Fix pdftohtml markup
    PDFTOHTML  = [
                  # Fix umlauts
-                  # ¨
                  (re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
                  (re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
                  (re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'),
@ -351,7 +350,7 @@ class HTMLPreProcessor(object):
                # print "The pdf line length returned is " + str(length)
                end_rules.append(
                    # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(r'(?<=.{%i}([a-z,:)\-IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                )

        for rule in self.PREPROCESS + start_rules:
@ -399,7 +398,7 @@ class HTMLPreProcessor(object):
            html = unidecoder.decode(html)

        if self.plugin_preprocess:
-            html = self.input_plugin_preprocess(html)
+            html = self.input_plugin_preprocess(self.extra_opts, html)

        if getattr(self.extra_opts, 'smarten_punctuation', False):
            html = self.smarten_punctuation(html)
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -11,10 +11,11 @@ from calibre.utils.logging import default_log

 class PreProcessor(object):

-    def __init__(self, log=None):
+    def __init__(self, extra_opts=None, log=None):
        self.log = default_log if log is None else log
        self.html_preprocess_sections = 0
        self.found_indents = 0
+        self.extra_opts = extra_opts

    def chapter_head(self, match):
        chap = match.group('chap')
@ -76,6 +77,32 @@ class PreProcessor(object):

    def __call__(self, html):
        self.log("*********  Preprocessing HTML  *********")
+        ###### Check Markup ######
+        #
+        # some lit files don't have any <p> tags or equivalent (generally just plain text between
+        # <pre> tags), check and  mark up line endings if required before proceeding
+        if self.no_markup(html, 0.1):
+             self.log("not enough paragraph markers, adding now")
+             # check if content is in pre tags, use txt procesor to mark up if so
+             pre = re.compile(r'<pre>', re.IGNORECASE)
+             if len(pre.findall(html)) == 1:
+                 self.log("Running Text Processing")
+                 from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+                 separate_paragraphs_single_line
+                 outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
+                 html = outerhtml.sub('\g<text>', html)
+                 html = separate_paragraphs_single_line(html)
+                 html = preserve_spaces(html)
+                 html = convert_basic(html, epub_split_size_kb=0)
+             else:
+                 # Add markup naively
+                 # TODO - find out if there are cases where there are more than one <pre> tag or
+                 # other types of unmarked html and handle them in some better fashion
+                 add_markup = re.compile('(?<!>)(\n)')
+                 html = add_markup.sub('</p>\n<p>', html)
+
+        ###### Mark Indents/Cleanup ######
+        #
        # Replace series of non-breaking spaces with text-indent
        txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
        html = txtindent.sub(self.insert_indent, html)
@ -85,30 +112,27 @@ class PreProcessor(object):
        html = re.sub(ur'\u00a0', ' ', html)
        # Get rid of empty <o:p> tags to simplify other processing
        html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
-        # Get rid of empty span tags
-        html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
+        # Get rid of empty span, bold, & italics tags
+        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
+        html = re.sub(r"\s*<[ibu]>\s*(<[ibu]>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
+        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)

        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
-        blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
+        blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
        blanklines = blankreg.findall(html)
        lines = linereg.findall(html)
        if len(lines) > 1:
            self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
-            if float(len(blanklines)) / float(len(lines)) > 0.40:
+            if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
+            'remove_paragraph_spacing', False):
                self.log("deleting blank lines")
                html = blankreg.sub('', html)
        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = re.sub(r"\s*</p>", "</p>\n", html)
        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
-
-        # some lit files don't have any <p> tags or equivalent (generally just plain text between
-        # <pre> tags), check and  mark up line endings if required before proceeding
-        if self.no_markup(html, 0.1):
-             self.log("not enough paragraph markers, adding now")
-             add_markup = re.compile('(?<!>)(\n)')
-             html = add_markup.sub('</p>\n<p>', html)
-
+        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
        # detect chapters/sections to match xpath or splitting logic
        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
        self.html_preprocess_sections = len(heading.findall(html))
@ -116,7 +140,7 @@ class PreProcessor(object):
        #
        # Start with most typical chapter headings, get more aggressive until one works
        if self.html_preprocess_sections < 10:
-            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
+            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE)
            html = chapdetect.sub(self.chapter_head, html)
        if self.html_preprocess_sections < 10:
            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
@ -125,10 +149,10 @@ class PreProcessor(object):

        if self.html_preprocess_sections < 10:
            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
            html = chapdetect2.sub(self.chapter_head, html)

-        # Unwrap lines
+        ###### Unwrap lines ######
        #
        self.log("Unwrapping Lines")
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
@ -147,27 +171,30 @@ class PreProcessor(object):
            format = 'html'

        # Calculate Length
-        length = line_length(format, html, 0.4)
-        self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
+        length = line_length(format, html, getattr(self.extra_opts,
+            'html_unwrap_factor', 0.4))
+        self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
        #
        # Unwrap and/or delete soft-hyphens, hyphens
-        html = re.sub(u'\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
-        html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
+        html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+        html = re.sub(u'(?<=[-\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)

-        # Unwrap lines using punctation if the median length of all lines is less than 200
-        unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+        # Unwrap lines using punctation and line length
+        unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
        html = unwrap.sub(' ', html)

        # If still no sections after unwrapping mark split points on lines with no punctuation
        if self.html_preprocess_sections < 10:
            self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
-            #self.log(html)
            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
            html = chapdetect3.sub(self.chapter_break, html)
        # search for places where a first or second level heading is immediately followed by another
        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
        # headings and titles, images, etc
        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
-        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
+        html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
+
+        # put back non-breaking spaces in empty paragraphs to preserve original formatting
+        html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)

        return html
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -490,7 +490,8 @@ class HTMLInput(InputFormatPlugin):
            return (None, None)
        return (None, raw)

-	def preprocess_html(self, html):
-        preprocessor = PreProcessor(log=getattr(self, 'log', None))
+    def preprocess_html(self, options, html):
+        self.options = options
+        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
        return preprocessor(html)

--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@ -53,7 +53,8 @@ class LITInput(InputFormatPlugin):
                        pre.append(ne)


-	def preprocess_html(self, html):
-        preprocessor = PreProcessor(log=getattr(self, 'log', None))
+    def preprocess_html(self, options, html):
+        self.options = options
+        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
        return preprocessor(html)

--- a/src/calibre/ebooks/lrf/input.py
+++ b/src/calibre/ebooks/lrf/input.py
@ -12,6 +12,7 @@ from copy import deepcopy
 from lxml import etree

 from calibre.customize.conversion import InputFormatPlugin
+from calibre.ebooks.conversion.utils import PreProcessor
 from calibre import guess_type

 class Canvas(etree.XSLTExtension):
@ -419,4 +420,10 @@ class LRFInput(InputFormatPlugin):
        styles.write()
        return os.path.abspath('content.opf')

+    def preprocess_html(self, options, html):
+        self.options = options
+        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
+        return preprocessor(html)
+
+

--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@ -39,11 +39,11 @@ class MOBIInput(InputFormatPlugin):
                accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
        return mr.created_opf_path

-    def preprocess_html(self, html):
+    def preprocess_html(self, options, html):
        # search for places where a first or second level heading is immediately followed by another
        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
        # headings and titles, images, etc
        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
-        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
+        html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
        return html

--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -229,7 +229,7 @@ class RTFInput(InputFormatPlugin):
            res = transform.tostring(result)
            res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
            if self.options.preprocess_html:
-                preprocessor = PreProcessor(log=getattr(self, 'log', None))
+                preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
                res = preprocessor(res)
            f.write(res)
        self.write_inline_css(inline_class)
--- a/src/calibre/gui2/convert/structure_detection.py
+++ b/src/calibre/gui2/convert/structure_detection.py
@ -26,8 +26,10 @@ class StructureDetectionWidget(Widget, Ui_Form):
                'remove_first_image',
                'insert_metadata', 'page_breaks_before',
                'preprocess_html', 'remove_header', 'header_regex',
-                'remove_footer', 'footer_regex']
+                'remove_footer', 'footer_regex','html_unwrap_factor']
                )
+        self.opt_html_unwrap_factor.setEnabled(False)
+        self.huf_label.setEnabled(False)
        self.db, self.book_id = db, book_id
        for x in ('pagebreak', 'rule', 'both', 'none'):
            self.opt_chapter_mark.addItem(x)
@ -64,3 +66,8 @@ class StructureDetectionWidget(Widget, Ui_Form):
                _('The XPath expression %s is invalid.')%x.text).exec_()
                return False
        return True
+
+    def set_value_handler(self, g, val):
+        if val is None and g is self.opt_html_unwrap_factor:
+            g.setValue(0.0)
+            return True
--- a/src/calibre/gui2/convert/structure_detection.ui
+++ b/src/calibre/gui2/convert/structure_detection.ui
@ -14,10 +14,10 @@
   <string>Form</string>
  </property>
  <layout class="QGridLayout" name="gridLayout">
-   <item row="0" column="0" colspan="2">
+   <item row="0" column="1" colspan="2">
    <widget class="XPathEdit" name="opt_chapter" native="true"/>
   </item>
-   <item row="1" column="0">
+   <item row="1" column="0" colspan="2">
    <widget class="QLabel" name="label">
     <property name="text">
      <string>Chapter &amp;mark:</string>
@ -27,31 +27,31 @@
     </property>
    </widget>
   </item>
-   <item row="1" column="1">
+   <item row="1" column="2">
    <widget class="QComboBox" name="opt_chapter_mark">
     <property name="minimumContentsLength">
      <number>20</number>
     </property>
    </widget>
   </item>
-   <item row="2" column="0">
+   <item row="2" column="0" colspan="2">
    <widget class="QCheckBox" name="opt_remove_first_image">
     <property name="text">
      <string>Remove first &amp;image</string>
     </property>
    </widget>
   </item>
-   <item row="4" column="0">
+   <item row="5" column="0" colspan="2">
    <widget class="QCheckBox" name="opt_insert_metadata">
     <property name="text">
      <string>Insert &amp;metadata as page at start of book</string>
     </property>
    </widget>
   </item>
-   <item row="10" column="0" colspan="2">
+   <item row="11" column="0" colspan="3">
    <widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
   </item>
-   <item row="11" column="0" colspan="2">
+   <item row="12" column="0" colspan="3">
    <spacer name="verticalSpacer">
     <property name="orientation">
      <enum>Qt::Vertical</enum>
@ -64,27 +64,66 @@
     </property>
    </spacer>
   </item>
-   <item row="7" column="0">
+   <item row="8" column="0" colspan="2">
    <widget class="QCheckBox" name="opt_remove_footer">
     <property name="text">
      <string>Remove F&amp;ooter</string>
     </property>
    </widget>
   </item>
-   <item row="5" column="0">
+   <item row="6" column="0" colspan="2">
    <widget class="QCheckBox" name="opt_remove_header">
     <property name="text">
      <string>Remove H&amp;eader</string>
     </property>
    </widget>
   </item>
-   <item row="6" column="0" colspan="2">
+   <item row="7" column="0" colspan="3">
    <widget class="RegexEdit" name="opt_header_regex" native="true"/>
   </item>
-   <item row="8" column="0" colspan="2">
+   <item row="9" column="0" colspan="3">
    <widget class="RegexEdit" name="opt_footer_regex" native="true"/>
   </item>
-   <item row="3" column="0">
+   <item row="4" column="1">
+    <widget class="QLabel" name="huf_label">
+     <property name="text">
+      <string>Line &amp;un-wrap factor during preprocess:</string>
+     </property>
+     <property name="buddy">
+      <cstring>opt_html_unwrap_factor</cstring>
+     </property>
+    </widget>
+   </item>
+   <item row="4" column="2">
+    <widget class="QDoubleSpinBox" name="opt_html_unwrap_factor">
+     <property name="toolTip">
+      <string/>
+     </property>
+     <property name="maximum">
+      <double>1.000000000000000</double>
+     </property>
+     <property name="singleStep">
+      <double>0.050000000000000</double>
+     </property>
+     <property name="value">
+      <double>0.400000000000000</double>
+     </property>
+    </widget>
+   </item>
+   <item row="4" column="0">
+    <spacer name="horizontalSpacer">
+     <property name="orientation">
+      <enum>Qt::Horizontal</enum>
+     </property>
+     <property name="sizeHint" stdset="0">
+      <size>
+       <width>40</width>
+       <height>20</height>
+      </size>
+     </property>
+    </spacer>
+   </item>
+   <item row="3" column="0" colspan="2">
    <widget class="QCheckBox" name="opt_preprocess_html">
     <property name="text">
      <string>&amp;Preprocess input file to possibly improve structure detection</string>
@ -108,5 +147,38 @@
  </customwidget>
 </customwidgets>
 <resources/>
- <connections/>
+ <connections>
+  <connection>
+   <sender>opt_preprocess_html</sender>
+   <signal>toggled(bool)</signal>
+   <receiver>opt_html_unwrap_factor</receiver>
+   <slot>setEnabled(bool)</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>328</x>
+     <y>87</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>481</x>
+     <y>113</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>opt_preprocess_html</sender>
+   <signal>toggled(bool)</signal>
+   <receiver>huf_label</receiver>
+   <slot>setEnabled(bool)</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>295</x>
+     <y>88</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>291</x>
+     <y>105</y>
+    </hint>
+   </hints>
+  </connection>
+ </connections>
 </ui>
--- a/src/calibre/gui2/dialogs/metadata_bulk.py
+++ b/src/calibre/gui2/dialogs/metadata_bulk.py
@ -4,8 +4,10 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''Dialog to edit metadata in bulk'''

 from threading import Thread
+import re

 from PyQt4.Qt import QDialog, QGridLayout
+from PyQt4 import QtGui

 from calibre.gui2.dialogs.metadata_bulk_ui import Ui_MetadataBulkDialog
 from calibre.gui2.dialogs.tag_editor import TagEditor
@ -83,7 +85,6 @@ class Worker(Thread):
            w.commit(self.ids)
        self.db.bulk_modify_tags(self.ids, add=add, remove=remove,
                notify=False)
-        self.db.clean()

    def run(self):
        try:
@ -101,6 +102,13 @@ class Worker(Thread):

 class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):

+    s_r_functions = {
+                    ''          : lambda x: x,
+                    _('Lower Case') : lambda x: x.lower(),
+                    _('Upper Case')     : lambda x: x.upper(),
+                    _('Title Case')     : lambda x: x.title(),
+            }
+
    def __init__(self, window, rows, db):
        QDialog.__init__(self, window)
        Ui_MetadataBulkDialog.__init__(self)
@ -127,12 +135,189 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
        self.series.currentIndexChanged[int].connect(self.series_changed)
        self.series.editTextChanged.connect(self.series_changed)
        self.tag_editor_button.clicked.connect(self.tag_editor)
+
        if len(db.custom_column_label_map) == 0:
-            self.central_widget.tabBar().setVisible(False)
+            self.central_widget.removeTab(1)
        else:
            self.create_custom_column_editors()
+
+        self.prepare_search_and_replace()
        self.exec_()

+    def prepare_search_and_replace(self):
+        self.search_for.initialize('bulk_edit_search_for')
+        self.replace_with.initialize('bulk_edit_replace_with')
+        self.test_text.initialize('bulk_edit_test_test')
+        fields = ['']
+        fm = self.db.field_metadata
+        for f in fm:
+            if (f in ['author_sort'] or (
+                fm[f]['datatype'] == 'text' or fm[f]['datatype'] == 'series')
+                    and fm[f].get('search_terms', None)
+                    and f not in ['formats', 'ondevice']):
+                fields.append(f)
+        fields.sort()
+        self.search_field.addItems(fields)
+        self.search_field.setMaxVisibleItems(min(len(fields), 20))
+        offset = 10
+        self.s_r_number_of_books = min(7, len(self.ids))
+        for i in range(1,self.s_r_number_of_books+1):
+            w = QtGui.QLabel(self.tabWidgetPage3)
+            w.setText(_('Book %d:'%i))
+            self.gridLayout1.addWidget(w, i+offset, 0, 1, 1)
+            w = QtGui.QLineEdit(self.tabWidgetPage3)
+            w.setReadOnly(True)
+            name = 'book_%d_text'%i
+            setattr(self, name, w)
+            self.book_1_text.setObjectName(name)
+            self.gridLayout1.addWidget(w, i+offset, 1, 1, 1)
+            w = QtGui.QLineEdit(self.tabWidgetPage3)
+            w.setReadOnly(True)
+            name = 'book_%d_result'%i
+            setattr(self, name, w)
+            self.book_1_text.setObjectName(name)
+            self.gridLayout1.addWidget(w, i+offset, 2, 1, 1)
+
+        self.s_r_heading.setText('<p>'+
+                           _('Search and replace in text fields using '
+                             'regular expressions. The search text is an '
+                             'arbitrary python-compatible regular expression. '
+                             'The replacement text can contain backreferences '
+                             'to parenthesized expressions in the pattern. '
+                             'The search is not anchored, and can match and '
+                             'replace multiple times on the same string. See '
+                             '<a href="http://docs.python.org/library/re.html"> '
+                             'this reference</a> '
+                             'for more information, and in particular the \'sub\' '
+                             'function.') + '<p>' + _(
+                             'Note: <b>you can destroy your library</b> '
+                             'using this feature. Changes are permanent. There '
+                             'is no undo function. You are strongly encouraged '
+                             'to back up your library before proceeding.'))
+        self.s_r_error = None
+        self.s_r_obj = None
+
+        self.replace_func.addItems(sorted(self.s_r_functions.keys()))
+        self.search_field.currentIndexChanged[str].connect(self.s_r_field_changed)
+        self.replace_func.currentIndexChanged[str].connect(self.s_r_paint_results)
+        self.search_for.editTextChanged[str].connect(self.s_r_paint_results)
+        self.replace_with.editTextChanged[str].connect(self.s_r_paint_results)
+        self.test_text.editTextChanged[str].connect(self.s_r_paint_results)
+        self.central_widget.setCurrentIndex(0)
+
+    def s_r_field_changed(self, txt):
+        txt = unicode(txt)
+        for i in range(0, self.s_r_number_of_books):
+            if txt:
+                fm = self.db.field_metadata[txt]
+                id = self.ids[i]
+                val = self.db.get_property(id, index_is_id=True,
+                                           loc=fm['rec_index'])
+                if val is None:
+                    val = ''
+                if fm['is_multiple']:
+                    val = [t.strip() for t in val.split(fm['is_multiple']) if t.strip()]
+                    if val:
+                        val.sort(cmp=lambda x,y: cmp(x.lower(), y.lower()))
+                        val = val[0]
+                    else:
+                        val = ''
+            else:
+                val = ''
+            w = getattr(self, 'book_%d_text'%(i+1))
+            w.setText(val)
+        self.s_r_paint_results(None)
+
+    def s_r_set_colors(self):
+        if self.s_r_error is not None:
+            col = 'rgb(255, 0, 0, 20%)'
+            self.test_result.setText(self.s_r_error.message)
+        else:
+            col = 'rgb(0, 255, 0, 20%)'
+        self.test_result.setStyleSheet('QLineEdit { color: black; '
+                                       'background-color: %s; }'%col)
+        for i in range(0,self.s_r_number_of_books):
+            getattr(self, 'book_%d_result'%(i+1)).setText('')
+
+    def s_r_func(self, match):
+        rf = self.s_r_functions[unicode(self.replace_func.currentText())]
+        rv = unicode(self.replace_with.text())
+        val = match.expand(rv)
+        return rf(val)
+
+    def s_r_paint_results(self, txt):
+        self.s_r_error = None
+        self.s_r_set_colors()
+        try:
+            self.s_r_obj = re.compile(unicode(self.search_for.text()))
+        except re.error as e:
+            self.s_r_obj = None
+            self.s_r_error = e
+            self.s_r_set_colors()
+            return
+
+        try:
+            self.test_result.setText(self.s_r_obj.sub(self.s_r_func,
+                                     unicode(self.test_text.text())))
+        except re.error as e:
+            self.s_r_error = e
+            self.s_r_set_colors()
+            return
+
+        for i in range(0,self.s_r_number_of_books):
+            wt = getattr(self, 'book_%d_text'%(i+1))
+            wr = getattr(self, 'book_%d_result'%(i+1))
+            try:
+                wr.setText(self.s_r_obj.sub(self.s_r_func, unicode(wt.text())))
+            except re.error as e:
+                self.s_r_error = e
+                self.s_r_set_colors()
+                break
+
+    def do_search_replace(self):
+        field = unicode(self.search_field.currentText())
+        if not field or not self.s_r_obj:
+            return
+
+        fm = self.db.field_metadata[field]
+
+        def apply_pattern(val):
+            try:
+                return self.s_r_obj.sub(self.s_r_func, val)
+            except:
+                return val
+
+        for id in self.ids:
+            val = self.db.get_property(id, index_is_id=True,
+                                       loc=fm['rec_index'])
+            if val is None:
+                continue
+            if fm['is_multiple']:
+                res = []
+                for val in [t.strip() for t in val.split(fm['is_multiple'])]:
+                    v = apply_pattern(val).strip()
+                    if v:
+                        res.append(v)
+                val = res
+                if fm['is_custom']:
+                    # The standard tags and authors values want to be lists.
+                    # All custom columns are to be strings
+                    val = fm['is_multiple'].join(val)
+            else:
+                val = apply_pattern(val)
+
+            if fm['is_custom']:
+                extra = self.db.get_custom_extra(id, label=fm['label'], index_is_id=True)
+                self.db.set_custom(id, val, label=fm['label'], extra=extra,
+                                   commit=False)
+            else:
+                if field == 'comments':
+                    setter = self.db.set_comment
+                else:
+                    setter = getattr(self.db, 'set_'+field)
+                setter(id, val, notify=False, commit=False)
+        self.db.commit()
+
    def create_custom_column_editors(self):
        w = self.central_widget.widget(1)
        layout = QGridLayout()
@ -193,6 +378,11 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
        if len(self.ids) < 1:
            return QDialog.accept(self)

+        if self.s_r_error is not None:
+            error_dialog(self, _('Search/replace invalid'),
+                    _('Search pattern is invalid: %s')%self.s_r_error.message,
+                    show=True)
+            return False
        self.changed = bool(self.ids)
        # Cache values from GUI so that Qt widgets are not used in
        # non GUI thread
@ -234,6 +424,10 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
            return error_dialog(self, _('Failed'),
                    self.worker.error[0], det_msg=self.worker.error[1],
                    show=True)
+
+        self.do_search_replace()
+
+        self.db.clean()
        return QDialog.accept(self)


--- a/src/calibre/gui2/dialogs/metadata_bulk.ui
+++ b/src/calibre/gui2/dialogs/metadata_bulk.ui
@ -6,8 +6,8 @@
   <rect>
    <x>0</x>
    <y>0</y>
-    <width>526</width>
-    <height>499</height>
+    <width>679</width>
+    <height>685</height>
   </rect>
  </property>
  <property name="windowTitle">
@ -200,14 +200,15 @@
         </item>
         <item row="6" column="2">
          <widget class="QCheckBox" name="remove_all_tags">
-           <property name="text">
-            <string>Remove all</string>
-           </property>
           <property name="toolTip">
            <string>Check this box to remove all tags from the books.</string>
           </property>
+           <property name="text">
+            <string>Remove all</string>
+           </property>
          </widget>
-         </item><item row="7" column="0">
+         </item>
+         <item row="7" column="0">
          <widget class="QLabel" name="label_7">
           <property name="text">
            <string>&amp;Series:</string>
@ -294,6 +295,19 @@ Future conversion of these books will use the default settings.</string>
           </property>
          </widget>
         </item>
+         <item row="12" column="0" colspan="3">
+          <spacer name="verticalSpacer_2">
+           <property name="orientation">
+            <enum>Qt::Vertical</enum>
+           </property>
+           <property name="sizeHint" stdset="0">
+            <size>
+             <width>20</width>
+             <height>40</height>
+            </size>
+           </property>
+          </spacer>
+         </item>
        </layout>
       </widget>
       <widget class="QWidget" name="tab">
@ -301,6 +315,128 @@ Future conversion of these books will use the default settings.</string>
         <string>&amp;Custom metadata</string>
        </attribute>
       </widget>
+       <widget class="QWidget" name="tabWidgetPage3">
+        <attribute name="title">
+         <string>&amp;Search and replace (experimental)</string>
+        </attribute>
+        <layout class="QGridLayout" name="gridLayout">
+         <property name="sizeConstraint">
+          <enum>QLayout::SetMinimumSize</enum>
+         </property>
+         <item row="1" column="0" colspan="3">
+          <widget class="QLabel" name="s_r_heading">
+           <property name="wordWrap">
+            <bool>true</bool>
+           </property>
+          </widget>
+         </item>
+         <item row="2" column="0">
+          <widget class="QLabel" name="filler">
+           <property name="text">
+            <string/>
+           </property>
+          </widget>
+         </item>
+         <item row="3" column="0">
+          <widget class="QLabel" name="xlabel_21">
+           <property name="text">
+            <string>Search &amp;field:</string>
+           </property>
+           <property name="buddy">
+            <cstring>search_field</cstring>
+           </property>
+          </widget>
+         </item>
+         <item row="3" column="1">
+          <widget class="QLabel" name="xlabel_2">
+           <property name="text">
+            <string>&amp;Search for:</string>
+           </property>
+           <property name="buddy">
+            <cstring>search_for</cstring>
+           </property>
+          </widget>
+         </item>
+         <item row="3" column="2">
+          <widget class="QLabel" name="xlabel_4">
+           <property name="text">
+            <string>&amp;Replace with:</string>
+           </property>
+           <property name="buddy">
+            <cstring>replace_with</cstring>
+           </property>
+          </widget>
+         </item>
+         <item row="4" column="0">
+          <widget class="QComboBox" name="search_field"/>
+         </item>
+         <item row="4" column="1">
+          <widget class="HistoryLineEdit" name="search_for"/>
+         </item>
+         <item row="4" column="2">
+          <widget class="HistoryLineEdit" name="replace_with"/>
+         </item>
+         <item row="5" column="1">
+          <widget class="QLabel" name="label_41">
+           <property name="text">
+            <string>Apply function &amp;after replace:</string>
+           </property>
+           <property name="buddy">
+            <cstring>replace_func</cstring>
+           </property>
+          </widget>
+         </item>
+         <item row="5" column="2">
+          <widget class="QComboBox" name="replace_func"/>
+         </item>
+         <item row="6" column="1">
+          <widget class="QLabel" name="xlabel_3">
+           <property name="text">
+            <string>Test &amp;text</string>
+           </property>
+           <property name="buddy">
+            <cstring>test_text</cstring>
+           </property>
+          </widget>
+         </item>
+         <item row="6" column="2">
+          <widget class="QLabel" name="label_5">
+           <property name="text">
+            <string>Test re&amp;sult</string>
+           </property>
+           <property name="buddy">
+            <cstring>test_result</cstring>
+           </property>
+          </widget>
+         </item>
+         <item row="7" column="0">
+          <widget class="QLabel" name="label_31">
+           <property name="text">
+            <string>Your test:</string>
+           </property>
+          </widget>
+         </item>
+         <item row="7" column="1">
+          <widget class="HistoryLineEdit" name="test_text"/>
+         </item>
+         <item row="7" column="2">
+          <widget class="QLineEdit" name="test_result"/>
+         </item>
+         <item row="20" column="1">
+          <spacer name="verticalSpacer">
+           <property name="orientation">
+            <enum>Qt::Vertical</enum>
+           </property>
+           <property name="sizeHint" stdset="0">
+            <size>
+             <width>20</width>
+             <height>40</height>
+            </size>
+           </property>
+          </spacer>
+         </item>
+        </layout>
+       </widget>
      </widget>
     </item>
    </layout>
@ -333,6 +469,11 @@ Future conversion of these books will use the default settings.</string>
   <extends>QLineEdit</extends>
   <header>widgets.h</header>
  </customwidget>
+  <customwidget>
+   <class>HistoryLineEdit</class>
+   <extends>QLineEdit</extends>
+   <header>widgets.h</header>
+  </customwidget>
 </customwidgets>
 <tabstops>
  <tabstop>authors</tabstop>
--- a/src/calibre/gui2/dialogs/metadata_single.py
+++ b/src/calibre/gui2/dialogs/metadata_single.py
@ -6,10 +6,7 @@ The dialog used to edit meta information for a book as well as
 add/remove formats
 '''

-import os
-import re
-import time
-import traceback
+import os, re, time, traceback, textwrap

 from PyQt4.Qt import SIGNAL, QObject, Qt, QTimer, QThread, QDate, \
    QPixmap, QListWidgetItem, QDialog, pyqtSignal
@ -331,6 +328,14 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
        ResizableDialog.__init__(self, window)
        self.bc_box.layout().setAlignment(self.cover, Qt.AlignCenter|Qt.AlignHCenter)
        self.cancel_all = False
+        base = unicode(self.author_sort.toolTip())
+        self.ok_aus_tooltip = '<p>' + textwrap.fill(base+'<br><br>'+
+                            _(' The green color indicates that the current '
+                    'author sort matches the current author'))
+        self.bad_aus_tooltip = '<p>'+textwrap.fill(base + '<br><br>'+
+                _(' The red color indicates that the current '
+                    'author sort does not match the current author'))
+
        if cancel_all:
            self.__abort_button = self.button_box.addButton(self.button_box.Abort)
            self.__abort_button.setToolTip(_('Abort the editing of all remaining books'))
@ -375,6 +380,10 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
                        self.remove_unused_series)
        QObject.connect(self.auto_author_sort, SIGNAL('clicked()'),
                        self.deduce_author_sort)
+        self.connect(self.author_sort, SIGNAL('textChanged(const QString&)'),
+                     self.author_sort_box_changed)
+        self.connect(self.authors, SIGNAL('editTextChanged(const QString&)'),
+                     self.authors_box_changed)
        self.connect(self.formats, SIGNAL('itemDoubleClicked(QListWidgetItem*)'),
                self.show_format)
        self.connect(self.formats, SIGNAL('delete_format()'), self.remove_format)
@ -467,6 +476,28 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
            for c in range(2, len(ans[i].widgets), 2):
                w.setTabOrder(ans[i].widgets[c-1], ans[i].widgets[c+1])

+    def authors_box_changed(self, txt):
+        aus = unicode(txt)
+        aus = re.sub(r'\s+et al\.$', '', aus)
+        aus = self.db.author_sort_from_authors(string_to_authors(aus))
+        self.mark_author_sort(normal=(unicode(self.author_sort.text()) == aus))
+
+    def author_sort_box_changed(self, txt):
+        au = unicode(self.authors.text())
+        au = re.sub(r'\s+et al\.$', '', au)
+        au = self.db.author_sort_from_authors(string_to_authors(au))
+        self.mark_author_sort(normal=(au == txt))
+
+    def mark_author_sort(self, normal=True):
+        if normal:
+            col = 'rgb(0, 255, 0, 20%)'
+        else:
+            col = 'rgb(255, 0, 0, 20%)'
+        self.author_sort.setStyleSheet('QLineEdit { color: black; '
+                                       'background-color: %s; }'%col)
+        tt = self.ok_aus_tooltip if normal else self.bad_aus_tooltip
+        self.author_sort.setToolTip(tt)
+
    def validate_isbn(self, isbn):
        isbn = unicode(isbn).strip()
        if not isbn:
--- a/src/calibre/gui2/dialogs/metadata_single.ui
+++ b/src/calibre/gui2/dialogs/metadata_single.ui
@ -151,14 +151,16 @@
                    <item>
                     <widget class="EnLineEdit" name="author_sort">
                      <property name="toolTip">
-                       <string>Specify how the author(s) of this book should be sorted. For example Charles Dickens should be sorted as Dickens, Charles.</string>
+                       <string>Specify how the author(s) of this book should be sorted. For example Charles Dickens should be sorted as Dickens, Charles.
+If the box is colored green, then text matches the individual author's sort strings. If it is colored red, then the authors and this text do not match.</string>
                      </property>
                     </widget>
                    </item>
                    <item>
                     <widget class="QToolButton" name="auto_author_sort">
                      <property name="toolTip">
-                       <string>Automatically create the author sort entry based on the current author entry</string>
+                       <string>Automatically create the author sort entry based on the current author entry.
+Using this button to create author sort will change author sort from red to green.</string>
                      </property>
                      <property name="text">
                       <string>...</string>
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -143,6 +143,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        SchemaUpgrade.__init__(self)
        self.initialize_dynamic()

+    def get_property(self, idx, index_is_id=False, loc=-1):
+        row = self.data._data[idx] if index_is_id else self.data[idx]
+        if row is not None:
+            return row[loc]
+
    def initialize_dynamic(self):
        self.field_metadata = FieldMetadata() #Ensure we start with a clean copy
        self.prefs = DBPrefs(self)
@ -324,17 +329,12 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        self.last_update_check = self.last_modified()


-        def get_property(idx, index_is_id=False, loc=-1):
-            row = self.data._data[idx] if index_is_id else self.data[idx]
-            if row is not None:
-                return row[loc]
-
        for prop in ('author_sort', 'authors', 'comment', 'comments', 'isbn',
                     'publisher', 'rating', 'series', 'series_index', 'tags',
                     'title', 'timestamp', 'uuid', 'pubdate', 'ondevice'):
-            setattr(self, prop, functools.partial(get_property,
+            setattr(self, prop, functools.partial(self.get_property,
                    loc=self.FIELD_MAP['comments' if prop == 'comment' else prop]))
-        setattr(self, 'title_sort', functools.partial(get_property,
+        setattr(self, 'title_sort', functools.partial(self.get_property,
                loc=self.FIELD_MAP['sort']))

    def initialize_database(self):
@ -439,7 +439,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
                if not f:
                    continue
                stream = cStringIO.StringIO(f)
-                self.add_format(id, format, stream, index_is_id=True, path=tpath)
+                self.add_format(id, format, stream, index_is_id=True,
+                        path=tpath, notify=False)
        self.conn.execute('UPDATE books SET path=? WHERE id=?', (path, id))
        if commit:
            self.conn.commit()
@ -1157,7 +1158,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
                result.append(r)
        return ' & '.join(result).replace('|', ',')

-    def set_authors(self, id, authors, notify=True):
+    def set_authors(self, id, authors, notify=True, commit=True):
        '''
        `authors`: A list of authors.
        '''
@ -1185,16 +1186,17 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        ss = self.author_sort_from_book(id, index_is_id=True)
        self.conn.execute('UPDATE books SET author_sort=? WHERE id=?',
                          (ss, id))
-        self.conn.commit()
+        if commit:
+            self.conn.commit()
        self.data.set(id, self.FIELD_MAP['authors'],
                      ','.join([a.replace(',', '|') for a in authors]),
                      row_is_id=True)
        self.data.set(id, self.FIELD_MAP['author_sort'], ss, row_is_id=True)
-        self.set_path(id, True)
+        self.set_path(id, index_is_id=True, commit=commit)
        if notify:
            self.notify('metadata', [id])

-    def set_title(self, id, title, notify=True):
+    def set_title(self, id, title, notify=True, commit=True):
        if not title:
            return
        if not isinstance(title, unicode):
@ -1205,8 +1207,9 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
            self.data.set(id, self.FIELD_MAP['sort'], title_sort(title), row_is_id=True)
        else:
            self.data.set(id, self.FIELD_MAP['sort'], title, row_is_id=True)
-        self.set_path(id, True)
-        self.conn.commit()
+        self.set_path(id, index_is_id=True, commit=commit)
+        if commit:
+            self.conn.commit()
        if notify:
            self.notify('metadata', [id])

--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@ -329,6 +329,17 @@ There are a few more options in this section.
    of as a separate cover. If you also specify a cover in |app|, then the converted book will have
    two covers. This option will simply remove the first image from the source document, thereby
    ensuring that the converted book has only one cover, the one specified in |app|.
+
+:guilabel:`Preprocess input`
+    This option activates various algorithms that try to detect and correct common cases of
+    badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
+    Turn this option on if your input document suffers from bad formatting. But be aware that in
+    some cases, this option can lead to worse results, so use with care.
+
+:guilabel:`Line-unwrap factor`
+    This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
+    option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
+    than the length of 40% of all lines in the document. 
    
 Table of Contents
 ------------------
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -376,7 +376,9 @@ be printed to it. If the debug output contains a line that looks like::

 then the problem is probably a corrupted font cache. You can clear the cache by following these
 `instructions <http://www.macworld.com/article/139383/2009/03/fontcacheclear.html>`_. If that doesn't
-solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like.
+solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like. An easy way to
+check for corrupted fonts in OS X is to start the "Font Book" application, select all fonts and then in the File
+menu, choose "Validate fonts".

 My antivirus program claims |app| is a virus/trojan?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/src/calibre/translations/ar.po
+++ b/src/calibre/translations/ar.po
--- a/src/calibre/translations/ca.po
+++ b/src/calibre/translations/ca.po
--- a/src/calibre/translations/cs.po
+++ b/src/calibre/translations/cs.po
--- a/src/calibre/translations/da.po
+++ b/src/calibre/translations/da.po
--- a/src/calibre/translations/de.po
+++ b/src/calibre/translations/de.po
--- a/src/calibre/translations/es.po
+++ b/src/calibre/translations/es.po
--- a/src/calibre/translations/eu.po
+++ b/src/calibre/translations/eu.po
--- a/src/calibre/translations/fr.po
+++ b/src/calibre/translations/fr.po
--- a/src/calibre/translations/it.po
+++ b/src/calibre/translations/it.po
--- a/src/calibre/translations/ja.po
+++ b/src/calibre/translations/ja.po
--- a/src/calibre/translations/ko.po
+++ b/src/calibre/translations/ko.po
--- a/src/calibre/translations/nb.po
+++ b/src/calibre/translations/nb.po
--- a/src/calibre/translations/nl.po
+++ b/src/calibre/translations/nl.po
--- a/src/calibre/translations/pl.po
+++ b/src/calibre/translations/pl.po
--- a/src/calibre/translations/pt_BR.po
+++ b/src/calibre/translations/pt_BR.po
--- a/src/calibre/translations/ro.po
+++ b/src/calibre/translations/ro.po
--- a/src/calibre/translations/sk.po
+++ b/src/calibre/translations/sk.po
--- a/src/calibre/translations/sr.po
+++ b/src/calibre/translations/sr.po
--- a/src/calibre/translations/sv.po
+++ b/src/calibre/translations/sv.po
--- a/src/calibre/translations/tr.po
+++ b/src/calibre/translations/tr.po
--- a/src/calibre/translations/uk.po
+++ b/src/calibre/translations/uk.po
--- a/src/calibre/translations/vi.po
+++ b/src/calibre/translations/vi.po
--- a/src/calibre/web/feeds/input.py
+++ b/src/calibre/web/feeds/input.py
@ -10,6 +10,7 @@ import os

 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.constants import numeric_version
+from calibre import walk

 class RecipeDisabled(Exception):
    pass
@ -111,6 +112,10 @@ class RecipeInput(InputFormatPlugin):
            if f.endswith('.opf'):
                return os.path.abspath(f)

+        for f in walk('.'):
+            if f.endswith('.opf'):
+                return os.path.abspath(f)
+
    def postprocess_book(self, oeb, opts, log):
        if self.recipe_object is not None:
            self.recipe_object.postprocess_book(oeb, opts, log)