Sync to ldolse heuristics branch.

2025-07-09 03:04:10 -04:00 · 2011-01-16 08:54:31 -05:00 · 2011-01-16 08:54:31 -05:00 · 51e7a555e1
commit 51e7a555e1
parent 9134d51377 e0d1de2ce8
28 changed files with 450 additions and 249 deletions
--- a/resources/recipes/mail_and_guardian.recipe
+++ b/resources/recipes/mail_and_guardian.recipe
@ -0,0 +1,32 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1295081935(BasicNewsRecipe):
+     title          = u'Mail & Guardian ZA News'
+     __author__     = '77ja65'
+     language = 'en'
+     oldest_article = 7
+     max_articles_per_feed = 30
+     no_stylesheets = True
+     masthead_url          =  'http://c1608832.cdn.cloudfiles.rackspacecloud.com/mg_logo.gif'
+     remove_tags_after = [dict(id='content')]
+
+     feeds          = [
+         (u'National News', u'http://www.mg.co.za/rss/national'),
+         (u'Top Stories', u'http://www.mg.co.za/rss'),
+         (u'Africa News', u'http://www.mg.co.za/rss/africa'),
+         (u'Sport', u'http://www.mg.co.za/rss/sport'),
+         (u'Business', u'http://www.mg.co.za/rss/business'),
+         (u'And In Other News', u'http://www.mg.co.za/rss/and-in-other-news'),
+         (u'World News', u'http://www.mg.co.za/rss/world')
+         ]
+
+     def print_version(self, url):
+           return url.replace('http://www.mg.co.za/article/',
+ 'http://www.mg.co.za/printformat/single/')
+
+     extra_css = '''
+                     h1{font-family:Arial,Helvetica,sans-serif; font-
+ weight:bold;font-size:large;}
+                     h2{font-family:Arial,Helvetica,sans-serif; font-
+ weight:normal;font-size:small;}
+                 '''
--- a/resources/recipes/sportsillustrated.recipe
+++ b/resources/recipes/sportsillustrated.recipe
@ -1,5 +1,5 @@
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+#from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from urllib import quote

 class SportsIllustratedRecipe(BasicNewsRecipe) :
@ -91,7 +91,7 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
        #   expire : no idea what value to use
        # All this comes from the Javascript function that redirects to the print version. It's called PT() and is defined in the file 48.js

-    def preprocess_html(self, soup):
+    '''def preprocess_html(self, soup):
        header = soup.find('div', attrs = {'class' : 'siv_artheader'})
        homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
        body = homeMadeSoup.body
@ -115,4 +115,5 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
            body.append(para)

        return homeMadeSoup
+        '''

--- a/resources/template-functions.json
+++ b/resources/template-functions.json
@ -0,0 +1,28 @@
+{
+    "contains": "def evaluate(self, formatter, kwargs, mi, locals,\n             val, test, value_if_present, value_if_not):\n    if re.search(test, val):\n        return value_if_present\n    else:\n        return value_if_not\n", 
+    "divide": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n    x = float(x if x else 0)\n    y = float(y if y else 0)\n    return unicode(x / y)\n", 
+    "uppercase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n    return val.upper()\n", 
+    "strcat": "def evaluate(self, formatter, kwargs, mi, locals, *args):\n    i = 0\n    res = ''\n    for i in range(0, len(args)):\n        res += args[i]\n    return res\n", 
+    "substr": "def evaluate(self, formatter, kwargs, mi, locals, str_, start_, end_):\n    return str_[int(start_): len(str_) if int(end_) == 0 else int(end_)]\n", 
+    "ifempty": "def evaluate(self, formatter, kwargs, mi, locals, val, value_if_empty):\n    if val:\n        return val\n    else:\n        return value_if_empty\n", 
+    "field": "def evaluate(self, formatter, kwargs, mi, locals, name):\n    return formatter.get_value(name, [], kwargs)\n", 
+    "capitalize": "def evaluate(self, formatter, kwargs, mi, locals, val):\n    return capitalize(val)\n", 
+    "list_item": "def evaluate(self, formatter, kwargs, mi, locals, val, index, sep):\n    if not val:\n        return ''\n    index = int(index)\n    val = val.split(sep)\n    try:\n        return val[index]\n    except:\n        return ''\n", 
+    "shorten": "def evaluate(self, formatter, kwargs, mi, locals,\n             val, leading, center_string, trailing):\n    l = max(0, int(leading))\n    t = max(0, int(trailing))\n    if len(val) > l + len(center_string) + t:\n        return val[0:l] + center_string + ('' if t == 0 else val[-t:])\n    else:\n        return val\n", 
+    "re": "def evaluate(self, formatter, kwargs, mi, locals, val, pattern, replacement):\n    return re.sub(pattern, replacement, val)\n", 
+    "add": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n    x = float(x if x else 0)\n    y = float(y if y else 0)\n    return unicode(x + y)\n", 
+    "lookup": "def evaluate(self, formatter, kwargs, mi, locals, val, *args):\n    if len(args) == 2: # here for backwards compatibility\n        if val:\n            return formatter.vformat('{'+args[0].strip()+'}', [], kwargs)\n        else:\n            return formatter.vformat('{'+args[1].strip()+'}', [], kwargs)\n    if (len(args) % 2) != 1:\n        raise ValueError(_('lookup requires either 2 or an odd number of arguments'))\n    i = 0\n    while i < len(args):\n        if i + 1 >= len(args):\n            return formatter.vformat('{' + args[i].strip() + '}', [], kwargs)\n        if re.search(args[i], val):\n            return formatter.vformat('{'+args[i+1].strip() + '}', [], kwargs)\n        i += 2\n", 
+    "template": "def evaluate(self, formatter, kwargs, mi, locals, template):\n    template = template.replace('[[', '{').replace(']]', '}')\n    return formatter.safe_format(template, kwargs, 'TEMPLATE', mi)\n", 
+    "print": "def evaluate(self, formatter, kwargs, mi, locals, *args):\n    print args\n    return None\n", 
+    "titlecase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n    return titlecase(val)\n", 
+    "test": "def evaluate(self, formatter, kwargs, mi, locals, val, value_if_set, value_not_set):\n    if val:\n        return value_if_set\n    else:\n        return value_not_set\n", 
+    "eval": "def evaluate(self, formatter, kwargs, mi, locals, template):\n    from formatter import eval_formatter\n    template = template.replace('[[', '{').replace(']]', '}')\n    return eval_formatter.safe_format(template, locals, 'EVAL', None)\n", 
+    "multiply": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n    x = float(x if x else 0)\n    y = float(y if y else 0)\n    return unicode(x * y)\n", 
+    "subtract": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n    x = float(x if x else 0)\n    y = float(y if y else 0)\n    return unicode(x - y)\n", 
+    "count": "def evaluate(self, formatter, kwargs, mi, locals, val, sep):\n    return unicode(len(val.split(sep)))\n", 
+    "lowercase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n    return val.lower()\n", 
+    "assign": "def evaluate(self, formatter, kwargs, mi, locals, target, value):\n    locals[target] = value\n    return value\n", 
+    "switch": "def evaluate(self, formatter, kwargs, mi, locals, val, *args):\n    if (len(args) % 2) != 1:\n        raise ValueError(_('switch requires an odd number of arguments'))\n    i = 0\n    while i < len(args):\n        if i + 1 >= len(args):\n            return args[i]\n        if re.search(args[i], val):\n            return args[i+1]\n        i += 2\n", 
+    "strcmp": "def evaluate(self, formatter, kwargs, mi, locals, x, y, lt, eq, gt):\n    v = strcmp(x, y)\n    if v < 0:\n        return lt\n    if v == 0:\n        return eq\n    return gt\n", 
+    "cmp": "def evaluate(self, formatter, kwargs, mi, locals, x, y, lt, eq, gt):\n    x = float(x if x else 0)\n    y = float(y if y else 0)\n    if x < y:\n        return lt\n    if x == y:\n        return eq\n    return gt\n"
+}
--- a/setup/resources.py
+++ b/setup/resources.py
@ -84,6 +84,23 @@ class Resources(Command):

            cPickle.dump(complete, open(dest, 'wb'), -1)

+        self.info('\tCreating template-functions.json')
+        dest = self.j(self.RESOURCES, 'template-functions.json')
+        function_dict = {}
+        import inspect
+        from calibre.utils.formatter_functions import all_builtin_functions
+        for obj in all_builtin_functions:
+            eval_func = inspect.getmembers(obj,
+                    lambda x: inspect.ismethod(x) and x.__name__ == 'evaluate')
+            try:
+                lines = [l[4:] for l in inspect.getsourcelines(eval_func[0][1])[0]]
+            except:
+                continue
+            lines = ''.join(lines)
+            function_dict[obj.name] = lines
+        import json
+        json.dump(function_dict, open(dest, 'wb'), indent=4)
+
    def clean(self):
        for x in ('scripts', 'recipes', 'ebook-convert-complete'):
            x = self.j(self.RESOURCES, x+'.pickle')
--- a/src/calibre/devices/sne/driver.py
+++ b/src/calibre/devices/sne/driver.py
@ -33,6 +33,6 @@ class SNE(USBMS):
    STORAGE_CARD_VOLUME_LABEL = 'SNE Storage Card'

    EBOOK_DIR_MAIN = EBOOK_DIR_CARD_A = 'Books'
-    SUPPORTS_SUB_DIRS = True
+    SUPPORTS_SUB_DIRS = False


--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -174,13 +174,19 @@ class Dehyphenator(object):
    retain hyphens.
    '''

-    def __init__(self):
+    def __init__(self, verbose=0, log=None):
+        self.log = default_log if log is None else log
+        self.verbose = verbose
        # Add common suffixes to the regex below to increase the likelihood of a match -
        # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
+        # only remove if it's not already the point of hyphenation
+        self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$"
+        self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
+        self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
        # remove prefixes if the prefix was not already the point of hyphenation
-        self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
-        self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
+        self.prefix_string = '^(dis|re|un|in|ex)'
+        self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
+        self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)

    def dehyphenate(self, match):
        firsthalf = match.group('firstpart')
@ -191,31 +197,44 @@ class Dehyphenator(object):
            wraptags = ''
        hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
        dehyphenated = unicode(firsthalf) + unicode(secondhalf)
+        if self.suffixes.match(secondhalf) is None:
            lookupword = self.removesuffixes.sub('', dehyphenated)
-        if self.prefixes.match(firsthalf) is None:
+        else:
+            lookupword = dehyphenated
+        if len(firsthalf) > 3 and self.prefixes.match(firsthalf) is None:
            lookupword = self.removeprefix.sub('', lookupword)
-        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        if self.verbose > 2:
+            self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated))
        try:
            searchresult = self.html.find(lookupword.lower())
        except:
            return hyphenated
        if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
            if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+                if self.verbose > 2:
+                    self.log("    Cleanup:returned dehyphenated word: " + str(dehyphenated))
                return dehyphenated
            elif self.html.find(hyphenated) != -1:
-                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+                if self.verbose > 2:
+                    self.log("        Cleanup:returned hyphenated word: " + str(hyphenated))
                return hyphenated
            else:
-                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+                if self.verbose > 2:
+                    self.log("            Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf))
                return firsthalf+u'\u2014'+wraptags+secondhalf

        else:
+            if len(firsthalf) <= 2 and len(secondhalf) <= 2:
+                if self.verbose > 2:
+                    self.log("too short, returned hyphenated word: " + str(hyphenated))
+                return hyphenated
            if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "returned dehyphenated word: " + str(dehyphenated)
+                if self.verbose > 2:
+                    self.log("     returned dehyphenated word: " + str(dehyphenated))
                return dehyphenated
            else:
-                #print "           returned hyphenated word: " + str(hyphenated)
+                if self.verbose > 2:
+                    self.log("          returned hyphenated word: " + str(hyphenated))
                return hyphenated

    def __call__(self, html, format, length=1):
@ -228,7 +247,7 @@ class Dehyphenator(object):
        elif format == 'txt':
            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
        elif format == 'individual_words':
-            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'(?!<)(?P<firstpart>\w+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)')
        elif format == 'html_cleanup':
            intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
        elif format == 'txt_cleanup':
@ -512,7 +531,7 @@ class HTMLPreProcessor(object):

        if is_pdftohtml and length > -1:
            # Dehyphenate
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
            html = dehyphenator(html,'html', length)

        if is_pdftohtml:
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -322,11 +322,11 @@ class HeuristicProcessor(object):
        html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
        # Delete microsoft 'smart' tags
        html = re.sub('(?i)</?st1:\w+>', '', html)
-        # Get rid of empty span, bold, font, & italics tags
-        html = re.sub(r'\s*<font[^>]*>\s*</font>\s*', '', html)
+        # Get rid of empty span, bold, font, em, & italics tags
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
-        html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
+        html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
+        html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
        self.deleted_nbsps = True
        return html

@ -376,17 +376,20 @@ class HeuristicProcessor(object):
        except:
            self.log("Can't get wordcount")

-        if 0 < self.totalwords < 50:
+        print "found "+unicode(self.totalwords)+" words in the flow"
+        if self.totalwords < 50:
            self.log("flow is too short, not running heuristics")
            return html

        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = self.arrange_htm_line_endings(html)

+        if self.cleanup_required():
            ###### Check Markup ######
            #
            # some lit files don't have any <p> tags or equivalent (generally just plain text between
            # <pre> tags), check and  mark up line endings if required before proceeding
+            # fix indents must run after this step
            if self.no_markup(html, 0.1):
                self.log("not enough paragraph markers, adding now")
                # markup using text processing
@ -397,6 +400,7 @@ class HeuristicProcessor(object):
            html = self.fix_nbsp_indents(html)

        if self.cleanup_required():
+            # fix indents must run before this step, as it removes non-breaking spaces
            html = self.cleanup_markup(html)

        # ADE doesn't render <br />, change to empty paragraphs
@ -421,8 +425,6 @@ class HeuristicProcessor(object):
            html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
            html = self.blankreg.sub('', html)

-        ###### Unwrap lines ######
-        if getattr(self.extra_opts, 'unwrap_lines', False):
        # Determine line ending type
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
@ -440,6 +442,8 @@ class HeuristicProcessor(object):
        length = docanalysis.line_length(unwrap_factor)
        self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
            
+        ###### Unwrap lines ######
+        if getattr(self.extra_opts, 'unwrap_lines', False):
            # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
            if hardbreaks or unwrap_factor < 0.4:
                self.log("Unwrapping required, unwrapping Lines")
@ -447,15 +451,16 @@ class HeuristicProcessor(object):
                dehyphenator = Dehyphenator()
                html = dehyphenator(html,'html', length)
                html = self.punctuation_unwrap(length, html, 'html')
-                #check any remaining hyphens, but only unwrap if there is a match
-                dehyphenator = Dehyphenator()
+                # unwrap remaining hyphens based on line length, but only remove if there is a match
+                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
                html = dehyphenator(html,'html_cleanup', length)

        if getattr(self.extra_opts, 'dehyphenate', False):
            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
            self.log("Fixing hyphenated content")
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
            html = dehyphenator(html,'html_cleanup', length)
+            html = dehyphenator(html, 'individual_words', length)

        # If still no sections after unwrapping mark split points on lines with no punctuation
        if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -285,7 +285,6 @@ class RTFInput(InputFormatPlugin):
        try:
            xml = self.generate_xml(stream.name)
        except RtfInvalidCodeException, e:
-            raise
            raise ValueError(_('This RTF file has a feature calibre does not '
            'support. Convert it to HTML first and then try it.\n%s')%e)

--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -226,10 +226,6 @@ class ParseRtf:
        try:
            return_value = process_tokens_obj.process_tokens()
        except InvalidRtfException, msg:
-            try:
-                os.remove(self.__temp_file)
-            except OSError:
-                pass
            #Check to see if the file is correctly encoded
            encode_obj = default_encoding.DefaultEncoding(
            in_file = self.__temp_file,
@ -241,13 +237,16 @@ class ParseRtf:
            check_encoding_obj = check_encoding.CheckEncoding(
                    bug_handler = RtfInvalidCodeException,
                        )
-            enc = encode_obj.get_codepage()
-            if enc != 'mac_roman':
-                enc = 'cp' + enc
+            enc = 'cp' + encode_obj.get_codepage()
+            msg = 'Exception in token processing'
            if check_encoding_obj.check_encoding(self.__file, enc):
                file_name = self.__file if isinstance(self.__file, str) \
                                    else self.__file.encode('utf-8')
                msg = 'File %s does not appear to be correctly encoded.\n' % file_name
+            try:
+                os.remove(self.__temp_file)
+            except OSError:
+                pass
            raise InvalidRtfException, msg
        delete_info_obj = delete_info.DeleteInfo(
            in_file = self.__temp_file,
--- a/src/calibre/ebooks/rtf2xml/default_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/default_encoding.py
@ -74,9 +74,6 @@ class DefaultEncoding:
        if not self.__datafetched:
            self._encoding()
            self.__datafetched = True
-        if self.__platform == 'Macintosh':
-            code_page = self.__code_page
-        else:
            code_page = 'ansicpg' + self.__code_page
        return self.__platform, code_page, self.__default_num

@ -94,49 +91,60 @@ class DefaultEncoding:

    def _encoding(self):
        with open(self.__file, 'r') as read_obj:
+            cpfound = False
            if not self.__fetchraw:
                for line in read_obj:
                    self.__token_info = line[:16]
                    if self.__token_info == 'mi<mk<rtfhed-end':
                        break
-                    if self.__token_info == 'cw<ri<ansi-codpg':
-                        #cw<ri<ansi-codpg<nu<10000
-                        self.__code_page = line[20:-1] if int(line[20:-1]) \
-                                            else '1252'
                    if self.__token_info == 'cw<ri<macintosh_':
                        self.__platform = 'Macintosh'
-                        self.__code_page = 'mac_roman'
                    elif self.__token_info == 'cw<ri<pc________':
                        self.__platform = 'IBMPC'
-                        self.__code_page = '437'
                    elif self.__token_info == 'cw<ri<pca_______':
                        self.__platform = 'OS/2'
-                        self.__code_page = '850'
+                    if self.__token_info == 'cw<ri<ansi-codpg' \
+                        and int(line[20:-1]):
+                            self.__code_page = line[20:-1]
                    if self.__token_info == 'cw<ri<deflt-font':
                        self.__default_num = line[20:-1]
+                        cpfound = True
                        #cw<ri<deflt-font<nu<0
+                if self.__platform != 'Windows' and \
+                        not cpfound:
+                    if self.__platform == 'Macintosh':
+                       self.__code_page = '10000'
+                    elif self.__platform == 'IBMPC':
+                        self.__code_page = '437'
+                    elif self.__platform == 'OS/2':
+                        self.__code_page = '850'
            else:
                fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
                fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
+
                for line in read_obj:
+                    if fenc.search(line):
+                        enc = fenc.search(line).group(1)
                    if fenccp.search(line):
                        cp = fenccp.search(line).group(1)
                        if not int(cp):
                            self.__code_page = cp
+                        cpfound = True
                        break
-                    if fenc.search(line):
-                        enc = fenc.search(line).group(1)
+                if self.__platform != 'Windows' and \
+                        not cpfound:
                    if enc == 'mac':
-                            self.__code_page = 'mac_roman'
+                        self.__code_page = '10000'
                    elif enc == 'pc':
                        self.__code_page = '437'
                    elif enc == 'pca':
                        self.__code_page = '850'

-# if __name__ == '__main__':
-    # encode_obj = DefaultEncoding(
-            # in_file = sys.argv[1],
-            # bug_handler = Exception,
-            # check_raw = True,
-            # )
-    # print encode_obj.get_codepage()
+if __name__ == '__main__':
+    import sys
+    encode_obj = DefaultEncoding(
+            in_file = sys.argv[1],
+            bug_handler = Exception,
+            check_raw = True,
+            )
+    print encode_obj.get_codepage()
--- a/src/calibre/ebooks/rtf2xml/delete_info.py
+++ b/src/calibre/ebooks/rtf2xml/delete_info.py
@ -20,7 +20,7 @@ import sys, os, tempfile
 from calibre.ebooks.rtf2xml import copy

 class DeleteInfo:
-    """Delelet unecessary destination groups"""
+    """Delete unecessary destination groups"""
    def __init__(self,
            in_file ,
            bug_handler,
@ -31,17 +31,14 @@ class DeleteInfo:
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__write_to = tempfile.mktemp()
+        self.__run_level = run_level
+        self.__initiate_allow()
        self.__bracket_count= 0
        self.__ob_count = 0
        self.__cb_count = 0
-        # self.__after_asterisk = False
-        # self.__delete = 0
-        self.__initiate_allow()
        self.__ob = 0
        self.__write_cb = False
-        self.__run_level = run_level
        self.__found_delete = False
-        # self.__list = False

    def __initiate_allow(self):
        """
@ -57,6 +54,8 @@ class DeleteInfo:
                            'cw<an<annotation',
                            'cw<cm<comment___',
                            'cw<it<lovr-table',
+                            # info table
+                            'cw<di<company___',
                            # 'cw<ls<list______',
                        )
        self.__not_allowable = (
@ -116,7 +115,6 @@ class DeleteInfo:
        """
        # Test for {\*}, in which case don't enter
        # delete state
-        # self.__after_asterisk = False # only enter this function once
        self.__found_delete = True
        if self.__token_info == 'cb<nu<clos-brack':
            if self.__delete_count == self.__cb_count:
@ -128,7 +126,7 @@ class DeleteInfo:
                # not sure what happens here!
                # believe I have a '{\*}
                if self.__run_level > 3:
-                    msg = 'flag problem\n'
+                    msg = 'Flag problem\n'
                    raise self.__bug_handler, msg
                return True
        elif self.__token_info in self.__allowable :
@ -173,8 +171,8 @@ class DeleteInfo:
        Return True for all control words.
        Return False otherwise.
        """
-        if self.__delete_count == self.__cb_count and self.__token_info ==\
-            'cb<nu<clos-brack':
+        if self.__delete_count == self.__cb_count and \
+                self.__token_info == 'cb<nu<clos-brack':
            self.__state = 'default'
            if self.__write_cb:
                self.__write_cb = False
@ -186,32 +184,24 @@ class DeleteInfo:
            return False

    def delete_info(self):
-        """Main method for handling other methods. Read one line in at
+        """Main method for handling other methods. Read one line at
        a time, and determine whether to print the line based on the state."""
        with open(self.__file, 'r') as read_obj:
            with open(self.__write_to, 'w') as self.__write_obj:
                for line in read_obj:
                    #ob<nu<open-brack<0001
-                    to_print = True
                    self.__token_info = line[:16]
                    if self.__token_info == 'ob<nu<open-brack':
                        self.__ob_count = line[-5:-1]
                    if self.__token_info == 'cb<nu<clos-brack':
                        self.__cb_count = line[-5:-1]
+                    # Get action to perform
                    action = self.__state_dict.get(self.__state)
                    if not action:
-                        sys.stderr.write(_('No action in dictionary state is "%s" \n')
+                        sys.stderr.write('No action in dictionary state is "%s" \n'
                                % self.__state)
-                    to_print = action(line)
-                    # if self.__after_asterisk:
-                        # to_print = self.__asterisk_func(line)
-                    # elif self.__list:
-                        # self.__in_list_func(line)
-                    # elif self.__delete:
-                        # to_print = self.__delete_func(line)
-                    # else:
-                        # to_print = self.__default_func(line)
-                    if to_print:
+                    # Print if allowed by action
+                    if action(line):
                        self.__write_obj.write(line)
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
--- a/src/calibre/ebooks/rtf2xml/info.py
+++ b/src/calibre/ebooks/rtf2xml/info.py
@ -15,8 +15,10 @@
 #                                                                       #
 #                                                                       #
 #########################################################################
-import sys, os, tempfile
+import sys, os, tempfile, re
+
 from calibre.ebooks.rtf2xml import copy
+
 class Info:
    """
    Make tags for document-information
@ -42,12 +44,14 @@ class Info:
        self.__copy = copy
        self.__run_level = run_level
        self.__write_to = tempfile.mktemp()
+
    def __initiate_values(self):
        """
        Initiate all values.
        """
        self.__text_string = ''
        self.__state = 'before_info_table'
+        self.rmspace = re.compile(r'\s+')
        self.__state_dict = {
        'before_info_table': self.__before_info_table_func,
        'after_info_table': self.__after_info_table_func,
@ -58,27 +62,49 @@ class Info:
        self.__info_table_dict = {
        'cw<di<title_____'  : (self.__found_tag_with_text_func, 'title'),
        'cw<di<author____'  : (self.__found_tag_with_text_func, 'author'),
+        'cw<di<operator__'  : (self.__found_tag_with_text_func, 'operator'),
+        'cw<di<manager___'  : (self.__found_tag_with_text_func, 'manager'),
+        'cw<di<company___'  : (self.__found_tag_with_text_func, 'company'),
        'cw<di<keywords__'  : (self.__found_tag_with_text_func, 'keywords'),
+        'cw<di<category__'  : (self.__found_tag_with_text_func, 'category'),
        'cw<di<doc-notes_'  : (self.__found_tag_with_text_func, 'doc-notes'),
        'cw<di<subject___'  : (self.__found_tag_with_text_func, 'subject'),
-        'cw<di<operator__'  : (self.__found_tag_with_text_func, 'operator'),
+        'cw<di<linkbase__'  : (self.__found_tag_with_text_func, 'hyperlink-base'),
+        
        'cw<di<create-tim'  : (self.__found_tag_with_tokens_func, 'creation-time'),
        'cw<di<revis-time'  : (self.__found_tag_with_tokens_func, 'revision-time'),
-        'cw<di<edit-time_'  : (self.__single_field_func, 'editing-time'),
+        'cw<di<edit-time_'  : (self.__found_tag_with_tokens_func, 'editing-time'),
+        'cw<di<print-time'  : (self.__found_tag_with_tokens_func, 'printing-time'),
+        'cw<di<backuptime'  : (self.__found_tag_with_tokens_func, 'backup-time'),
+        
        'cw<di<num-of-wor'  : (self.__single_field_func, 'number-of-words'),
        'cw<di<num-of-chr'  : (self.__single_field_func, 'number-of-characters'),
+        'cw<di<numofchrws'  : (self.__single_field_func, 'number-of-characters-without-space'),
        'cw<di<num-of-pag'  : (self.__single_field_func, 'number-of-pages'),
+        'cw<di<version___'  : (self.__single_field_func, 'version'),
+        'cw<di<intern-ver'  : (self.__single_field_func, 'internal-version-number'),
+        'cw<di<internalID'  : (self.__single_field_func, 'internal-id-number'),
        }
        self.__token_dict = {
        'year______'        : 'year',
        'month_____'        : 'month',
        'day_______'        : 'day',
        'minute____'        : 'minute',
+        'second____'        : 'second',
        'revis-time'        : 'revision-time',
+        'create-tim'        : 'creation-time',
+        'edit-time_'        : 'editing-time',
+        'print-time'        : 'printing-time',
+        'backuptime'        : 'backup-time',
        'num-of-wor'        : 'number-of-words',
        'num-of-chr'        : 'number-of-characters',
+        'numofchrws'        : 'number-of-characters-without-space',
        'num-of-pag'        : 'number-of-pages',
+        'version___'        : 'version',
+        'intern-ver'        : 'internal-version-number',
+        'internalID'        : 'internal-id-number',
        }
+
    def __before_info_table_func(self, line):
        """
        Required:
@ -92,6 +118,7 @@ class Info:
        if self.__token_info == 'mi<mk<doc-in-beg':
            self.__state = 'in_info_table'
        self.__write_obj.write(line)
+
    def __in_info_table_func(self, line):
        """
        Requires:
@ -112,6 +139,7 @@ class Info:
                action(line, tag)
            else:
                self.__write_obj.write(line)
+
    def __found_tag_with_text_func(self, line, tag):
        """
        Requires:
@ -126,6 +154,7 @@ class Info:
        """
        self.__tag = tag
        self.__state = 'collect_text'
+
    def __collect_text_func(self, line):
        """
        Requires:
@ -139,6 +168,8 @@ class Info:
        """
        if self.__token_info == 'mi<mk<docinf-end':
            self.__state = 'in_info_table'
+            #Don't print empty tags
+            if len(self.rmspace.sub('',self.__text_string)):
                self.__write_obj.write(
                    'mi<tg<open______<%s\n'
                    'tx<nu<__________<%s\n'
@ -147,6 +178,7 @@ class Info:
            self.__text_string = ''
        elif line[0:2] == 'tx':
            self.__text_string += line[17:-1]
+
    def __found_tag_with_tokens_func(self, line, tag):
        """
        Requires:
@ -163,6 +195,7 @@ class Info:
        self.__state = 'collect_tokens'
        self.__text_string = 'mi<tg<empty-att_<%s' % tag
        #mi<tg<empty-att_<page-definition<margin>33\n
+
    def __collect_tokens_func(self, line):
        """
        Requires:
@ -194,18 +227,19 @@ class Info:
            att = line[6:16]
            value = line[20:-1]
            att_changed = self.__token_dict.get(att)
-            if att_changed == None:
+            if att_changed is None:
                if self.__run_level > 3:
-                    msg = 'no dictionary match for %s\n' % att
+                    msg = 'No dictionary match for %s\n' % att
                    raise self.__bug_handler, msg
            else:
                self.__text_string += '<%s>%s' % (att_changed, value)
+
    def __single_field_func(self, line, tag):
        value = line[20:-1]
        self.__write_obj.write(
-        'mi<tg<empty-att_<%s'
-        '<%s>%s\n' % (tag, tag, value)
+        'mi<tg<empty-att_<%s<%s>%s\n' % (tag, tag, value)
        )
+
    def __after_info_table_func(self, line):
        """
        Requires:
@ -217,6 +251,7 @@ class Info:
            the file.
        """
        self.__write_obj.write(line)
+
    def fix_info(self):
        """
        Requires:
@ -234,20 +269,15 @@ class Info:
            information table, simply write the line to the output file.
        """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'wb') as self.__write_obj:
+                for line in read_obj:
                    self.__token_info = line[:16]
                    action = self.__state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('no no matching state in module styles.py\n')
+                    if action is None:
+                        sys.stderr.write('No matching state in module styles.py\n')
                        sys.stderr.write(self.__state + '\n')
                    action(line)
-        read_obj.close()
-        self.__write_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "info.data")
--- a/src/calibre/ebooks/rtf2xml/process_tokens.py
+++ b/src/calibre/ebooks/rtf2xml/process_tokens.py
@ -70,7 +70,7 @@ class ProcessTokens:
        ';'                  :	('mc', ';', self.ms_sub_func),
        # this must be wrong
        '-'                  :	('mc', '-', self.ms_sub_func),
-        'line'               :  ('mi', 'hardline-break', self.hardline_func), #calibre
+        'line'               :  ('mi', 'hardline-break', self.direct_conv_func), #calibre
        # misc => ml
        '*'                  :	('ml', 'asterisk__', self.default_func),
        ':'                  :	('ml', 'colon_____', self.default_func),
@ -78,7 +78,6 @@ class ProcessTokens:
        'backslash'          :	('nu', '\\', self.text_func),
        'ob'                 :	('nu', '{', self.text_func),
        'cb'                 :	('nu', '}', self.text_func),
-        #'line'               :  ('nu', ' ', self.text_func), calibre
        # paragraph formatting => pf
        'page'               :  ('pf', 'page-break', self.default_func),
        'par'                :	('pf', 'par-end___', self.default_func),
@ -231,11 +230,15 @@ class ProcessTokens:
        'trhdr'              :  ('tb', 'row-header', self.default_func),
        # preamble => pr
        # document information => di
+        # TODO integrate \userprops
        'info'               :	('di', 'doc-info__', self.default_func),
+        'title'              :	('di', 'title_____', self.default_func),
        'author'             :	('di', 'author____', self.default_func),
        'operator'           :	('di', 'operator__', self.default_func),
-        'title'              :	('di', 'title_____', self.default_func),
+        'manager'            :	('di', 'manager___', self.default_func),
+        'company'            :	('di', 'company___', self.default_func),
        'keywords'           :  ('di', 'keywords__', self.default_func),
+        'category'           :  ('di', 'category__', self.default_func),
        'doccomm'            :  ('di', 'doc-notes_', self.default_func),
        'comment'            :  ('di', 'doc-notes_', self.default_func),
        'subject'            :  ('di', 'subject___', self.default_func),
@ -244,11 +247,19 @@ class ProcessTokens:
        'mo'                 :	('di', 'month_____', self.default_func),
        'dy'                 :	('di', 'day_______', self.default_func),
        'min'                :	('di', 'minute____', self.default_func),
+        'sec'                :	('di', 'second____', self.default_func),
        'revtim'             :	('di', 'revis-time', self.default_func),
+        'edmins'             :	('di', 'edit-time_', self.default_func),
+        'printim'            :	('di', 'print-time', self.default_func),
+        'buptim'             :	('di', 'backuptime', self.default_func),
        'nofwords'           :	('di', 'num-of-wor', self.default_func),
        'nofchars'           :	('di', 'num-of-chr', self.default_func),
+        'nofcharsws'         :	('di', 'numofchrws', self.default_func),
        'nofpages'           :	('di', 'num-of-pag', self.default_func),
-        'edmins'             :	('di', 'edit-time_', self.default_func),
+        'version'            :	('di', 'version___', self.default_func),
+        'vern'               :	('di', 'intern-ver', self.default_func),
+        'hlinkbase'          :	('di', 'linkbase__', self.default_func),
+        'id'                 :	('di', 'internalID', self.default_func),
        # headers and footers => hf
        'headerf'            :	('hf', 'head-first', self.default_func),
        'headerl'            :	('hf', 'head-left_', self.default_func),
@ -605,7 +616,7 @@ class ProcessTokens:
    def ms_sub_func(self, pre, token, num):
        return 'tx<mc<__________<%s\n' % token

-    def hardline_func(self, pre, token, num):
+    def direct_conv_func(self, pre, token, num):
        return 'mi<tg<empty_____<%s\n' % token

    def default_func(self, pre, token, num):
--- a/src/calibre/ebooks/rtf2xml/tokenize.py
+++ b/src/calibre/ebooks/rtf2xml/tokenize.py
@ -27,11 +27,13 @@ class Tokenize:
            bug_handler,
            copy = None,
            run_level = 1,
+            # out_file = None,
        ):
        self.__file = in_file
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__write_to = tempfile.mktemp()
+        # self.__out_file = out_file
        self.__compile_expressions()
        #variables
        self.__uc_char = 0
@ -113,6 +115,8 @@ class Tokenize:

    def __sub_reg_split(self,input_file):
        input_file = self.__replace_spchar.mreplace(input_file)
+        # this is for older RTF
+        input_file = self.__par_exp.sub('\n\\par \n', input_file)
        input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
        input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
        #remove \n in bin data
@ -153,8 +157,6 @@ class Tokenize:
            # put a backslash in front of to eliminate special cases and
            # make processing easier
            "}": "\\}",
-            # this is for older RTF
-            r'\\$': '\\par ',
            }
        self.__replace_spchar = MReplace(SIMPLE_RPL)
        #add ;? in case of char following \u
@ -168,10 +170,12 @@ class Tokenize:
        #why keep backslash whereas \is replaced before?
        #remove \n from endline char
        self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
+        #this is for old RTF
+        self.__par_exp = re.compile(r'\\\n+')
+        # self.__par_exp = re.compile(r'\\$')
        #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
        #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
        #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
-        #self.__par_exp = re.compile(r'\\$')
        #self.__remove_line = re.compile(r'\n+')
        #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
        ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
@ -199,7 +203,24 @@ class Tokenize:
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "tokenize.data")
+        # if self.__out_file:
+            # self.__file = self.__out_file
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
        
        #self.__special_tokens = [ '_', '~', "'", '{', '}' ]
+
+# import sys
+# def main(args=sys.argv):
+    # if len(args) < 1:
+        # print 'No file'
+        # return
+    # file = 'data_tokens.txt'
+    # if len(args) == 3:
+        # file = args[2]
+    # to = Tokenize(args[1], Exception, out_file = file)
+    # to.tokenize()
+
+
+# if __name__ == '__main__':
+    # sys.exit(main())
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -106,7 +106,7 @@ class TXTInput(InputFormatPlugin):
                    log.debug('Auto detected paragraph type as %s' % options.paragraph_type)

            # Dehyphenate
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None))
            txt = dehyphenator(txt,'txt', length)

            # We don't check for block because the processor assumes block.
@ -137,11 +137,6 @@ class TXTInput(InputFormatPlugin):
                setattr(options, 'format_scene_breaks', True)
                setattr(options, 'dehyphenate', True)

-        # Dehyphenate in cleanup mode for missed txt and markdown conversion
-        dehyphenator = Dehyphenator()
-        html = dehyphenator(html,'txt_cleanup', length)
-        html = dehyphenator(html,'html_cleanup', length)
-
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
--- a/src/calibre/gui2/init.py
+++ b/src/calibre/gui2/init.py
@ -505,7 +505,7 @@ class FileDialog(QObject):
        self.selected_files = []
        if mode == QFileDialog.AnyFile:
            f = unicode(QFileDialog.getSaveFileName(parent, title, initial_dir, ftext, ""))
-            if f and os.path.exists(f):
+            if f:
                self.selected_files.append(f)
        elif mode == QFileDialog.ExistingFile:
            f = unicode(QFileDialog.getOpenFileName(parent, title, initial_dir, ftext, ""))
--- a/src/calibre/gui2/actions/catalog.py
+++ b/src/calibre/gui2/actions/catalog.py
@ -28,7 +28,7 @@ class GenerateCatalogAction(InterfaceAction):

        if not ids:
            return error_dialog(self.gui, _('No books selected'),
-                    _('No books selected to generate catalog for'),
+                    _('No books selected for catalog generation'),
                    show=True)

 		db = self.gui.library_view.model().db
@ -55,9 +55,9 @@ class GenerateCatalogAction(InterfaceAction):

    def catalog_generated(self, job):
        if job.result:
-            # Search terms nulled catalog results
-            return error_dialog(self.gui, _('No books found'),
-                    _("No books to catalog\nCheck job details"),
+            # Error during catalog generation
+            return error_dialog(self.gui, _('Catalog generation terminated'),
+                    job.result,
                    show=True)
        if job.failed:
            return self.gui.job_exception(job)
--- a/src/calibre/gui2/actions/edit_metadata.py
+++ b/src/calibre/gui2/actions/edit_metadata.py
@ -94,7 +94,7 @@ class EditMetadataAction(InterfaceAction):
            get_social_metadata = config['get_social_metadata']
        else:
            get_social_metadata = set_social_metadata
-        from calibre.gui2.metadata import DoDownload
+        from calibre.gui2.metadata.bulk_download import DoDownload
        if set_social_metadata is not None and set_social_metadata:
            x = _('social metadata')
        else:
--- a/src/calibre/gui2/metadata/init.py
+++ b/src/calibre/gui2/metadata/init.py
@ -0,0 +1,9 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+
--- a/src/calibre/gui2/metadata/bulk_download.py
+++ b/src/calibre/gui2/metadata/bulk_download.py
--- a/src/calibre/gui2/tag_view.py
+++ b/src/calibre/gui2/tag_view.py
@ -730,7 +730,7 @@ class TagsModel(QAbstractItemModel): # {{{
            else:
                collapse_model = 'partition'
                collapse_template = tweaks['categories_collapsed_popularity_template']
-        collapse_letter = None
+        collapse_letter = collapse_letter_sk = None

        for i, r in enumerate(self.row_map):
            if self.hidden_categories and self.categories[i] in self.hidden_categories:
@ -782,8 +782,17 @@ class TagsModel(QAbstractItemModel): # {{{
                        ts = tag.sort
                        if not ts:
                            ts = ' '
-                        if upper(ts[0]) != collapse_letter:
+                        try:
+                            sk = sort_key(ts)[0]
+                        except:
+                            sk = ts[0]
+
+                        if sk != collapse_letter_sk:
                            collapse_letter = upper(ts[0])
+                            try:
+                                collapse_letter_sk = sort_key(collapse_letter)[0]
+                            except:
+                                collapse_letter_sk = collapse_letter
                            sub_cat = TagTreeItem(parent=category,
                                     data = collapse_letter,
                                     category_icon = category_node.icon,
--- a/src/calibre/gui2/widgets.py
+++ b/src/calibre/gui2/widgets.py
@ -386,11 +386,13 @@ class LineEditECM(object):
        action_lower_case = case_menu.addAction(_('Lower Case'))
        action_swap_case = case_menu.addAction(_('Swap Case'))
        action_title_case = case_menu.addAction(_('Title Case'))
+        action_capitalize = case_menu.addAction(_('Capitalize'))

        self.connect(action_upper_case, SIGNAL('triggered()'), self.upper_case)
        self.connect(action_lower_case, SIGNAL('triggered()'), self.lower_case)
        self.connect(action_swap_case, SIGNAL('triggered()'), self.swap_case)
        self.connect(action_title_case, SIGNAL('triggered()'), self.title_case)
+        self.connect(action_capitalize, SIGNAL('triggered()'), self.capitalize)

        menu.addMenu(case_menu)
        menu.exec_(event.globalPos())
@ -408,6 +410,10 @@ class LineEditECM(object):
        from calibre.utils.titlecase import titlecase
        self.setText(titlecase(unicode(self.text())))

+    def capitalize(self):
+        from calibre.utils.icu import capitalize
+        self.setText(capitalize(unicode(self.text())))
+

 class EnLineEdit(LineEditECM, QLineEdit):

--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@ -1144,7 +1144,9 @@ class EPUB_MOBI(CatalogPlugin):
            def error(self):
                def fget(self):
                    return self.__error
-                return property(fget=fget)
+                def fset(self, val):
+                    self.__error = val
+                return property(fget=fget,fset=fset)
            @dynamic_property
            def generateForKindle(self):
                def fget(self):
@ -1411,6 +1413,88 @@ class EPUB_MOBI(CatalogPlugin):
                except:
                    pass

+        def fetchBooksByAuthor(self):
+            '''
+            Generate a list of titles sorted by author from the database
+            return = Success
+            '''
+
+            self.updateProgressFullStep("Sorting database")
+
+            '''
+            # Sort titles case-insensitive, by author
+            self.booksByAuthor = sorted(self.booksByTitle,
+                                 key=lambda x:(x['author_sort'].upper(), x['author_sort'].upper()))
+            '''
+
+            self.booksByAuthor = list(self.booksByTitle)
+            self.booksByAuthor.sort(self.author_compare)
+
+            if False and self.verbose:
+                self.opts.log.info("fetchBooksByAuthor(): %d books" % len(self.booksByAuthor))
+                self.opts.log.info(" %-30s %-20s %s" % ('title', 'series', 'series_index'))
+                for title in self.booksByAuthor:
+                    self.opts.log.info((u" %-30s %-20s%5s " % \
+                                        (title['title'][:30],
+                                         title['series'][:20] if title['series'] else '',
+                                         title['series_index'],
+                                         )).encode('utf-8'))
+                raise SystemExit
+
+            # Build the unique_authors set from existing data
+            authors = [(record['author'], record['author_sort'].capitalize()) for record in self.booksByAuthor]
+
+            # authors[] contains a list of all book authors, with multiple entries for multiple books by author
+            #        authors[]: (([0]:friendly  [1]:sort))
+            # unique_authors[]: (([0]:friendly  [1]:sort  [2]:book_count))
+            books_by_current_author = 0
+            current_author = authors[0]
+            multiple_authors = False
+            unique_authors = []
+            for (i,author) in enumerate(authors):
+                if author != current_author:
+                    # Note that current_author and author are tuples: (friendly, sort)
+                    multiple_authors = True
+
+                if author != current_author and i:
+                    # Warn, exit if friendly matches previous, but sort doesn't
+                    if author[0] == current_author[0]:
+                        error_msg = _('''
+\n*** Metadata error ***
+Inconsistent Author Sort values for Author '{0}', unable to continue building catalog.
+Select all books by '{0}', apply correct Author Sort value in Edit Metadata dialog,
+then rebuild the catalog.\n''').format(author[0])
+
+                        self.opts.log.warn(error_msg)
+                        self.error = error_msg
+                        return False
+
+                    # New author, save the previous author/sort/count
+                    unique_authors.append((current_author[0], icu_title(current_author[1]),
+                                           books_by_current_author))
+                    current_author = author
+                    books_by_current_author = 1
+                elif i==0 and len(authors) == 1:
+                    # Allow for single-book lists
+                    unique_authors.append((current_author[0], icu_title(current_author[1]),
+                                           books_by_current_author))
+                else:
+                    books_by_current_author += 1
+            else:
+                # Add final author to list or single-author dataset
+                if (current_author == author and len(authors) > 1) or not multiple_authors:
+                    unique_authors.append((current_author[0], icu_title(current_author[1]),
+                                           books_by_current_author))
+
+            if False and self.verbose:
+                self.opts.log.info("\nfetchBooksByauthor(): %d unique authors" % len(unique_authors))
+                for author in unique_authors:
+                    self.opts.log.info((u" %-50s %-25s %2d" % (author[0][0:45], author[1][0:20],
+                       author[2])).encode('utf-8'))
+
+            self.authors = unique_authors
+            return True
+
        def fetchBooksByTitle(self):

            self.updateProgressFullStep("Fetching database")
@ -1562,90 +1646,9 @@ class EPUB_MOBI(CatalogPlugin):
                                                               title['title_sort'][0:40])).decode('mac-roman'))
                return True
            else:
+                self.error = _("No books found to catalog.\nCheck 'Excluded books' criteria in E-book options.")
                return False

-        def fetchBooksByAuthor(self):
-            '''
-            Generate a list of titles sorted by author from the database
-            return = Success
-            '''
-
-            self.updateProgressFullStep("Sorting database")
-
-            '''
-            # Sort titles case-insensitive, by author
-            self.booksByAuthor = sorted(self.booksByTitle,
-                                 key=lambda x:(x['author_sort'].upper(), x['author_sort'].upper()))
-            '''
-
-            self.booksByAuthor = list(self.booksByTitle)
-            self.booksByAuthor.sort(self.author_compare)
-
-            if False and self.verbose:
-                self.opts.log.info("fetchBooksByAuthor(): %d books" % len(self.booksByAuthor))
-                self.opts.log.info(" %-30s %-20s %s" % ('title', 'series', 'series_index'))
-                for title in self.booksByAuthor:
-                    self.opts.log.info((u" %-30s %-20s%5s " % \
-                                        (title['title'][:30],
-                                         title['series'][:20] if title['series'] else '',
-                                         title['series_index'],
-                                         )).encode('utf-8'))
-                raise SystemExit
-
-            # Build the unique_authors set from existing data
-            authors = [(record['author'], record['author_sort'].capitalize()) for record in self.booksByAuthor]
-
-            # authors[] contains a list of all book authors, with multiple entries for multiple books by author
-            #        authors[]: (([0]:friendly  [1]:sort))
-            # unique_authors[]: (([0]:friendly  [1]:sort  [2]:book_count))
-            books_by_current_author = 0
-            current_author = authors[0]
-            multiple_authors = False
-            unique_authors = []
-            for (i,author) in enumerate(authors):
-                if author != current_author:
-                    # Note that current_author and author are tuples: (friendly, sort)
-                    multiple_authors = True
-
-                if author != current_author and i:
-                    # Warn, exit if friendly matches previous, but sort doesn't
-                    if author[0] == current_author[0]:
-                        error_msg = _('''
-\n*** Metadata error ***
-Inconsistent Author Sort values for Author '{0}', unable to continue building catalog.
-Select all books by '{0}', apply correct Author Sort value in Edit Metadata dialog,
-then rebuild the catalog.
-*** Terminating catalog generation ***\n''').format(author[0])
-
-                        self.opts.log.warn(error_msg)
-                        return False
-
-                    # New author, save the previous author/sort/count
-                    unique_authors.append((current_author[0], icu_title(current_author[1]),
-                                           books_by_current_author))
-                    current_author = author
-                    books_by_current_author = 1
-                elif i==0 and len(authors) == 1:
-                    # Allow for single-book lists
-                    unique_authors.append((current_author[0], icu_title(current_author[1]),
-                                           books_by_current_author))
-                else:
-                    books_by_current_author += 1
-            else:
-                # Add final author to list or single-author dataset
-                if (current_author == author and len(authors) > 1) or not multiple_authors:
-                    unique_authors.append((current_author[0], icu_title(current_author[1]),
-                                           books_by_current_author))
-
-            if False and self.verbose:
-                self.opts.log.info("\nfetchBooksByauthor(): %d unique authors" % len(unique_authors))
-                for author in unique_authors:
-                    self.opts.log.info((u" %-50s %-25s %2d" % (author[0][0:45], author[1][0:20],
-                       author[2])).encode('utf-8'))
-
-            self.authors = unique_authors
-            return True
-
        def fetchBookmarks(self):
            '''
            Collect bookmarks for catalog entries
@ -5069,6 +5072,8 @@ then rebuild the catalog.
                            abort_after_input_dump=False)
            plumber.merge_ui_recommendations(recommendations)
            plumber.run()
-            return 0
+            # returns to gui2.actions.catalog:catalog_generated()
+            return None
        else:
-            return 1
+            # returns to gui2.actions.catalog:catalog_generated()
+            return catalog.error
--- a/src/calibre/library/cli.py
+++ b/src/calibre/library/cli.py
@ -693,8 +693,12 @@ def command_catalog(args, dbpath):
                            }

    with plugin:
-        plugin.run(args[1], opts, get_db(dbpath, opts))
-    return 0
+        ret = plugin.run(args[1], opts, get_db(dbpath, opts))
+    if ret is None:
+        ret = 0
+    else:
+        ret = 1
+    return ret

 # end of GR additions

--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -690,11 +690,14 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        mi = Metadata(None)

        aut_list = row[fm['au_map']]
-        aut_list = [p.split(':::') for p in aut_list.split(':#:')]
+        if aut_list:
+            aut_list = [p.split(':::') for p in aut_list.split(':#:') if p]
+        else:
+            aut_list = []
        aum = []
        aus = {}
        for (author, author_sort) in aut_list:
-            aum.append(author)
+            aum.append(author.replace('|', ','))
            aus[author] = author_sort.replace('|', ',')
        mi.title       = row[fm['title']]
        mi.authors     = aum
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -437,6 +437,15 @@ My antivirus program claims |app| is a virus/trojan?

 Your antivirus program is wrong. |app| is a completely open source product. You can actually browse the source code yourself (or hire someone to do it for you) to verify that it is not a virus. Please report the false identification to whatever company you buy your antivirus software from. If the antivirus program is preventing you from downloading/installing |app|, disable it temporarily, install |app| and then re-enable it.

+How do I backup |app|?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The most important thing to backup is the |app| library folder, that contains all your books and metadata. This is the folder you chose for your |app| library when you ran |app| for the first time. You can get the path to the library folder by clicking the |app| icon on the main toolbar. You must backup this complete folder with all its files and sub-folders. 
+
+You can switch |app| to using a backed up library folder by simply clicking the |app| icon on the toolbar and choosing your backup library folder. 
+
+If you want to backup the |app| configuration/plugins, you have to backup the config directory. You can find this config directory via :guilabel:`Preferences->Miscellaneous`. Note that restoring configuration directories is not officially supported, but should work in most cases. Just copy the contents of the backup directory into the current configuration directory to restore.
+
 How do I use purchased EPUB books with |app|?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Most purchased EPUB books have `DRM <http://wiki.mobileread.com/wiki/DRM>`_. This prevents |app| from opening them. You can still use |app| to store and transfer them to your e-book reader. First, you must authorize your reader on a windows machine with Adobe Digital Editions. Once this is done, EPUB books transferred with |app| will work fine on your reader. When you purchase an epub book from a website, you will get an ".acsm" file. This file should be opened with Adobe Digital Editions, which will then download the actual ".epub" e-book. The e-book file will be stored in the folder "My Digital Editions", from where you can add it to |app|.
--- a/src/calibre/utils/formatter_functions.py
+++ b/src/calibre/utils/formatter_functions.py
@ -77,7 +77,7 @@ class FormatterFunction(object):
                                        exc_traceback)[-2:]).replace('\n', '')
            return _('Exception ' + info)

-
+all_builtin_functions = []
 class BuiltinFormatterFunction(FormatterFunction):
    def __init__(self):
        formatter_functions.register_builtin(self)
@ -88,6 +88,7 @@ class BuiltinFormatterFunction(FormatterFunction):
        except:
            lines = []
        self.program_text = ''.join(lines)
+        all_builtin_functions.append(self)

 class BuiltinStrcmp(BuiltinFormatterFunction):
    name = 'strcmp'
--- a/src/calibre/utils/icu.py
+++ b/src/calibre/utils/icu.py
@ -80,7 +80,7 @@ def icu_case_sensitive_strcmp(collator, a, b):

 def icu_capitalize(s):
    s = lower(s)
-    return s.replace(s[0], upper(s[0]), 1)
+    return s.replace(s[0], upper(s[0]), 1) if s else s

 load_icu()
 load_collator()