Sync to ldolse heuristics branch.

2025-07-09 03:04:10 -04:00 · 2011-01-16 08:54:31 -05:00 · 2011-01-16 08:54:31 -05:00 · 51e7a555e1
commit 51e7a555e1
parent 9134d51377 e0d1de2ce8
28 changed files with 450 additions and 249 deletions
--- a/resources/recipes/mail_and_guardian.recipe
+++ b/resources/recipes/mail_and_guardian.recipe
@ -0,0 +1,32 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1295081935(BasicNewsRecipe):
     title          = u'Mail & Guardian ZA News'
     __author__     = '77ja65'
     language = 'en'
     oldest_article = 7
     max_articles_per_feed = 30
     no_stylesheets = True
     masthead_url          =  'http://c1608832.cdn.cloudfiles.rackspacecloud.com/mg_logo.gif'
     remove_tags_after = [dict(id='content')]
     feeds          = [
         (u'National News', u'http://www.mg.co.za/rss/national'),
         (u'Top Stories', u'http://www.mg.co.za/rss'),
         (u'Africa News', u'http://www.mg.co.za/rss/africa'),
         (u'Sport', u'http://www.mg.co.za/rss/sport'),
         (u'Business', u'http://www.mg.co.za/rss/business'),
         (u'And In Other News', u'http://www.mg.co.za/rss/and-in-other-news'),
         (u'World News', u'http://www.mg.co.za/rss/world')
         ]
     def print_version(self, url):
           return url.replace('http://www.mg.co.za/article/',
 'http://www.mg.co.za/printformat/single/')
     extra_css = '''
                     h1{font-family:Arial,Helvetica,sans-serif; font-
 weight:bold;font-size:large;}
                     h2{font-family:Arial,Helvetica,sans-serif; font-
 weight:normal;font-size:small;}
                 '''
--- a/resources/recipes/sportsillustrated.recipe
+++ b/resources/recipes/sportsillustrated.recipe
@ -1,5 +1,5 @@
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+#from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from urllib import quote
 class SportsIllustratedRecipe(BasicNewsRecipe) :
@ -91,7 +91,7 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
        #   expire : no idea what value to use
        # All this comes from the Javascript function that redirects to the print version. It's called PT() and is defined in the file 48.js
-    def preprocess_html(self, soup):
+    '''def preprocess_html(self, soup):
        header = soup.find('div', attrs = {'class' : 'siv_artheader'})
        homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
        body = homeMadeSoup.body
@ -115,4 +115,5 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
            body.append(para)
        return homeMadeSoup
        '''
--- a/resources/template-functions.json
+++ b/resources/template-functions.json
@ -0,0 +1,28 @@
 {
    "contains": "def evaluate(self, formatter, kwargs, mi, locals,\n             val, test, value_if_present, value_if_not):\n    if re.search(test, val):\n        return value_if_present\n    else:\n        return value_if_not\n", 
    "divide": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n    x = float(x if x else 0)\n    y = float(y if y else 0)\n    return unicode(x / y)\n", 
    "uppercase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n    return val.upper()\n", 
    "strcat": "def evaluate(self, formatter, kwargs, mi, locals, *args):\n    i = 0\n    res = ''\n    for i in range(0, len(args)):\n        res += args[i]\n    return res\n", 
    "substr": "def evaluate(self, formatter, kwargs, mi, locals, str_, start_, end_):\n    return str_[int(start_): len(str_) if int(end_) == 0 else int(end_)]\n", 
    "ifempty": "def evaluate(self, formatter, kwargs, mi, locals, val, value_if_empty):\n    if val:\n        return val\n    else:\n        return value_if_empty\n", 
    "field": "def evaluate(self, formatter, kwargs, mi, locals, name):\n    return formatter.get_value(name, [], kwargs)\n", 
    "capitalize": "def evaluate(self, formatter, kwargs, mi, locals, val):\n    return capitalize(val)\n", 
    "list_item": "def evaluate(self, formatter, kwargs, mi, locals, val, index, sep):\n    if not val:\n        return ''\n    index = int(index)\n    val = val.split(sep)\n    try:\n        return val[index]\n    except:\n        return ''\n", 
    "shorten": "def evaluate(self, formatter, kwargs, mi, locals,\n             val, leading, center_string, trailing):\n    l = max(0, int(leading))\n    t = max(0, int(trailing))\n    if len(val) > l + len(center_string) + t:\n        return val[0:l] + center_string + ('' if t == 0 else val[-t:])\n    else:\n        return val\n", 
    "re": "def evaluate(self, formatter, kwargs, mi, locals, val, pattern, replacement):\n    return re.sub(pattern, replacement, val)\n", 
    "add": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n    x = float(x if x else 0)\n    y = float(y if y else 0)\n    return unicode(x + y)\n", 
    "lookup": "def evaluate(self, formatter, kwargs, mi, locals, val, *args):\n    if len(args) == 2: # here for backwards compatibility\n        if val:\n            return formatter.vformat('{'+args[0].strip()+'}', [], kwargs)\n        else:\n            return formatter.vformat('{'+args[1].strip()+'}', [], kwargs)\n    if (len(args) % 2) != 1:\n        raise ValueError(_('lookup requires either 2 or an odd number of arguments'))\n    i = 0\n    while i < len(args):\n        if i + 1 >= len(args):\n            return formatter.vformat('{' + args[i].strip() + '}', [], kwargs)\n        if re.search(args[i], val):\n            return formatter.vformat('{'+args[i+1].strip() + '}', [], kwargs)\n        i += 2\n", 
    "template": "def evaluate(self, formatter, kwargs, mi, locals, template):\n    template = template.replace('[[', '{').replace(']]', '}')\n    return formatter.safe_format(template, kwargs, 'TEMPLATE', mi)\n", 
    "print": "def evaluate(self, formatter, kwargs, mi, locals, *args):\n    print args\n    return None\n", 
    "titlecase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n    return titlecase(val)\n", 
    "test": "def evaluate(self, formatter, kwargs, mi, locals, val, value_if_set, value_not_set):\n    if val:\n        return value_if_set\n    else:\n        return value_not_set\n", 
    "eval": "def evaluate(self, formatter, kwargs, mi, locals, template):\n    from formatter import eval_formatter\n    template = template.replace('[[', '{').replace(']]', '}')\n    return eval_formatter.safe_format(template, locals, 'EVAL', None)\n", 
    "multiply": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n    x = float(x if x else 0)\n    y = float(y if y else 0)\n    return unicode(x * y)\n", 
    "subtract": "def evaluate(self, formatter, kwargs, mi, locals, x, y):\n    x = float(x if x else 0)\n    y = float(y if y else 0)\n    return unicode(x - y)\n", 
    "count": "def evaluate(self, formatter, kwargs, mi, locals, val, sep):\n    return unicode(len(val.split(sep)))\n", 
    "lowercase": "def evaluate(self, formatter, kwargs, mi, locals, val):\n    return val.lower()\n", 
    "assign": "def evaluate(self, formatter, kwargs, mi, locals, target, value):\n    locals[target] = value\n    return value\n", 
    "switch": "def evaluate(self, formatter, kwargs, mi, locals, val, *args):\n    if (len(args) % 2) != 1:\n        raise ValueError(_('switch requires an odd number of arguments'))\n    i = 0\n    while i < len(args):\n        if i + 1 >= len(args):\n            return args[i]\n        if re.search(args[i], val):\n            return args[i+1]\n        i += 2\n", 
    "strcmp": "def evaluate(self, formatter, kwargs, mi, locals, x, y, lt, eq, gt):\n    v = strcmp(x, y)\n    if v < 0:\n        return lt\n    if v == 0:\n        return eq\n    return gt\n", 
    "cmp": "def evaluate(self, formatter, kwargs, mi, locals, x, y, lt, eq, gt):\n    x = float(x if x else 0)\n    y = float(y if y else 0)\n    if x < y:\n        return lt\n    if x == y:\n        return eq\n    return gt\n"
 }
--- a/setup/resources.py
+++ b/setup/resources.py
@ -84,6 +84,23 @@ class Resources(Command):
            cPickle.dump(complete, open(dest, 'wb'), -1)
        self.info('\tCreating template-functions.json')
        dest = self.j(self.RESOURCES, 'template-functions.json')
        function_dict = {}
        import inspect
        from calibre.utils.formatter_functions import all_builtin_functions
        for obj in all_builtin_functions:
            eval_func = inspect.getmembers(obj,
                    lambda x: inspect.ismethod(x) and x.__name__ == 'evaluate')
            try:
                lines = [l[4:] for l in inspect.getsourcelines(eval_func[0][1])[0]]
            except:
                continue
            lines = ''.join(lines)
            function_dict[obj.name] = lines
        import json
        json.dump(function_dict, open(dest, 'wb'), indent=4)
    def clean(self):
        for x in ('scripts', 'recipes', 'ebook-convert-complete'):
            x = self.j(self.RESOURCES, x+'.pickle')
--- a/src/calibre/devices/sne/driver.py
+++ b/src/calibre/devices/sne/driver.py
@ -33,6 +33,6 @@ class SNE(USBMS):
    STORAGE_CARD_VOLUME_LABEL = 'SNE Storage Card'
    EBOOK_DIR_MAIN = EBOOK_DIR_CARD_A = 'Books'
-    SUPPORTS_SUB_DIRS = True
+    SUPPORTS_SUB_DIRS = False
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -174,13 +174,19 @@ class Dehyphenator(object):
    retain hyphens.
    '''
-    def __init__(self):
+    def __init__(self, verbose=0, log=None):
        self.log = default_log if log is None else log
        self.verbose = verbose
        # Add common suffixes to the regex below to increase the likelihood of a match -
        # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
+        # only remove if it's not already the point of hyphenation
        self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$"
        self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
        self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
        # remove prefixes if the prefix was not already the point of hyphenation
-        self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
+        self.prefix_string = '^(dis|re|un|in|ex)'
-        self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
+        self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
        self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
    def dehyphenate(self, match):
        firsthalf = match.group('firstpart')
@ -191,31 +197,44 @@ class Dehyphenator(object):
            wraptags = ''
        hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
        dehyphenated = unicode(firsthalf) + unicode(secondhalf)
-        lookupword = self.removesuffixes.sub('', dehyphenated)
+        if self.suffixes.match(secondhalf) is None:
-        if self.prefixes.match(firsthalf) is None:
+            lookupword = self.removesuffixes.sub('', dehyphenated)
        else:
            lookupword = dehyphenated
        if len(firsthalf) > 3 and self.prefixes.match(firsthalf) is None:
            lookupword = self.removeprefix.sub('', lookupword)
-        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        if self.verbose > 2:
            self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated))
        try:
            searchresult = self.html.find(lookupword.lower())
        except:
            return hyphenated
        if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
            if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+                if self.verbose > 2:
                    self.log("    Cleanup:returned dehyphenated word: " + str(dehyphenated))
                return dehyphenated
            elif self.html.find(hyphenated) != -1:
-                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+                if self.verbose > 2:
                    self.log("        Cleanup:returned hyphenated word: " + str(hyphenated))
                return hyphenated
            else:
-                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+                if self.verbose > 2:
                    self.log("            Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf))
                return firsthalf+u'\u2014'+wraptags+secondhalf
        else:
            if len(firsthalf) <= 2 and len(secondhalf) <= 2:
                if self.verbose > 2:
                    self.log("too short, returned hyphenated word: " + str(hyphenated))
                return hyphenated
            if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "returned dehyphenated word: " + str(dehyphenated)
+                if self.verbose > 2:
                    self.log("     returned dehyphenated word: " + str(dehyphenated))
                return dehyphenated
            else:
-                #print "           returned hyphenated word: " + str(hyphenated)
+                if self.verbose > 2:
                    self.log("          returned hyphenated word: " + str(hyphenated))
                return hyphenated
    def __call__(self, html, format, length=1):
@ -228,7 +247,7 @@ class Dehyphenator(object):
        elif format == 'txt':
            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
        elif format == 'individual_words':
-            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'(?!<)(?P<firstpart>\w+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)')
        elif format == 'html_cleanup':
            intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
        elif format == 'txt_cleanup':
@ -512,7 +531,7 @@ class HTMLPreProcessor(object):
        if is_pdftohtml and length > -1:
            # Dehyphenate
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
            html = dehyphenator(html,'html', length)
        if is_pdftohtml:
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -322,11 +322,11 @@ class HeuristicProcessor(object):
        html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
        # Delete microsoft 'smart' tags
        html = re.sub('(?i)</?st1:\w+>', '', html)
-        # Get rid of empty span, bold, font, & italics tags
+        # Get rid of empty span, bold, font, em, & italics tags
        html = re.sub(r'\s*<font[^>]*>\s*</font>\s*', '', html)
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
-        html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
+        html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
        html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
        self.deleted_nbsps = True
        return html
@ -376,27 +376,31 @@ class HeuristicProcessor(object):
        except:
            self.log("Can't get wordcount")
-        if 0 < self.totalwords < 50:
+        print "found "+unicode(self.totalwords)+" words in the flow"
        if self.totalwords < 50:
            self.log("flow is too short, not running heuristics")
            return html
        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = self.arrange_htm_line_endings(html)
-        ###### Check Markup ######
+        if self.cleanup_required():
-        #
+            ###### Check Markup ######
-        # some lit files don't have any <p> tags or equivalent (generally just plain text between
+            #
-        # <pre> tags), check and  mark up line endings if required before proceeding
+            # some lit files don't have any <p> tags or equivalent (generally just plain text between
-        if self.no_markup(html, 0.1):
+            # <pre> tags), check and  mark up line endings if required before proceeding
-            self.log("not enough paragraph markers, adding now")
+            # fix indents must run after this step
-            # markup using text processing
+            if self.no_markup(html, 0.1):
-            html = self.markup_pre(html)
+                self.log("not enough paragraph markers, adding now")
                # markup using text processing
                html = self.markup_pre(html)
        # Replace series of non-breaking spaces with text-indent
        if getattr(self.extra_opts, 'fix_indents', False):
            html = self.fix_nbsp_indents(html)
        if self.cleanup_required():
            # fix indents must run before this step, as it removes non-breaking spaces
            html = self.cleanup_markup(html)
        # ADE doesn't render <br />, change to empty paragraphs
@ -420,26 +424,26 @@ class HeuristicProcessor(object):
            self.log("deleting blank lines")
            html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
            html = self.blankreg.sub('', html)
        # Determine line ending type
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
        # that lines can be un-wrapped across page boundaries
        format = self.analyze_line_endings(html)
        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
        # more of the lines break in the same region of the document then unwrapping is required
        docanalysis = DocAnalysis(format, html)
        hardbreaks = docanalysis.line_histogram(.50)
        self.log("Hard line breaks check returned "+unicode(hardbreaks))
        # Calculate Length
        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
        length = docanalysis.line_length(unwrap_factor)
        self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
        ###### Unwrap lines ######
        if getattr(self.extra_opts, 'unwrap_lines', False):
            # Determine line ending type
            # Some OCR sourced files have line breaks in the html using a combination of span & p tags
            # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
            # that lines can be un-wrapped across page boundaries
            format = self.analyze_line_endings(html)
            # Check Line histogram to determine if the document uses hard line breaks, If 50% or
            # more of the lines break in the same region of the document then unwrapping is required
            docanalysis = DocAnalysis(format, html)
            hardbreaks = docanalysis.line_histogram(.50)
            self.log("Hard line breaks check returned "+unicode(hardbreaks))
            # Calculate Length
            unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
            length = docanalysis.line_length(unwrap_factor)
            self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
            # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
            if hardbreaks or unwrap_factor < 0.4:
                self.log("Unwrapping required, unwrapping Lines")
@ -447,15 +451,16 @@ class HeuristicProcessor(object):
                dehyphenator = Dehyphenator()
                html = dehyphenator(html,'html', length)
                html = self.punctuation_unwrap(length, html, 'html')
-                #check any remaining hyphens, but only unwrap if there is a match
+                # unwrap remaining hyphens based on line length, but only remove if there is a match
-                dehyphenator = Dehyphenator()
+                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
                html = dehyphenator(html,'html_cleanup', length)
        if getattr(self.extra_opts, 'dehyphenate', False):
            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
            self.log("Fixing hyphenated content")
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
            html = dehyphenator(html,'html_cleanup', length)
            html = dehyphenator(html, 'individual_words', length)
        # If still no sections after unwrapping mark split points on lines with no punctuation
        if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -285,7 +285,6 @@ class RTFInput(InputFormatPlugin):
        try:
            xml = self.generate_xml(stream.name)
        except RtfInvalidCodeException, e:
            raise
            raise ValueError(_('This RTF file has a feature calibre does not '
            'support. Convert it to HTML first and then try it.\n%s')%e)
--- a/src/calibre/ebooks/rtf2xml/ParseRtf.py
+++ b/src/calibre/ebooks/rtf2xml/ParseRtf.py
@ -226,10 +226,6 @@ class ParseRtf:
        try:
            return_value = process_tokens_obj.process_tokens()
        except InvalidRtfException, msg:
            try:
                os.remove(self.__temp_file)
            except OSError:
                pass
            #Check to see if the file is correctly encoded
            encode_obj = default_encoding.DefaultEncoding(
            in_file = self.__temp_file,
@ -241,14 +237,17 @@ class ParseRtf:
            check_encoding_obj = check_encoding.CheckEncoding(
                    bug_handler = RtfInvalidCodeException,
                        )
-            enc = encode_obj.get_codepage()
+            enc = 'cp' + encode_obj.get_codepage()
-            if enc != 'mac_roman':
+            msg = 'Exception in token processing'
                enc = 'cp' + enc
            if check_encoding_obj.check_encoding(self.__file, enc):
                file_name = self.__file if isinstance(self.__file, str) \
                                    else self.__file.encode('utf-8')
                msg = 'File %s does not appear to be correctly encoded.\n' % file_name
-                raise InvalidRtfException, msg
+            try:
                os.remove(self.__temp_file)
            except OSError:
                pass
            raise InvalidRtfException, msg
        delete_info_obj = delete_info.DeleteInfo(
            in_file = self.__temp_file,
            copy = self.__copy,
--- a/src/calibre/ebooks/rtf2xml/default_encoding.py
+++ b/src/calibre/ebooks/rtf2xml/default_encoding.py
@ -74,9 +74,6 @@ class DefaultEncoding:
        if not self.__datafetched:
            self._encoding()
            self.__datafetched = True
        if self.__platform == 'Macintosh':
            code_page = self.__code_page
        else:
            code_page = 'ansicpg' + self.__code_page
        return self.__platform, code_page, self.__default_num
@ -94,49 +91,60 @@ class DefaultEncoding:
    def _encoding(self):
        with open(self.__file, 'r') as read_obj:
            cpfound = False
            if not self.__fetchraw:
                for line in read_obj:
                    self.__token_info = line[:16]
                    if self.__token_info == 'mi<mk<rtfhed-end':
                        break
                    if self.__token_info == 'cw<ri<ansi-codpg':
                        #cw<ri<ansi-codpg<nu<10000
                        self.__code_page = line[20:-1] if int(line[20:-1]) \
                                            else '1252'
                    if self.__token_info == 'cw<ri<macintosh_':
                        self.__platform = 'Macintosh'
                        self.__code_page = 'mac_roman'
                    elif self.__token_info == 'cw<ri<pc________':
                        self.__platform = 'IBMPC'
                        self.__code_page = '437'
                    elif self.__token_info == 'cw<ri<pca_______':
                        self.__platform = 'OS/2'
-                        self.__code_page = '850'
+                    if self.__token_info == 'cw<ri<ansi-codpg' \
                        and int(line[20:-1]):
                            self.__code_page = line[20:-1]
                    if self.__token_info == 'cw<ri<deflt-font':
                        self.__default_num = line[20:-1]
                        cpfound = True
                        #cw<ri<deflt-font<nu<0
                if self.__platform != 'Windows' and \
                        not cpfound:
                    if self.__platform == 'Macintosh':
                       self.__code_page = '10000'
                    elif self.__platform == 'IBMPC':
                        self.__code_page = '437'
                    elif self.__platform == 'OS/2':
                        self.__code_page = '850'
            else:
                fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
                fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
                for line in read_obj:
                    if fenc.search(line):
                        enc = fenc.search(line).group(1)
                    if fenccp.search(line):
                        cp = fenccp.search(line).group(1)
                        if not int(cp):
                            self.__code_page = cp
                        cpfound = True
                        break
-                    if fenc.search(line):
+                if self.__platform != 'Windows' and \
-                        enc = fenc.search(line).group(1)
+                        not cpfound:
-                        if enc == 'mac':
+                    if enc == 'mac':
-                            self.__code_page = 'mac_roman'
+                        self.__code_page = '10000'
-                        elif enc == 'pc':
+                    elif enc == 'pc':
-                            self.__code_page = '437'
+                        self.__code_page = '437'
-                        elif enc == 'pca':
+                    elif enc == 'pca':
-                            self.__code_page = '850'
+                        self.__code_page = '850'
-# if __name__ == '__main__':
+if __name__ == '__main__':
-    # encode_obj = DefaultEncoding(
+    import sys
-            # in_file = sys.argv[1],
+    encode_obj = DefaultEncoding(
-            # bug_handler = Exception,
+            in_file = sys.argv[1],
-            # check_raw = True,
+            bug_handler = Exception,
-            # )
+            check_raw = True,
-    # print encode_obj.get_codepage()
+            )
    print encode_obj.get_codepage()
--- a/src/calibre/ebooks/rtf2xml/delete_info.py
+++ b/src/calibre/ebooks/rtf2xml/delete_info.py
@ -20,7 +20,7 @@ import sys, os, tempfile
 from calibre.ebooks.rtf2xml import copy
 class DeleteInfo:
-    """Delelet unecessary destination groups"""
+    """Delete unecessary destination groups"""
    def __init__(self,
            in_file ,
            bug_handler,
@ -31,17 +31,14 @@ class DeleteInfo:
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__write_to = tempfile.mktemp()
        self.__run_level = run_level
        self.__initiate_allow()
        self.__bracket_count= 0
        self.__ob_count = 0
        self.__cb_count = 0
        # self.__after_asterisk = False
        # self.__delete = 0
        self.__initiate_allow()
        self.__ob = 0
        self.__write_cb = False
        self.__run_level = run_level
        self.__found_delete = False
        # self.__list = False
    def __initiate_allow(self):
        """
@ -57,6 +54,8 @@ class DeleteInfo:
                            'cw<an<annotation',
                            'cw<cm<comment___',
                            'cw<it<lovr-table',
                            # info table
                            'cw<di<company___',
                            # 'cw<ls<list______',
                        )
        self.__not_allowable = (
@ -116,7 +115,6 @@ class DeleteInfo:
        """
        # Test for {\*}, in which case don't enter
        # delete state
        # self.__after_asterisk = False # only enter this function once
        self.__found_delete = True
        if self.__token_info == 'cb<nu<clos-brack':
            if self.__delete_count == self.__cb_count:
@ -128,7 +126,7 @@ class DeleteInfo:
                # not sure what happens here!
                # believe I have a '{\*}
                if self.__run_level > 3:
-                    msg = 'flag problem\n'
+                    msg = 'Flag problem\n'
                    raise self.__bug_handler, msg
                return True
        elif self.__token_info in self.__allowable :
@ -173,8 +171,8 @@ class DeleteInfo:
        Return True for all control words.
        Return False otherwise.
        """
-        if self.__delete_count == self.__cb_count and self.__token_info ==\
+        if self.__delete_count == self.__cb_count and \
-            'cb<nu<clos-brack':
+                self.__token_info == 'cb<nu<clos-brack':
            self.__state = 'default'
            if self.__write_cb:
                self.__write_cb = False
@ -186,32 +184,24 @@ class DeleteInfo:
            return False
    def delete_info(self):
-        """Main method for handling other methods. Read one line in at
+        """Main method for handling other methods. Read one line at
        a time, and determine whether to print the line based on the state."""
        with open(self.__file, 'r') as read_obj:
            with open(self.__write_to, 'w') as self.__write_obj:
                for line in read_obj:
                    #ob<nu<open-brack<0001
                    to_print = True
                    self.__token_info = line[:16]
                    if self.__token_info == 'ob<nu<open-brack':
                        self.__ob_count = line[-5:-1]
                    if self.__token_info == 'cb<nu<clos-brack':
                        self.__cb_count = line[-5:-1]
                    # Get action to perform
                    action = self.__state_dict.get(self.__state)
                    if not action:
-                        sys.stderr.write(_('No action in dictionary state is "%s" \n')
+                        sys.stderr.write('No action in dictionary state is "%s" \n'
                                % self.__state)
-                    to_print = action(line)
+                    # Print if allowed by action
-                    # if self.__after_asterisk:
+                    if action(line):
                        # to_print = self.__asterisk_func(line)
                    # elif self.__list:
                        # self.__in_list_func(line)
                    # elif self.__delete:
                        # to_print = self.__delete_func(line)
                    # else:
                        # to_print = self.__default_func(line)
                    if to_print:
                        self.__write_obj.write(line)
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
--- a/src/calibre/ebooks/rtf2xml/info.py
+++ b/src/calibre/ebooks/rtf2xml/info.py
@ -15,8 +15,10 @@
 #                                                                       #
 #                                                                       #
 #########################################################################
-import sys, os, tempfile
+import sys, os, tempfile, re
 from calibre.ebooks.rtf2xml import copy
 class Info:
    """
    Make tags for document-information
@ -42,12 +44,14 @@ class Info:
        self.__copy = copy
        self.__run_level = run_level
        self.__write_to = tempfile.mktemp()
    def __initiate_values(self):
        """
        Initiate all values.
        """
        self.__text_string = ''
        self.__state = 'before_info_table'
        self.rmspace = re.compile(r'\s+')
        self.__state_dict = {
        'before_info_table': self.__before_info_table_func,
        'after_info_table': self.__after_info_table_func,
@ -58,27 +62,49 @@ class Info:
        self.__info_table_dict = {
        'cw<di<title_____'  : (self.__found_tag_with_text_func, 'title'),
        'cw<di<author____'  : (self.__found_tag_with_text_func, 'author'),
        'cw<di<operator__'  : (self.__found_tag_with_text_func, 'operator'),
        'cw<di<manager___'  : (self.__found_tag_with_text_func, 'manager'),
        'cw<di<company___'  : (self.__found_tag_with_text_func, 'company'),
        'cw<di<keywords__'  : (self.__found_tag_with_text_func, 'keywords'),
        'cw<di<category__'  : (self.__found_tag_with_text_func, 'category'),
        'cw<di<doc-notes_'  : (self.__found_tag_with_text_func, 'doc-notes'),
        'cw<di<subject___'  : (self.__found_tag_with_text_func, 'subject'),
-        'cw<di<operator__'  : (self.__found_tag_with_text_func, 'operator'),
+        'cw<di<linkbase__'  : (self.__found_tag_with_text_func, 'hyperlink-base'),
        'cw<di<create-tim'  : (self.__found_tag_with_tokens_func, 'creation-time'),
-        'cw<di<revis-time'  :  (self.__found_tag_with_tokens_func, 'revision-time'),
+        'cw<di<revis-time'  : (self.__found_tag_with_tokens_func, 'revision-time'),
-        'cw<di<edit-time_'  : (self.__single_field_func, 'editing-time'),
+        'cw<di<edit-time_'  : (self.__found_tag_with_tokens_func, 'editing-time'),
        'cw<di<print-time'  : (self.__found_tag_with_tokens_func, 'printing-time'),
        'cw<di<backuptime'  : (self.__found_tag_with_tokens_func, 'backup-time'),
        'cw<di<num-of-wor'  : (self.__single_field_func, 'number-of-words'),
        'cw<di<num-of-chr'  : (self.__single_field_func, 'number-of-characters'),
        'cw<di<numofchrws'  : (self.__single_field_func, 'number-of-characters-without-space'),
        'cw<di<num-of-pag'  : (self.__single_field_func, 'number-of-pages'),
        'cw<di<version___'  : (self.__single_field_func, 'version'),
        'cw<di<intern-ver'  : (self.__single_field_func, 'internal-version-number'),
        'cw<di<internalID'  : (self.__single_field_func, 'internal-id-number'),
        }
        self.__token_dict = {
        'year______'        : 'year',
        'month_____'        : 'month',
        'day_______'        : 'day',
        'minute____'        : 'minute',
        'second____'        : 'second',
        'revis-time'        : 'revision-time',
        'create-tim'        : 'creation-time',
        'edit-time_'        : 'editing-time',
        'print-time'        : 'printing-time',
        'backuptime'        : 'backup-time',
        'num-of-wor'        : 'number-of-words',
        'num-of-chr'        : 'number-of-characters',
        'numofchrws'        : 'number-of-characters-without-space',
        'num-of-pag'        : 'number-of-pages',
        'version___'        : 'version',
        'intern-ver'        : 'internal-version-number',
        'internalID'        : 'internal-id-number',
        }
    def __before_info_table_func(self, line):
        """
        Required:
@ -92,6 +118,7 @@ class Info:
        if self.__token_info == 'mi<mk<doc-in-beg':
            self.__state = 'in_info_table'
        self.__write_obj.write(line)
    def __in_info_table_func(self, line):
        """
        Requires:
@ -112,6 +139,7 @@ class Info:
                action(line, tag)
            else:
                self.__write_obj.write(line)
    def __found_tag_with_text_func(self, line, tag):
        """
        Requires:
@ -126,6 +154,7 @@ class Info:
        """
        self.__tag = tag
        self.__state = 'collect_text'
    def __collect_text_func(self, line):
        """
        Requires:
@ -139,14 +168,17 @@ class Info:
        """
        if self.__token_info == 'mi<mk<docinf-end':
            self.__state = 'in_info_table'
-            self.__write_obj.write(
+            #Don't print empty tags
-                'mi<tg<open______<%s\n'
+            if len(self.rmspace.sub('',self.__text_string)):
-                'tx<nu<__________<%s\n'
+                self.__write_obj.write(
-                'mi<tg<close_____<%s\n' % (self.__tag, self.__text_string, self.__tag)
+                    'mi<tg<open______<%s\n'
-            )
+                    'tx<nu<__________<%s\n'
                    'mi<tg<close_____<%s\n' % (self.__tag, self.__text_string, self.__tag)
                )
            self.__text_string = ''
        elif line[0:2] == 'tx':
            self.__text_string += line[17:-1]
    def __found_tag_with_tokens_func(self, line, tag):
        """
        Requires:
@ -163,6 +195,7 @@ class Info:
        self.__state = 'collect_tokens'
        self.__text_string = 'mi<tg<empty-att_<%s' % tag
        #mi<tg<empty-att_<page-definition<margin>33\n
    def __collect_tokens_func(self, line):
        """
        Requires:
@ -194,18 +227,19 @@ class Info:
            att = line[6:16]
            value = line[20:-1]
            att_changed = self.__token_dict.get(att)
-            if att_changed == None:
+            if att_changed is None:
                if self.__run_level > 3:
-                    msg = 'no dictionary match for %s\n' % att
+                    msg = 'No dictionary match for %s\n' % att
                    raise self.__bug_handler, msg
            else:
                self.__text_string += '<%s>%s' % (att_changed, value)
    def __single_field_func(self, line, tag):
        value = line[20:-1]
        self.__write_obj.write(
-        'mi<tg<empty-att_<%s'
+        'mi<tg<empty-att_<%s<%s>%s\n' % (tag, tag, value)
        '<%s>%s\n' % (tag, tag, value)
        )
    def __after_info_table_func(self, line):
        """
        Requires:
@ -217,6 +251,7 @@ class Info:
            the file.
        """
        self.__write_obj.write(line)
    def fix_info(self):
        """
        Requires:
@ -234,20 +269,15 @@ class Info:
            information table, simply write the line to the output file.
        """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
+        with open(self.__file, 'r') as read_obj:
-        self.__write_obj = open(self.__write_to, 'w')
+            with open(self.__write_to, 'wb') as self.__write_obj:
-        line_to_read = 1
+                for line in read_obj:
-        while line_to_read:
+                    self.__token_info = line[:16]
-            line_to_read = read_obj.readline()
+                    action = self.__state_dict.get(self.__state)
-            line = line_to_read
+                    if action is None:
-            self.__token_info = line[:16]
+                        sys.stderr.write('No matching state in module styles.py\n')
-            action = self.__state_dict.get(self.__state)
+                        sys.stderr.write(self.__state + '\n')
-            if action == None:
+                    action(line)
                sys.stderr.write('no no matching state in module styles.py\n')
                sys.stderr.write(self.__state + '\n')
            action(line)
        read_obj.close()
        self.__write_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "info.data")
--- a/src/calibre/ebooks/rtf2xml/process_tokens.py
+++ b/src/calibre/ebooks/rtf2xml/process_tokens.py
@ -70,7 +70,7 @@ class ProcessTokens:
        ';'                  :	('mc', ';', self.ms_sub_func),
        # this must be wrong
        '-'                  :	('mc', '-', self.ms_sub_func),
-        'line'               :  ('mi', 'hardline-break', self.hardline_func), #calibre
+        'line'               :  ('mi', 'hardline-break', self.direct_conv_func), #calibre
        # misc => ml
        '*'                  :	('ml', 'asterisk__', self.default_func),
        ':'                  :	('ml', 'colon_____', self.default_func),
@ -78,7 +78,6 @@ class ProcessTokens:
        'backslash'          :	('nu', '\\', self.text_func),
        'ob'                 :	('nu', '{', self.text_func),
        'cb'                 :	('nu', '}', self.text_func),
        #'line'               :  ('nu', ' ', self.text_func), calibre
        # paragraph formatting => pf
        'page'               :  ('pf', 'page-break', self.default_func),
        'par'                :	('pf', 'par-end___', self.default_func),
@ -231,11 +230,15 @@ class ProcessTokens:
        'trhdr'              :  ('tb', 'row-header', self.default_func),
        # preamble => pr
        # document information => di
        # TODO integrate \userprops
        'info'               :	('di', 'doc-info__', self.default_func),
        'title'              :	('di', 'title_____', self.default_func),
        'author'             :	('di', 'author____', self.default_func),
        'operator'           :	('di', 'operator__', self.default_func),
-        'title'              :	('di', 'title_____', self.default_func),
+        'manager'            :	('di', 'manager___', self.default_func),
        'company'            :	('di', 'company___', self.default_func),
        'keywords'           :  ('di', 'keywords__', self.default_func),
        'category'           :  ('di', 'category__', self.default_func),
        'doccomm'            :  ('di', 'doc-notes_', self.default_func),
        'comment'            :  ('di', 'doc-notes_', self.default_func),
        'subject'            :  ('di', 'subject___', self.default_func),
@ -244,11 +247,19 @@ class ProcessTokens:
        'mo'                 :	('di', 'month_____', self.default_func),
        'dy'                 :	('di', 'day_______', self.default_func),
        'min'                :	('di', 'minute____', self.default_func),
        'sec'                :	('di', 'second____', self.default_func),
        'revtim'             :	('di', 'revis-time', self.default_func),
        'edmins'             :	('di', 'edit-time_', self.default_func),
        'printim'            :	('di', 'print-time', self.default_func),
        'buptim'             :	('di', 'backuptime', self.default_func),
        'nofwords'           :	('di', 'num-of-wor', self.default_func),
        'nofchars'           :	('di', 'num-of-chr', self.default_func),
        'nofcharsws'         :	('di', 'numofchrws', self.default_func),
        'nofpages'           :	('di', 'num-of-pag', self.default_func),
-        'edmins'             :	('di', 'edit-time_', self.default_func),
+        'version'            :	('di', 'version___', self.default_func),
        'vern'               :	('di', 'intern-ver', self.default_func),
        'hlinkbase'          :	('di', 'linkbase__', self.default_func),
        'id'                 :	('di', 'internalID', self.default_func),
        # headers and footers => hf
        'headerf'            :	('hf', 'head-first', self.default_func),
        'headerl'            :	('hf', 'head-left_', self.default_func),
@ -605,7 +616,7 @@ class ProcessTokens:
    def ms_sub_func(self, pre, token, num):
        return 'tx<mc<__________<%s\n' % token
-    def hardline_func(self, pre, token, num):
+    def direct_conv_func(self, pre, token, num):
        return 'mi<tg<empty_____<%s\n' % token
    def default_func(self, pre, token, num):
--- a/src/calibre/ebooks/rtf2xml/tokenize.py
+++ b/src/calibre/ebooks/rtf2xml/tokenize.py
@ -27,11 +27,13 @@ class Tokenize:
            bug_handler,
            copy = None,
            run_level = 1,
-    ):
+            # out_file = None,
        ):
        self.__file = in_file
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__write_to = tempfile.mktemp()
        # self.__out_file = out_file
        self.__compile_expressions()
        #variables
        self.__uc_char = 0
@ -113,6 +115,8 @@ class Tokenize:
    def __sub_reg_split(self,input_file):
        input_file = self.__replace_spchar.mreplace(input_file)
        # this is for older RTF
        input_file = self.__par_exp.sub('\n\\par \n', input_file)
        input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
        input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
        #remove \n in bin data
@ -127,7 +131,7 @@ class Tokenize:
        # this is for older RTF
        #line = re.sub(self.__par_exp, '\\par ', line)
        #return filter(lambda x: len(x) > 0, \
-            #(self.__remove_line.sub('', x) for x in tokens))
+            #(self.__remove_line.sub('', x) for x in tokens)) 
    def __compile_expressions(self):
        SIMPLE_RPL = {
@ -153,8 +157,6 @@ class Tokenize:
            # put a backslash in front of to eliminate special cases and
            # make processing easier
            "}": "\\}",
            # this is for older RTF
            r'\\$': '\\par ',
            }
        self.__replace_spchar = MReplace(SIMPLE_RPL)
        #add ;? in case of char following \u
@ -168,10 +170,12 @@ class Tokenize:
        #why keep backslash whereas \is replaced before?
        #remove \n from endline char
        self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
        #this is for old RTF
        self.__par_exp = re.compile(r'\\\n+')
        # self.__par_exp = re.compile(r'\\$')
        #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
        #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
        #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
        #self.__par_exp = re.compile(r'\\$')
        #self.__remove_line = re.compile(r'\n+')
        #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
        ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
@ -199,7 +203,24 @@ class Tokenize:
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "tokenize.data")
        # if self.__out_file:
            # self.__file = self.__out_file
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
-        #self.__special_tokens = [ '_', '~', "'", '{', '}' ]
+        #self.__special_tokens = [ '_', '~', "'", '{', '}' ]
 # import sys
 # def main(args=sys.argv):
    # if len(args) < 1:
        # print 'No file'
        # return
    # file = 'data_tokens.txt'
    # if len(args) == 3:
        # file = args[2]
    # to = Tokenize(args[1], Exception, out_file = file)
    # to.tokenize()
 # if __name__ == '__main__':
    # sys.exit(main())
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -106,7 +106,7 @@ class TXTInput(InputFormatPlugin):
                    log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
            # Dehyphenate
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None))
            txt = dehyphenator(txt,'txt', length)
            # We don't check for block because the processor assumes block.
@ -137,11 +137,6 @@ class TXTInput(InputFormatPlugin):
                setattr(options, 'format_scene_breaks', True)
                setattr(options, 'dehyphenate', True)
        # Dehyphenate in cleanup mode for missed txt and markdown conversion
        dehyphenator = Dehyphenator()
        html = dehyphenator(html,'txt_cleanup', length)
        html = dehyphenator(html,'html_cleanup', length)
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
--- a/src/calibre/gui2/init.py
+++ b/src/calibre/gui2/init.py
@ -505,7 +505,7 @@ class FileDialog(QObject):
        self.selected_files = []
        if mode == QFileDialog.AnyFile:
            f = unicode(QFileDialog.getSaveFileName(parent, title, initial_dir, ftext, ""))
-            if f and os.path.exists(f):
+            if f:
                self.selected_files.append(f)
        elif mode == QFileDialog.ExistingFile:
            f = unicode(QFileDialog.getOpenFileName(parent, title, initial_dir, ftext, ""))
--- a/src/calibre/gui2/actions/catalog.py
+++ b/src/calibre/gui2/actions/catalog.py
@ -28,7 +28,7 @@ class GenerateCatalogAction(InterfaceAction):
        if not ids:
            return error_dialog(self.gui, _('No books selected'),
-                    _('No books selected to generate catalog for'),
+                    _('No books selected for catalog generation'),
                    show=True)
 		db = self.gui.library_view.model().db
@ -55,9 +55,9 @@ class GenerateCatalogAction(InterfaceAction):
    def catalog_generated(self, job):
        if job.result:
-            # Search terms nulled catalog results
+            # Error during catalog generation
-            return error_dialog(self.gui, _('No books found'),
+            return error_dialog(self.gui, _('Catalog generation terminated'),
-                    _("No books to catalog\nCheck job details"),
+                    job.result,
                    show=True)
        if job.failed:
            return self.gui.job_exception(job)
--- a/src/calibre/gui2/actions/edit_metadata.py
+++ b/src/calibre/gui2/actions/edit_metadata.py
@ -94,7 +94,7 @@ class EditMetadataAction(InterfaceAction):
            get_social_metadata = config['get_social_metadata']
        else:
            get_social_metadata = set_social_metadata
-        from calibre.gui2.metadata import DoDownload
+        from calibre.gui2.metadata.bulk_download import DoDownload
        if set_social_metadata is not None and set_social_metadata:
            x = _('social metadata')
        else:
--- a/src/calibre/gui2/metadata/init.py
+++ b/src/calibre/gui2/metadata/init.py
@ -0,0 +1,9 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
--- a/src/calibre/gui2/metadata/bulk_download.py
+++ b/src/calibre/gui2/metadata/bulk_download.py
--- a/src/calibre/gui2/tag_view.py
+++ b/src/calibre/gui2/tag_view.py
@ -730,7 +730,7 @@ class TagsModel(QAbstractItemModel): # {{{
            else:
                collapse_model = 'partition'
                collapse_template = tweaks['categories_collapsed_popularity_template']
-        collapse_letter = None
+        collapse_letter = collapse_letter_sk = None
        for i, r in enumerate(self.row_map):
            if self.hidden_categories and self.categories[i] in self.hidden_categories:
@ -782,8 +782,17 @@ class TagsModel(QAbstractItemModel): # {{{
                        ts = tag.sort
                        if not ts:
                            ts = ' '
-                        if upper(ts[0]) != collapse_letter:
+                        try:
                            sk = sort_key(ts)[0]
                        except:
                            sk = ts[0]
                        if sk != collapse_letter_sk:
                            collapse_letter = upper(ts[0])
                            try:
                                collapse_letter_sk = sort_key(collapse_letter)[0]
                            except:
                                collapse_letter_sk = collapse_letter
                            sub_cat = TagTreeItem(parent=category,
                                     data = collapse_letter,
                                     category_icon = category_node.icon,
--- a/src/calibre/gui2/widgets.py
+++ b/src/calibre/gui2/widgets.py
@ -386,11 +386,13 @@ class LineEditECM(object):
        action_lower_case = case_menu.addAction(_('Lower Case'))
        action_swap_case = case_menu.addAction(_('Swap Case'))
        action_title_case = case_menu.addAction(_('Title Case'))
        action_capitalize = case_menu.addAction(_('Capitalize'))
        self.connect(action_upper_case, SIGNAL('triggered()'), self.upper_case)
        self.connect(action_lower_case, SIGNAL('triggered()'), self.lower_case)
        self.connect(action_swap_case, SIGNAL('triggered()'), self.swap_case)
        self.connect(action_title_case, SIGNAL('triggered()'), self.title_case)
        self.connect(action_capitalize, SIGNAL('triggered()'), self.capitalize)
        menu.addMenu(case_menu)
        menu.exec_(event.globalPos())
@ -408,6 +410,10 @@ class LineEditECM(object):
        from calibre.utils.titlecase import titlecase
        self.setText(titlecase(unicode(self.text())))
    def capitalize(self):
        from calibre.utils.icu import capitalize
        self.setText(capitalize(unicode(self.text())))
 class EnLineEdit(LineEditECM, QLineEdit):
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@ -1144,7 +1144,9 @@ class EPUB_MOBI(CatalogPlugin):
            def error(self):
                def fget(self):
                    return self.__error
-                return property(fget=fget)
+                def fset(self, val):
                    self.__error = val
                return property(fget=fget,fset=fset)
            @dynamic_property
            def generateForKindle(self):
                def fget(self):
@ -1411,6 +1413,88 @@ class EPUB_MOBI(CatalogPlugin):
                except:
                    pass
        def fetchBooksByAuthor(self):
            '''
            Generate a list of titles sorted by author from the database
            return = Success
            '''
            self.updateProgressFullStep("Sorting database")
            '''
            # Sort titles case-insensitive, by author
            self.booksByAuthor = sorted(self.booksByTitle,
                                 key=lambda x:(x['author_sort'].upper(), x['author_sort'].upper()))
            '''
            self.booksByAuthor = list(self.booksByTitle)
            self.booksByAuthor.sort(self.author_compare)
            if False and self.verbose:
                self.opts.log.info("fetchBooksByAuthor(): %d books" % len(self.booksByAuthor))
                self.opts.log.info(" %-30s %-20s %s" % ('title', 'series', 'series_index'))
                for title in self.booksByAuthor:
                    self.opts.log.info((u" %-30s %-20s%5s " % \
                                        (title['title'][:30],
                                         title['series'][:20] if title['series'] else '',
                                         title['series_index'],
                                         )).encode('utf-8'))
                raise SystemExit
            # Build the unique_authors set from existing data
            authors = [(record['author'], record['author_sort'].capitalize()) for record in self.booksByAuthor]
            # authors[] contains a list of all book authors, with multiple entries for multiple books by author
            #        authors[]: (([0]:friendly  [1]:sort))
            # unique_authors[]: (([0]:friendly  [1]:sort  [2]:book_count))
            books_by_current_author = 0
            current_author = authors[0]
            multiple_authors = False
            unique_authors = []
            for (i,author) in enumerate(authors):
                if author != current_author:
                    # Note that current_author and author are tuples: (friendly, sort)
                    multiple_authors = True
                if author != current_author and i:
                    # Warn, exit if friendly matches previous, but sort doesn't
                    if author[0] == current_author[0]:
                        error_msg = _('''
 \n*** Metadata error ***
 Inconsistent Author Sort values for Author '{0}', unable to continue building catalog.
 Select all books by '{0}', apply correct Author Sort value in Edit Metadata dialog,
 then rebuild the catalog.\n''').format(author[0])
                        self.opts.log.warn(error_msg)
                        self.error = error_msg
                        return False
                    # New author, save the previous author/sort/count
                    unique_authors.append((current_author[0], icu_title(current_author[1]),
                                           books_by_current_author))
                    current_author = author
                    books_by_current_author = 1
                elif i==0 and len(authors) == 1:
                    # Allow for single-book lists
                    unique_authors.append((current_author[0], icu_title(current_author[1]),
                                           books_by_current_author))
                else:
                    books_by_current_author += 1
            else:
                # Add final author to list or single-author dataset
                if (current_author == author and len(authors) > 1) or not multiple_authors:
                    unique_authors.append((current_author[0], icu_title(current_author[1]),
                                           books_by_current_author))
            if False and self.verbose:
                self.opts.log.info("\nfetchBooksByauthor(): %d unique authors" % len(unique_authors))
                for author in unique_authors:
                    self.opts.log.info((u" %-50s %-25s %2d" % (author[0][0:45], author[1][0:20],
                       author[2])).encode('utf-8'))
            self.authors = unique_authors
            return True
        def fetchBooksByTitle(self):
            self.updateProgressFullStep("Fetching database")
@ -1562,90 +1646,9 @@ class EPUB_MOBI(CatalogPlugin):
                                                               title['title_sort'][0:40])).decode('mac-roman'))
                return True
            else:
                self.error = _("No books found to catalog.\nCheck 'Excluded books' criteria in E-book options.")
                return False
        def fetchBooksByAuthor(self):
            '''
            Generate a list of titles sorted by author from the database
            return = Success
            '''
            self.updateProgressFullStep("Sorting database")
            '''
            # Sort titles case-insensitive, by author
            self.booksByAuthor = sorted(self.booksByTitle,
                                 key=lambda x:(x['author_sort'].upper(), x['author_sort'].upper()))
            '''
            self.booksByAuthor = list(self.booksByTitle)
            self.booksByAuthor.sort(self.author_compare)
            if False and self.verbose:
                self.opts.log.info("fetchBooksByAuthor(): %d books" % len(self.booksByAuthor))
                self.opts.log.info(" %-30s %-20s %s" % ('title', 'series', 'series_index'))
                for title in self.booksByAuthor:
                    self.opts.log.info((u" %-30s %-20s%5s " % \
                                        (title['title'][:30],
                                         title['series'][:20] if title['series'] else '',
                                         title['series_index'],
                                         )).encode('utf-8'))
                raise SystemExit
            # Build the unique_authors set from existing data
            authors = [(record['author'], record['author_sort'].capitalize()) for record in self.booksByAuthor]
            # authors[] contains a list of all book authors, with multiple entries for multiple books by author
            #        authors[]: (([0]:friendly  [1]:sort))
            # unique_authors[]: (([0]:friendly  [1]:sort  [2]:book_count))
            books_by_current_author = 0
            current_author = authors[0]
            multiple_authors = False
            unique_authors = []
            for (i,author) in enumerate(authors):
                if author != current_author:
                    # Note that current_author and author are tuples: (friendly, sort)
                    multiple_authors = True
                if author != current_author and i:
                    # Warn, exit if friendly matches previous, but sort doesn't
                    if author[0] == current_author[0]:
                        error_msg = _('''
 \n*** Metadata error ***
 Inconsistent Author Sort values for Author '{0}', unable to continue building catalog.
 Select all books by '{0}', apply correct Author Sort value in Edit Metadata dialog,
 then rebuild the catalog.
 *** Terminating catalog generation ***\n''').format(author[0])
                        self.opts.log.warn(error_msg)
                        return False
                    # New author, save the previous author/sort/count
                    unique_authors.append((current_author[0], icu_title(current_author[1]),
                                           books_by_current_author))
                    current_author = author
                    books_by_current_author = 1
                elif i==0 and len(authors) == 1:
                    # Allow for single-book lists
                    unique_authors.append((current_author[0], icu_title(current_author[1]),
                                           books_by_current_author))
                else:
                    books_by_current_author += 1
            else:
                # Add final author to list or single-author dataset
                if (current_author == author and len(authors) > 1) or not multiple_authors:
                    unique_authors.append((current_author[0], icu_title(current_author[1]),
                                           books_by_current_author))
            if False and self.verbose:
                self.opts.log.info("\nfetchBooksByauthor(): %d unique authors" % len(unique_authors))
                for author in unique_authors:
                    self.opts.log.info((u" %-50s %-25s %2d" % (author[0][0:45], author[1][0:20],
                       author[2])).encode('utf-8'))
            self.authors = unique_authors
            return True
        def fetchBookmarks(self):
            '''
            Collect bookmarks for catalog entries
@ -5069,6 +5072,8 @@ then rebuild the catalog.
                            abort_after_input_dump=False)
            plumber.merge_ui_recommendations(recommendations)
            plumber.run()
-            return 0
+            # returns to gui2.actions.catalog:catalog_generated()
            return None
        else:
-            return 1
+            # returns to gui2.actions.catalog:catalog_generated()
            return catalog.error
--- a/src/calibre/library/cli.py
+++ b/src/calibre/library/cli.py
@ -693,8 +693,12 @@ def command_catalog(args, dbpath):
                            }
    with plugin:
-        plugin.run(args[1], opts, get_db(dbpath, opts))
+        ret = plugin.run(args[1], opts, get_db(dbpath, opts))
-    return 0
+    if ret is None:
        ret = 0
    else:
        ret = 1
    return ret
 # end of GR additions
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -690,11 +690,14 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        mi = Metadata(None)
        aut_list = row[fm['au_map']]
-        aut_list = [p.split(':::') for p in aut_list.split(':#:')]
+        if aut_list:
            aut_list = [p.split(':::') for p in aut_list.split(':#:') if p]
        else:
            aut_list = []
        aum = []
        aus = {}
        for (author, author_sort) in aut_list:
-            aum.append(author)
+            aum.append(author.replace('|', ','))
            aus[author] = author_sort.replace('|', ',')
        mi.title       = row[fm['title']]
        mi.authors     = aum
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -437,6 +437,15 @@ My antivirus program claims |app| is a virus/trojan?
 Your antivirus program is wrong. |app| is a completely open source product. You can actually browse the source code yourself (or hire someone to do it for you) to verify that it is not a virus. Please report the false identification to whatever company you buy your antivirus software from. If the antivirus program is preventing you from downloading/installing |app|, disable it temporarily, install |app| and then re-enable it.
 How do I backup |app|?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The most important thing to backup is the |app| library folder, that contains all your books and metadata. This is the folder you chose for your |app| library when you ran |app| for the first time. You can get the path to the library folder by clicking the |app| icon on the main toolbar. You must backup this complete folder with all its files and sub-folders. 
 You can switch |app| to using a backed up library folder by simply clicking the |app| icon on the toolbar and choosing your backup library folder. 
 If you want to backup the |app| configuration/plugins, you have to backup the config directory. You can find this config directory via :guilabel:`Preferences->Miscellaneous`. Note that restoring configuration directories is not officially supported, but should work in most cases. Just copy the contents of the backup directory into the current configuration directory to restore.
 How do I use purchased EPUB books with |app|?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Most purchased EPUB books have `DRM <http://wiki.mobileread.com/wiki/DRM>`_. This prevents |app| from opening them. You can still use |app| to store and transfer them to your e-book reader. First, you must authorize your reader on a windows machine with Adobe Digital Editions. Once this is done, EPUB books transferred with |app| will work fine on your reader. When you purchase an epub book from a website, you will get an ".acsm" file. This file should be opened with Adobe Digital Editions, which will then download the actual ".epub" e-book. The e-book file will be stored in the folder "My Digital Editions", from where you can add it to |app|.
--- a/src/calibre/utils/formatter_functions.py
+++ b/src/calibre/utils/formatter_functions.py
@ -77,7 +77,7 @@ class FormatterFunction(object):
                                        exc_traceback)[-2:]).replace('\n', '')
            return _('Exception ' + info)
-
+all_builtin_functions = []
 class BuiltinFormatterFunction(FormatterFunction):
    def __init__(self):
        formatter_functions.register_builtin(self)
@ -88,6 +88,7 @@ class BuiltinFormatterFunction(FormatterFunction):
        except:
            lines = []
        self.program_text = ''.join(lines)
        all_builtin_functions.append(self)
 class BuiltinStrcmp(BuiltinFormatterFunction):
    name = 'strcmp'
--- a/src/calibre/utils/icu.py
+++ b/src/calibre/utils/icu.py
@ -80,7 +80,7 @@ def icu_case_sensitive_strcmp(collator, a, b):
 def icu_capitalize(s):
    s = lower(s)
-    return s.replace(s[0], upper(s[0]), 1)
+    return s.replace(s[0], upper(s[0]), 1) if s else s
 load_icu()
 load_collator()