Merge from trunk

2026-06-04 21:15:24 -04:00 · 2011-01-13 12:42:28 +00:00
parent 969965f7c1 f48c31c493
commit 3cdc088f0d
43 changed files with 2233 additions and 1526 deletions
@@ -8,12 +8,13 @@ __docformat__ = 'restructuredtext en'
 globeandmail.com
 '''

+import re
+
 from calibre.web.feeds.news import BasicNewsRecipe

 class AdvancedUserRecipe1287083651(BasicNewsRecipe):
    title          = u'Globe & Mail'
-    __license__   = 'GPL v3'
-    __author__ = 'Szing'
+    __author__ = 'Kovid Goyal'
    oldest_article = 2
    no_stylesheets = True
    max_articles_per_feed = 100
@@ -38,24 +39,19 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
      (u'Sports', u'http://www.theglobeandmail.com/auto/?service=rss')
    ]

-    keep_only_tags = [
-      dict(name='h1'),
-      dict(name='h2', attrs={'id':'articletitle'}),
-      dict(name='p', attrs={'class':['leadText', 'meta', 'leadImage', 'redtext byline', 'bodyText']}),
-      dict(name='div', attrs={'class':['news','articlemeta','articlecopy']}),
-      dict(name='id', attrs={'class':'article'}),
-      dict(name='table', attrs={'class':'todays-market'}),
-      dict(name='header', attrs={'id':'leadheader'})
-    ]
+    preprocess_regexps = [
+        (re.compile(r'<head.*?</head>', re.DOTALL), lambda m: '<head></head>'),
+        (re.compile(r'<script.*?</script>', re.DOTALL), lambda m: ''),
+        ]

+    remove_tags_before = dict(name='h1')
    remove_tags = [
-      dict(name='div', attrs={'id':['tabInside', 'ShareArticles', 'topStories']})
-    ]
-
-    #this has to be here or the text in the article appears twice.
-    remove_tags_after = [dict(id='article')]
+            dict(name='div', attrs={'id':['ShareArticles', 'topStories']}),
+            dict(href=lambda x: x and 'tracking=' in x),
+            {'class':['articleTools', 'pagination', 'Ads', 'topad',
+                'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]

    #Use the mobile version rather than the web version
    def print_version(self, url):
-        return url + '&service=mobile'
+        return url.rpartition('?')[0] + '?service=mobile'

@@ -4,7 +4,6 @@ __copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
 msnbc.msn.com
 '''

-import re
 from calibre.web.feeds.recipes import BasicNewsRecipe

 class MsNBC(BasicNewsRecipe):
@@ -19,7 +18,7 @@ class MsNBC(BasicNewsRecipe):
    publisher              = 'msnbc.com'
    category               = 'news, USA, world'
    language               = 'en'
-    extra_css              = """ 
+    extra_css              = """
                                body{ font-family: Georgia,Times,serif }
                                .hide{display: none}
                                .caption{font-family: Arial,sans-serif; font-size: x-small}
@@ -44,7 +43,7 @@ class MsNBC(BasicNewsRecipe):
                     ,dict(attrs={'class':['gl_headline','articleText','drawer-content Linear','v-center3','byline','textBodyBlack']})
                   ]
    remove_attributes=['property','lang','rel','xmlns:fb','xmlns:v','xmlns:dc','xmlns:dcmitype','xmlns:og','xmlns:media','xmlns:vcard','typeof','itemscope','itemtype','itemprop','about','type','size','width','height','onreadystatechange','data','border','hspace','vspace']
-    
+
    remove_tags      = [
                          dict(name=['iframe','object','link','embed','meta','table'])
                         ,dict(name='span', attrs={'class':['copyright','Linear copyright']})
@@ -70,7 +69,7 @@ class MsNBC(BasicNewsRecipe):
            if item.has_key('id') and item['id'].startswith('vine-'):
               item.extract()
            if item.has_key('class') and ( item['class'].startswith('ad') or item['class'].startswith('vine')):
-               item.extract()            
+               item.extract()
        for item in soup.body.findAll('img'):
            if not item.has_key('alt'):
               item['alt'] = 'image'
@@ -83,6 +82,6 @@ class MsNBC(BasicNewsRecipe):
        for alink in soup.findAll('a'):
            if alink.string is not None:
               tstr = alink.string
-               alink.replaceWith(tstr)    
+               alink.replaceWith(tstr)
        return soup

@@ -287,7 +287,7 @@
                <xsl:value-of select="count(preceding::rtf:footnote) + 1"/>
                <xsl:text>]</xsl:text>
            </xsl:when>
-            <xsl:when test="(@superscript = 'true')">
+            <xsl:when test="(@superscript)">
                <xsl:element name="sup">
                    <xsl:element name="span">
                        <xsl:attribute name="class">
@@ -297,7 +297,7 @@
                    </xsl:element>
                </xsl:element>
            </xsl:when>
-            <xsl:when test="(@underscript = 'true')">
+            <xsl:when test="(@underscript or @subscript)">
                <xsl:element name="sub">
                    <xsl:element name="span">
                        <xsl:attribute name="class">
@@ -459,6 +459,18 @@ def force_unicode(obj, enc=preferred_encoding):
                        obj = obj.decode('utf-8')
    return obj

+def as_unicode(obj, enc=preferred_encoding):
+    if not isbytestring(obj):
+        try:
+            obj = unicode(obj)
+        except:
+            try:
+                obj = str(obj)
+            except:
+                obj = repr(obj)
+    return force_unicode(obj, enc=enc)
+
+

 def human_readable(size):
    """ Convert a size in bytes into a human readable form """
@@ -10,7 +10,8 @@ from calibre.ebooks.metadata import MetaInformation, string_to_authors
 title_pat    = re.compile(r'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL)
 author_pat   = re.compile(r'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL)
 comment_pat  = re.compile(r'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL)
-category_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
+tags_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
+publisher_pat = re.compile(r'\{\\info.*?\{\\manager(.*?)(?<!\\)\}', re.DOTALL)

 def get_document_info(stream):
    """
@@ -82,61 +83,73 @@ def decode(raw, codec):

 def get_metadata(stream):
    """ Return metadata as a L{MetaInfo} object """
-    title, author, comment, category = None, None, None, None
    stream.seek(0)
    if stream.read(5) != r'{\rtf':
-        return MetaInformation(None, None)
+        return MetaInformation(_('Unknown'))
    block = get_document_info(stream)[0]
    if not block:
-        return MetaInformation(None, None)
+        return MetaInformation(_('Unknown'))

    stream.seek(0)
    cpg = detect_codepage(stream)
    stream.seek(0)

    title_match = title_pat.search(block)
-    if title_match:
+    if title_match is not None:
        title = decode(title_match.group(1).strip(), cpg)
+    else:
+        title = _('Unknown')
    author_match = author_pat.search(block)
-    if author_match:
+    if author_match is not None:
        author = decode(author_match.group(1).strip(), cpg)
-    comment_match = comment_pat.search(block)
-    if comment_match:
-        comment = decode(comment_match.group(1).strip(), cpg)
-    category_match = category_pat.search(block)
-    if category_match:
-        category = decode(category_match.group(1).strip(), cpg)
-    mi = MetaInformation(title, author)
+    else:
+        author = None
+    mi = MetaInformation(title)
    if author:
        mi.authors = string_to_authors(author)
-    mi.comments = comment
-    mi.category = category
+
+    comment_match = comment_pat.search(block)
+    if comment_match is not None:
+        comment = decode(comment_match.group(1).strip(), cpg)
+        mi.comments = comment
+    tags_match = tags_pat.search(block)
+    if tags_match is not None:
+        tags = decode(tags_match.group(1).strip(), cpg)
+        mi.tags = tags
+    publisher_match = publisher_pat.search(block)
+    if publisher_match is not None:
+        publisher = decode(publisher_match.group(1).strip(), cpg)
+        mi.publisher = publisher
+
    return mi

-
 def create_metadata(stream, options):
-    md = r'{\info'
+    md = [r'{\info']
    if options.title:
        title = options.title.encode('ascii', 'ignore')
-        md += r'{\title %s}'%(title,)
+        md.append(r'{\title %s}'%(title,))
    if options.authors:
        au = options.authors
        if not isinstance(au, basestring):
            au = u', '.join(au)
        author = au.encode('ascii', 'ignore')
-        md += r'{\author %s}'%(author,)
-    if options.get('category', None):
-        category = options.category.encode('ascii', 'ignore')
-        md += r'{\category %s}'%(category,)
+        md.append(r'{\author %s}'%(author,))
    comp = options.comment if hasattr(options, 'comment') else options.comments
    if comp:
        comment = comp.encode('ascii', 'ignore')
-        md += r'{\subject %s}'%(comment,)
-    if len(md) > 6:
-        md += '}'
+        md.append(r'{\subject %s}'%(comment,))
+    if options.publisher:
+        publisher = options.publisher.encode('ascii', 'ignore')
+        md.append(r'{\manager %s}'%(publisher,))
+    if options.tags:
+        tags = u', '.join(options.tags)
+        tags = tags.encode('ascii', 'ignore')
+        md.append(r'{\category %s}'%(tags,))
+    if len(md) > 1:
+        md.append('}')
        stream.seek(0)
        src   = stream.read()
-        ans = src[:6] + md + src[6:]
+        ans = src[:6] + u''.join(md) + src[6:]
        stream.seek(0)
        stream.write(ans)

@@ -156,7 +169,7 @@ def set_metadata(stream, options):

        base_pat = r'\{\\name(.*?)(?<!\\)\}'
        title = options.title
-        if title != None:
+        if title is not None:
            title = title.encode('ascii', 'replace')
            pat = re.compile(base_pat.replace('name', 'title'), re.DOTALL)
            if pat.search(src):
@@ -164,7 +177,7 @@ def set_metadata(stream, options):
            else:
                src = add_metadata_item(src, 'title', title)
        comment = options.comments
-        if comment != None:
+        if comment is not None:
            comment = comment.encode('ascii', 'replace')
            pat = re.compile(base_pat.replace('name', 'subject'), re.DOTALL)
            if pat.search(src):
@@ -172,7 +185,7 @@ def set_metadata(stream, options):
            else:
                src = add_metadata_item(src, 'subject', comment)
        author = options.authors
-        if author != None:
+        if author is not None:
            author =  ', '.join(author)
            author = author.encode('ascii', 'ignore')
            pat = re.compile(base_pat.replace('name', 'author'), re.DOTALL)
@@ -180,14 +193,23 @@ def set_metadata(stream, options):
                src = pat.sub(r'{\\author ' + author + r'}', src)
            else:
                src = add_metadata_item(src, 'author', author)
-        category = options.get('category', None)
-        if category != None:
-            category = category.encode('ascii', 'replace')
+        tags = options.tags
+        if tags is not None:
+            tags =  ', '.join(tags)
+            tags = tags.encode('ascii', 'replace')
            pat = re.compile(base_pat.replace('name', 'category'), re.DOTALL)
            if pat.search(src):
-                src = pat.sub(r'{\\category ' + category + r'}', src)
+                src = pat.sub(r'{\\category ' + tags + r'}', src)
            else:
-                src = add_metadata_item(src, 'category', category)
+                src = add_metadata_item(src, 'category', tags)
+        publisher = options.publisher
+        if publisher is not None:
+            publisher = publisher.encode('ascii', 'replace')
+            pat = re.compile(base_pat.replace('name', 'manager'), re.DOTALL)
+            if pat.search(src):
+                src = pat.sub(r'{\\manager ' + publisher + r'}', src)
+            else:
+                src = add_metadata_item(src, 'manager', publisher)
        stream.seek(pos + olen)
        after = stream.read()
        stream.seek(pos)
@@ -77,7 +77,15 @@ class RTFInput(InputFormatPlugin):

    def generate_xml(self, stream):
        from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
-        ofile = 'out.xml'
+        ofile = 'dataxml.xml'
+        run_lev, debug_dir = 1, None
+        if getattr(self.opts, 'debug_pipeline', None) is not None:
+            try:
+                os.mkdir(debug_dir)
+                debug_dir = 'rtfdebug'
+                run_lev = 4
+            except:
+                pass
        parser = ParseRtf(
            in_file    = stream,
            out_file   = ofile,
@@ -115,43 +123,45 @@ class RTFInput(InputFormatPlugin):

            # Write or do not write paragraphs. Default is 0.
            empty_paragraphs = 1,
+
+            #debug
+            deb_dir = debug_dir,
+            run_level = run_lev,
        )
        parser.parse_rtf()
-        ans = open('out.xml').read()
-        os.remove('out.xml')
-        return ans
+        with open(ofile, 'rb') as f:
+            return f.read()

    def extract_images(self, picts):
+        import imghdr
        self.log('Extracting images...')

+        with open(picts, 'rb') as f:
+            raw = f.read()
+        picts = filter(len, re.findall(r'\{\\pict([^}]+)\}', raw))
+        hex = re.compile(r'[^a-fA-F0-9]')
+        encs = [hex.sub('', pict) for pict in picts]
+
        count = 0
-        raw = open(picts, 'rb').read()
-        starts = []
-        for match in re.finditer(r'\{\\pict([^}]+)\}', raw):
-            starts.append(match.start(1))
-
        imap = {}
-
-        for start in starts:
-            pos, bc = start, 1
-            while bc > 0:
-                if raw[pos] == '}': bc -= 1
-                elif raw[pos] == '{': bc += 1
-                pos += 1
-            pict = raw[start:pos+1]
-            enc = re.sub(r'[^a-zA-Z0-9]', '', pict)
+        for enc in encs:
            if len(enc) % 2 == 1:
                enc = enc[:-1]
            data = enc.decode('hex')
+            fmt = imghdr.what(None, data)
+            if fmt is None:
+                fmt = 'wmf'
            count += 1
-            name = (('%4d'%count).replace(' ', '0'))+'.wmf'
-            open(name, 'wb').write(data)
+            name = '%04d.%s' % (count, fmt)
+            with open(name, 'wb') as f:
+                f.write(data)
            imap[count] = name
            #open(name+'.hex', 'wb').write(enc)
        return self.convert_images(imap)

    def convert_images(self, imap):
-        for count, val in imap.items():
+        self.default_img = None
+        for count, val in imap.iteritems():
            try:
                imap[count] = self.convert_image(val)
            except:
@@ -159,6 +169,8 @@ class RTFInput(InputFormatPlugin):
        return imap

    def convert_image(self, name):
+        if not name.endswith('.wmf'):
+            return name
        try:
            return self.rasterize_wmf(name)
        except:
@@ -167,21 +179,22 @@ class RTFInput(InputFormatPlugin):

    def replace_wmf(self, name):
        from calibre.ebooks import calibre_cover
-        data = calibre_cover('Conversion of WMF images is not supported',
+        if self.default_img is None:
+            self.default_img = calibre_cover('Conversion of WMF images is not supported',
            'Use Microsoft Word or OpenOffice to save this RTF file'
            ' as HTML and convert that in calibre.', title_size=36,
            author_size=20)
        name = name.replace('.wmf', '.jpg')
        with open(name, 'wb') as f:
-            f.write(data)
+            f.write(self.default_img)
        return name

    def rasterize_wmf(self, name):
-        from calibre.utils.wmf import extract_raster_image
+        from calibre.utils.wmf.parse import wmf_unwrap
        with open(name, 'rb') as f:
            data = f.read()
-        data = extract_raster_image(data)
-        name = name.replace('.wmf', '.jpg')
+        data = wmf_unwrap(data)
+        name = name.replace('.wmf', '.png')
        with open(name, 'wb') as f:
            f.write(data)
        return name
@@ -212,27 +225,27 @@ class RTFInput(InputFormatPlugin):
        css += '\n'+'\n'.join(font_size_classes)
        css += '\n' +'\n'.join(color_classes)

-        for cls, val in border_styles.items():
+        for cls, val in border_styles.iteritems():
            css += '\n\n.%s {\n%s\n}'%(cls, val)

        with open('styles.css', 'ab') as f:
            f.write(css)

-    def preprocess(self, fname):
-        self.log('\tPreprocessing to convert unicode characters')
-        try:
-            data = open(fname, 'rb').read()
-            from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
-            tokenizer = RtfTokenizer(data)
-            tokens = RtfTokenParser(tokenizer.tokens)
-            data = tokens.toRTF()
-            fname = 'preprocessed.rtf'
-            with open(fname, 'wb') as f:
-                f.write(data)
-        except:
-            self.log.exception(
-            'Failed to preprocess RTF to convert unicode sequences, ignoring...')
-        return fname
+    # def preprocess(self, fname):
+        # self.log('\tPreprocessing to convert unicode characters')
+        # try:
+            # data = open(fname, 'rb').read()
+            # from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
+            # tokenizer = RtfTokenizer(data)
+            # tokens = RtfTokenParser(tokenizer.tokens)
+            # data = tokens.toRTF()
+            # fname = 'preprocessed.rtf'
+            # with open(fname, 'wb') as f:
+                # f.write(data)
+        # except:
+            # self.log.exception(
+            # 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
+        # return fname

    def convert_borders(self, doc):
        border_styles = []
@@ -269,17 +282,14 @@ class RTFInput(InputFormatPlugin):
        self.log = log
        self.log('Converting RTF to XML...')
        #Name of the preprocesssed RTF file
-        fname = self.preprocess(stream.name)
+        # fname = self.preprocess(stream.name)
        try:
-            xml = self.generate_xml(fname)
+            xml = self.generate_xml(stream.name)
        except RtfInvalidCodeException, e:
+            raise
            raise ValueError(_('This RTF file has a feature calibre does not '
            'support. Convert it to HTML first and then try it.\n%s')%e)

-        '''dataxml = open('dataxml.xml', 'w')
-        dataxml.write(xml)
-        dataxml.close'''
-
        d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
        if d:
            imap = {}
@@ -17,7 +17,8 @@
 #########################################################################
 # $Revision: 1.41 $
 # $Date: 2006/03/24 23:50:07 $
-import sys,os
+import sys, os
+
 from calibre.ebooks.rtf2xml import headings_to_sections, \
    line_endings, footnote, fields_small, default_encoding, \
    make_lists, preamble_div, header, colors, group_borders, \
@@ -90,7 +91,6 @@ class ParseRtf:
                out_file = '',
                out_dir = None,
                dtd = '',
-                #debug = 0, #why? calibre
                deb_dir = None,
                convert_symbol = None,
                convert_wingdings = None,
@@ -107,6 +107,7 @@ class ParseRtf:
                no_dtd = 0,
                char_data = '',
                ):
+
        """
        Requires:
        'file' --file to parse
@@ -119,12 +120,11 @@ class ParseRtf:
            script tries to output to directory where is script is exectued.)
            'deb_dir' --debug directory. If a debug_dir is provided, the script
            will copy each run through as a file to examine in the debug_dir
-            'perl_script'--use perl to make tokens. This runs just a bit faster.
-            (I will probably phase this out.)
            'check_brackets' -- make sure the brackets match up after each run
            through a file. Only for debugging.
        Returns: Nothing
        """
+
        self.__file = in_file
        self.__out_file = out_file
        self.__out_dir = out_dir
@@ -132,7 +132,7 @@ class ParseRtf:
        self.__dtd_path = dtd
        self.__check_file(in_file,"file_to_parse")
        self.__char_data = char_data
-        self.__debug_dir = deb_dir #self.__debug_dir = debug calibre
+        self.__debug_dir = deb_dir
        self.__check_dir(self.__temp_dir)
        self.__copy = self.__check_dir(self.__debug_dir)
        self.__convert_caps = convert_caps
@@ -155,25 +155,24 @@ class ParseRtf:
        if hasattr(the_file, 'read'): return
        if the_file == None:
            if type == "file_to_parse":
-                message = "You must provide a file for the script to work"
-            msg = message
+                msg = "\nYou must provide a file for the script to work"
            raise RtfInvalidCodeException, msg
        elif os.path.exists(the_file):
            pass # do nothing
        else:
-            message = "The file '%s' cannot be found" % the_file
-            msg = message
+            msg = "\nThe file '%s' cannot be found" % the_file
            raise RtfInvalidCodeException, msg
+
    def __check_dir(self, the_dir):
        """Check to see if directory exists"""
        if not the_dir :
            return
        dir_exists = os.path.isdir(the_dir)
        if not dir_exists:
-            message = "%s is not a directory" % the_dir
-            msg = message
+            msg = "\n%s is not a directory" % the_dir
            raise RtfInvalidCodeException, msg
        return 1
+
    def parse_rtf(self):
        """
        Parse the file by calling on other classes.
@@ -194,13 +193,14 @@ class ParseRtf:
            copy_obj.set_dir(self.__debug_dir)
            copy_obj.remove_files()
            copy_obj.copy_file(self.__temp_file, "original_file")
-        # new as of 2005-08-02. Do I want this?
+        # Function to check if bracket are well handled
        if self.__debug_dir or self.__run_level > 2:
            self.__check_brack_obj = check_brackets.CheckBrackets\
            (file = self.__temp_file,
                bug_handler = RtfInvalidCodeException,
                    )
-        # convert Macintosh line endings to Unix line endings
+        #convert Macintosh and Windows line endings to Unix line endings
+        #why do this if you don't wb after?
        line_obj = line_endings.FixLineEndings(
                in_file = self.__temp_file,
                bug_handler = RtfInvalidCodeException,
@@ -208,13 +208,13 @@ class ParseRtf:
                run_level = self.__run_level,
                replace_illegals = self.__replace_illegals,
                )
-        return_value = line_obj.fix_endings()
+        return_value = line_obj.fix_endings() #calibre return what?
        self.__return_code(return_value)
        tokenize_obj = tokenize.Tokenize(
                bug_handler = RtfInvalidCodeException,
                in_file = self.__temp_file,
                copy = self.__copy,
-                run_level = self.__run_level,)
+                run_level = self.__run_level)
        tokenize_obj.tokenize()
        process_tokens_obj = process_tokens.ProcessTokens(
            in_file = self.__temp_file,
@@ -230,12 +230,25 @@ class ParseRtf:
                os.remove(self.__temp_file)
            except OSError:
                pass
+            #Check to see if the file is correctly encoded
+            encode_obj = default_encoding.DefaultEncoding(
+            in_file = self.__temp_file,
+            run_level = self.__run_level,
+            bug_handler = RtfInvalidCodeException,
+            check_raw = True,
+            )
+            platform, code_page, default_font_num = encode_obj.find_default_encoding()
            check_encoding_obj = check_encoding.CheckEncoding(
-                bug_handler = RtfInvalidCodeException,
-                    )
-            check_encoding_obj.check_encoding(self.__file)
-            sys.stderr.write('File "%s" does not appear to be RTF.\n' % self.__file if isinstance(self.__file, str) else self.__file.encode('utf-8'))
-            raise InvalidRtfException, msg
+                    bug_handler = RtfInvalidCodeException,
+                        )
+            enc = encode_obj.get_codepage()
+            if enc != 'mac_roman':
+                enc = 'cp' + enc
+            if check_encoding_obj.check_encoding(self.__file, enc):
+                file_name = self.__file if isinstance(self.__file, str) \
+                                    else self.__file.encode('utf-8')
+                msg = 'File %s does not appear to be correctly encoded.\n' % file_name
+                raise InvalidRtfException, msg
        delete_info_obj = delete_info.DeleteInfo(
            in_file = self.__temp_file,
            copy = self.__copy,
@@ -508,6 +521,7 @@ class ParseRtf:
                indent = self.__indent,
                run_level = self.__run_level,
                no_dtd = self.__no_dtd,
+                encoding = encode_obj.get_codepage(),
                bug_handler = RtfInvalidCodeException,
                )
        tags_obj.convert_to_tags()
@@ -520,35 +534,28 @@ class ParseRtf:
        output_obj.output()
        os.remove(self.__temp_file)
        return self.__exit_level
+
    def __bracket_match(self, file_name):
        if self.__run_level > 2:
            good_br, msg =  self.__check_brack_obj.check_brackets()
            if good_br:
                pass
-                # sys.stderr.write( msg + ' in ' + file_name + "\n")
+                #sys.stderr.write( msg + ' in ' + file_name + "\n")
            else:
-                msg += msg +  " in file '" + file_name + "'\n"
+                msg = '%s in file %s\n' % (msg, file_name)
                raise RtfInvalidCodeException, msg
+
    def __return_code(self, num):
-        if num == None:
-            return
-        if int(num) > self.__exit_level:
-            self.__exit_level = num
+      if num == None:
+          return
+      if int(num) > self.__exit_level:
+          self.__exit_level = num
+
    def __make_temp_file(self,file):
        """Make a temporary file to parse"""
        write_file="rtf_write_file"
        read_obj = file if hasattr(file, 'read') else open(file,'r')
-        write_obj = open(write_file, 'w')
-        line = "dummy"
-        while line:
-            line = read_obj.read(1000)
-            write_obj.write(line )
-        write_obj.close()
+        with open(write_file, 'wb') as write_obj:
+            for line in read_obj:
+                write_obj.write(line)
        return write_file
-    """
-mi<tg<open______<style-sheet\n
-mi<tg<close_____<style-sheet\n
-mi<tg<open-att__<footnote<num>1\n
-mi<tg<empty-att_<page-definition<margin>33\n
-mi<tg<empty_____<para\n
-"""
@@ -24,38 +24,38 @@ class CheckBrackets:
        self.__ob_count = 0
        self.__cb_count = 0
        self.__open_bracket_num = []
+
    def open_brack(self, line):
        num = line[-5:-1]
        self.__open_bracket_num.append(num)
        self.__bracket_count += 1
+
    def close_brack(self, line):
        num = line[-5:-1]
-        ##self.__open_bracket_num.append(num)
        try:
            last_num = self.__open_bracket_num.pop()
        except:
-            return 0
+            return False
        if num != last_num:
-            return 0
+            return False
        self.__bracket_count -= 1
-        return 1
+        return True
+
    def check_brackets(self):
-        read_obj = open(self.__file, 'r')
-        line = 'dummy'
        line_count = 0
-        while line:
-            line_count += 1
-            line = read_obj.readline()
-            self.__token_info = line[:16]
-            if self.__token_info == 'ob<nu<open-brack':
-                self.open_brack(line)
-            if self.__token_info == 'cb<nu<clos-brack':
-                right_count = self.close_brack(line)
-                if not right_count:
-                    return (0, "closed bracket doesn't match, line %s" % line_count)
-        read_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            for line in read_obj:
+                line_count += 1
+                self.__token_info = line[:16]
+                if self.__token_info == 'ob<nu<open-brack':
+                    self.open_brack(line)
+                if self.__token_info == 'cb<nu<clos-brack':
+                    if not self.close_brack(line):
+                        return (False, "closed bracket doesn't match, line %s" % line_count)
+
        if self.__bracket_count != 0:
-            msg = 'At end of file open and closed brackets don\'t match\n'
-            msg = msg + 'total number of brackets is %s' % self.__bracket_count
-            return (0, msg)
-        return (1, "brackets match!")
+            msg = ('At end of file open and closed brackets don\'t match\n' \
+                        'total number of brackets is %s') % self.__bracket_count
+            return (False, msg)
+        return (True, "Brackets match!")
+
@@ -1,8 +1,11 @@
 #!/usr/bin/env python
 import sys
+
 class CheckEncoding:
+
    def __init__(self, bug_handler):
        self.__bug_handler = bug_handler
+
    def __get_position_error(self, line, encoding, line_num):
        char_position = 0
        for char in line:
@@ -12,21 +15,23 @@ class CheckEncoding:
            except UnicodeError, msg:
                sys.stderr.write('line: %s char: %s\n' %  (line_num, char_position))
                sys.stderr.write(str(msg) + '\n')
-    def check_encoding(self, path, encoding='us-ascii'):
-        read_obj = open(path, 'r')
-        line_to_read = 1
+
+    def check_encoding(self, path, encoding='us-ascii', verbose=True):
        line_num = 0
-        while line_to_read:
-            line_num += 1
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            try:
-                line.decode(encoding)
-            except UnicodeError:
-                if len(line) < 1000:
-                    self.__get_position_error(line, encoding, line_num)
-                else:
-                    sys.stderr.write('line: %d has bad encoding\n'%line_num)
+        with open(path, 'r') as read_obj:
+            for line in read_obj:
+                line_num += 1
+                try:
+                    line.decode(encoding)
+                except UnicodeError:
+                    if verbose:
+                        if len(line) < 1000:
+                            self.__get_position_error(line, encoding, line_num)
+                        else:
+                            sys.stderr.write('line: %d has bad encoding\n' % line_num)
+                    return True
+        return False
+
 if __name__ == '__main__':
    check_encoding_obj = CheckEncoding()
    check_encoding_obj.check_encoding(sys.argv[1])
@@ -16,7 +16,9 @@
 #                                                                       #
 #########################################################################
 import os, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+
 class CombineBorders:
    """Combine borders in RTF tokens to make later processing easier"""
    def __init__(self,
@@ -32,28 +34,31 @@ class CombineBorders:
        self.__state = 'default'
        self.__bord_pos = 'default'
        self.__bord_att = []
+
    def found_bd(self, line):
        #cw<bd<bor-t-r-vi
        self.__state = 'border'
        self.__bord_pos = line[6:16]
+
    def __default_func(self, line):
        #cw<bd<bor-t-r-vi
        if self.__first_five == 'cw<bd':
            self.found_bd(line)
            return ''
        return line
+
    def end_border(self, line, write_obj):
-        joiner = "|"
-        border_string = joiner.join(self.__bord_att)
+        border_string = "|".join(self.__bord_att)
        self.__bord_att = []
        write_obj.write('cw<bd<%s<nu<%s\n' % (self.__bord_pos,
-        border_string))
+                                                border_string))
        self.__state = 'default'
        self.__bord_string = ''
        if self.__first_five == 'cw<bd':
            self. found_bd(line)
        else:
            write_obj.write(line)
+
    def add_to_border_desc(self, line):
        #cw<bt<bdr-hair__<nu<true
        #cw<bt<bdr-linew<nu<0.50
@@ -65,26 +70,22 @@ class CombineBorders:
        else:
            num = ':' + num
        self.__bord_att.append(border_desc + num)
+
    def __border_func(self, line, write_obj):
        if self.__first_five != 'cw<bt':
            self.end_border(line, write_obj)
        else:
            self.add_to_border_desc(line)
+
    def combine_borders(self):
-        read_obj = open(self.__file, 'r')
-        write_obj = open(self.__write_to, 'w')
-        line_to_read = 'dummy'
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__first_five = line[0:5]
-            if self.__state == 'border':
-                self.__border_func(line, write_obj)
-            else:
-                to_print = self.__default_func(line)
-                write_obj.write(to_print)
-        read_obj.close()
-        write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as write_obj:
+                for line in read_obj:
+                    self.__first_five = line[0:5]
+                    if self.__state == 'border':
+                        self.__border_func(line, write_obj)
+                    else:
+                        write_obj.write(self.__default_func(line))
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "combine_borders.data")
@@ -1,6 +1,9 @@
-import os, tempfile
-from calibre.ebooks.rtf2xml import copy
+import os, tempfile, sys
+
+from calibre.ebooks.rtf2xml import copy, check_encoding
+
 public_dtd = 'rtf2xml1.0.dtd'
+
 class ConvertToTags:
    """
    Convert file to XML
@@ -10,6 +13,7 @@ class ConvertToTags:
            bug_handler,
            dtd_path,
            no_dtd,
+            encoding,
            indent = None,
            copy = None,
            run_level = 1,
@@ -29,9 +33,14 @@ class ConvertToTags:
        self.__copy = copy
        self.__dtd_path = dtd_path
        self.__no_dtd = no_dtd
+        if encoding != 'mac_roman':
+            self.__encoding = 'cp' + encoding
+        else:
+            self.__encoding = 'mac_roman'
        self.__indent = indent
        self.__run_level = run_level
        self.__write_to = tempfile.mktemp()
+
    def __initiate_values(self):
        """
        Set values, including those for the dictionary.
@@ -61,6 +70,7 @@ class ConvertToTags:
        'tx<ut<__________'  :   self.__text_func,
        'mi<tg<empty_____'  :   self.__empty_func,
        }
+
    def __open_func(self, line):
        """
        Print the opening tag and newlines when needed.
@@ -73,6 +83,7 @@ class ConvertToTags:
        if info in self.__two_new_line:
            self.__write_extra_new_line()
        self.__write_obj.write('<%s>' % info)
+
    def __empty_func(self, line):
        """
        Print out empty tag and newlines when needed.
@@ -85,10 +96,11 @@ class ConvertToTags:
            self.__write_new_line()
        if info in self.__two_new_line:
            self.__write_extra_new_line()
+
    def __open_att_func(self, line):
        """
        Process lines for open tags that have attributes.
-        The important infor is between [17:-1]. Take this info and split it
+        The important info is between [17:-1]. Take this info and split it
        with the delimeter '<'. The first token in this group is the element
        name. The rest are attributes, separated fromt their values by '>'. So
        read each token one at a time, and split them by '>'.
@@ -119,6 +131,7 @@ class ConvertToTags:
            self.__write_new_line()
        if element_name in self.__two_new_line:
            self.__write_extra_new_line()
+
    def __empty_att_func(self, line):
        """
        Same as the __open_att_func, except a '/' is placed at the end of the tag.
@@ -143,6 +156,7 @@ class ConvertToTags:
            self.__write_new_line()
        if element_name in self.__two_new_line:
            self.__write_extra_new_line()
+
    def __close_func(self, line):
        """
        Print out the closed tag and new lines, if appropriate.
@@ -156,6 +170,7 @@ class ConvertToTags:
            self.__write_new_line()
        if info in self.__two_new_line:
            self.__write_extra_new_line()
+
    def __text_func(self, line):
        """
        Simply print out the information between [17:-1]
@@ -163,6 +178,7 @@ class ConvertToTags:
        #tx<nu<__________<Normal;
        # change this!
        self.__write_obj.write(line[17:-1])
+
    def __write_extra_new_line(self):
        """
        Print out extra new lines if the new lines have not exceeded two. If
@@ -172,8 +188,10 @@ class ConvertToTags:
            return
        if self.__new_line < 2:
            self.__write_obj.write('\n')
+
    def __default_func(self, line):
        pass
+
    def __write_new_line(self):
        """
        Print out a new line if a new line has not already been printed out.
@@ -183,11 +201,23 @@ class ConvertToTags:
        if not self.__new_line:
            self.__write_obj.write('\n')
            self.__new_line += 1
+
    def __write_dec(self):
        """
        Write the XML declaration at the top of the document.
        """
-        self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
+        #keep maximum compatibility with previous version
+        check_encoding_obj = check_encoding.CheckEncoding(
+                    bug_handler=self.__bug_handler)
+
+        if not check_encoding_obj.check_encoding(self.__file, verbose=False):
+            self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
+        elif not check_encoding_obj.check_encoding(self.__file, self.__encoding):
+            self.__write_obj.write('<?xml version="1.0" encoding="%s" ?>' % self.__encoding)
+        else:
+            self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
+            sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
+                    ' hope for the best')
        self.__new_line = 0
        self.__write_new_line()
        if self.__no_dtd:
@@ -207,6 +237,7 @@ class ConvertToTags:
            )
        self.__new_line = 0
        self.__write_new_line()
+
    def convert_to_tags(self):
        """
        Read in the file one line at a time. Get the important info, between
@@ -222,18 +253,14 @@ class ConvertToTags:
            an empty tag function.
            """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
        self.__write_obj = open(self.__write_to, 'w')
        self.__write_dec()
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            action = self.__state_dict.get(self.__token_info)
-            if action != None:
-                action(line)
-        read_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            for line in read_obj:
+                self.__token_info = line[:16]
+                action = self.__state_dict.get(self.__token_info)
+                if action is not None:
+                    action(line)
        self.__write_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
@@ -23,6 +23,7 @@ class Copy:
    def __init__(self, bug_handler, file = None, deb_dir = None, ):
        self.__file = file
        self.__bug_handler = bug_handler
+
    def set_dir(self, deb_dir):
        """Set the temporary directory to write files to"""
        if deb_dir is None:
@@ -33,19 +34,11 @@ class Copy:
            message = "%(deb_dir)s is not a directory" % vars()
            raise self.__bug_handler , message
        Copy.__dir = deb_dir
+
    def remove_files(self ):
        """Remove files from directory"""
        self.__remove_the_files(Copy.__dir)
-        """
-        list_of_files = os.listdir(Copy.__dir)
-        list_of_files = os.listdir(the_dir)
-        for file in list_of_files:
-            rem_file = os.path.join(Copy.__dir,file)
-            if os.path.isdir(rem_file):
-                self.remove_files(rem_file)
-            else:
-                os.remove(rem_file)
-        """
+
    def __remove_the_files(self, the_dir):
        """Remove files from directory"""
        list_of_files = os.listdir(the_dir)
@@ -58,6 +51,7 @@ class Copy:
                    os.remove(rem_file)
                except OSError:
                    pass
+
    def copy_file(self, file, new_file):
        """
        Copy the file to a new name
@@ -1,61 +1,142 @@
 #########################################################################
 #                                                                       #
-#                                                                       #
 #   copyright 2002 Paul Henry Tremblay                                  #
 #                                                                       #
-#   This program is distributed in the hope that it will be useful,     #
-#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
-#   General Public License for more details.                            #
-#                                                                       #
-#   You should have received a copy of the GNU General Public License   #
-#   along with this program; if not, write to the Free Software         #
-#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA            #
-#   02111-1307 USA                                                      #
-#                                                                       #
-#                                                                       #
 #########################################################################
+
+'''
+Codepages as to RTF 1.9.1:
+    437	United States IBM
+    708	Arabic (ASMO 708)
+    709	Arabic (ASMO 449+, BCON V4)
+    710	Arabic (transparent Arabic)
+    711	Arabic (Nafitha Enhanced)
+    720	Arabic (transparent ASMO)
+    819	Windows 3.1 (United States and Western Europe)
+    850	IBM multilingual
+    852	Eastern European
+    860	Portuguese
+    862	Hebrew
+    863	French Canadian
+    864	Arabic
+    865	Norwegian
+    866	Soviet Union
+    874	Thai
+    932	Japanese
+    936	Simplified Chinese
+    949	Korean
+    950	Traditional Chinese
+    1250	Eastern European
+    1251	Cyrillic
+    1252	Western European
+    1253	Greek
+    1254	Turkish
+    1255	Hebrew
+    1256	Arabic
+    1257	Baltic
+    1258	Vietnamese
+    1361	Johab
+    10000	MAC Roman
+    10001	MAC Japan
+    10004	MAC Arabic
+    10005	MAC Hebrew
+    10006	MAC Greek
+    10007	MAC Cyrillic
+    10029	MAC Latin2
+    10081	MAC Turkish
+    57002	Devanagari
+    57003	Bengali
+    57004	Tamil
+    57005	Telugu
+    57006	Assamese
+    57007	Oriya
+    57008	Kannada
+    57009	Malayalam
+    57010	Gujarati
+    57011	Punjabi
+'''
+import re
+
 class DefaultEncoding:
    """
    Find the default encoding for the doc
    """
-    def __init__(self, in_file, bug_handler, run_level = 1,):
-        """
-        Required:
-            'file'
-        Returns:
-            nothing
-            """
+    def __init__(self, in_file, bug_handler, run_level = 1, check_raw = False):
        self.__file = in_file
        self.__bug_handler = bug_handler
+        self.__platform = 'Windows'
+        self.__default_num = 'not-defined'
+        self.__code_page = '1252'
+        self.__datafetched = False
+        self.__fetchraw = check_raw
+
    def find_default_encoding(self):
-        platform = 'Windows'
-        default_num = 'not-defined'
-        code_page = 'ansicpg1252'
-        read_obj = open(self.__file, 'r')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            if self.__token_info == 'mi<mk<rtfhed-end':
-                break
-            if self.__token_info == 'cw<ri<ansi-codpg':
-                #cw<ri<ansi-codpg<nu<10000
-                num = line[20:-1]
-                if not num:
-                    num = '1252'
-                code_page = 'ansicpg' + num
-            if self.__token_info == 'cw<ri<macintosh_':
-                platform = 'Macintosh'
-            if self.__token_info == 'cw<ri<deflt-font':
-                default_num = line[20:-1]
-                #cw<ri<deflt-font<nu<0
-            #action = self.__state_dict.get(self.__state)
-            #if action == None:
-                #print self.__state
-            #action(line)
-        read_obj.close()
-        if platform == 'Macintosh':
-            code_page = 'mac_roman'
-        return platform, code_page, default_num
+        if not self.__datafetched:
+            self._encoding()
+            self.__datafetched = True
+        if self.__platform == 'Macintosh':
+            code_page = self.__code_page
+        else:
+            code_page = 'ansicpg' + self.__code_page
+        return self.__platform, code_page, self.__default_num
+
+    def get_codepage(self):
+        if not self.__datafetched:
+            self._encoding()
+            self.__datafetched = True
+        return self.__code_page
+
+    def get_platform(self):
+        if not self.__datafetched:
+            self._encoding()
+            self.__datafetched = True
+        return self.__platform
+
+    def _encoding(self):
+        with open(self.__file, 'r') as read_obj:
+            if not self.__fetchraw:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    if self.__token_info == 'mi<mk<rtfhed-end':
+                        break
+                    if self.__token_info == 'cw<ri<ansi-codpg':
+                        #cw<ri<ansi-codpg<nu<10000
+                        self.__code_page = line[20:-1] if int(line[20:-1]) \
+                                            else '1252'
+                    if self.__token_info == 'cw<ri<macintosh_':
+                        self.__platform = 'Macintosh'
+                        self.__code_page = 'mac_roman'
+                    elif self.__token_info == 'cw<ri<pc________':
+                        self.__platform = 'IBMPC'
+                        self.__code_page = '437'
+                    elif self.__token_info == 'cw<ri<pca_______':
+                        self.__platform = 'OS/2'
+                        self.__code_page = '850'
+                    if self.__token_info == 'cw<ri<deflt-font':
+                        self.__default_num = line[20:-1]
+                        #cw<ri<deflt-font<nu<0
+            else:
+                fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
+                fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
+                for line in read_obj:
+                    if fenccp.search(line):
+                        cp = fenccp.search(line).group(1)
+                        if not int(cp):
+                            self.__code_page = cp
+                        break
+                    if fenc.search(line):
+                        enc = fenc.search(line).group(1)
+                        if enc == 'mac':
+                            self.__code_page = 'mac_roman'
+                        elif enc == 'pc':
+                            self.__code_page = '437'
+                        elif enc == 'pca':
+                            self.__code_page = '850'
+
+# if __name__ == '__main__':
+    # encode_obj = DefaultEncoding(
+            # in_file = sys.argv[1],
+            # bug_handler = Exception,
+            # check_raw = True,
+            # )
+    # print encode_obj.get_codepage()
@@ -16,7 +16,9 @@
 #                                                                       #
 #########################################################################
 import sys, os, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+
 class DeleteInfo:
    """Delelet unecessary destination groups"""
    def __init__(self,
@@ -29,17 +31,18 @@ class DeleteInfo:
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__write_to = tempfile.mktemp()
-        self.__bracket_count=0
+        self.__bracket_count= 0
        self.__ob_count = 0
        self.__cb_count = 0
-        self.__after_asterisk = 0
-        self.__delete = 0
+        # self.__after_asterisk = False
+        # self.__delete = 0
        self.__initiate_allow()
        self.__ob = 0
-        self.__write_cb = 0
+        self.__write_cb = False
        self.__run_level = run_level
-        self.__found_delete = 0
-        self.__list = 0
+        self.__found_delete = False
+        # self.__list = False
+
    def __initiate_allow(self):
        """
        Initiate a list of destination groups which should be printed out.
@@ -66,9 +69,10 @@ class DeleteInfo:
        self.__state_dict = {
            'default'           : self.__default_func,
            'after_asterisk'    : self.__asterisk_func,
-            'delete'           : self.__delete_func,
+            'delete'            : self.__delete_func,
            'list'              : self.__list_func,
        }
+
    def __default_func(self,line):
        """Handle lines when in no special state. Look for an asterisk to
        begin a special state. Otherwise, print out line."""
@@ -81,27 +85,29 @@ class DeleteInfo:
            if self.__ob:
                self.__write_obj.write(self.__ob)
            self.__ob = line
-            return 0
+            return False
        else:
            # write previous bracket, since didn't fine asterisk
            if self.__ob:
                self.__write_obj.write(self.__ob)
                self.__ob = 0
-            return 1
+            return True
+
    def __delete_func(self,line):
        """Handle lines when in delete state. Don't print out lines
        unless the state has ended."""
        if self.__delete_count == self.__cb_count:
            self.__state = 'default'
            if self.__write_cb:
-                self.__write_cb = 0
-                return 1
-            return 0
+                self.__write_cb = True
+                return True
+            return False
+
    def __asterisk_func(self,line):
        """
        Determine whether to delete info in group
        Note on self.__cb flag.
-        If you find that you are in a delete group, and the preivous
+        If you find that you are in a delete group, and the previous
        token in not an open bracket (self.__ob = 0), that means
        that the delete group is nested inside another acceptable
        detination group. In this case, you have alrady written
@@ -110,21 +116,21 @@ class DeleteInfo:
        """
        # Test for {\*}, in which case don't enter
        # delete state
-        self.__after_asterisk = 0 # only enter this function once
-        self.__found_delete = 1
+        # self.__after_asterisk = False # only enter this function once
+        self.__found_delete = True
        if self.__token_info == 'cb<nu<clos-brack':
            if self.__delete_count == self.__cb_count:
                self.__state = 'default'
                self.__ob = 0
                # changed this because haven't printed out start
-                return 0
+                return False
            else:
                # not sure what happens here!
                # believe I have a '{\*}
                if self.__run_level > 3:
                    msg = 'flag problem\n'
                    raise self.__bug_handler, msg
-                return 1
+                return True
        elif self.__token_info in self.__allowable :
            if self.__ob:
                self.__write_obj.write(self.__ob)
@@ -132,85 +138,81 @@ class DeleteInfo:
                self.__state = 'default'
            else:
                pass
-            return 1
+            return True
        elif self.__token_info == 'cw<ls<list______':
            self.__ob = 0
            self.__found_list_func(line)
        elif self.__token_info in self.__not_allowable:
            if not self.__ob:
-                self.__write_cb = 1
+                self.__write_cb = True
            self.__ob = 0
            self.__state = 'delete'
            self.__cb_count = 0
-            return 0
+            return False
        else:
            if self.__run_level > 5:
-                msg = 'After an asterisk, and found neither an allowable or non-allowble token\n'
-                msg += 'token is "%s"\n' % self.__token_info
-                raise self.__bug_handler
+                msg = ('After an asterisk, and found neither an allowable or non-allowable token\n\
+                            token is "%s"\n') % self.__token_info
+                raise self.__bug_handler, msg
            if not self.__ob:
-                self.__write_cb = 1
+                self.__write_cb = True
            self.__ob = 0
            self.__state = 'delete'
            self.__cb_count = 0
-            return 0
+            return False
+
    def __found_list_func(self, line):
        """
        print out control words in this group
        """
        self.__state = 'list'
+
    def __list_func(self, line):
        """
        Check to see if the group has ended.
-        Return 1 for all control words.
-        Return 0 otherwise.
+        Return True for all control words.
+        Return False otherwise.
        """
        if self.__delete_count == self.__cb_count and self.__token_info ==\
            'cb<nu<clos-brack':
            self.__state = 'default'
            if self.__write_cb:
-                self.__write_cb = 0
-                return 1
-            return 0
+                self.__write_cb = False
+                return True
+            return False
        elif line[0:2] == 'cw':
-            return 1
+            return True
        else:
-            return 0
+            return False
+
    def delete_info(self):
        """Main method for handling other methods. Read one line in at
-        a time, and determine wheter to print the line based on the state."""
-        line_to_read = 'dummy'
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        while line_to_read:
-            #ob<nu<open-brack<0001
-            to_print =1
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            if self.__token_info == 'ob<nu<open-brack':
-                self.__ob_count = line[-5:-1]
-            if self.__token_info == 'cb<nu<clos-brack':
-                self.__cb_count = line[-5:-1]
-            action = self.__state_dict.get(self.__state)
-            if not action:
-                sys.stderr.write('No action in dictionary state is "%s" \n'
-                        % self.__state)
-            to_print = action(line)
-            """
-            if self.__after_asterisk:
-                to_print = self.__asterisk_func(line)
-            elif self.__list:
-                self.__in_list_func(line)
-            elif self.__delete:
-                to_print = self.__delete_func(line)
-            else:
-                to_print = self.__default_func(line)
-            """
-            if to_print:
-                self.__write_obj.write(line)
-        self.__write_obj.close()
-        read_obj.close()
+        a time, and determine whether to print the line based on the state."""
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                for line in read_obj:
+                    #ob<nu<open-brack<0001
+                    to_print = True
+                    self.__token_info = line[:16]
+                    if self.__token_info == 'ob<nu<open-brack':
+                        self.__ob_count = line[-5:-1]
+                    if self.__token_info == 'cb<nu<clos-brack':
+                        self.__cb_count = line[-5:-1]
+                    action = self.__state_dict.get(self.__state)
+                    if not action:
+                        sys.stderr.write(_('No action in dictionary state is "%s" \n')
+                                % self.__state)
+                    to_print = action(line)
+                    # if self.__after_asterisk:
+                        # to_print = self.__asterisk_func(line)
+                    # elif self.__list:
+                        # self.__in_list_func(line)
+                    # elif self.__delete:
+                        # to_print = self.__delete_func(line)
+                    # else:
+                        # to_print = self.__default_func(line)
+                    if to_print:
+                        self.__write_obj.write(line)
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "delete_info.data")
@@ -16,7 +16,9 @@
 #                                                                       #
 #########################################################################
 import os, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+
 class Footnote:
    """
    Two public methods are available. The first separates all of the
@@ -35,6 +37,7 @@ class Footnote:
        self.__copy = copy
        self.__write_to = tempfile.mktemp()
        self.__found_a_footnote = 0
+
    def __first_line_func(self, line):
        """
        Print the tag info for footnotes.  Check whether footnote is an
@@ -47,6 +50,7 @@ class Footnote:
            self.__write_to_foot_obj.write(
            'mi<tg<open-att__<footnote<num>%s\n' % self.__footnote_count)
        self.__first_line = 0
+
    def __in_footnote_func(self, line):
        """Handle all tokens that are part of footnote"""
        if self.__first_line:
@@ -68,6 +72,7 @@ class Footnote:
            'mi<mk<footnt-clo\n')
        else:
            self.__write_to_foot_obj.write(line)
+
    def __found_footnote(self, line):
        """ Found a footnote"""
        self.__found_a_footnote = 1
@@ -81,6 +86,7 @@ class Footnote:
        'mi<mk<footnt-ind<%04d\n' % self.__footnote_count)
        self.__write_to_foot_obj.write(
        'mi<mk<footnt-ope<%04d\n' % self.__footnote_count)
+
    def __default_sep(self, line):
        """Handle all tokens that are not footnote tokens"""
        if self.__token_info == 'cw<nt<footnote__':
@@ -91,6 +97,7 @@ class Footnote:
            self.__write_obj.write(
                'tx<nu<__________<%s\n' % num
            )
+
    def __initiate_sep_values(self):
        """
        initiate counters for separate_footnotes method.
@@ -102,6 +109,7 @@ class Footnote:
        self.__in_footnote = 0
        self.__first_line = 0 #have not processed the first line of footnote
        self.__footnote_count = 0
+
    def separate_footnotes(self):
        """
        Separate all the footnotes in an RTF file and put them at the bottom,
@@ -111,58 +119,50 @@ class Footnote:
        bottom of the main file.
        """
        self.__initiate_sep_values()
-        read_obj = open(self.__file)
-        self.__write_obj = open(self.__write_to, 'w')
        self.__footnote_holder = tempfile.mktemp()
-        self.__write_to_foot_obj = open(self.__footnote_holder, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            # keep track of opening and closing brackets
-            if self.__token_info == 'ob<nu<open-brack':
-                self.__ob_count = line[-5:-1]
-            if self.__token_info == 'cb<nu<clos-brack':
-                self.__cb_count = line[-5:-1]
-            # In the middle of footnote text
-            if self.__in_footnote:
-                self.__in_footnote_func(line)
-            # not in the middle of footnote text
-            else:
-                self.__default_sep(line)
-        self.__write_obj.close()
-        read_obj.close()
-        self.__write_to_foot_obj.close()
-        read_obj = open(self.__footnote_holder, 'r')
-        write_obj = open(self.__write_to, 'a')
-        write_obj.write(
-        'mi<mk<sect-close\n'
-        'mi<mk<body-close\n'
-        'mi<tg<close_____<section\n'
-        'mi<tg<close_____<body\n'
-        'mi<tg<close_____<doc\n'
-        'mi<mk<footnt-beg\n')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            write_obj.write(line)
-        write_obj.write(
-        'mi<mk<footnt-end\n')
-        read_obj.close()
-        write_obj.close()
+        with open(self.__file) as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                with open(self.__footnote_holder, 'w') as self.__write_to_foot_obj:
+                    for line in read_obj:
+                        self.__token_info = line[:16]
+                        # keep track of opening and closing brackets
+                        if self.__token_info == 'ob<nu<open-brack':
+                            self.__ob_count = line[-5:-1]
+                        if self.__token_info == 'cb<nu<clos-brack':
+                            self.__cb_count = line[-5:-1]
+                        # In the middle of footnote text
+                        if self.__in_footnote:
+                            self.__in_footnote_func(line)
+                        # not in the middle of footnote text
+                        else:
+                            self.__default_sep(line)
+        with open(self.__footnote_holder, 'r') as read_obj:
+            with open(self.__write_to, 'a') as write_obj:
+                write_obj.write(
+                    'mi<mk<sect-close\n'
+                    'mi<mk<body-close\n'
+                    'mi<tg<close_____<section\n'
+                    'mi<tg<close_____<body\n'
+                    'mi<tg<close_____<doc\n'
+                    'mi<mk<footnt-beg\n')
+                for line in read_obj:
+                    write_obj.write(line)
+                write_obj.write(
+                'mi<mk<footnt-end\n')
        os.remove(self.__footnote_holder)
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "footnote_separate.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
+
    def update_info(self, file, copy):
        """
        Unused method
        """
        self.__file = file
        self.__copy = copy
+
    def __get_foot_body_func(self, line):
        """
        Process lines in main body and look for beginning of footnotes.
@@ -172,6 +172,7 @@ class Footnote:
            self.__state = 'foot'
        else:
            self.__write_obj.write(line)
+
    def __get_foot_foot_func(self, line):
        """
        Copy footnotes from bottom of file to a separate, temporary file.
@@ -180,6 +181,7 @@ class Footnote:
            self.__state = 'body'
        else:
            self.__write_to_foot_obj.write(line)
+
    def __get_footnotes(self):
        """
        Private method to remove footnotes from main file.  Read one line from
@@ -188,21 +190,16 @@ class Footnote:
        These two functions do the work of separating the footnotes form the
        body.
        """
-        read_obj = open(self.__file)
-        self.__write_obj = open(self.__write_to, 'w')
-            # self.__write_to = "footnote_info.data"
-        self.__write_to_foot_obj = open(self.__footnote_holder, 'w')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            self.__token_info = line[:16]
-            if self.__state == 'body':
-                self.__get_foot_body_func(line)
-            elif self.__state == 'foot':
-                self.__get_foot_foot_func(line)
-        read_obj.close()
-        self.__write_obj.close()
-        self.__write_to_foot_obj.close()
+        with open(self.__file) as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                with open(self.__footnote_holder, 'w') as self.__write_to_foot_obj:
+                    for line in read_obj:
+                        self.__token_info = line[:16]
+                        if self.__state == 'body':
+                            self.__get_foot_body_func(line)
+                        elif self.__state == 'foot':
+                            self.__get_foot_foot_func(line)
+
    def __get_foot_from_temp(self, num):
        """
        Private method for joining footnotes to body. This method reads from
@@ -213,9 +210,7 @@ class Footnote:
        look_for = 'mi<mk<footnt-ope<' + num + '\n'
        found_foot = 0
        string_to_return = ''
-        line = 1
-        while line:
-            line = self.__read_from_foot_obj.readline()
+        for line in self.__read_from_foot_obj:
            if found_foot:
                if line == 'mi<mk<footnt-clo\n':
                    return string_to_return
@@ -223,6 +218,7 @@ class Footnote:
            else:
                if line == look_for:
                    found_foot = 1
+
    def __join_from_temp(self):
        """
        Private method for rejoining footnotes to body.  Read from the
@@ -232,16 +228,14 @@ class Footnote:
        print out to the third file.
        If no footnote marker is found, simply print out the token (line).
        """
-        self.__read_from_foot_obj = open(self.__footnote_holder, 'r')
-        read_obj = open(self.__write_to, 'r')
-        self.__write_obj = open(self.__write_to2, 'w')
-        line = 1
-        while line:
-            line = read_obj.readline()
-            if line[:16] == 'mi<mk<footnt-ind':
-                line = self.__get_foot_from_temp(line[17:-1])
-            self.__write_obj.write(line)
-        read_obj.close()
+        with open(self.__footnote_holder, 'r') as self.__read_from_foot_obj:
+            with open(self.__write_to, 'r') as read_obj:
+                with open(self.__write_to2, 'w') as self.__write_obj:
+                    for line in read_obj:
+                        if line[:16] == 'mi<mk<footnt-ind':
+                            line = self.__get_foot_from_temp(line[17:-1])
+                        self.__write_obj.write(line)
+
    def join_footnotes(self):
        """
        Join the footnotes from the bottom of the file and put them in their
@@ -258,8 +252,8 @@ class Footnote:
        self.__state = 'body'
        self.__get_footnotes()
        self.__join_from_temp()
-        self.__write_obj.close()
-        self.__read_from_foot_obj.close()
+        # self.__write_obj.close()
+        # self.__read_from_foot_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to2, "footnote_joined.data")
@@ -43,27 +43,28 @@ class GetCharMap:
    def get_char_map(self, map):
        if map == 'ansicpg0':
            map = 'ansicpg1250'
-        found_map = 0
+        if map in ('ansicpg10000', '10000'):
+            map = 'mac_roman'
+        found_map = False
        map_dict = {}
        self.__char_file.seek(0)
-        for line in self.__char_file.readlines():
+        for line in self.__char_file:
            if not line.strip(): continue
            begin_element = '<%s>' % map;
            end_element = '</%s>' % map
            if not found_map:
                if begin_element in line:
-                    found_map = 1
+                    found_map = True
            else:
                if end_element in line:
                    break
                fields = line.split(':')
                fields[1].replace('\\colon', ':')
                map_dict[fields[1]] = fields[3]
-            
-        
+
+
        if not found_map:
-            msg = 'no map found\n'
-            msg += 'map is "%s"\n'%(map,)
+            msg = 'no map found\nmap is "%s"\n'%(map,)
            raise self.__bug_handler, msg
        return map_dict

@@ -54,10 +54,10 @@ class Hex2Utf8:
            'convert_to_caps'--wether to convert caps to utf-8
        Returns:
            nothing
-            """
+        """
        self.__file = in_file
        self.__copy = copy
-        if area_to_convert != 'preamble' and area_to_convert != 'body':
+        if area_to_convert not in ('preamble', 'body'):
            msg = (
            'Developer error! Wrong flag.\n'
            'in module "hex_2_utf8.py\n'
@@ -79,7 +79,8 @@ class Hex2Utf8:
        self.__write_to = tempfile.mktemp()
        self.__bug_handler = bug_handler
        self.__invalid_rtf_handler = invalid_rtf_handler
-    def update_values(  self,
+
+    def update_values(self,
                        file,
                        area_to_convert,
                        char_file,
@@ -132,6 +133,7 @@ class Hex2Utf8:
        # self.__convert_symbol = 0
        # self.__convert_wingdings = 0
        # self.__convert_zapf = 0
+
    def __initiate_values(self):
        """
        Required:
@@ -191,6 +193,7 @@ class Hex2Utf8:
            'body'          :       self.__body_func,
            'mi<mk<body-open_'  :   self.__found_body_func,
            'tx<hx<__________'  :   self.__hex_text_func,
+            # 'tx<nu<__________'  :   self.__text_func,
            }
        self.__body_state_dict = {
            'preamble'      :       self.__preamble_for_body_func,
@@ -209,6 +212,7 @@ class Hex2Utf8:
        }
        self.__caps_list = ['false']
        self.__font_list = ['not-defined']
+
    def __hex_text_func(self, line):
        """
        Required:
@@ -218,12 +222,12 @@ class Hex2Utf8:
            token is in the dictionary, then check if the value starts with a
            "&". If it does, then tag the result as utf text. Otherwise, tag it
            as normal text.
-            If the nex_num is not in the dictionary, then a mistake has been
+            If the hex_num is not in the dictionary, then a mistake has been
            made.
            """
        hex_num = line[17:-1]
        converted = self.__current_dict.get(hex_num)
-        if converted != None:
+        if converted is not None:
            # tag as utf-8
            if converted[0:1] == "&":
                font = self.__current_dict_name
@@ -263,42 +267,43 @@ class Hex2Utf8:
                    # msg += 'dictionary is %s\n' % self.__current_dict_name
                    msg = 'Character "&#x%s;" does not appear to be valid (or is a control character)\n' % token
                    raise self.__bug_handler, msg
+
    def __found_body_func(self, line):
        self.__state = 'body'
        self.__write_obj.write(line)
+
    def __body_func(self, line):
        """
        When parsing preamble
        """
        self.__write_obj.write(line)
+
    def __preamble_func(self, line):
        action = self.__preamble_state_dict.get(self.__token_info)
-        if action != None:
+        if action is not None:
            action(line)
        else:
            self.__write_obj.write(line)
+
    def __convert_preamble(self):
        self.__state = 'preamble'
-        read_obj = open(self.__file, 'r')
        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            action = self.__preamble_state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('error no state found in hex_2_utf8',
-                self.__state
-                )
-            action(line)
-        read_obj.close()
+        with open(self.__file, 'r') as read_obj:
+           for line in read_obj:
+                self.__token_info = line[:16]
+                action = self.__preamble_state_dict.get(self.__state)
+                if action is None:
+                    sys.stderr.write(_('error no state found in hex_2_utf8'),
+                    self.__state
+                    )
+                action(line)
        self.__write_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
+
    def __preamble_for_body_func(self, line):
        """
        Required:
@@ -311,6 +316,7 @@ class Hex2Utf8:
        if self.__token_info == 'mi<mk<body-open_':
            self.__found_body_func(line)
        self.__write_obj.write(line)
+
    def __body_for_body_func(self, line):
        """
        Required:
@@ -321,10 +327,11 @@ class Hex2Utf8:
            Used when parsing the body.
        """
        action = self.__in_body_dict.get(self.__token_info)
-        if action != None:
+        if action is not None:
            action(line)
        else:
            self.__write_obj.write(line)
+
    def __start_font_func(self, line):
        """
        Required:
@@ -348,6 +355,7 @@ class Hex2Utf8:
        else:
            self.__current_dict_name = 'default'
            self.__current_dict = self.__def_dict
+
    def __end_font_func(self, line):
        """
        Required:
@@ -376,6 +384,7 @@ class Hex2Utf8:
        else:
            self.__current_dict_name = 'default'
            self.__current_dict = self.__def_dict
+
    def __start_special_font_func_old(self, line):
        """
        Required:
@@ -398,6 +407,7 @@ class Hex2Utf8:
            self.__current_dict.append(self.__dingbats_dict)
            self.__special_fonts_found += 1
            self.__current_dict_name = 'Zapf Dingbats'
+
    def __end_special_font_func(self, line):
        """
        Required:
@@ -416,6 +426,7 @@ class Hex2Utf8:
            self.__current_dict.pop()
            self.__special_fonts_found -= 1
            self.__dict_name = 'default'
+
    def __start_caps_func_old(self, line):
        """
        Required:
@@ -427,6 +438,7 @@ class Hex2Utf8:
            self.__in_caps to 1
        """
        self.__in_caps = 1
+
    def __start_caps_func(self, line):
        """
        Required:
@@ -440,6 +452,7 @@ class Hex2Utf8:
        self.__in_caps = 1
        value = line[17:-1]
        self.__caps_list.append(value)
+
    def __end_caps_func(self, line):
        """
        Required:
@@ -455,7 +468,8 @@ class Hex2Utf8:
        else:
            sys.stderr.write('Module is hex_2_utf8\n')
            sys.stderr.write('method is __end_caps_func\n')
-            sys.stderr.write('caps list should be more than one?\n')
+            sys.stderr.write('caps list should be more than one?\n') #self.__in_caps not set
+
    def __text_func(self, line):
        """
        Required:
@@ -466,9 +480,8 @@ class Hex2Utf8:
            if in caps, convert. Otherwise, print out.
        """
        text = line[17:-1]
-        if self.__current_dict_name == 'Symbol'\
-          or self.__current_dict_name == 'Wingdings'\
-          or self.__current_dict_name == 'Zapf Dingbats':
+        # print line
+        if self.__current_dict_name in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
            the_string = ''
            for letter in text:
                hex_num = hex(ord(letter))
@@ -477,21 +490,21 @@ class Hex2Utf8:
                hex_num = hex_num[2:]
                hex_num = '\'%s' % hex_num
                converted = self.__current_dict.get(hex_num)
-                if converted == None:
+                if converted is None:
                    sys.stderr.write('module is hex_2_ut8\n')
                    sys.stderr.write('method is __text_func\n')
                    sys.stderr.write('no hex value for "%s"\n' % hex_num)
                else:
                    the_string += converted
            self.__write_obj.write('tx<nu<__________<%s\n' % the_string)
+            # print the_string
        else:
            if self.__caps_list[-1] == 'true' \
                and self.__convert_caps\
-                and self.__current_dict_name != 'Symbol'\
-                and self.__current_dict_name != 'Wingdings'\
-                and self.__current_dict_name != 'Zapf Dingbats':
+                and self.__current_dict_name not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
                text = text.upper()
            self.__write_obj.write('tx<nu<__________<%s\n' % text)
+
    def __utf_to_caps_func(self, line):
        """
        Required:
@@ -506,6 +519,7 @@ class Hex2Utf8:
            # utf_text = utf_text.upper()
            utf_text = self.__utf_token_to_caps_func(utf_text)
        self.__write_obj.write('tx<ut<__________<%s\n' % utf_text)
+
    def __utf_token_to_caps_func(self, char_entity):
        """
        Required:
@@ -530,28 +544,26 @@ class Hex2Utf8:
            return char_entity
        else:
            return converted
+
    def __convert_body(self):
        self.__state = 'body'
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            action = self.__body_state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('error no state found in hex_2_utf8',
-                self.__state
-                )
-            action(line)
-        read_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            self.__write_obj = open(self.__write_to, 'w')
+            for line in read_obj:
+                self.__token_info = line[:16]
+                action = self.__body_state_dict.get(self.__state)
+                if action is None:
+                    sys.stderr.write('error no state found in hex_2_utf8',
+                    self.__state
+                    )
+                action(line)
        self.__write_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "body_utf_convert.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
+
    def convert_hex_2_utf8(self):
        self.__initiate_values()
        if self.__area_to_convert == 'preamble':
@@ -1,5 +1,7 @@
 import sys, os, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+
 """
 States.
 1. default
@@ -36,6 +38,7 @@ class Inline:
        self.__copy = copy
        self.__run_level = run_level
        self.__write_to = tempfile.mktemp()
+
    def __initiate_values(self):
        """
        Initiate all values.
@@ -51,7 +54,6 @@ class Inline:
            'tx<ut<__________'  :       self.__found_text_func,
            'mi<mk<inline-fld'  :       self.__found_text_func,
            'text'              :       self.__found_text_func,
-            'cw<nu<hard-lineb'  :       self.__found_text_func, #calibre
            'cb<nu<clos-brack'  :       self.__close_bracket_func,
            'mi<mk<par-end___'  :       self.__end_para_func,
            'mi<mk<footnt-ope'  :       self.__end_para_func,
@@ -63,7 +65,6 @@ class Inline:
            'tx<hx<__________'  :       self.__found_text_func,
            'tx<ut<__________'  :       self.__found_text_func,
            'text'              :       self.__found_text_func,
-            'cw<nu<hard-lineb'  :       self.__found_text_func, #calibre
            'mi<mk<inline-fld'  :       self.__found_text_func,
            'ob<nu<open-brack':         self.__found_open_bracket_func,
            'mi<mk<par-end___'  :       self.__end_para_func,
@@ -83,12 +84,12 @@ class Inline:
        self.__in_para = 0 #  not in paragraph
        self.__char_dict = {
            # character info => ci
-            'annotation'    :       'annotation',
+            'annotation'    :   'annotation',
            'blue______'    :   'blue',
            'bold______'    :   'bold',
-            'caps______'    :       'caps',
-            'char-style'    :       'character-style',
-            'dbl-strike'    :    'double-strike-through',
+            'caps______'    :   'caps',
+            'char-style'    :   'character-style',
+            'dbl-strike'    :   'double-strike-through',
            'emboss____'    :   'emboss',
            'engrave___'    :   'engrave',
            'font-color'    :   'font-color',
@@ -96,7 +97,7 @@ class Inline:
            'font-size_'    :   'font-size',
            'font-style'    :   'font-style',
            'font-up___'    :   'superscript',
-            'footnot-mk'    :       'footnote-marker',
+            'footnot-mk'    :   'footnote-marker',
            'green_____'    :   'green',
            'hidden____'    :   'hidden',
            'italics___'    :   'italics',
@@ -107,9 +108,10 @@ class Inline:
            'strike-thr'    :   'strike-through',
            'subscript_'    :   'subscript',
            'superscrip'    :   'superscript',
-            'underlined'    :       'underlined',
+            'underlined'    :   'underlined',
        }
        self.__caps_list = ['false']
+
    def __set_list_func(self, line):
        """
        Requires:
@@ -128,6 +130,7 @@ class Inline:
                self.__place = 'in_list'
                self.__inline_list = self.__list_inline_list
                self.__groups_in_waiting = self.__groups_in_waiting_list
+
    def __default_func(self, line):
        """
        Requires:
@@ -140,8 +143,8 @@ class Inline:
        action = self.__default_dict.get(self.__token_info)
        if action:
            action(line)
-        if self.__token_info != 'cw<nu<hard-lineb': #calibre
-            self.__write_obj.write(line)
+        self.__write_obj.write(line)
+
    def __found_open_bracket_func(self, line):
        """
        Requires:
@@ -156,6 +159,7 @@ class Inline:
        self.__groups_in_waiting[0] += 1
        self.__inline_list.append({})
        self.__inline_list[-1]['contains_inline'] = 0
+
    def __after_open_bracket_func(self, line):
        """
        Requires:
@@ -176,6 +180,7 @@ class Inline:
                self.__state = 'default' #  a non control word?
                action(line)
        self.__write_obj.write(line)
+
    def __handle_control_word(self, line):
        """
        Required:
@@ -206,6 +211,7 @@ class Inline:
                elif char_value == 'Zapf Dingbats':
                    self.__write_obj.write('mi<mk<font-dingb\n')
            """
+
    def __close_bracket_func(self, line):
        """
        Requires:
@@ -244,6 +250,7 @@ class Inline:
        self.__inline_list.pop()
        if self.__groups_in_waiting[0] != 0:
            self.__groups_in_waiting[0] -= 1
+
    def __found_text_func(self, line):
        """
        Required:
@@ -257,7 +264,6 @@ class Inline:
                Text can mark the start of a paragraph.
                If already in a paragraph, check to see if any groups are waiting
                to be added. If so, use another method to write these groups.
-            3. If not check if hardline break, then write
        """
        if self.__place == 'in_list':
            self.__write_inline()
@@ -265,12 +271,9 @@ class Inline:
            if not self.__in_para:
                self.__in_para = 1
                self.__start_para_func(line)
-            else:
-                if self.__token_info == 'cw<nu<hard-lineb': #calibre
-                    self.__write_obj.write('mi<tg<empty_____<hardline-break\n')
-                if self.__groups_in_waiting[0] != 0:
+            elif self.__groups_in_waiting[0] != 0:
                    self.__write_inline()
-                
+
    def __write_inline(self):
        """
        Required:
@@ -314,6 +317,7 @@ class Inline:
                            self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
                    self.__write_obj.write('\n')
        self.__groups_in_waiting[0] = 0
+
    def __end_para_func(self, line):
        """
        Requires:
@@ -342,6 +346,7 @@ class Inline:
                    self.__write_obj.write('mi<mk<caps-end__\n')
                self.__write_obj.write('mi<tg<close_____<inline\n')
        self.__in_para = 0
+
    def __start_para_func(self, line):
        """
        Requires:
@@ -369,12 +374,14 @@ class Inline:
                        self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
                self.__write_obj.write('\n')
        self.__groups_in_waiting[0] = 0
+
    def __found_field_func(self, line):
        """
        Just a default function to make sure I don't prematurely exit
        default state
        """
        pass
+
    def form_tags(self):
        """
        Requires:
@@ -386,32 +393,27 @@ class Inline:
            the state.
        """
        self.__initiate_values()
-        read_obj = open(self.__file, 'r')
-        self.__write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            token = line[0:-1]
-            self.__token_info = ''
-            if token == 'tx<mc<__________<rdblquote'\
-                or token == 'tx<mc<__________<ldblquote'\
-                or token == 'tx<mc<__________<lquote'\
-                or token == 'tx<mc<__________<rquote'\
-                or token == 'tx<mc<__________<emdash'\
-                or token == 'tx<mc<__________<endash'\
-                or token == 'tx<mc<__________<bullet':
-                self.__token_info = 'text'
-            else:
-                self.__token_info = line[:16]
-            self.__set_list_func(line)
-            action = self.__state_dict.get(self.__state)
-            if action == None:
-                sys.stderr.write('No matching state in module inline_for_lists.py\n')
-                sys.stderr.write(self.__state + '\n')
-            action(line)
-        read_obj.close()
-        self.__write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as self.__write_obj:
+                for line in read_obj:
+                    token = line[0:-1]
+                    self.__token_info = ''
+                    if token == 'tx<mc<__________<rdblquote'\
+                        or token == 'tx<mc<__________<ldblquote'\
+                        or token == 'tx<mc<__________<lquote'\
+                        or token == 'tx<mc<__________<rquote'\
+                        or token == 'tx<mc<__________<emdash'\
+                        or token == 'tx<mc<__________<endash'\
+                        or token == 'tx<mc<__________<bullet':
+                        self.__token_info = 'text'
+                    else:
+                        self.__token_info = line[:16]
+                    self.__set_list_func(line)
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write('No matching state in module inline_for_lists.py\n')
+                        sys.stderr.write(self.__state + '\n')
+                    action(line)
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "inline.data")
@@ -15,8 +15,11 @@
 #                                                                       #
 #                                                                       #
 #########################################################################
-import os, tempfile, re
+import os, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+from calibre.utils.cleantext import clean_ascii_chars
+
 class FixLineEndings:
    """Fix line endings"""
    def __init__(self,
@@ -32,36 +35,23 @@ class FixLineEndings:
        self.__run_level = run_level
        self.__write_to = tempfile.mktemp()
        self.__replace_illegals = replace_illegals
+
    def fix_endings(self):
-        ##tempFileName = tempfile.mktemp()
-        illegal_regx = re.compile( '\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x0B|\x0E|\x0F|\x10|\x11|\x12|\x13')
-        #nums = [0, 1, 2, 3, 4, 5, 6, 7, 8,  11,  14, 15, 16, 17, 18, 19]
-        """
-read_obj = open(self.__file, 'r')
-line = read_obj.read(1000)
-regexp = re.compile(r"\r")
-macintosh = regexp.search(line)
-read_obj.close()
-        """
-        # always check since I have to get rid of illegal characters
-        macintosh = 1
-        if macintosh:
-            line = 1
-            read_obj = open(self.__file, 'r')
-            write_obj = open(self.__write_to, 'w')
-            while line:
-                line = read_obj.read(1000)
-                # line = re.sub(regexp,"\n",line)
-                line = line.replace ('\r', '\n')
-                if self.__replace_illegals:
-                    line = re.sub(illegal_regx, '', line)
-                    # for num in nums:
-                        # line = line.replace(chr(num), '')
-                write_obj.write(line )
-            read_obj.close()
-            write_obj.close()
-            copy_obj = copy.Copy(bug_handler = self.__bug_handler)
-            if self.__copy:
-                copy_obj.copy_file(self.__write_to, "line_endings.data")
-            copy_obj.rename(self.__write_to, self.__file)
-            os.remove(self.__write_to)
+        #read
+        with open(self.__file, 'r') as read_obj:
+            input_file = read_obj.read()
+        #calibre go from win and mac to unix
+        input_file = input_file.replace ('\r\n', '\n')
+        input_file = input_file.replace ('\r', '\n')
+        #remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
+        if self.__replace_illegals:
+            input_file = clean_ascii_chars(input_file)
+        #write
+        with open(self.__write_to, 'wb') as write_obj:
+            write_obj.write(input_file)
+        #copy
+        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "line_endings.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
@@ -16,7 +16,9 @@
 #                                                                       #
 #########################################################################
 import sys, os, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+
 class Pict:
    """Process graphic information"""
    def __init__(self,
@@ -36,13 +38,11 @@ class Pict:
        self.__ob_count = 0
        self.__cb_count = 0
        self.__pict_count = 0
-        self.__in_pict = 0
-        self.__already_found_pict = 0
+        self.__in_pict = False
+        self.__already_found_pict = False
        self.__orig_file = orig_file
        self.__initiate_pict_dict()
        self.__out_file = out_file
-        # this is left over
-        self.__no_ask = 1

    def __initiate_pict_dict(self):
        self.__pict_dict = {
@@ -71,57 +71,43 @@ class Pict:
                self.__out_file))
        else:
            dir_name = os.path.dirname(self.__orig_file)
-        # self.__output_to_file_func()
        self.__dir_name = base_name + "_rtf_pict_dir/"
        self.__dir_name = os.path.join(dir_name, self.__dir_name)
        if not os.path.isdir(self.__dir_name):
            try:
                os.mkdir(self.__dir_name)
            except OSError, msg:
-                msg = str(msg)
-                msg += "Couldn't make directory '%s':\n" % (self.__dir_name)
+                msg = "%sCouldn't make directory '%s':\n" % (str(msg), self.__dir_name)
                raise self.__bug_handler
        else:
-            if self.__no_ask:
-                user_response = 'r'
-            else:
-                msg = 'Do you want to remove all files in %s?\n' % self.__dir_name
-                msg += 'Type "r" to remove.\n'
-                msg +=  'Type any other key to keep files in place.\n'
-                sys.stderr.write(msg)
-                user_response = raw_input()
-            if user_response == 'r':
-                if self.__run_level > 1:
-                    sys.stderr.write('Removing files from old pict directory...\n')
-                all_files = os.listdir(self.__dir_name)
-                for the_file in all_files:
-                    the_file = os.path.join(self.__dir_name, the_file)
-                    try:
-                        os.remove(the_file)
-                    except OSError:
-                        pass
-                if self.__run_level > 1:
-                    sys.stderr.write('Files removed.\n')
+            if self.__run_level > 1:
+                sys.stderr.write('Removing files from old pict directory...\n')
+            all_files = os.listdir(self.__dir_name)
+            for the_file in all_files:
+                the_file = os.path.join(self.__dir_name, the_file)
+                try:
+                    os.remove(the_file)
+                except OSError:
+                    pass
+            if self.__run_level > 1:
+                sys.stderr.write('Files removed.\n')

    def __create_pict_file(self):
        """Create a file for all the pict data to be written to.
        """
        self.__pict_file = os.path.join(self.__dir_name, 'picts.rtf')
-        write_pic_obj = open(self.__pict_file, 'w')
-        write_pic_obj.close()
        self.__write_pic_obj = open(self.__pict_file, 'a')

    def __in_pict_func(self, line):
        if self.__cb_count == self.__pict_br_count:
-            self.__in_pict = 0
+            self.__in_pict = False
            self.__write_pic_obj.write("}\n")
-            return 1
+            return True
        else:
            action = self.__pict_dict.get(self.__token_info)
            if action:
-                line = action(line)
-                self.__write_pic_obj.write(line)
-            return 0
+                self.__write_pic_obj.write(action(line))
+            return False

    def __default(self, line, write_obj):
        """Determine if each token marks the beginning of pict data.
@@ -142,53 +128,50 @@ class Pict:
            write_obj.write('mi<mk<pict-end__\n')
            if not self.__already_found_pict:
                self.__create_pict_file()
-                self.__already_found_pict=1;
+                self.__already_found_pict=True;
                self.__print_rtf_header()
            self.__in_pict = 1
            self.__pict_br_count = self.__ob_count
            self.__cb_count = 0
            self.__write_pic_obj.write("{\\pict\n")
-            return 0
-        return 1
+            return False
+        return True

    def __print_rtf_header(self):
        """Print to pict file the necessary RTF data for the file to be
        recognized as an RTF file.
        """
-        self.__write_pic_obj.write("{\\rtf1 \n")
-        self.__write_pic_obj.write("{\\fonttbl\\f0\\null;} \n")
-        self.__write_pic_obj.write("{\\colortbl\\red255\\green255\\blue255;} \n")
-        self.__write_pic_obj.write("\\pard \n")
+        self.__write_pic_obj.write("{\\rtf1 \n{\\fonttbl\\f0\\null;} \n")
+        self.__write_pic_obj.write("{\\colortbl\\red255\\green255\\blue255;} \n\\pard \n")

    def process_pict(self):
        self.__make_dir()
-        read_obj = open(self.__file)
-        write_obj = open(self.__write_to, 'w')
-        line_to_read = 'dummy'
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            self.__token_info = line[:16]
-            if self.__token_info == 'ob<nu<open-brack':
-                self.__ob_count = line[-5:-1]
-            if self.__token_info == 'cb<nu<clos-brack':
-                self.__cb_count = line[-5:-1]
-            if not self.__in_pict:
-                to_print = self.__default(line, write_obj)
-                if to_print :
-                    write_obj.write(line)
-            else:
-                to_print = self.__in_pict_func(line)
-                if to_print :
-                    write_obj.write(line)
-        if self.__already_found_pict:
-            self.__write_pic_obj.write("}\n")
-            self.__write_pic_obj.close()
-        read_obj.close()
-        write_obj.close()
+        with open(self.__file) as read_obj:
+            with open(self.__write_to, 'w') as write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    if self.__token_info == 'ob<nu<open-brack':
+                        self.__ob_count = line[-5:-1]
+                    if self.__token_info == 'cb<nu<clos-brack':
+                        self.__cb_count = line[-5:-1]
+                    if not self.__in_pict:
+                        to_print = self.__default(line, write_obj)
+                        if to_print :
+                            write_obj.write(line)
+                    else:
+                        to_print = self.__in_pict_func(line)
+                        if to_print :
+                            write_obj.write(line)
+                if self.__already_found_pict:
+                    self.__write_pic_obj.write("}\n")
+                    self.__write_pic_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "pict.data")
+            try:
+                copy_obj.copy_file(self.__pict_file, "pict.rtf")
+            except:
+                pass
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
        if self.__pict_count == 0:
@@ -15,8 +15,10 @@
 #                                                                       #
 #                                                                       #
 #########################################################################
-import os, re,  tempfile
+import os, re, tempfile
+
 from calibre.ebooks.rtf2xml import copy, check_brackets
+
 class ProcessTokens:
    """
    Process each token on a line and add information that will be useful for
@@ -41,14 +43,16 @@ class ProcessTokens:
        self.__bracket_count=0
        self.__exception_handler = exception_handler
        self.__bug_handler = bug_handler
+
    def compile_expressions(self):
        self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
        self.__utf_exp = re.compile(r'(&.*?;)')
+
    def initiate_token_dict(self):
        self.__return_code = 0
        self.dict_token={
        # unicode
-        'mshex'             :   ('nu', '__________', self.__ms_hex_func),
+        'mshex'              :  ('nu', '__________', self.__ms_hex_func),
        # brackets
        '{'                  :	('nu', '{', self.ob_func),
        '}'                  :	('nu', '}', self.cb_func),
@@ -66,6 +70,7 @@ class ProcessTokens:
        ';'                  :	('mc', ';', self.ms_sub_func),
        # this must be wrong
        '-'                  :	('mc', '-', self.ms_sub_func),
+        'line'               :  ('mi', 'hardline-break', self.hardline_func), #calibre
        # misc => ml
        '*'                  :	('ml', 'asterisk__', self.default_func),
        ':'                  :	('ml', 'colon_____', self.default_func),
@@ -73,7 +78,6 @@ class ProcessTokens:
        'backslash'          :	('nu', '\\', self.text_func),
        'ob'                 :	('nu', '{', self.text_func),
        'cb'                 :	('nu', '}', self.text_func),
-        'line'               :  ('nu', 'hard-lineb', self.default_func), #calibre
        #'line'               :  ('nu', ' ', self.text_func), calibre
        # paragraph formatting => pf
        'page'               :  ('pf', 'page-break', self.default_func),
@@ -159,15 +163,17 @@ class ProcessTokens:
        'rtf'                :	('ri', 'rtf_______', self.default_func),
        'deff'               :	('ri', 'deflt-font', self.default_func),
        'mac'                :	('ri', 'macintosh_', self.default_func),
+        'pc'                 :	('ri', 'pc________', self.default_func),
+        'pca'                :	('ri', 'pca_______', self.default_func),
        'ansi'               :	('ri', 'ansi______', self.default_func),
        'ansicpg'            :	('ri', 'ansi-codpg', self.default_func),
        # notes => nt
        'footnote'           :	('nt', 'footnote__', self.default_func),
        'ftnalt'             :	('nt', 'type______<endnote', self.two_part_func),
        # anchor => an
-        'tc'                :	('an', 'toc_______', self.default_func),
+        'tc'                 :	('an', 'toc_______', self.default_func),
        'bkmkstt'            :	('an', 'book-mk-st', self.default_func),
-        'bkmkstart'         :	('an', 'book-mk-st', self.default_func),
+        'bkmkstart'          :	('an', 'book-mk-st', self.default_func),
        'bkmkend'            :	('an', 'book-mk-en', self.default_func),
        'xe'                 :	('an', 'index-mark', self.default_func),
        'rxe'                :	('an', 'place_____', self.default_func),
@@ -347,7 +353,7 @@ class ProcessTokens:
            10:     'Kanji numbering without the digit character',
            11:     'Kanji numbering with the digit character',
            1246:   'phonetic Katakana characters in aiueo order',
-            1346:    'phonetic katakana characters in iroha order',
+            1346:   'phonetic katakana characters in iroha order',
            14:     'double byte character',
            15:     'single byte character',
            16:     'Kanji numbering 3',
@@ -392,7 +398,7 @@ class ProcessTokens:
            5121 	:  'Arabic Algeria',
            15361 	:  'Arabic Bahrain',
            3073 	:  'Arabic Egypt',
-            1 	        :   'Arabic General',
+            1 	    :   'Arabic General',
            2049 	:  'Arabic Iraq',
            11265 	:  'Arabic Jordan',
            13313 	:  'Arabic Kuwait',
@@ -417,7 +423,7 @@ class ProcessTokens:
            1059 	:  'Byelorussian',
            1027 	:  'Catalan',
            2052 	:  'Chinese China',
-            4 	        :  'Chinese General',
+            4 	    :  'Chinese General',
            3076 	:  'Chinese Hong Kong',
            4100 	:  'Chinese Singapore',
            1028 	:  'Chinese Taiwan',
@@ -431,7 +437,7 @@ class ProcessTokens:
            2057 	:  'English British',
            4105 	:  'English Canada',
            9225 	:  'English Caribbean',
-            9 	        :  'English General',
+            9 	    :  'English General',
            6153 	:  'English Ireland',
            8201 	:  'English Jamaica',
            5129 	:  'English New Zealand',
@@ -595,30 +601,37 @@ class ProcessTokens:
        num = num[1:] # chop off leading 0, which I added
        num = num.upper() # the mappings store hex in caps
        return 'tx<hx<__________<\'%s\n' % num # add an ' for the mappings
+
    def ms_sub_func(self, pre, token, num):
        return 'tx<mc<__________<%s\n' % token
+
+    def hardline_func(self, pre, token, num):
+        return 'mi<tg<empty_____<%s\n' % token
+
    def default_func(self, pre, token, num):
-        if num == None:
+        if num is None:
            num = 'true'
        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
+
    def __list_type_func(self, pre, token, num):
        type = 'arabic'
-        if num == None:
+        if num is None:
            type = 'Arabic'
        else:
            try:
                num = int(num)
            except ValueError:
                if self.__run_level > 3:
-                    msg = 'number "%s" cannot be converted to integer\n' % num
+                    msg = 'Number "%s" cannot be converted to integer\n' % num
                    raise self.__bug_handler, msg
            type = self.__number_type_dict.get(num)
-            if type == None:
+            if type is None:
                if self.__run_level > 3:
                    msg = 'No type for "%s" in self.__number_type_dict\n'
                    raise self.__bug_handler
                type = 'Arabic'
        return 'cw<%s<%s<nu<%s\n' % (pre, token, type)
+
    def __language_func(self, pre, token, num):
        lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group()))
        if not lang_name:
@@ -627,31 +640,36 @@ class ProcessTokens:
                msg = 'No entry for number "%s"' % num
                raise self.__bug_handler, msg
        return 'cw<%s<%s<nu<%s\n' % (pre, token, lang_name)
+
    def two_part_func(self, pre, token, num):
        list = token.split("<")
        token = list[0]
        num = list[1]
        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
        ##return 'cw<nu<nu<nu<%s>num<%s\n' % (token, num)
+
    def divide_by_2(self, pre, token, num):
        num = self.divide_num(num, 2)
        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
        ##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
+
    def divide_by_20(self, pre, token, num):
        num = self.divide_num(num, 20)
        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
        ##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
+
    def text_func(self, pre, token, num=None):
        return 'tx<nu<__________<%s\n' % token
+
    def ob_func(self, pre, token, num=None):
        self.__bracket_count += 1
-        ##return 'ob<%04d\n' % self.__bracket_count
        return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
+
    def cb_func(self, pre, token, num=None):
-        ##line = 'cb<%04d\n' % self.__bracket_count
        line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
        self.__bracket_count -= 1
        return line
+
    def color_func(self, pre, token, num):
        third_field = 'nu'
        if num[-1] == ';':
@@ -662,6 +680,7 @@ class ProcessTokens:
            num = "0" + num
        return 'cw<%s<%s<%s<%s\n' % (pre, token, third_field, num)
        ##return 'cw<cl<%s<nu<nu<%s>%s<%s\n' % (third_field, token, num, token)
+
    def bool_st_func(self, pre, token, num):
        if num is None or num == '' or num == '1':
            return 'cw<%s<%s<nu<true\n' % (pre, token)
@@ -670,24 +689,23 @@ class ProcessTokens:
            return 'cw<%s<%s<nu<false\n' % (pre, token)
                ##return 'cw<nu<nu<nu<%s>false<%s\n' % (token, token)
        else:
-            msg = 'boolean should have some value module process tokens\n'
-            msg += 'token is ' + token + "\n"
-            msg += "'" + num + "'" + "\n"
+            msg = "boolean should have some value module process tokens\ntoken is %s\n'%s'\n" % (token, num)
            raise self.__bug_handler, msg
+
    def __no_sup_sub_func(self, pre, token, num):
        the_string = 'cw<ci<subscript_<nu<false\n'
        the_string += 'cw<ci<superscrip<nu<false\n'
        return the_string
+
    def divide_num(self, numerator, denominator):
        try:
-            numerator = float(re.search('[0-9.]+', numerator).group())            
+            #calibre why ignore negative number? Wrong in case of \fi
+            numerator = float(re.search('[0-9.\-]+', numerator).group())
        except TypeError, msg:
            if self.__run_level > 3:
-                msg = 'no number to process?\n'
-                msg += 'this indicates that the token '
-                msg += ' \(\\li\) should have a number and does not\n'
-                msg += 'numerator is "%s"\n' % numerator
-                msg += 'denominator is "%s"\n' % denominator
+                msg = ('No number to process?\nthis indicates that the token \(\\li\) \
+                should have a number and does not\nnumerator is \
+                "%s"\ndenominator is "%s"\n') % (numerator, denominator)
                raise self.__bug_handler, msg
            if 5 > self.__return_code:
                self.__return_code = 5
@@ -698,9 +716,10 @@ class ProcessTokens:
        if string_num[-2:] == ".0":
            string_num = string_num[:-2]
        return string_num
+
    def split_let_num(self, token):
        match_obj = re.search(self.__num_exp,token)
-        if match_obj != None:
+        if match_obj is not None:
            first = match_obj.group(1)
            second = match_obj.group(2)
            if not second:
@@ -714,6 +733,7 @@ class ProcessTokens:
                raise self.__bug_handler
            return token, 0
        return first, second
+
    def convert_to_hex(self,number):
        """Convert a string to uppercase hexidecimal"""
        num = int(number)
@@ -722,6 +742,7 @@ class ProcessTokens:
            return hex_num
        except:
            raise self.__bug_handler
+
    def process_cw(self, token):
        """Change the value of the control word by determining what dictionary
        it belongs to"""
@@ -737,89 +758,62 @@ class ProcessTokens:
        pre, token, action = self.dict_token.get(token, (None, None, None))
        if action:
            return action(pre, token, num)
-    # unused function
-    def initiate_token_actions(self):
-        self.action_for_token={
-        '{'     :   self.ob_func,
-        '}'     :   self.cb_func,
-        '\\'    :   self.process_cw,
-        }
-    # unused function
-    def evaluate_token(self,token):
-        """Evaluate tokens. Return a value if the token is not a
-        control word. Otherwise, pass token onto another method
-        for further evaluation."""
-        token, action = self.dict_token.get(token[0:1])
-        if action:
-            line = action(token)
-            return line
-        else :
-            return  'tx<nu<nu<nu<nu<%s\n' % token
+
    def __check_brackets(self, in_file):
        self.__check_brack_obj = check_brackets.CheckBrackets\
            (file = in_file)
        good_br =  self.__check_brack_obj.check_brackets()[0]
        if not good_br:
            return 1
+
    def process_tokens(self):
        """Main method for handling other methods. """
-        first_token = 0
-        second_token = 0
-        read_obj = open(self.__file, 'r')
-        write_obj = open(self.__write_to, 'w')
-        line_to_read = "dummy"
        line_count = 0
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            token = line_to_read
-            token = token.replace("\n","")
-            if not token:
-                continue
-            line_count += 1
-            try:
-                token.decode('us-ascii')
-            except UnicodeError, msg:
-                msg = str(msg)
-                msg += 'Invalid RTF: File not ascii encoded.\n'
-                raise self.__exception_handler, msg
-            if not first_token:
-                if token != '\\{':
-                    msg = 'Invalid RTF: document doesn\'t start with {\n'
-                    raise self.__exception_handler, msg
-                first_token = 1
-            elif first_token and not second_token:
-                if token[0:4] != '\\rtf':
-                    msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
-                    raise self.__exception_handler, msg
-                second_token = 1
-            ##token = self.evaluate_token(token)
-            the_index = token.find('\\ ')
-            if token != None and  the_index > -1:
-                msg ='Invalid RTF: token "\\ " not valid. \n'
-                raise self.__exception_handler, msg
-            elif token[0:1] == "\\":
-                line = self.process_cw(token)
-                if line != None:
-                    write_obj.write(line)
-            else:
-                fields = re.split(self.__utf_exp, token)
-                for field in fields:
-                    if not field:
-                        continue
-                    if field[0:1] == '&':
-                        write_obj.write('tx<ut<__________<%s\n' % field)
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'wb') as write_obj:
+                for line in read_obj:
+                    token = line.replace("\n","")
+                    line_count += 1
+                    if line_count == 1 and token != '\\{':
+                            msg = 'Invalid RTF: document doesn\'t start with {\n'
+                            raise self.__exception_handler, msg
+                    elif line_count == 2 and token[0:4] != '\\rtf':
+                            msg = 'Invalid RTF: document doesn\'t start with \\rtf \n'
+                            raise self.__exception_handler, msg
+
+                    the_index = token.find('\\ ')
+                    if token is not None and  the_index > -1:
+                        msg = 'Invalid RTF: token "\\ " not valid.\n'
+                        raise self.__exception_handler, msg
+                    elif token[:1] == "\\":
+                        try:
+                            token.decode('us-ascii')
+                        except UnicodeError, msg:
+                            msg = 'Invalid RTF: Tokens not ascii encoded.\n%s' % str(msg)
+                            raise self.__exception_handler, msg
+                        line = self.process_cw(token)
+                        if line is not None:
+                            write_obj.write(line)
                    else:
-                        write_obj.write('tx<nu<__________<%s\n' % field)
-        read_obj.close()
-        write_obj.close()
+                        fields = re.split(self.__utf_exp, token)
+                        for field in fields:
+                            if not field:
+                                continue
+                            if field[0:1] == '&':
+                                write_obj.write('tx<ut<__________<%s\n' % field)
+                            else:
+                                write_obj.write('tx<nu<__________<%s\n' % field)
+
        if not line_count:
-            msg ='Invalid RTF: file appears to be empty. \n'
+            msg = 'Invalid RTF: file appears to be empty.\n'
            raise self.__exception_handler, msg
+
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "processed_tokens.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
+
        bad_brackets = self.__check_brackets(self.__file)
        if bad_brackets:
            msg = 'Invalid RTF: document does not have matching brackets.\n'
@@ -16,7 +16,10 @@
 #                                                                       #
 #########################################################################
 import os, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+from calibre.utils.cleantext import clean_ascii_chars
+
 class ReplaceIllegals:
    """
    reaplace illegal lower ascii characters
@@ -30,21 +33,14 @@ class ReplaceIllegals:
        self.__copy = copy
        self.__run_level = run_level
        self.__write_to = tempfile.mktemp()
+
    def replace_illegals(self):
        """
        """
-        nums = [0, 1, 2, 3, 4, 5, 6, 7, 8,  11,  13, 14, 15, 16, 17, 18, 19]
-        read_obj = open(self.__file, 'r')
-        write_obj = open(self.__write_to, 'w')
-        line_to_read = 1
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            for num in nums:
-                line = line.replace(chr(num), '')
-            write_obj.write(line)
-        read_obj.close()
-        write_obj.close()
+        with open(self.__file, 'r') as read_obj:
+            with open(self.__write_to, 'w') as write_obj:
+                for line in read_obj:
+                    write_obj.write(clean_ascii_chars(line))
        copy_obj = copy.Copy()
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "replace_illegals.data")
@@ -16,7 +16,10 @@
 #                                                                       #
 #########################################################################
 import os, re, tempfile
+
 from calibre.ebooks.rtf2xml import copy
+from calibre.utils.mreplace import MReplace
+
 class Tokenize:
    """Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
    def __init__(self,
@@ -28,89 +31,175 @@ class Tokenize:
        self.__file = in_file
        self.__bug_handler = bug_handler
        self.__copy = copy
-        self.__special_tokens = [ '_', '~', "'", '{', '}' ]
        self.__write_to = tempfile.mktemp()
-    def __from_ms_to_utf8(self,match_obj):
-        uni_char = int(match_obj.group(1))
-        if uni_char < 0:
-            uni_char +=  65536
-        return   '&#x' + str('%X' % uni_char) + ';'
-    def __neg_unicode_func(self, match_obj):
-        neg_uni_char = int(match_obj.group(1)) * -1
-        # sys.stderr.write(str( neg_uni_char))
-        uni_char = neg_uni_char + 65536
-        return   '&#x' + str('%X' % uni_char) + ';'
-    def __sub_line_reg(self,line):
-        line = line.replace("\\\\", "\\backslash ")
-        line = line.replace("\\~", "\\~ ")
-        line = line.replace("\\;", "\\; ")
-        line = line.replace("&", "&amp;")
-        line = line.replace("<", "&lt;")
-        line = line.replace(">", "&gt;")
-        line = line.replace("\\~", "\\~ ")
-        line = line.replace("\\_", "\\_ ")
-        line = line.replace("\\:", "\\: ")
-        line = line.replace("\\-", "\\- ")
-        # turn into a generic token to eliminate special
-        # cases and make processing easier
-        line = line.replace("\\{", "\\ob ")
-        # turn into a generic token to eliminate special
-        # cases and make processing easier
-        line = line.replace("\\}", "\\cb ")
-        # put a backslash in front of to eliminate special cases and
-        # make processing easier
-        line = line.replace("{", "\\{")
-        # put a backslash in front of to eliminate special cases and
-        # make processing easier
-        line = line.replace("}", "\\}")
-        line = re.sub(self.__utf_exp, self.__from_ms_to_utf8, line)
-        # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
-        line = re.sub(self.__ms_hex_exp, "\\mshex0\g<1> ", line)
-        ##line = line.replace("\\backslash", "\\\\")
-        # this is for older RTF
-        line = re.sub(self.__par_exp, '\\par ', line)
-        return line
-    def __compile_expressions(self):
-        self.__ms_hex_exp = re.compile(r"\\\'(..)")
-        self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) {0,1}")
-        self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\\[^\s\\{}&]+(?:\s)?)")
-        self.__par_exp = re.compile(r'\\$')
-        self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
-        ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
-    def __create_tokens(self):
        self.__compile_expressions()
-        read_obj = open(self.__file, 'r')
-        write_obj = open(self.__write_to, 'w')
-        line_to_read = "dummy"
-        while line_to_read:
-            line_to_read = read_obj.readline()
-            line = line_to_read
-            line = line.replace("\n", "")
-            line =  self.__sub_line_reg(line)
-            tokens = re.split(self.__splitexp, line)
-            ##print tokens
-            for token in tokens:
-                if token != "":
-                    write_obj.write(token + "\n")
-                    """
-                    match_obj = re.search(self.__mixed_exp, token)
-                    if match_obj != None:
-                        first = match_obj.group(1)
-                        second = match_obj.group(2)
-                        write_obj.write(first + "\n")
-                        write_obj.write(second + "\n")
-                    else:
-                        write_obj.write(token + "\n")
-                    """
-        read_obj.close()
-        write_obj.close()
+        #variables
+        self.__uc_char = 0
+        self.__uc_bin = False
+        self.__uc_value = [1]
+
+    def __reini_utf8_counters(self):
+        self.__uc_char = 0
+        self.__uc_bin = False
+
+    def __remove_uc_chars(self, startchar, token):
+        for i in xrange(startchar, len(token)):
+            if token[i] == " ":
+                continue
+            elif self.__uc_char:
+                self.__uc_char -= 1
+            else:
+                return token[i:]
+        #if only " " and char to skip
+        return ''
+
+    def __unicode_process(self, token):
+        #change scope in
+        if token == '\{':
+            self.__uc_value.append(self.__uc_value[-1])
+            #basic error handling
+            self.__reini_utf8_counters()
+            return token
+        #change scope out
+        elif token == '\}':
+            self.__uc_value.pop()
+            self.__reini_utf8_counters()
+            return token
+        #add a uc control
+        elif token[:3] == '\uc':
+            self.__uc_value[-1] = int(token[3:])
+            self.__reini_utf8_counters()
+            return token
+        #bin data to slip
+        elif self.__uc_bin:
+            self.__uc_bin = False
+            return ''
+        #uc char to remove
+        elif self.__uc_char:
+            #handle \bin tag in case of uc char to skip
+            if token[:4] == '\bin':
+                self.__uc_char -=1
+                self.__uc_bin = True
+                return ''
+            elif token[:1] == "\\" :
+                self.__uc_char -=1
+                return ''
+            else:
+                return self.__remove_uc_chars(0, token)
+        #go for real \u token
+        match_obj = self.__utf_exp.match(token)
+        if match_obj is not None:
+            self.__reini_utf8_counters()
+            #get value and handle negative case
+            uni_char = int(match_obj.group(1))
+            uni_len = len(match_obj.group(1)) + 2
+            if uni_char < 0:
+                uni_char += 65536
+            uni_char = unichr(uni_char).encode('ascii', 'xmlcharrefreplace')
+            self.__uc_char = self.__uc_value[-1]
+            #there is only an unicode char
+            if len(token)<= uni_len:
+                return uni_char
+            #an unicode char and something else
+            #must be after as it is splited on \
+            #necessary? maybe for \bin?
+            elif not self.__uc_char:
+                return uni_char + token[uni_len:]
+            #if not uc0 and chars
+            else:
+                return uni_char + self.__remove_uc_chars(uni_len, token)
+        #default
+        return token
+
+    def __sub_reg_split(self,input_file):
+        input_file = self.__replace_spchar.mreplace(input_file)
+        input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
+        input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
+        #remove \n in bin data
+        input_file = self.__bin_exp.sub(lambda x: \
+                                        x.group().replace('\n', '') + '\n', input_file)
+        #split
+        tokens = re.split(self.__splitexp, input_file)
+        #remove empty tokens and \n
+        return filter(lambda x: len(x) > 0 and x != '\n', tokens)
+        #input_file = re.sub(self.__utf_exp, self.__from_ms_to_utf8, input_file)
+        # line = re.sub( self.__neg_utf_exp, self.__neg_unicode_func, line)
+        # this is for older RTF
+        #line = re.sub(self.__par_exp, '\\par ', line)
+        #return filter(lambda x: len(x) > 0, \
+            #(self.__remove_line.sub('', x) for x in tokens))
+
+    def __compile_expressions(self):
+        SIMPLE_RPL = {
+            "\\\\": "\\backslash ",
+            "\\~": "\\~ ",
+            "\\;": "\\; ",
+            "&": "&amp;",
+            "<": "&lt;",
+            ">": "&gt;",
+            "\\~": "\\~ ",
+            "\\_": "\\_ ",
+            "\\:": "\\: ",
+            "\\-": "\\- ",
+            # turn into a generic token to eliminate special
+            # cases and make processing easier
+            "\\{": "\\ob ",
+            # turn into a generic token to eliminate special
+            # cases and make processing easier
+            "\\}": "\\cb ",
+            # put a backslash in front of to eliminate special cases and
+            # make processing easier
+            "{": "\\{",
+            # put a backslash in front of to eliminate special cases and
+            # make processing easier
+            "}": "\\}",
+            # this is for older RTF
+            r'\\$': '\\par ',
+            }
+        self.__replace_spchar = MReplace(SIMPLE_RPL)
+        #add ;? in case of char following \u
+        self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})") #r"\\\'(..)"
+        self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
+        self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
+        #manage upr/ud situations
+        self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" + \
+                       r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
+        #add \n in split for whole file reading
+        #why keep backslash whereas \is replaced before?
+        #remove \n from endline char
+        self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
+        #self.__bin_exp = re.compile(r"\\bin(-?\d{1,8}) {0,1}")
+        #self.__utf_exp = re.compile(r"^\\u(-?\d{3,6})")
+        #self.__splitexp = re.compile(r"(\\[\\{}]|{|}|\n|\\[^\s\\{}&]+(?:\s)?)")
+        #self.__par_exp = re.compile(r'\\$')
+        #self.__remove_line = re.compile(r'\n+')
+        #self.__mixed_exp = re.compile(r"(\\[a-zA-Z]+\d+)(\D+)")
+        ##self.num_exp = re.compile(r"(\*|:|[a-zA-Z]+)(.*)")
+
    def tokenize(self):
-        """Main class for handling other methods. Reads in one line \
-        at a time, usues method self.sub_line to make basic substitutions,\
-        uses ? to process tokens"""
-        self.__create_tokens()
+        """Main class for handling other methods. Reads the file \
+        , uses method self.sub_reg to make basic substitutions,\
+        and process tokens by itself"""
+        #read
+        with open(self.__file, 'r') as read_obj:
+            input_file = read_obj.read()
+        
+        #process simple replacements and split giving us a correct list
+        #remove '' and \n in the process
+        tokens = self.__sub_reg_split(input_file)
+        #correct unicode
+        tokens = map(self.__unicode_process, tokens)
+        #remove empty items created by removing \uc
+        tokens = filter(lambda x: len(x) > 0, tokens)
+        
+        #write
+        with open(self.__write_to, 'wb') as write_obj:
+            write_obj.write('\n'.join(tokens))
+        #Move and copy
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "tokenize.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
+        
+        #self.__special_tokens = [ '_', '~', "'", '{', '}' ]
@@ -1,4 +1,8 @@
 # -*- coding: utf-8 -*-
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+

 '''
 Read content from txt file.
@@ -10,10 +14,7 @@ from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
 from calibre.ebooks.conversion.preprocess import DocAnalysis
-
-__license__   = 'GPL v3'
-__copyright__ = '2009, John Schember <john@nachtimwald.com>'
-__docformat__ = 'restructuredtext en'
+from calibre.utils.cleantext import clean_ascii_chars

 HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'

@@ -33,9 +34,7 @@ def clean_txt(txt):
    # Remove excessive line breaks.
    txt = re.sub('\n{3,}', '\n\n', txt)
    #remove ASCII invalid chars : 0 to 8 and 11-14 to 24
-    chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
-    illegal_chars = re.compile(u'|'.join(map(unichr, chars)))
-    txt = illegal_chars.sub('', txt)
+    txt = clean_ascii_chars(txt)

    return txt

@@ -27,14 +27,17 @@ class PluginWidget(QWidget, Ui_Form):
    def __init__(self, parent=None):
        QWidget.__init__(self, parent)
        self.setupUi(self)
-        from calibre.library.catalog import FIELDS
-        self.all_fields = []
-        for x in FIELDS :
-            if x != 'all':
-                self.all_fields.append(x)
-                QListWidgetItem(x, self.db_fields)

    def initialize(self, name, db): #not working properly to update
+        from calibre.library.catalog import FIELDS
+
+        self.all_fields = [x for x in FIELDS if x != 'all']
+        #add custom columns
+        self.all_fields.extend([x for x in sorted(db.custom_field_keys())])
+        #populate
+        for x in self.all_fields:
+            QListWidgetItem(x, self.db_fields)
+
        self.name = name
        fields = gprefs.get(name+'_db_fields', self.all_fields)
        # Restore the activated db_fields from last use
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+from PyQt4.Qt import QDialog
+from calibre.gui2.dialogs.drm_error_ui import Ui_Dialog
+
+class DRMErrorMessage(QDialog, Ui_Dialog):
+
+    def __init__(self, parent=None, title=None):
+        QDialog.__init__(self, parent)
+        self.setupUi(self)
+        if title is not None:
+            t = unicode(self.msg.text())
+            self.msg.setText('<h2>%s</h2>%s'%(title, t))
+        self.resize(self.sizeHint())
+
@@ -0,0 +1,102 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>Dialog</class>
+ <widget class="QDialog" name="Dialog">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>417</width>
+    <height>235</height>
+   </rect>
+  </property>
+  <property name="windowTitle">
+   <string>This book is DRMed</string>
+  </property>
+  <layout class="QGridLayout" name="gridLayout">
+   <item row="0" column="0">
+    <widget class="QLabel" name="label">
+     <property name="sizePolicy">
+      <sizepolicy hsizetype="Preferred" vsizetype="Preferred">
+       <horstretch>0</horstretch>
+       <verstretch>0</verstretch>
+      </sizepolicy>
+     </property>
+     <property name="maximumSize">
+      <size>
+       <width>132</width>
+       <height>16777215</height>
+      </size>
+     </property>
+     <property name="text">
+      <string/>
+     </property>
+     <property name="pixmap">
+      <pixmap resource="../../../../resources/images.qrc">:/images/document-encrypt.png</pixmap>
+     </property>
+    </widget>
+   </item>
+   <item row="0" column="1">
+    <widget class="QLabel" name="msg">
+     <property name="text">
+      <string>&lt;p&gt;This book is locked by &lt;b&gt;DRM&lt;/b&gt;. To learn more about DRM and why you cannot read or convert this book in calibre, 
+&lt;a href=&quot;http://bugs.calibre-ebook.com/wiki/DRM&quot;&gt;click here&lt;/a&gt;.</string>
+     </property>
+     <property name="wordWrap">
+      <bool>true</bool>
+     </property>
+     <property name="openExternalLinks">
+      <bool>true</bool>
+     </property>
+    </widget>
+   </item>
+   <item row="1" column="0" colspan="2">
+    <widget class="QDialogButtonBox" name="buttonBox">
+     <property name="orientation">
+      <enum>Qt::Horizontal</enum>
+     </property>
+     <property name="standardButtons">
+      <set>QDialogButtonBox::Close</set>
+     </property>
+    </widget>
+   </item>
+  </layout>
+ </widget>
+ <resources>
+  <include location="../../../../resources/images.qrc"/>
+ </resources>
+ <connections>
+  <connection>
+   <sender>buttonBox</sender>
+   <signal>accepted()</signal>
+   <receiver>Dialog</receiver>
+   <slot>accept()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>248</x>
+     <y>254</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>157</x>
+     <y>274</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>buttonBox</sender>
+   <signal>rejected()</signal>
+   <receiver>Dialog</receiver>
+   <slot>reject()</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>316</x>
+     <y>260</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>286</x>
+     <y>274</y>
+    </hint>
+   </hints>
+  </connection>
+ </connections>
+</ui>
@@ -15,7 +15,7 @@ from calibre.ebooks.metadata import string_to_authors, authors_to_string
 from calibre.ebooks.metadata.book.base import composite_formatter
 from calibre.ebooks.metadata.meta import get_metadata
 from calibre.gui2.custom_column_widgets import populate_metadata_page
-from calibre.gui2 import error_dialog
+from calibre.gui2 import error_dialog, ResizableDialog
 from calibre.gui2.progress_indicator import ProgressIndicator
 from calibre.utils.config import dynamic
 from calibre.utils.titlecase import titlecase
@@ -49,7 +49,7 @@ def get_cover_data(path):



-class MyBlockingBusy(QDialog):
+class MyBlockingBusy(QDialog): # {{{

    do_one_signal = pyqtSignal()

@@ -241,8 +241,9 @@ class MyBlockingBusy(QDialog):
        self.current_index += 1
        self.do_one_signal.emit()

+    # }}}

-class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
+class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):

    s_r_functions = {       ''              : lambda x: x,
                            _('Lower Case') : lambda x: icu_lower(x),
@@ -261,9 +262,8 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
                        ]

    def __init__(self, window, rows, model, tab):
-        QDialog.__init__(self, window)
+        ResizableDialog.__init__(self, window)
        Ui_MetadataBulkDialog.__init__(self)
-        self.setupUi(self)
        self.model = model
        self.db = model.db
        self.ids = [self.db.id(r) for r in rows]
@@ -823,7 +823,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
                                if book.series_index is not None:
                                    self.series_index.setValue(book.series_index)
                        if book.has_cover:
-                            if d.opt_auto_download_cover.isChecked() and book.has_cover:
+                            if d.opt_auto_download_cover.isChecked():
                                self.fetch_cover()
                            else:
                                self.fetch_cover_button.setFocus(Qt.OtherFocusReason)
@@ -384,8 +384,9 @@ class BooksModel(QAbstractTableModel): # {{{
            name, val = mi.format_field(key)
            if mi.metadata_for_field(key)['datatype'] == 'comments':
                name += ':html'
-            if val:
+            if val and name not in data:
                data[name] = val
+
        return data


@@ -468,12 +468,8 @@ class Main(MainWindow, MainWindowMixin, DeviceMixin, EmailMixin, # {{{
        try:
            if 'calibre.ebooks.DRMError' in job.details:
                if not minz:
-                    d = error_dialog(self, _('Conversion Error'),
-                        _('<p>Could not convert: %s<p>It is a '
-                        '<a href="%s">DRM</a>ed book. You must first remove the '
-                        'DRM using third party tools.')%\
-                            (job.description.split(':')[-1],
-                                'http://bugs.calibre-ebook.com/wiki/DRM'))
+                    from calibre.gui2.dialogs.drm_error import DRMErrorMessage
+                    d = DRMErrorMessage(self, job.description.split(':')[-1])
                    d.setModal(False)
                    d.show()
                    self._modeless_dialogs.append(d)
@@ -26,6 +26,7 @@ from calibre.gui2.search_box import SearchBox2
 from calibre.ebooks.metadata import MetaInformation
 from calibre.customize.ui import available_input_formats
 from calibre.gui2.viewer.dictionary import Lookup
+from calibre import as_unicode

 class TOCItem(QStandardItem):

@@ -626,13 +627,12 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
            QApplication.processEvents()
        if worker.exception is not None:
            if isinstance(worker.exception, DRMError):
-                error_dialog(self, _('DRM Error'),
-                        _('<p>This book is protected by <a href="%s">DRM</a>')
-                        %'http://wiki.mobileread.com/wiki/DRM').exec_()
+                from calibre.gui2.dialogs.drm_error import DRMErrorMessage
+                DRMErrorMessage(self).exec_()
            else:
                r = getattr(worker.exception, 'reason', worker.exception)
                error_dialog(self, _('Could not open ebook'),
-                        unicode(r), det_msg=worker.traceback, show=True)
+                        as_unicode(r), det_msg=worker.traceback, show=True)
            self.close_progress_indicator()
        else:
            self.metadata.show_opf(self.iterator.opf, os.path.splitext(pathtoebook)[1][1:])
@@ -1531,10 +1531,23 @@ class EPUB_MOBI(CatalogPlugin):
                                        self.opts.header_note_source_field,
                                        index_is_id=True)
                    if notes:
-                        if field_md['datatype'] == 'text' and isinstance(notes,list):
-                            notes = ' &middot; '.join(notes)
+                        if field_md['datatype'] == 'text':
+                            if isinstance(notes,list):
+                                notes = ' &middot; '.join(notes)
                        elif field_md['datatype'] == 'datetime':
                            notes = format_date(notes,'dd MMM yyyy')
+                        elif field_md['datatype'] == 'composite':
+                            m = re.match(r'\[(.+)\]$', notes)
+                            if m is not None:
+                                # Sniff for special pseudo-list string "[<item, item>]"
+                                bracketed_content = m.group(1)
+                                if ',' in bracketed_content:
+                                    # Recast the comma-separated items as a list
+                                    items = bracketed_content.split(',')
+                                    items = [i.strip() for i in items]
+                                    notes = ' &middot; '.join(items)
+                                else:
+                                    notes = bracketed_content
                        this_title['notes'] = {'source':field_md['name'],
                                                   'content':notes}

@@ -709,6 +709,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        formats = row[fm['formats']]
        if not formats:
            formats = None
+        else:
+            formats = formats.split(',')
        mi.formats = formats
        tags = row[fm['tags']]
        if tags:
@@ -110,6 +110,7 @@ class cmd_commit(_cmd_commit):
            suffix = 'The fix will be in the next release.'
        action = action+'ed'
        msg = '%s in branch %s. %s'%(action, nick, suffix)
+        msg = msg.replace('Fixesed', 'Fixed')
        server = xmlrpclib.ServerProxy(url)
        server.ticket.update(int(bug), msg,
                             {'status':'closed', 'resolution':'fixed'},
@@ -3,7 +3,7 @@ __license__ = 'GPL 3'
 __copyright__ = '2010, sengian <sengian1@gmail.com>'
 __docformat__ = 'restructuredtext en'

-import re
+import re, htmlentitydefs

 _ascii_pat = None

@@ -21,3 +21,32 @@ def clean_ascii_chars(txt, charlist=None):
        pat = re.compile(u'|'.join(map(unichr, charlist)))
    return pat.sub('', txt)

+##
+# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
+# Removes HTML or XML character references and entities from a text string.
+#
+# @param text The HTML (or XML) source text.
+# @return The plain text, as a Unicode string, if necessary.
+
+def unescape(text, rm=False, rchar=u''):
+    def fixup(m, rm=rm, rchar=rchar):
+        text = m.group(0)
+        if text[:2] == "&#":
+            # character reference
+            try:
+                if text[:3] == "&#x":
+                    return unichr(int(text[3:-1], 16))
+                else:
+                    return unichr(int(text[2:-1]))
+            except ValueError:
+                pass
+        else:
+            # named entity
+            try:
+                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+            except KeyError:
+                pass
+        if rm:
+            return rchar #replace by char
+        return text # leave as is
+    return re.sub("&#?\w+;", fixup, text)
@@ -92,7 +92,10 @@ def identify_data(data):
    or raises an Exception if data is not an image.
    '''
    img = Image()
-    img.load(data)
+    if hasattr(img, 'identify'):
+        img.identify(data)
+    else:
+        img.load(data)
    width, height = img.size
    fmt = img.format
    return (width, height, fmt)
@@ -456,6 +456,26 @@ magick_Image_load(magick_Image *self, PyObject *args, PyObject *kwargs) {

 // }}}

+// Image.identify {{{
+static PyObject *
+magick_Image_identify(magick_Image *self, PyObject *args, PyObject *kwargs) {
+    const char *data;
+	Py_ssize_t dlen;
+    MagickBooleanType res;
+    
+    NULL_CHECK(NULL)
+    if (!PyArg_ParseTuple(args, "s#", &data, &dlen)) return NULL;
+
+    res = MagickPingImageBlob(self->wand, data, dlen);
+
+    if (!res)
+        return magick_set_exception(self->wand);
+
+    Py_RETURN_NONE;
+}
+
+// }}}
+
 // Image.open {{{
 static PyObject *
 magick_Image_read(magick_Image *self, PyObject *args, PyObject *kwargs) {
@@ -993,6 +1013,10 @@ static PyMethodDef magick_Image_methods[] = {
    {"destroy", (PyCFunction)magick_Image_destroy, METH_VARARGS,
    "Destroy the underlying ImageMagick Wand. WARNING: After using this method, all methods on this object will raise an exception."},

+    {"identify", (PyCFunction)magick_Image_identify, METH_VARARGS,
+     "Identify an image from a byte buffer (string)"
+    },
+
    {"load", (PyCFunction)magick_Image_load, METH_VARARGS,
     "Load an image from a byte buffer (string)"
    },
@@ -0,0 +1,269 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import sys, struct
+
+
+
+class WMFHeader(object):
+
+    '''
+    For header documentation, see
+    http://www.skynet.ie/~caolan/publink/libwmf/libwmf/doc/ora-wmf.html
+    '''
+
+    def __init__(self, data, log, verbose):
+        self.log, self.verbose = log, verbose
+        offset = 0
+        file_type, header_size, windows_version = struct.unpack_from('<HHH', data)
+        offset += 6
+
+        if header_size != 9:
+            raise ValueError('Not a WMF file')
+
+        file_size, num_of_objects = struct.unpack_from('<IH', data, offset)
+
+        if file_size * 2 != len(data):
+            # file size is in 2-byte units
+            raise ValueError('WMF file header specifies incorrect file size')
+        offset += 6
+
+        self.records_start_at = header_size * 2
+
+class DIBHeader(object):
+
+    '''
+    See http://en.wikipedia.org/wiki/BMP_file_format
+    '''
+
+    def __init__(self, raw):
+        hsize = struct.unpack('<I', raw[:4])[0]
+        if hsize == 40:
+            parts = struct.unpack('<IiiHHIIIIII', raw[:hsize])
+            for i, attr in enumerate((
+                'header_size', 'width', 'height', 'color_planes',
+                'bits_per_pixel', 'compression', 'image_size',
+                'hres', 'vres', 'ncols', 'nimpcols'
+                )):
+                setattr(self, attr, parts[i])
+        elif hsize == 12:
+            parts = struct.unpack('<IHHHH', raw[:hsize])
+            for i, attr in enumerate((
+                'header_size', 'width', 'height', 'color_planes',
+                'bits_per_pixel')):
+                setattr(self, attr, parts[i])
+        else:
+            raise ValueError('Unsupported DIB header type of size: %d'%hsize)
+
+        self.bitmasks_size = 12 if getattr(self, 'compression', 0) == 3 else 0
+        self.color_table_size = 0
+        if self.bits_per_pixel != 24:
+            # See http://support.microsoft.com/kb/q81498/
+            # for all the gory Micro and soft details
+            self.color_table_size = getattr(self, 'ncols', 0) * 4
+
+
+class WMF(object):
+
+    def __init__(self, log=None, verbose=0):
+        if log is None:
+            from calibre.utils.logging import default_log as log
+        self.log = log
+        self.verbose = verbose
+
+        self.map_mode = None
+        self.window_origin = None
+        self.window_extent = None
+        self.bitmaps = []
+
+        self.function_map = { # {{{
+                30: 'SaveDC',
+                53: 'RealizePalette',
+                55: 'SetPalEntries',
+                79: 'StartPage',
+                80: 'EndPage',
+                82: 'AbortDoc',
+                94: 'EndDoc',
+                258: 'SetBkMode',
+                259: 'SetMapMode',
+                260: 'SetROP2',
+                261: 'SetRelabs',
+                262: 'SetPolyFillMode',
+                263: 'SetStretchBltMode',
+                264: 'SetTextCharExtra',
+                295: 'RestoreDC',
+                298: 'InvertRegion',
+                299: 'PaintRegion',
+                300: 'SelectClipRegion',
+                301: 'SelectObject',
+                302: 'SetTextAlign',
+                313: 'ResizePalette',
+                332: 'ResetDc',
+                333: 'StartDoc',
+                496: 'DeleteObject',
+                513: 'SetBkColor',
+                521: 'SetTextColor',
+                522: 'SetTextJustification',
+                523: 'SetWindowOrg',
+                524: 'SetWindowExt',
+                525: 'SetViewportOrg',
+                526: 'SetViewportExt',
+                527: 'OffsetWindowOrg',
+                529: 'OffsetViewportOrg',
+                531: 'LineTo',
+                532: 'MoveTo',
+                544: 'OffsetClipRgn',
+                552: 'FillRegion',
+                561: 'SetMapperFlags',
+                564: 'SelectPalette',
+                1040: 'ScaleWindowExt',
+                1042: 'ScaleViewportExt',
+                1045: 'ExcludeClipRect',
+                1046: 'IntersectClipRect',
+                1048: 'Ellipse',
+                1049: 'FloodFill',
+                1051: 'Rectangle',
+                1055: 'SetPixel',
+                1065: 'FrameRegion',
+                1352: 'ExtFloodFill',
+                1564: 'RoundRect',
+                1565: 'PatBlt',
+                2071: 'Arc',
+                2074: 'Pie',
+                2096: 'Chord',
+                3379: 'SetDibToDev',
+                247: 'CreatePalette',
+                248: 'CreateBrush',
+                322: 'DibCreatePatternBrush',
+                496: 'DeleteObject',
+                505: 'CreatePatternBrush',
+                762: 'CreatePenIndirect',
+                763: 'CreateFontIndirect',
+                764: 'CreateBrushIndirect',
+                765: 'CreateBitmapIndirect',
+                804: 'Polygon',
+                805: 'Polyline',
+                1078: 'AnimatePalette',
+                1313: 'TextOut',
+                1336: 'PolyPolygon',
+                1574: 'Escape',
+                1583: 'DrawText',
+                1790: 'CreateBitmap',
+                1791: 'CreateRegion',
+                2338: 'BitBlt',
+                2368: 'DibBitblt',
+                2610: 'ExtTextOut',
+                2851: 'StretchBlt',
+                2881: 'DibStretchBlt',
+                3907: 'StretchDIBits'
+        } # }}}
+
+    def __call__(self, stream_or_data):
+        data = stream_or_data
+        if hasattr(data, 'read'):
+            data = data.read()
+        self.log.filter_level = self.log.DEBUG
+        self.header = WMFHeader(data, self.log, self.verbose)
+
+        offset = self.header.records_start_at
+        hsize = struct.calcsize('<IH')
+        self.records = []
+        while offset < len(data)-6:
+            size, func = struct.unpack_from('<IH', data, offset)
+            size *= 2 # Convert to bytes
+            offset += hsize
+            params = ''
+            delta = size - hsize
+            if delta > 0:
+                params = data[offset:offset+delta]
+                offset += delta
+
+            func = self.function_map.get(func, func)
+
+            if self.verbose > 3:
+                self.log.debug('WMF Record:', size, func)
+            self.records.append((func, params))
+
+        for rec in self.records:
+            f = getattr(self, rec[0], None)
+            if callable(f):
+                f(rec[1])
+            elif self.verbose > 2:
+                self.log.debug('Ignoring record:', rec[0])
+
+        self.has_raster_image = len(self.bitmaps) > 0
+
+
+    def SetMapMode(self, params):
+        if len(params) == 2:
+            self.map_mode = struct.unpack('<H', params)[0]
+        else:
+            self.log.warn('Invalid SetMapMode param')
+
+    def SetWindowOrg(self, params):
+        if len(params) == 4:
+            self.window_origin = struct.unpack('<HH', params)
+        elif len(params) == 8:
+            self.window_origin = struct.unpack('<II', params)
+        elif len(params) == 16:
+            self.window_origin = struct.unpack('<LL', params)
+        else:
+            self.log.warn('Invalid SetWindowOrg param', repr(params))
+
+    def SetWindowExt(self, params):
+        if len(params) == 4:
+            self.window_extent = struct.unpack('<HH', params)
+        elif len(params) == 8:
+            self.window_extent = struct.unpack('<II', params)
+        elif len(params) == 16:
+            self.window_extent = struct.unpack('<LL', params)
+        else:
+            self.log.warn('Invalid SetWindowExt param', repr(params))
+
+    def DibStretchBlt(self, raw):
+        offset = 0
+        fmt = '<IHHHHHHHH'
+        raster_op, src_height, src_width, y_src, x_src, dest_height, \
+            dest_width, y_dest, x_dest = struct.unpack_from('<IHHHHHHHH', raw, offset)
+        offset += struct.calcsize(fmt)
+        bmp_data = raw[offset:]
+        bmp = self.create_bmp_from_dib(bmp_data)
+        self.bitmaps.append(bmp)
+
+    def create_bmp_from_dib(self, raw):
+        size = len(raw) + 14
+        dh = DIBHeader(raw)
+        pixel_array_offset = dh.header_size + dh.bitmasks_size + \
+                             dh.color_table_size
+        parts = ['BM', struct.pack('<I', size), '\0'*4, struct.pack('<I',
+            pixel_array_offset)]
+        return ''.join(parts) + raw
+
+    def to_png(self):
+        bmps = list(sorted(self.bitmaps, key=lambda x: len(x)))
+        bmp = bmps[-1]
+        from calibre.utils.magick import Image
+        img = Image()
+        img.load(bmp)
+        return img.export('png')
+
+def wmf_unwrap(wmf_data):
+    '''
+    Return the largest embedded raster image in the WMF.
+    The returned data is in PNG format.
+    '''
+    w = WMF()
+    w(wmf_data)
+    if not w.has_raster_image:
+        raise ValueError('No raster image found in the WMF')
+    return w.to_png()
+
+if __name__ == '__main__':
+    wmf = WMF(verbose=4)
+    wmf(open(sys.argv[-1], 'rb'))
+    open('/t/test.bmp', 'wb').write(wmf.bitmaps[0])
+
@@ -982,9 +982,12 @@ class ZipFile:
            zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH])

        if fname != zinfo.orig_filename:
-            raise BadZipfile, \
-                      'File name in directory "%s" and header "%s" differ.' % (
-                          zinfo.orig_filename, fname)
+            print ('WARNING: Header (%r) and directory (%r) filenames do not'
+                    ' match inside ZipFile')%(fname, zinfo.orig_filename)
+            print 'Using directory filename %r'%zinfo.orig_filename
+            #raise BadZipfile, \
+            #          'File name in directory "%r" and header "%r" differ.' % (
+            #              zinfo.orig_filename, fname)

        # check for encrypted flag & handle password
        is_encrypted = zinfo.flag_bits & 0x1
@@ -700,10 +700,17 @@ class BasicNewsRecipe(Recipe):
        for attr in self.remove_attributes:
            for x in soup.findAll(attrs={attr:True}):
                del x[attr]
-        for base in list(soup.findAll(['base', 'iframe'])):
+        for base in list(soup.findAll(['base', 'iframe', 'canvas', 'embed',
+            'command', 'datalist', 'video', 'audio'])):
            base.extract()

        ans = self.postprocess_html(soup, first_fetch)
+
+        # Nuke HTML5 tags
+        for x in ans.findAll(['article', 'aside', 'header', 'footer', 'nav',
+            'figcaption', 'figure', 'section']):
+            x.name = 'div'
+
        if job_info:
            url, f, a, feed_len = job_info
            try: