replaced messed up rtf file

2025-08-30 23:00:21 -04:00 · 2010-09-11 22:15:09 +10:00 · 2010-09-11 22:15:09 +10:00 · f6de0bef13
commit f6de0bef13
parent 105d490c01
1 changed files with 289 additions and 335 deletions
--- a/src/calibre/ebooks/rtf/preprocess.py
+++ b/src/calibre/ebooks/rtf/preprocess.py
@ -1,390 +1,344 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import with_statement
 __license__   = 'GPL v3'
-__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__copyright__ = '2010, Gerendi Sandor Attila'
 __docformat__ = 'restructuredtext en'
-import functools, re
+"""
 RTF tokenizer and token parser. v.1.0 (1/17/2010)
 Author: Gerendi Sandor Attila
-from calibre import entity_to_unicode
+At this point this will tokenize a RTF file then rebuild it from the tokens.
 In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compilant.
 """
-XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
+class tokenDelimitatorStart():
-SVG_NS       = 'http://www.w3.org/2000/svg'
+    def __init__(self):
-XLINK_NS     = 'http://www.w3.org/1999/xlink'
+        pass
    def toRTF(self):
        return b'{'
    def __repr__(self):
        return '{'
-convert_entities = functools.partial(entity_to_unicode,
+class tokenDelimitatorEnd():
-        result_exceptions = {
+    def __init__(self):
-            u'<' : '&lt;',
+        pass
-            u'>' : '&gt;',
+    def toRTF(self):
-            u"'" : '&apos;',
+        return b'}'
-            u'"' : '&quot;',
+    def __repr__(self):
-            u'&' : '&amp;',
+        return '}'
        })
 _span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
-LIGATURES = {
+class tokenControlWord():
-#        u'\u00c6': u'AE',
+    def __init__(self, name, separator = ''):
-#        u'\u00e6': u'ae',
+        self.name = name
-#        u'\u0152': u'OE',
+        self.separator = separator
-#        u'\u0153': u'oe',
+    def toRTF(self):
-#        u'\u0132': u'IJ',
+        return self.name + self.separator
-#        u'\u0133': u'ij',
+    def __repr__(self):
-#        u'\u1D6B': u'ue',
+        return self.name + self.separator
        u'\uFB00': u'ff',
        u'\uFB01': u'fi',
        u'\uFB02': u'fl',
        u'\uFB03': u'ffi',
        u'\uFB04': u'ffl',
        u'\uFB05': u'ft',
        u'\uFB06': u'st',
        }
-_ligpat = re.compile(u'|'.join(LIGATURES))
+class tokenControlWordWithNumericArgument():
    def __init__(self, name, argument, separator = ''):
        self.name = name
        self.argument = argument
        self.separator = separator
    def toRTF(self):
        return self.name + repr(self.argument) + self.separator
    def __repr__(self):
        return self.name + repr(self.argument) + self.separator
-def sanitize_head(match):
+class tokenControlSymbol():
-    x = match.group(1)
+    def __init__(self, name):
-    x = _span_pat.sub('', x)
+        self.name = name
-    return '<head>\n%s\n</head>' % x
+    def toRTF(self):
        return self.name
    def __repr__(self):
        return self.name
-def chap_head(match):
+class tokenData():
-    chap = match.group('chap')
+    def __init__(self, data):
-    title = match.group('title')
+        self.data = data
-    if not title:
+    def toRTF(self):
-               return '<h1>'+chap+'</h1><br/>\n'
+        return self.data
-    else:
+    def __repr__(self):
-               return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
+        return self.data
-def wrap_lines(match):
+class tokenBinN():
-    ital = match.group('ital')
+    def __init__(self, data, separator = ''):
-    if not ital:
+        self.data = data
-               return ' '
+        self.separator = separator
-    else:
+    def toRTF(self):
-               return ital+' '
+        return "\\bin" + repr(len(self.data)) + self.separator + self.data
    def __repr__(self):
        return "\\bin" + repr(len(self.data)) + self.separator + self.data
 class token8bitChar():
    def __init__(self, data):
        self.data = data
    def toRTF(self):
        return "\\'" + self.data
    def __repr__(self):
        return "\\'" + self.data
 class tokenUnicode():
    def __init__(self, data, separator = '', current_ucn = 1, eqList = []):
        self.data = data
        self.separator = separator
        self.current_ucn = current_ucn
        self.eqList = eqList
    def toRTF(self):
        result = '\\u' + repr(self.data) + ' '
        ucn = self.current_ucn
        if len(self.eqList) < ucn:
            ucn = len(self.eqList)
            result =  tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result
        i = 0
        for eq in self.eqList:
            if i >= ucn:
                break
            result = result + eq.toRTF()
        return result
    def __repr__(self):
        return '\\u' + repr(self.data)
-def line_length(format, raw, percent):
+def isAsciiLetter(value):
-    '''
+    return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z'))
    raw is the raw text to find the line length to use for wrapping.
    percentage is a decimal number, 0 - 1 which is used to determine
    how far in the list of line lengths to use. The list of line lengths is
    ordered smallest to larged and does not include duplicates. 0.5 is the
    median value.
    '''
    raw = raw.replace('&nbsp;', ' ')
    if format == 'html':
        linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
    elif format == 'pdf':
        linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
    lines = linere.findall(raw)
    print "percent is " + str(percent)
-    lengths = []
+def isDigit(value):
-    for line in lines:
+    return (value >= '0') and (value <= '9')
        if len(line) > 0:
            lengths.append(len(line))
-    if not lengths:
+def isChar(value, char):
-        return 0
+    return value == char
-    lengths = list(set(lengths))
+def isString(buffer, string):
-    total = sum(lengths)
+    return buffer == string
    avg = total / len(lengths)
    max_line = avg * 2
    lengths = sorted(lengths)
    for i in range(len(lengths) - 1, -1, -1):
        if lengths[i] > max_line:
            del lengths[i]
    if percent > 1:
        percent = 1
    if percent < 0:
        percent = 0
    index = int(len(lengths) * percent) - 1
    return lengths[index]
-class CSSPreProcessor(object):
+class RtfTokenParser():
    def __init__(self, tokens):
        self.tokens = tokens
        self.process()
        self.processUnicode()
-    PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')
+    def process(self):
        i = 0
        newTokens = []
        while i < len(self.tokens):
            if isinstance(self.tokens[i], tokenControlSymbol):
                if isString(self.tokens[i].name, "\\'"):
                    i = i + 1
                    if not isinstance(self.tokens[i], tokenData):
                        raise Exception('Error: token8bitChar without data.')
                    if len(self.tokens[i].data) < 2:
                        raise Exception('Error: token8bitChar without data.')
                    newTokens.append(token8bitChar(self.tokens[i].data[0:2]))
                    if len(self.tokens[i].data) > 2:
                        newTokens.append(tokenData(self.tokens[i].data[2:]))
                    i = i + 1
                    continue
-    def __call__(self, data, add_namespace=False):
+            newTokens.append(self.tokens[i])
-        from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
+            i = i + 1
        data = self.PAGE_PAT.sub('', data)
        if not add_namespace:
            return data
        ans, namespaced = [], False
        for line in data.splitlines():
            ll = line.lstrip()
            if not (namespaced or ll.startswith('@import') or
                        ll.startswith('@charset')):
                ans.append(XHTML_CSS_NAMESPACE.strip())
                namespaced = True
            ans.append(line)
-        return u'\n'.join(ans)
+        self.tokens = list(newTokens)
-class HTMLPreProcessor(object):
+    def processUnicode(self):
        i = 0
        newTokens = []
        ucNbStack = [1]
        while i < len(self.tokens):
            if isinstance(self.tokens[i], tokenDelimitatorStart):
                ucNbStack.append(ucNbStack[len(ucNbStack) - 1])
                newTokens.append(self.tokens[i])
                i = i + 1
                continue
            if isinstance(self.tokens[i], tokenDelimitatorEnd):
                ucNbStack.pop()
                newTokens.append(self.tokens[i])
                i = i + 1
                continue
            if isinstance(self.tokens[i], tokenControlWordWithNumericArgument):
                if isString(self.tokens[i].name, '\\uc'):
                    ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument
                    newTokens.append(self.tokens[i])
                    i = i + 1
                    continue
                if isString(self.tokens[i].name, '\\u'):
                    x = i
                    j = 0
                    i = i + 1
                    replace = []
                    partialData = None
                    ucn = ucNbStack[len(ucNbStack) - 1]
                    while (i < len(self.tokens)) and (j < ucn):
                        if isinstance(self.tokens[i], tokenDelimitatorStart):
                            break
                        if isinstance(self.tokens[i], tokenDelimitatorEnd):
                            break
                        if isinstance(self.tokens[i], tokenData):
                            if len(self.tokens[i].data) >= ucn - j:
                                replace.append(tokenData(self.tokens[i].data[0 : ucn - j]))
                                if len(self.tokens[i].data) > ucn - j:
                                    partialData = tokenData(self.tokens[i].data[ucn - j:])
                                i = i + 1
                                break
                            else:
                                replace.append(self.tokens[i])
                                j = j + len(self.tokens[i].data)
                                i = i + 1
                                continue
                        if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN):
                            replace.append(self.tokens[i])
                            i = i + 1
                            j = j + 1
                            continue
                        raise Exception('Error: incorect utf replacement.')
-    PREPROCESS = [
+                    #calibre rtf2xml does not support utfreplace
-                  # Some idiotic HTML generators (Frontpage I'm looking at you)
+                    replace = []
                  # Put all sorts of crap into <head>. This messes up lxml
                  (re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
                   sanitize_head),
                  # Convert all entities, since lxml doesn't handle them well
                  (re.compile(r'&(\S+?);'), convert_entities),
                  # Remove the <![if/endif tags inserted by everybody's darling, MS Word
                  (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
                   lambda match: ''),
                  ]
-    # Fix pdftohtml markup
+                    newTokens.append(tokenUnicode(self.tokens[x].argument, self.tokens[x].separator, ucNbStack[len(ucNbStack) - 1], replace))
-    PDFTOHTML  = [
+                    if partialData != None:
-                  # Fix umlauts
+                        newTokens.append(partialData)
-                  # ¨
+                    continue
                  (re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
                  (re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
                  (re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'),
                  (re.compile(u'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ë'),
                  (re.compile(u'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ï'),
                  (re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'),
                  (re.compile(u'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ö'),
                  (re.compile(u'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ö'),
                  (re.compile(u'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ü'),
                  (re.compile(u'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ü'),
-                  # Fix accents
+            newTokens.append(self.tokens[i])
-                  # `
+            i = i + 1
                  (re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'),
                  (re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'),
                  (re.compile(u'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'è'),
                  (re.compile(u'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'È'),
                  (re.compile(u'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ì'),
                  (re.compile(u'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ì'),
                  (re.compile(u'`\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ò'),
                  (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
                  (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
                  (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
-                  # ´
+        self.tokens = list(newTokens)
                  (re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),
                  (re.compile(u'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Á'),
                  (re.compile(u'´\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ć'),
                  (re.compile(u'´\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ć'),
                  (re.compile(u'´\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'é'),
                  (re.compile(u'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'É'),
                  (re.compile(u'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'í'),
                  (re.compile(u'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Í'),
                  (re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'),
                  (re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'),
                  (re.compile(u'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ń'),
                  (re.compile(u'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ń'),
                  (re.compile(u'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'ś'),
                  (re.compile(u'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Ś'),
                  (re.compile(u'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ú'),
                  (re.compile(u'´\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ú'),
                  (re.compile(u'´\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ź'),
                  (re.compile(u'´\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ź'),
                  # ˆ
                  (re.compile(u'ˆ\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'â'),
                  (re.compile(u'ˆ\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Â'),
                  (re.compile(u'ˆ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ê'),
                  (re.compile(u'ˆ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ê'),
                  (re.compile(u'ˆ\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'î'),
                  (re.compile(u'ˆ\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Î'),
                  (re.compile(u'ˆ\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ô'),
                  (re.compile(u'ˆ\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ô'),
                  (re.compile(u'ˆ\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'û'),
                  (re.compile(u'ˆ\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Û'),
-                  # ¸
+    def toRTF(self):
-                  (re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'),
+        result = []
-                  (re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'),
+        for token in self.tokens:
            result.append(token.toRTF())
        return "".join(result)
                  # ˛
                  (re.compile(u'˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ą'),
                  (re.compile(u'˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ą'),
                  (re.compile(u'˛\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ę'),
                  (re.compile(u'˛\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ę'),
                  # ˙
                  (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
                  (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
-                  # If pdf printed from a browser then the header/footer has a reliable pattern
+class RtfTokenizer():
-                  (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
+    def __init__(self, rtfData):
        self.rtfData = []
        self.tokens = []
        self.rtfData = rtfData
        self.tokenize()
-                  # Center separator lines
+    def tokenize(self):
-                  (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
+        i = 0
        lastDataStart = -1
        while i < len(self.rtfData):
-                  # Remove page links
+            if isChar(self.rtfData[i], '{'):
-                  (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
+                if lastDataStart > -1:
-                  # Remove <hr> tags
+                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
-                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
+                    lastDataStart = -1
-                  # Replace <br><br> with <p>
+                self.tokens.append(tokenDelimitatorStart())
-                  # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
+                i = i + 1
                continue
-                  # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
+            if isChar(self.rtfData[i], '}'):
-                  (re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
+                if lastDataStart > -1:
                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
                    lastDataStart = -1
                self.tokens.append(tokenDelimitatorEnd())
                i = i + 1
                continue
-                  # Remove gray background
+            if isChar(self.rtfData[i], '\\'):
-                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
+                if i + 1 >= len(self.rtfData):
                    raise Exception('Error: Control character found at the end of the document.')
-                  # Detect Chapters to match default XPATH in GUI
+                if lastDataStart > -1:
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
+                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
                    lastDataStart = -1
-                  # Have paragraphs show better
+                tokenStart = i
-                  (re.compile(r'<br.*?>'), lambda match : '<p>'),
+                i = i + 1
                  # Clean up spaces
                  (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
                  # Add space before and after italics
                  (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
                  (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
                 ]
-    # Fix Book Designer markup
+                #Control Words
-    BOOK_DESIGNER = [
+                if isAsciiLetter(self.rtfData[i]):
-                     # HR
+                    #consume <ASCII Letter Sequence>
-                     (re.compile('<hr>', re.IGNORECASE),
+                    consumed = False
-                      lambda match : '<span style="page-break-after:always"> </span>'),
+                    while i < len(self.rtfData):
-                     # Create header tags
+                        if not isAsciiLetter(self.rtfData[i]):
-                     (re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
+                            tokenEnd = i
-                      lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
+                            consumed = True
-                     (re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
+                            break
-                      lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
+                        i = i + 1
                     (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                      lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
                     (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                      lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
                     ]
    def __init__(self, input_plugin_preprocess, plugin_preprocess,
            extra_opts=None):
        self.input_plugin_preprocess = input_plugin_preprocess
        self.plugin_preprocess = plugin_preprocess
        self.extra_opts = extra_opts
-    def is_baen(self, src):
+                    if not consumed:
-        return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
+                        raise Exception('Error (at:%d): Control Word without end.'%(tokenStart))
                          re.IGNORECASE).search(src) is not None
-    def is_book_designer(self, raw):
+                    #we have numeric argument before delimiter
-        return re.search('<H2[^><]*id=BookTitle', raw) is not None
+                    if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]):
                        #consume the numeric argument
                        consumed = False
                        l = 0
                        while i < len(self.rtfData):
                            if not isDigit(self.rtfData[i]):
                                consumed = True
                                break
                            l = l + 1
                            i = i + 1
                            if l > 10 :
                                raise Exception('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart])
-    def is_pdftohtml(self, src):
+                        if not consumed:
-        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
+                            raise Exception('Error (at:%d): Control Word without numeric argument end.'%[tokenStart])
-    def __call__(self, html, remove_special_chars=None,
+                    separator = ''
-            get_preprocess_html=False):
+                    if isChar(self.rtfData[i], ' '):
-        if remove_special_chars is not None:
+                        separator = ' '
            html = remove_special_chars.sub('', html)
        html = html.replace('\0', '')
        is_pdftohtml = self.is_pdftohtml(html)
        if self.is_baen(html):
            rules = []
        elif self.is_book_designer(html):
            rules = self.BOOK_DESIGNER
        elif is_pdftohtml:
            rules = self.PDFTOHTML
        else:
            rules = []
-        start_rules = []
+                    controlWord = self.rtfData[tokenStart: tokenEnd]
-        if is_pdftohtml:
+                    if tokenEnd < i:
-            # Remove non breaking spaces
+                        value = int(self.rtfData[tokenEnd: i])
-            start_rules.append((re.compile(ur'\u00a0'), lambda match : ' '))
+                        if isString(controlWord, "\\bin"):
                            i = i + value
                            self.tokens.append(tokenBinN(self.rtfData[tokenStart:i], separator))
                        else:
                            self.tokens.append(tokenControlWordWithNumericArgument(controlWord, value, separator))
                    else:
                        self.tokens.append(tokenControlWord(controlWord, separator))
                    #space delimiter, we should discard it
                    if self.rtfData[i] == ' ':
                        i = i + 1
-        if not getattr(self.extra_opts, 'keep_ligatures', False):
+                #Control Symbol
-            html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
+                else:
                    self.tokens.append(tokenControlSymbol(self.rtfData[tokenStart : i + 1]))
                    i = i + 1
                continue
-        end_rules = []
+            if lastDataStart < 0:
-        if getattr(self.extra_opts, 'remove_header', None):
+                lastDataStart = i
-            try:
+            i = i + 1
                rules.insert(0,
                    (re.compile(self.extra_opts.header_regex), lambda match : '')
                )
            except:
                import traceback
                print 'Failed to parse remove_header regexp'
                traceback.print_exc()
-        if getattr(self.extra_opts, 'remove_footer', None):
+    def toRTF(self):
-            try:
+        result = []
-                rules.insert(0,
+        for token in self.tokens:
-                    (re.compile(self.extra_opts.footer_regex), lambda match : '')
+            result.append(token.toRTF())
-                )
+        return "".join(result)
            except:
                import traceback
                print 'Failed to parse remove_footer regexp'
                traceback.print_exc()
        # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
        if getattr(self.extra_opts, 'preprocess_html', None):
            if is_pdftohtml:
                end_rules.append(
                    (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
                )
-        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
+if __name__ == "__main__":
-            length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
+    import sys
-            if length:
+    if len(sys.argv) < 2:
-                print "The pdf line length returned is " + str(length)
+        print ("Usage %prog rtfFileToConvert")
-                end_rules.append(
+        sys.exit()
-                    # Un wrap using punctuation
+    f = open(sys.argv[1], 'rb')
-                    (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+    data = f.read()
-                )
+    f.close()
-        for rule in self.PREPROCESS + start_rules:
+    tokenizer = RtfTokenizer(data)
-            html = rule[0].sub(rule[1], html)
+    parsedTokens = RtfTokenParser(tokenizer.tokens)
-        if get_preprocess_html:
+    data = parsedTokens.toRTF()
            return html
-        def dump(raw, where):
+    f = open(sys.argv[1], 'w')
-            import os
+    f.write(data)
-            dp = getattr(self.extra_opts, 'debug_pipeline', None)
+    f.close()
            if dp and os.path.exists(dp):
                odir = os.path.join(dp, 'input')
                if os.path.exists(odir):
                    odir = os.path.join(odir, where)
                    if not os.path.exists(odir):
                        os.makedirs(odir)
                    name, i = None, 0
                    while not name or os.path.exists(os.path.join(odir, name)):
                        i += 1
                        name = '%04d.html'%i
                    with open(os.path.join(odir, name), 'wb') as f:
                        f.write(raw.encode('utf-8'))
        #dump(html, 'pre-preprocess')
        for rule in rules + end_rules:
            html = rule[0].sub(rule[1], html)
        #dump(html, 'post-preprocess')
        # Handle broken XHTML w/ SVG (ugh)
        if 'svg:' in html and SVG_NS not in html:
            html = html.replace(
                '<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
        if 'xlink:' in html and XLINK_NS not in html:
            html = html.replace(
                '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
        html = XMLDECL_RE.sub('', html)
        if getattr(self.extra_opts, 'asciiize', False):
            from calibre.ebooks.unidecode.unidecoder import Unidecoder
            unidecoder = Unidecoder()
            html = unidecoder.decode(html)
        if self.plugin_preprocess:
            html = self.input_plugin_preprocess(html)
        return html