diff --git a/src/calibre/ebooks/rtf/preprocess.py b/src/calibre/ebooks/rtf/preprocess.py index ee45da697f..a3076651fd 100644 --- a/src/calibre/ebooks/rtf/preprocess.py +++ b/src/calibre/ebooks/rtf/preprocess.py @@ -1,390 +1,344 @@ #!/usr/bin/env python # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement __license__ = 'GPL v3' -__copyright__ = '2009, Kovid Goyal ' +__copyright__ = '2010, Gerendi Sandor Attila' __docformat__ = 'restructuredtext en' -import functools, re +""" +RTF tokenizer and token parser. v.1.0 (1/17/2010) +Author: Gerendi Sandor Attila -from calibre import entity_to_unicode +At this point this will tokenize a RTF file then rebuild it from the tokens. +In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compilant. +""" -XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') -SVG_NS = 'http://www.w3.org/2000/svg' -XLINK_NS = 'http://www.w3.org/1999/xlink' +class tokenDelimitatorStart(): + def __init__(self): + pass + def toRTF(self): + return b'{' + def __repr__(self): + return '{' -convert_entities = functools.partial(entity_to_unicode, - result_exceptions = { - u'<' : '<', - u'>' : '>', - u"'" : ''', - u'"' : '"', - u'&' : '&', - }) -_span_pat = re.compile('', re.DOTALL|re.IGNORECASE) +class tokenDelimitatorEnd(): + def __init__(self): + pass + def toRTF(self): + return b'}' + def __repr__(self): + return '}' -LIGATURES = { -# u'\u00c6': u'AE', -# u'\u00e6': u'ae', -# u'\u0152': u'OE', -# u'\u0153': u'oe', -# u'\u0132': u'IJ', -# u'\u0133': u'ij', -# u'\u1D6B': u'ue', - u'\uFB00': u'ff', - u'\uFB01': u'fi', - u'\uFB02': u'fl', - u'\uFB03': u'ffi', - u'\uFB04': u'ffl', - u'\uFB05': u'ft', - u'\uFB06': u'st', - } +class tokenControlWord(): + def __init__(self, name, separator = ''): + self.name = name + self.separator = separator + def toRTF(self): + return self.name + self.separator + def __repr__(self): + return self.name + self.separator -_ligpat = re.compile(u'|'.join(LIGATURES)) +class tokenControlWordWithNumericArgument(): + def __init__(self, name, argument, separator = ''): + self.name = name + self.argument = argument + self.separator = separator + def toRTF(self): + return self.name + repr(self.argument) + self.separator + def __repr__(self): + return self.name + repr(self.argument) + self.separator -def sanitize_head(match): - x = match.group(1) - x = _span_pat.sub('', x) - return '\n%s\n' % x +class tokenControlSymbol(): + def __init__(self, name): + self.name = name + def toRTF(self): + return self.name + def __repr__(self): + return self.name -def chap_head(match): - chap = match.group('chap') - title = match.group('title') - if not title: - return '

'+chap+'


\n' - else: - return '

'+chap+'

\n

'+title+'

\n' +class tokenData(): + def __init__(self, data): + self.data = data + def toRTF(self): + return self.data + def __repr__(self): + return self.data -def wrap_lines(match): - ital = match.group('ital') - if not ital: - return ' ' - else: - return ital+' ' +class tokenBinN(): + def __init__(self, data, separator = ''): + self.data = data + self.separator = separator + def toRTF(self): + return "\\bin" + repr(len(self.data)) + self.separator + self.data + def __repr__(self): + return "\\bin" + repr(len(self.data)) + self.separator + self.data + +class token8bitChar(): + def __init__(self, data): + self.data = data + def toRTF(self): + return "\\'" + self.data + def __repr__(self): + return "\\'" + self.data + +class tokenUnicode(): + def __init__(self, data, separator = '', current_ucn = 1, eqList = []): + self.data = data + self.separator = separator + self.current_ucn = current_ucn + self.eqList = eqList + def toRTF(self): + result = '\\u' + repr(self.data) + ' ' + ucn = self.current_ucn + if len(self.eqList) < ucn: + ucn = len(self.eqList) + result = tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result + i = 0 + for eq in self.eqList: + if i >= ucn: + break + result = result + eq.toRTF() + return result + def __repr__(self): + return '\\u' + repr(self.data) -def line_length(format, raw, percent): - ''' - raw is the raw text to find the line length to use for wrapping. - percentage is a decimal number, 0 - 1 which is used to determine - how far in the list of line lengths to use. The list of line lengths is - ordered smallest to larged and does not include duplicates. 0.5 is the - median value. - ''' - raw = raw.replace(' ', ' ') - if format == 'html': - linere = re.compile('(?<=)', re.DOTALL) - elif format == 'pdf': - linere = re.compile('(?<=
).*?(?=
)', re.DOTALL) - lines = linere.findall(raw) - print "percent is " + str(percent) +def isAsciiLetter(value): + return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z')) - lengths = [] - for line in lines: - if len(line) > 0: - lengths.append(len(line)) +def isDigit(value): + return (value >= '0') and (value <= '9') - if not lengths: - return 0 +def isChar(value, char): + return value == char - lengths = list(set(lengths)) - total = sum(lengths) - avg = total / len(lengths) - max_line = avg * 2 - - lengths = sorted(lengths) - for i in range(len(lengths) - 1, -1, -1): - if lengths[i] > max_line: - del lengths[i] - - if percent > 1: - percent = 1 - if percent < 0: - percent = 0 - - index = int(len(lengths) * percent) - 1 - - return lengths[index] +def isString(buffer, string): + return buffer == string -class CSSPreProcessor(object): +class RtfTokenParser(): + def __init__(self, tokens): + self.tokens = tokens + self.process() + self.processUnicode() - PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') + def process(self): + i = 0 + newTokens = [] + while i < len(self.tokens): + if isinstance(self.tokens[i], tokenControlSymbol): + if isString(self.tokens[i].name, "\\'"): + i = i + 1 + if not isinstance(self.tokens[i], tokenData): + raise Exception('Error: token8bitChar without data.') + if len(self.tokens[i].data) < 2: + raise Exception('Error: token8bitChar without data.') + newTokens.append(token8bitChar(self.tokens[i].data[0:2])) + if len(self.tokens[i].data) > 2: + newTokens.append(tokenData(self.tokens[i].data[2:])) + i = i + 1 + continue - def __call__(self, data, add_namespace=False): - from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE - data = self.PAGE_PAT.sub('', data) - if not add_namespace: - return data - ans, namespaced = [], False - for line in data.splitlines(): - ll = line.lstrip() - if not (namespaced or ll.startswith('@import') or - ll.startswith('@charset')): - ans.append(XHTML_CSS_NAMESPACE.strip()) - namespaced = True - ans.append(line) + newTokens.append(self.tokens[i]) + i = i + 1 - return u'\n'.join(ans) + self.tokens = list(newTokens) -class HTMLPreProcessor(object): + def processUnicode(self): + i = 0 + newTokens = [] + ucNbStack = [1] + while i < len(self.tokens): + if isinstance(self.tokens[i], tokenDelimitatorStart): + ucNbStack.append(ucNbStack[len(ucNbStack) - 1]) + newTokens.append(self.tokens[i]) + i = i + 1 + continue + if isinstance(self.tokens[i], tokenDelimitatorEnd): + ucNbStack.pop() + newTokens.append(self.tokens[i]) + i = i + 1 + continue + if isinstance(self.tokens[i], tokenControlWordWithNumericArgument): + if isString(self.tokens[i].name, '\\uc'): + ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument + newTokens.append(self.tokens[i]) + i = i + 1 + continue + if isString(self.tokens[i].name, '\\u'): + x = i + j = 0 + i = i + 1 + replace = [] + partialData = None + ucn = ucNbStack[len(ucNbStack) - 1] + while (i < len(self.tokens)) and (j < ucn): + if isinstance(self.tokens[i], tokenDelimitatorStart): + break + if isinstance(self.tokens[i], tokenDelimitatorEnd): + break + if isinstance(self.tokens[i], tokenData): + if len(self.tokens[i].data) >= ucn - j: + replace.append(tokenData(self.tokens[i].data[0 : ucn - j])) + if len(self.tokens[i].data) > ucn - j: + partialData = tokenData(self.tokens[i].data[ucn - j:]) + i = i + 1 + break + else: + replace.append(self.tokens[i]) + j = j + len(self.tokens[i].data) + i = i + 1 + continue + if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN): + replace.append(self.tokens[i]) + i = i + 1 + j = j + 1 + continue + raise Exception('Error: incorect utf replacement.') - PREPROCESS = [ - # Some idiotic HTML generators (Frontpage I'm looking at you) - # Put all sorts of crap into . This messes up lxml - (re.compile(r']*>\n*(.*?)\n*', re.IGNORECASE|re.DOTALL), - sanitize_head), - # Convert all entities, since lxml doesn't handle them well - (re.compile(r'&(\S+?);'), convert_entities), - # Remove the ', re.IGNORECASE), - lambda match: ''), - ] + #calibre rtf2xml does not support utfreplace + replace = [] - # Fix pdftohtml markup - PDFTOHTML = [ - # Fix umlauts - # ¨ - (re.compile(u'¨\s*()*\s*a', re.UNICODE), lambda match: u'ä'), - (re.compile(u'¨\s*()*\s*A', re.UNICODE), lambda match: u'Ä'), - (re.compile(u'¨\s*()*\s*e', re.UNICODE), lambda match: u'ë'), - (re.compile(u'¨\s*()*\s*E', re.UNICODE), lambda match: u'Ë'), - (re.compile(u'¨\s*()*\s*i', re.UNICODE), lambda match: u'ï'), - (re.compile(u'¨\s*()*\s*I', re.UNICODE), lambda match: u'Ï'), - (re.compile(u'¨\s*()*\s*o', re.UNICODE), lambda match: u'ö'), - (re.compile(u'¨\s*()*\s*O', re.UNICODE), lambda match: u'Ö'), - (re.compile(u'¨\s*()*\s*u', re.UNICODE), lambda match: u'ü'), - (re.compile(u'¨\s*()*\s*U', re.UNICODE), lambda match: u'Ü'), + newTokens.append(tokenUnicode(self.tokens[x].argument, self.tokens[x].separator, ucNbStack[len(ucNbStack) - 1], replace)) + if partialData != None: + newTokens.append(partialData) + continue - # Fix accents - # ` - (re.compile(u'`\s*()*\s*a', re.UNICODE), lambda match: u'à'), - (re.compile(u'`\s*()*\s*A', re.UNICODE), lambda match: u'À'), - (re.compile(u'`\s*()*\s*e', re.UNICODE), lambda match: u'è'), - (re.compile(u'`\s*()*\s*E', re.UNICODE), lambda match: u'È'), - (re.compile(u'`\s*()*\s*i', re.UNICODE), lambda match: u'ì'), - (re.compile(u'`\s*()*\s*I', re.UNICODE), lambda match: u'Ì'), - (re.compile(u'`\s*()*\s*o', re.UNICODE), lambda match: u'ò'), - (re.compile(u'`\s*()*\s*O', re.UNICODE), lambda match: u'Ò'), - (re.compile(u'`\s*()*\s*u', re.UNICODE), lambda match: u'ù'), - (re.compile(u'`\s*()*\s*U', re.UNICODE), lambda match: u'Ù'), + newTokens.append(self.tokens[i]) + i = i + 1 - # ´ - (re.compile(u'´\s*()*\s*a', re.UNICODE), lambda match: u'á'), - (re.compile(u'´\s*()*\s*A', re.UNICODE), lambda match: u'Á'), - (re.compile(u'´\s*()*\s*c', re.UNICODE), lambda match: u'ć'), - (re.compile(u'´\s*()*\s*C', re.UNICODE), lambda match: u'Ć'), - (re.compile(u'´\s*()*\s*e', re.UNICODE), lambda match: u'é'), - (re.compile(u'´\s*()*\s*E', re.UNICODE), lambda match: u'É'), - (re.compile(u'´\s*()*\s*i', re.UNICODE), lambda match: u'í'), - (re.compile(u'´\s*()*\s*I', re.UNICODE), lambda match: u'Í'), - (re.compile(u'´\s*()*\s*o', re.UNICODE), lambda match: u'ó'), - (re.compile(u'´\s*()*\s*O', re.UNICODE), lambda match: u'Ó'), - (re.compile(u'´\s*()*\s*n', re.UNICODE), lambda match: u'ń'), - (re.compile(u'´\s*()*\s*N', re.UNICODE), lambda match: u'Ń'), - (re.compile(u'´\s*()*\s*s', re.UNICODE), lambda match: u'ś'), - (re.compile(u'´\s*()*\s*S', re.UNICODE), lambda match: u'Ś'), - (re.compile(u'´\s*()*\s*u', re.UNICODE), lambda match: u'ú'), - (re.compile(u'´\s*()*\s*U', re.UNICODE), lambda match: u'Ú'), - (re.compile(u'´\s*()*\s*z', re.UNICODE), lambda match: u'ź'), - (re.compile(u'´\s*()*\s*Z', re.UNICODE), lambda match: u'Ź'), + self.tokens = list(newTokens) - # ˆ - (re.compile(u'ˆ\s*()*\s*a', re.UNICODE), lambda match: u'â'), - (re.compile(u'ˆ\s*()*\s*A', re.UNICODE), lambda match: u'Â'), - (re.compile(u'ˆ\s*()*\s*e', re.UNICODE), lambda match: u'ê'), - (re.compile(u'ˆ\s*()*\s*E', re.UNICODE), lambda match: u'Ê'), - (re.compile(u'ˆ\s*()*\s*i', re.UNICODE), lambda match: u'î'), - (re.compile(u'ˆ\s*()*\s*I', re.UNICODE), lambda match: u'Î'), - (re.compile(u'ˆ\s*()*\s*o', re.UNICODE), lambda match: u'ô'), - (re.compile(u'ˆ\s*()*\s*O', re.UNICODE), lambda match: u'Ô'), - (re.compile(u'ˆ\s*()*\s*u', re.UNICODE), lambda match: u'û'), - (re.compile(u'ˆ\s*()*\s*U', re.UNICODE), lambda match: u'Û'), - # ¸ - (re.compile(u'¸\s*()*\s*c', re.UNICODE), lambda match: u'ç'), - (re.compile(u'¸\s*()*\s*C', re.UNICODE), lambda match: u'Ç'), + def toRTF(self): + result = [] + for token in self.tokens: + result.append(token.toRTF()) + return "".join(result) - # ˛ - (re.compile(u'˛\s*()*\s*a', re.UNICODE), lambda match: u'ą'), - (re.compile(u'˛\s*()*\s*A', re.UNICODE), lambda match: u'Ą'), - (re.compile(u'˛\s*()*\s*e', re.UNICODE), lambda match: u'ę'), - (re.compile(u'˛\s*()*\s*E', re.UNICODE), lambda match: u'Ę'), - - # ˙ - (re.compile(u'˙\s*()*\s*z', re.UNICODE), lambda match: u'ż'), - (re.compile(u'˙\s*()*\s*Z', re.UNICODE), lambda match: u'Ż'), - - # If pdf printed from a browser then the header/footer has a reliable pattern - (re.compile(r'((?<=)\s*file:////?[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''), +class RtfTokenizer(): + def __init__(self, rtfData): + self.rtfData = [] + self.tokens = [] + self.rtfData = rtfData + self.tokenize() - # Center separator lines - (re.compile(u'
\s*(?P([*#•]+\s*)+)\s*
'), lambda match: '

\n

' + match.group(1) + '

'), + def tokenize(self): + i = 0 + lastDataStart = -1 + while i < len(self.rtfData): - # Remove page links - (re.compile(r'', re.IGNORECASE), lambda match: ''), - # Remove
tags - (re.compile(r'', re.IGNORECASE), lambda match: '
'), - # Replace

with

- # (re.compile(r'
\s*
', re.IGNORECASE), lambda match: '\n

'), + if isChar(self.rtfData[i], '{'): + if lastDataStart > -1: + self.tokens.append(tokenData(self.rtfData[lastDataStart : i])) + lastDataStart = -1 + self.tokens.append(tokenDelimitatorStart()) + i = i + 1 + continue - # unwrap hyphenation - don't delete the hyphen (often doesn't split words) - (re.compile(u'(?<=[-–—])\s*
\s*(?=[[a-z\d])'), lambda match: ''), + if isChar(self.rtfData[i], '}'): + if lastDataStart > -1: + self.tokens.append(tokenData(self.rtfData[lastDataStart : i])) + lastDataStart = -1 + self.tokens.append(tokenDelimitatorEnd()) + i = i + 1 + continue - # Remove gray background - (re.compile(r']+>'), lambda match : ''), + if isChar(self.rtfData[i], '\\'): + if i + 1 >= len(self.rtfData): + raise Exception('Error: Control character found at the end of the document.') - # Detect Chapters to match default XPATH in GUI - (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(()?)?)\s*(]*>\s*){1,3}\s*(?P(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), + if lastDataStart > -1: + self.tokens.append(tokenData(self.rtfData[lastDataStart : i])) + lastDataStart = -1 - # Have paragraphs show better - (re.compile(r'<br.*?>'), lambda match : '<p>'), - # Clean up spaces - (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), - # Add space before and after italics - (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'), - (re.compile(r'</i>(?=\w)'), lambda match: '</i> '), - - ] + tokenStart = i + i = i + 1 - # Fix Book Designer markup - BOOK_DESIGNER = [ - # HR - (re.compile('<hr>', re.IGNORECASE), - lambda match : '<span style="page-break-after:always"> </span>'), - # Create header tags - (re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), - lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))), - (re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), - lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))), - (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), - lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)), - (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), - lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)), - ] - def __init__(self, input_plugin_preprocess, plugin_preprocess, - extra_opts=None): - self.input_plugin_preprocess = input_plugin_preprocess - self.plugin_preprocess = plugin_preprocess - self.extra_opts = extra_opts + #Control Words + if isAsciiLetter(self.rtfData[i]): + #consume <ASCII Letter Sequence> + consumed = False + while i < len(self.rtfData): + if not isAsciiLetter(self.rtfData[i]): + tokenEnd = i + consumed = True + break + i = i + 1 - def is_baen(self, src): - return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"', - re.IGNORECASE).search(src) is not None + if not consumed: + raise Exception('Error (at:%d): Control Word without end.'%(tokenStart)) - def is_book_designer(self, raw): - return re.search('<H2[^><]*id=BookTitle', raw) is not None + #we have numeric argument before delimiter + if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]): + #consume the numeric argument + consumed = False + l = 0 + while i < len(self.rtfData): + if not isDigit(self.rtfData[i]): + consumed = True + break + l = l + 1 + i = i + 1 + if l > 10 : + raise Exception('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart]) - def is_pdftohtml(self, src): - return '<!-- created by calibre\'s pdftohtml -->' in src[:1000] + if not consumed: + raise Exception('Error (at:%d): Control Word without numeric argument end.'%[tokenStart]) - def __call__(self, html, remove_special_chars=None, - get_preprocess_html=False): - if remove_special_chars is not None: - html = remove_special_chars.sub('', html) - html = html.replace('\0', '') - is_pdftohtml = self.is_pdftohtml(html) - if self.is_baen(html): - rules = [] - elif self.is_book_designer(html): - rules = self.BOOK_DESIGNER - elif is_pdftohtml: - rules = self.PDFTOHTML - else: - rules = [] + separator = '' + if isChar(self.rtfData[i], ' '): + separator = ' ' - start_rules = [] - if is_pdftohtml: - # Remove non breaking spaces - start_rules.append((re.compile(ur'\u00a0'), lambda match : ' ')) + controlWord = self.rtfData[tokenStart: tokenEnd] + if tokenEnd < i: + value = int(self.rtfData[tokenEnd: i]) + if isString(controlWord, "\\bin"): + i = i + value + self.tokens.append(tokenBinN(self.rtfData[tokenStart:i], separator)) + else: + self.tokens.append(tokenControlWordWithNumericArgument(controlWord, value, separator)) + else: + self.tokens.append(tokenControlWord(controlWord, separator)) + #space delimiter, we should discard it + if self.rtfData[i] == ' ': + i = i + 1 - if not getattr(self.extra_opts, 'keep_ligatures', False): - html = _ligpat.sub(lambda m:LIGATURES[m.group()], html) + #Control Symbol + else: + self.tokens.append(tokenControlSymbol(self.rtfData[tokenStart : i + 1])) + i = i + 1 + continue - end_rules = [] - if getattr(self.extra_opts, 'remove_header', None): - try: - rules.insert(0, - (re.compile(self.extra_opts.header_regex), lambda match : '') - ) - except: - import traceback - print 'Failed to parse remove_header regexp' - traceback.print_exc() + if lastDataStart < 0: + lastDataStart = i + i = i + 1 - if getattr(self.extra_opts, 'remove_footer', None): - try: - rules.insert(0, - (re.compile(self.extra_opts.footer_regex), lambda match : '') - ) - except: - import traceback - print 'Failed to parse remove_footer regexp' - traceback.print_exc() + def toRTF(self): + result = [] + for token in self.tokens: + result.append(token.toRTF()) + return "".join(result) - # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives - if getattr(self.extra_opts, 'preprocess_html', None): - if is_pdftohtml: - end_rules.append( - (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), - ) - if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: - length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor')) - if length: - print "The pdf line length returned is " + str(length) - end_rules.append( - # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), - ) +if __name__ == "__main__": + import sys + if len(sys.argv) < 2: + print ("Usage %prog rtfFileToConvert") + sys.exit() + f = open(sys.argv[1], 'rb') + data = f.read() + f.close() - for rule in self.PREPROCESS + start_rules: - html = rule[0].sub(rule[1], html) + tokenizer = RtfTokenizer(data) + parsedTokens = RtfTokenParser(tokenizer.tokens) - if get_preprocess_html: - return html + data = parsedTokens.toRTF() - def dump(raw, where): - import os - dp = getattr(self.extra_opts, 'debug_pipeline', None) - if dp and os.path.exists(dp): - odir = os.path.join(dp, 'input') - if os.path.exists(odir): - odir = os.path.join(odir, where) - if not os.path.exists(odir): - os.makedirs(odir) - name, i = None, 0 - while not name or os.path.exists(os.path.join(odir, name)): - i += 1 - name = '%04d.html'%i - with open(os.path.join(odir, name), 'wb') as f: - f.write(raw.encode('utf-8')) + f = open(sys.argv[1], 'w') + f.write(data) + f.close() - #dump(html, 'pre-preprocess') - - for rule in rules + end_rules: - html = rule[0].sub(rule[1], html) - - #dump(html, 'post-preprocess') - - # Handle broken XHTML w/ SVG (ugh) - if 'svg:' in html and SVG_NS not in html: - html = html.replace( - '<html', '<html xmlns:svg="%s"' % SVG_NS, 1) - if 'xlink:' in html and XLINK_NS not in html: - html = html.replace( - '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1) - - html = XMLDECL_RE.sub('', html) - - if getattr(self.extra_opts, 'asciiize', False): - from calibre.ebooks.unidecode.unidecoder import Unidecoder - unidecoder = Unidecoder() - html = unidecoder.decode(html) - - if self.plugin_preprocess: - html = self.input_plugin_preprocess(html) - - return html