diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 7707325131..874b39223a 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -50,7 +50,7 @@ class PML_HTMLizer(object): ] STATES_TAGS = { - 'h1': ('

', '

'), + 'h1': ('

', '

'), 'h2': ('

', '

'), 'h3': ('

', '

'), 'h4': ('

', '

'), @@ -108,6 +108,8 @@ class PML_HTMLizer(object): 'h4', 'h5', 'h6', + 'sb', + 'sp', ] DIV_STATES = [ @@ -135,6 +137,13 @@ class PML_HTMLizer(object): def prepare_pml(self, pml): # Remove comments pml = re.sub(r'(?mus)\\v(?P.*?)\\v', '', pml) + + # Remove extra white spaces. + pml = re.sub(r'(?mus)[ ]{2,}', ' ', pml) + pml = re.sub(r'(?mus)^[ ]*(?=.)', '', pml) + pml = re.sub(r'(?mus)(?<=.)[ ]*$', '', pml) + pml = re.sub(r'(?mus)^[ ]*$', '', pml) + # Footnotes and Sidebars pml = re.sub(r'(?mus).+?)">\s*(?P.*?)\s*', lambda match: '\\FN="fns-%s"%s\\FN' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) pml = re.sub(r'(?mus).+?)">\s*(?P.*?)\s*', lambda match: '\\SB="fns-%s"%s\\SB' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) @@ -150,14 +159,6 @@ class PML_HTMLizer(object): return pml - def prepare_line(self, line): - line = re.sub(r'[ ]{2,}', ' ', line) - line = re.sub(r'^[ ]*(?=.)', '', line) - line = re.sub(r'(?<=.)[ ]*$', '', line) - line = re.sub(r'^[ ]*$', '', line) - - return line - def cleanup_html(self, html): old = html html = self.cleanup_html_remove_redundant(html) @@ -217,7 +218,9 @@ class PML_HTMLizer(object): text = u'' ds = [] - code = self.CODE_STATES[code] + code = self.CODE_STATES.get(code, None) + if not code: + return text if code in self.DIV_STATES: ds = self.DIV_STATES[:] @@ -278,7 +281,9 @@ class PML_HTMLizer(object): def process_code_block(self, code, stream, pre=''): text = u'' - code = self.CODE_STATES[code] + code = self.CODE_STATES.get(code, None) + if not code: + return text # Close all spans for c in self.SPAN_STATES: @@ -312,27 +317,12 @@ class PML_HTMLizer(object): return text - - def process_code_simple(self, code): - if code not in self.CODE_STATES.keys(): - return u'' - - text = u'' - - if self.state[self.CODE_STATES[code]][0]: - text = self.STATES_TAGS[self.CODE_STATES[code]][1] - else: - text = self.STATES_TAGS[self.CODE_STATES[code]][0] - - self.state[self.CODE_STATES[code]][0] = not self.state[self.CODE_STATES[code]][0] - - return text - def code_value(self, stream): value = u'' # state 0 is before = # state 1 is before the first " # state 2 is before the second " + # state 3 is after the second " state = 0 loc = stream.tell() @@ -341,6 +331,13 @@ class PML_HTMLizer(object): if state == 0: if c == '=': state = 1 + elif c != ' ': + # A code that requires an argument should have = after the + # code but sometimes has spaces. If it has anything other + # than a space or = after the code then we can assume the + # markup is invalid. We will stop looking for the value + # and continue to hopefully not lose any data. + break; elif state == 1: if c == '"': state = 2 @@ -353,6 +350,8 @@ class PML_HTMLizer(object): c = stream.read(1) if state != 3: + # Unable to complete the sequence to reterieve the value. Reset + # the stream to the location it started. stream.seek(loc) value = u'' @@ -370,13 +369,12 @@ class PML_HTMLizer(object): self.state[s] = [False, '']; for line in pml.splitlines(): - parsed = [] - empty = True - - line = self.prepare_line(line) if not line: continue + parsed = [] + empty = True + # Must use StringIO, cStringIO does not support unicode line = StringIO.StringIO(line) parsed.append(self.start_line()) @@ -389,15 +387,15 @@ class PML_HTMLizer(object): c = line.read(1) if c == 'x': - text = self.process_code_simple(c) + text = self.process_code_block(c, line) elif c in 'XS': l = line.read(1) - if '%s%s' % (c, l) == 'Sd': - text = self.process_code_block('Sd', line, 'fns') - elif '%s%s' % (c, l) == 'SB': + if '%s%s' % (c, l) == 'SB': text = self.process_code('SB', line) + elif '%s%s' % (c, l) == 'Sd': + text = self.process_code_block('Sd', line, 'fns') else: - text = self.process_code_simple('%s%s' % (c, l)) + text = self.process_code_block('%s%s' % (c, l), line) elif c == 'q': text = self.process_code_block(c, line) elif c in 'crtTiIuobBlk':