diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/pml/input.py index eac2e99e05..ad37494ff7 100644 --- a/src/calibre/ebooks/pml/input.py +++ b/src/calibre/ebooks/pml/input.py @@ -45,7 +45,7 @@ class PMLInput(InputFormatPlugin): ienc = self.options.input_encoding self.log.debug('Converting PML to HTML...') - hizer = PML_HTMLizer(close_all) + hizer = PML_HTMLizer() html = hizer.parse_pml(pml_stream.read().decode(ienc), html_path) html_stream.write('
\s*
', '', html) return html def start_line(self): @@ -173,140 +192,211 @@ class PML_HTMLizer(object): def end_line(self): end = u'' + div = [] + span = [] + other = [] + for key, val in self.state.items(): if val[0]: if key == 'T': self.state['T'][0] = False - end += self.STATES_TAGS[key][1] + if key in self.DIV_STATES: + div.append(key) + elif key in self.SPAN_STATES: + span.append(key) + else: + other.append(key) + for key in span+div+other: + end += self.STATES_TAGS[key][1] return u'%s' % end - def process_code_simple(self, code): - if code not in self.CODE_STATES.keys(): - return u'' - + def process_code(self, code, stream, pre=''): text = u'' - if self.state[self.CODE_STATES[code]][0]: - text = self.STATES_TAGS[self.CODE_STATES[code]][1] - else: - text = self.STATES_TAGS[self.CODE_STATES[code]][0] - - self.state[self.CODE_STATES[code]][0] = not self.state[self.CODE_STATES[code]][0] - - return text - - def process_code_link(self, stream, pre=''): - text = u'' - - href = self.code_value(stream) - if pre: - href = '#%s-%s' % (pre, href) - - if self.state['a'][0]: - text = self.STATES_TAGS['a'][1] - else: - text = self.STATES_TAGS['a'][0] % href - self.state['a'][1] = href - - self.state['a'][0] = not self.state['a'][0] - - return text - - def process_code_div_span(self, code, stream): - if self.close_all: - return self.process_code_div_span_call(code, stream) - else: - return self.process_code_div_span_ind(code, stream) - - def process_code_div_span_ind(self, code, stream): - text = u'' - ds = [] - - code = self.CODE_STATES[code] + code = self.CODE_STATES.get(code, None) + if not code: + return text if code in self.DIV_STATES: - ds = self.DIV_STATES[:] - ss = self.SPAN_STATES[:] + # Ignore multilple T's on the same line. They do not have a closing + # code. They get closed at the end of the line. + if code == 'T' and self.state['T'][0]: + self.code_value(stream) + return text + text = self.process_code_div(code, stream) elif code in self.SPAN_STATES: - ds = self.SPAN_STATES[:] - ss = [] + text = self.process_code_span(code, stream) + elif code in self.BLOCK_STATES: + text = self.process_code_block(code, stream, pre) + else: + text = self.process_code_simple(code) + + self.state[code][0] = not self.state[code][0] + + return text + + def process_code_simple(self, code): + text = u'' + if self.state[code][0]: + text = self.STATES_TAGS[code][1] + else: + if code in self.STATES_VALUE_REQ: + val = self.code_value(stream) + text += self.STATES_TAGS[code][0] % val + self.state[code][1] = val + else: + text = self.STATES_TAGS[code][0] + + return text + + def process_code_div(self, code, stream): + text = u'' + + # Close code. if self.state[code][0]: # Close all. - for c in ss+ds: + for c in self.SPAN_STATES+self.DIV_STATES: if self.state[c][0]: text += self.STATES_TAGS[c][1] # Reopen the based on state. - del ds[ds.index(code)] - for c in ds+ss: + for c in self.DIV_STATES+self.SPAN_STATES: + if code == c: + continue if self.state[c][0]: if c in self.STATES_VALUE_REQ: text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] else: text += self.STATES_TAGS[c][0] + # Open code. else: + # Close all spans. + for c in self.SPAN_STATES: + if self.state[c][0]: + text += self.STATES_TAGS[c][1] + # Process the code if code in self.STATES_VALUE_REQ: val = self.code_value(stream) - text = self.STATES_TAGS[code][0] % val + text += self.STATES_TAGS[code][0] % val self.state[code][1] = val else: - text = self.STATES_TAGS[code][0] - - self.state[code][0] = not self.state[code][0] + text += self.STATES_TAGS[code][0] + # Re-open all spans based on state + for c in self.SPAN_STATES: + if self.state[c][0]: + if c in self.STATES_VALUE_REQ: + text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] + else: + text += self.STATES_TAGS[c][0] return text - def process_code_div_span_call(self, code, stream): + def process_code_span(self, code, stream): text = u'' - divs = self.DIV_STATES[:] - spans = self.SPAN_STATES[:] - - code = self.CODE_STATES[code] + # Close code. if self.state[code][0]: - # Close all divs then spans. - for c in spans+divs: + # Close all spans + for c in self.SPAN_STATES: if self.state[c][0]: text += self.STATES_TAGS[c][1] - # Reopen the based on state. Open divs then spans - if code in self.DIV_STATES: - del divs[divs.index(code)] - if code in self.SPAN_STATES: - del spans[spans.index(code)] - for c in divs+spans: + # Re-open the spans based on state except for code which will be + # left closed. + for c in self.SPAN_STATES: + if code == c: + continue if self.state[c][0]: if c in self.STATES_VALUE_REQ: - text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] + text += self.STATES_TAGS[code][0] % self.state[c][1] else: text += self.STATES_TAGS[c][0] + # Open code. else: if code in self.STATES_VALUE_REQ: val = self.code_value(stream) - text = self.STATES_TAGS[code][0] % val + text += self.STATES_TAGS[code][0] % val self.state[code][1] = val else: - text = self.STATES_TAGS[code][0] + text += self.STATES_TAGS[code][0] - self.state[code][0] = not self.state[code][0] + return text + + def process_code_block(self, code, stream, pre=''): + text = u'' + + # Close all spans + for c in self.SPAN_STATES: + if self.state[c][0]: + text += self.STATES_TAGS[c][1] + + # Process the code + if self.state[code][0]: + # Close tag + text += self.STATES_TAGS[code][1] + else: + # Open tag + if code in self.STATES_VALUE_REQ: + val = self.code_value(stream) + if pre: + val = '#%s-%s' % (pre, val) + text += self.STATES_TAGS[code][0] % val + self.state[code][1] = val + else: + text += self.STATES_TAGS[code][0] + + # Re-open all spans if code was a div based on state + for c in self.SPAN_STATES: + if self.state[c][0]: + if c in self.STATES_VALUE_REQ: + text += self.STATES_TAGS[code][0] % self.state[c][1] + else: + text += self.STATES_TAGS[c][0] return text def code_value(self, stream): value = u'' - open = False + # state 0 is before = + # state 1 is before the first " + # state 2 is before the second " + # state 3 is after the second " + state = 0 + loc = stream.tell() c = stream.read(1) while c != '': - if open and c != '"': - value += c - if c == '"': - if not open: - open = True - else: + if state == 0: + if c == '=': + state = 1 + elif c != ' ': + # A code that requires an argument should have = after the + # code but sometimes has spaces. If it has anything other + # than a space or = after the code then we can assume the + # markup is invalid. We will stop looking for the value + # and continue to hopefully not lose any data. break + elif state == 1: + if c == '"': + state = 2 + elif c != ' ': + # " should always follow = but we will allow for blank + # space after the =. + break + elif state == 2: + if c == '"': + state = 3 + break + else: + value += c c = stream.read(1) + if state != 3: + # Unable to complete the sequence to reterieve the value. Reset + # the stream to the location it started. + stream.seek(loc) + value = u'' + return value.strip() def parse_pml(self, pml, file_name=''): @@ -323,11 +413,12 @@ class PML_HTMLizer(object): for line in pml.splitlines(): if not line: continue + parsed = [] empty = True # Must use StringIO, cStringIO does not support unicode - line = StringIO.StringIO(self.prepare_line(line)) + line = StringIO.StringIO(line) parsed.append(self.start_line()) c = line.read(1) @@ -337,20 +428,20 @@ class PML_HTMLizer(object): if c == '\\': c = line.read(1) - if c == 'x': - text = self.process_code_simple(c) - elif c in 'XS': + if c in 'xqcrtTiIuobBlk': + text = self.process_code(c, line) + elif c in 'FSX': l = line.read(1) - if '%s%s' % (c, l) == 'Sd': - text = self.process_code_link(line, 'fns') + if '%s%s' % (c, l) == 'Fn': + text = self.process_code('Fn', line, 'fns') + elif '%s%s' % (c, l) == 'FN': + text = self.process_code('FN', line) elif '%s%s' % (c, l) == 'SB': - text = self.process_code_div_span('SB', line) + text = self.process_code('SB', line) + elif '%s%s' % (c, l) == 'Sd': + text = self.process_code('Sd', line, 'fns') else: - text = self.process_code_simple('%s%s' % (c, l)) - elif c == 'q': - text = self.process_code_link(line) - elif c in 'crtTiIuobBl': - text = self.process_code_div_span(c, line) + text = self.process_code('%s%s' % (c, l), line) elif c == 'm': empty = False src = self.code_value(line) @@ -369,12 +460,6 @@ class PML_HTMLizer(object): text = '' % id elif c == 'n': pass - elif c == 'F': - l = line.read(1) - if '%s%s' % (c, l) == 'Fn': - text = self.process_code_link(line, 'fns') - elif '%s%s' % (c, l) == 'FN': - text = self.process_code_div_span('FN', line) elif c == 'w': empty = False text = '