diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index b0d9ce1ec7..05cf488617 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -9,79 +9,390 @@ __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' import re +import StringIO from calibre import my_unichr, prepare_string_for_xml from calibre.ebooks.pdb.ereader import image_name -PML_HTML_RULES = [ - # NOP-process all \x escapes, turning \\ into \ This keeps the regex - # parsing simple while making sure that we don't try to honor \\x as \x - # (and also makes sure we DO honor \\\x as \ followed by \x). - (re.compile(r'\\(.)'), lambda match: '\' if match.group(1) == '\\' else '\\' + match.group(1)), +class PML_HTMLizer(object): - (re.compile(r'\\p'), lambda match: '

'), - (re.compile(r'\\x(?P.*?)\\x', re.DOTALL), lambda match: '

%s

' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\X(?P[0-4])(?P.*?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''), - (re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry - (re.compile(r'\\c(?P.*?)\\c', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\r(?P.*?)\\r', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\i(?P.*?)\\i', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\u(?P.*?)\\u', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\o(?P.*?)\\o', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\v(?P.*?)\\v', re.DOTALL), lambda match: '' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\t(?P.*?)\\t', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\T="(?P\d+)%*"(?P.*?)$', re.MULTILINE), lambda match: r'
%s
' % (match.group('val'), match.group('text')) if match.group('text') else ''), - (re.compile(r'\\w="(?P\d+)%"'), lambda match: '
' % match.group('val')), - (re.compile(r'\\n'), lambda match: ''), - (re.compile(r'\\s(?P.*?)\\s', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\b(?P.*?)\\b', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), # \b is deprecated; \B should be used instead. - (re.compile(r'\\l(?P.*?)\\l', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\B(?P.*?)\\B', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\Sp(?P.*?)\\Sp', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\Sb(?P.*?)\\Sb', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\k(?P.*?)\\k', re.DOTALL), lambda match: '%s' % match.group('text').upper() if match.group('text') else ''), - (re.compile(r'\\a(?P\d{3})'), lambda match: '&#%s;' % match.group('num')), - (re.compile(r'\\U(?P[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))), - (re.compile(r'\\m="(?P.+?)"'), lambda match: '' % image_name(match.group('name')).strip('\x00')), - (re.compile(r'\\q="(?P#.+?)"(?P.*?)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text')) if match.group('text') else ''), - (re.compile(r'\\Q="(?P.+?)"'), lambda match: '' % match.group('target')), - (re.compile(r'\\-'), lambda match: ''), - (re.compile(r'\\Fn="(?P.+?)"(?P.*?)\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text')) if match.group('text') else ''), - (re.compile(r'\\Sd="(?P.+?)"(?P.*?)\\Sd'), lambda match: '%s' % (match.group('target'), match.group('text')) if match.group('text') else ''), - # Just italicize index items as that is how the eReader software renders them. - (re.compile(r'\\I(?P.*?)\\I', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), + STATES = [ + 'i', + 'u', + 'd', + 'b', + 'sp', + 'sb', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'a', + 'c', + 'r', + 't', + 's', + 'l', + 'T', + 'Fn', + 'Sd', + 'FS' + ] - # Sidebar and Footnotes - (re.compile(r'<sidebar\s+id="(?P.+?)">\s*(?P.*?)\s*</sidebar>', re.DOTALL), lambda match: '
%s
' % (match.group('target'), match.group('text')) if match.group('text') else ''), - (re.compile(r'<footnote\s+id="(?P.+?)">\s*(?P.*?)\s*</footnote>', re.DOTALL), lambda match: '
%s
' % (match.group('target'), match.group('text')) if match.group('text') else ''), + STATES_VALUE_REQ = [ + 'a', + 'T', + 'FS' + ] - # eReader files are one paragraph per line. - # This forces the lines to wrap properly. - (re.compile('^(?P.+)$', re.MULTILINE), lambda match: '

%s

' % match.group('text')), - # Remove empty

's. - (re.compile('

[ ]*

'), lambda match: ''), - # Ensure empty lines carry over. - (re.compile('(\r\n|\n|\r){3}'), lambda match: '
'), + STATES_TAGS = { + 'h1': ('

', '

'), + 'h2': ('

', '

'), + 'h3': ('

', '

'), + 'h4': ('

', '

'), + 'h5': ('
', '
'), + 'h6': ('
', '
'), + 'sp': ('', ''), + 'sb': ('', ''), + 'a': ('', ''), + 'c': ('
', '
'), + 'r': ('
', '
'), + 't': ('
', '
'), + 'T': ('
', '
'), + 'i': ('', ''), + 'u': ('', ''), + 'd': ('', ''), + 'b': ('', ''), + 's': ('', ''), + 'l': ('', ''), + 'FS': ('
', '
'), + } - # Try to fix some of the misordering of character-attribute tags. - (re.compile(r'(?P(<(i|u|b|del|sup|sub)( [^>]+)?>)+)(?P()+)'), lambda match: match.group('close') + match.group('ch')), - (re.compile(r'(?P(<(i|u|b|del|sup|sub|span)( [^>]+)?>)+)(?P(<(div|h\d)( [^>]+)?>)+)'), lambda match: match.group('blk') + match.group('ch')), + CODE_STATES = { + 'q': 'a', + 'x': 'h1', + 'X0': 'h2', + 'X1': 'h3', + 'X2': 'h4', + 'X3': 'h5', + 'X4': 'h6', + 'Sp': 'sp', + 'Sb': 'sb', + 'c': 'c', + 'r': 'r', + 't': 't', + 'T': 'T', + 'i': 'i', + 'I': 'i', + 'u': 'u', + 'o': 'd', + 'b': 'b', + 'B': 'b', + 's': 's', + 'l': 'l', + 'Fn': 'a', + 'Sd': 'a', + 'FN': 'FS', + 'SB': 'FS', + } - # Remove unmatched plm codes. - (re.compile(r'\\X[0-4]'), lambda match: ''), - (re.compile(r'\\T="\d+%*"'), lambda match: ''), - (re.compile(r'\\Sp'), lambda match: ''), - (re.compile(r'\\Sb'), lambda match: ''), - # Remove invalid single item pml codes. - (re.compile(r'\\.'), lambda match: ''), -] + DIV_STATES = [ + 'c', + 'r', + 't', + 'T', + 'FS', + ] -def pml_to_html(pml): - html = prepare_string_for_xml(pml) - for rule in PML_HTML_RULES: - html = rule[0].sub(rule[1], html) + SPAN_STATES = [ + 's', + 'l', + 'i', + 'u', + 'd', + 'b', + ] - return html + def __init__(self, close_all): + self.close_all = close_all + self.state = {} + + def prepare_pml(self, pml): + # Remove comments + pml = re.sub(r'(?mus)\\v(?P.*?)\\v', '', pml) + # Footnotes and Sidebars + pml = re.sub(r'(?mus).+?)">\s*(?P.*?)\s*', lambda match: '\\FN="fns-%s"%s\\FN' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) + pml = re.sub(r'(?mus).+?)">\s*(?P.*?)\s*', lambda match: '\\SB="fns-%s"%s\\SB' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) + + pml = prepare_string_for_xml(pml) + + pml = re.sub(r'\\a(?P\d{3})', lambda match: '&#%s;' % match.group('num'), pml) + pml = re.sub(r'\\U(?P[0-9a-f]{4})', lambda match: '%s' % my_unichr(int(match.group('num'), 16)), pml) + + return pml + + def prepare_line(self, line): + line = re.sub(r'[ ]{2,}', ' ', line) + line = re.sub(r'^[ ]*(?=.)', '', line) + line = re.sub(r'(?<=.)[ ]*$', '', line) + line = re.sub(r'^[ ]*$', '', line) + + return line + + def start_line(self): + start = u'' + + for key, val in self.state.items(): + if val[0]: + if key not in self.STATES_VALUE_REQ: + start += self.STATES_TAGS[key][0] + else: + start += self.STATES_TAGS[key][0] % val[1] + + return u'

%s' % start + + def end_line(self): + end = u'' + + for key, val in self.state.items(): + if val[0]: + if key == 'T': + self.state['T'][0] = False + end += self.STATES_TAGS[key][1] + + return u'%s

' % end + + def process_code_simple(self, code): + if code not in self.CODE_STATES.keys(): + return u'' + + text = u'' + + if self.state[self.CODE_STATES[code]][0]: + text = self.STATES_TAGS[self.CODE_STATES[code]][1] + else: + text = self.STATES_TAGS[self.CODE_STATES[code]][0] + + self.state[self.CODE_STATES[code]][0] = not self.state[self.CODE_STATES[code]][0] + + return text + + def process_code_link(self, stream, pre=''): + text = u'' + + href = self.code_value(stream) + if pre: + href = '#%s-%s' % (pre, href) + + if self.state['a'][0]: + text = self.STATES_TAGS['a'][1] + else: + text = self.STATES_TAGS['a'][0] % href + self.state['a'][1] = href + + self.state['a'][0] = not self.state['a'][0] + + return text + + def process_code_div_span(self, code, stream): + if self.close_all: + return self.process_code_div_span_call(code, stream) + else: + return self.process_code_div_span_ind(code, stream) + + def process_code_div_span_ind(self, code, stream): + text = u'' + ds = [] + + code = self.CODE_STATES[code] + + if code in self.DIV_STATES: + ds = self.DIV_STATES[:] + elif code in self.SPAN_STATES: + ds = self.SPAN_STATES[:] + + if self.state[code][0]: + # Close all. + for c in ds: + if self.state[c][0]: + text += self.STATES_TAGS[c][1] + # Reopen the based on state. + del ds[ds.index(code)] + for c in ds: + if self.state[c][0]: + if c in self.STATES_VALUE_REQ: + text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] + else: + text += self.STATES_TAGS[c][0] + else: + if code in self.STATES_VALUE_REQ: + val = self.code_value(stream) + text = self.STATES_TAGS[code][0] % val + self.state[code][1] = val + else: + text = self.STATES_TAGS[code][0] + + self.state[code][0] = not self.state[code][0] + + return text + + def process_code_div_span_call(self, code, stream): + text = u'' + divs = self.DIV_STATES[:] + spans = self.SPAN_STATES[:] + + code = self.CODE_STATES[code] + + if self.state[code][0]: + # Close all divs then spans. + for c in spans+divs: + if self.state[c][0]: + text += self.STATES_TAGS[c][1] + # Reopen the based on state. Open divs then spans + if code in self.DIV_STATES: + del divs[divs.index(code)] + if code in self.SPAN_STATES: + del spans[spans.index(code)] + for c in divs+spans: + if state[c][0]: + if c in self.STATES_VALUE_REQ: + text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] + else: + text += self.STATES_TAGS[c][0] + else: + if code in self.STATES_VALUE_REQ: + val = self.code_value(stream) + text = self.STATES_TAGS[code][0] % val + state[code][1] = val + else: + text = self.STATES_TAGS[code][0] + + self.state[code][0] = not self.state[code][0] + + return text + + def code_value(self, stream): + value = u'' + open = False + + c = stream.read(1) + while c != '': + if open and c != '"': + value += c + if c == '"': + if not open: + open = True + else: + break + c = stream.read(1) + + return value.strip() + + def parse_pml(self, pml): + pml = self.prepare_pml(pml) + output = [] + + self.state = {} + for s in self.STATES: + self.state[s] = [False, '']; + + for line in pml.splitlines(): + if not line: + continue + parsed = [] + empty = True + + # Must use StringIO, cStringIO does not support unicode + line = StringIO.StringIO(self.prepare_line(line)) + parsed.append(self.start_line()) + + c = line.read(1) + while c != '': + text = u'' + + if c == '\\': + c = line.read(1) + + if c == 'x': + text = self.process_code_simple(c) + elif c in 'XS': + l = line.read(1) + if '%s%s' % (c, l) == 'Sd': + text = self.process_code_link(line, 'fns') + elif '%s%s' % (c, l) == 'SB': + text = self.process_code_div_span('SB', line) + else: + text = self.process_code_simple('%s%s' % (c, l)) + elif c == 'q': + text = self.process_code_link(line) + elif c in 'crtTiIuobB': + text = self.process_code_div_span(c, line) + elif c in 'sl': + close = u'' + if c == 's' and self.state['l']: + close = self.process_code_div_span('l', line) + if c == 'l' and self.state['s']: + close = self.process_code_div_span('s', line) + text = self.process_code_div_span(c, line) + text = close+text + elif c == 'm': + empty = False + src = self.code_value(line) + text = '' % image_name(src).strip('\x00') + elif c == 'Q': + empty = False + id = self.code_value(line) + text = '' % id + elif c == 'p': + empty = False + text = '

' + elif c == 'C': + # This should be made to create a TOC entry + line.read(1) + self.code_value(line) + elif c == 'n': + pass + elif c == 'F': + l = line.read(1) + if '%s%s' % (c, l) == 'Fn': + text = self.process_code_link(line, 'fns') + elif '%s%s' % (c, l) == 'FN': + text = self.process_code_div_span('FN', line) + elif c == 'w': + empty = False + text = '
' % self.code_value(line) + elif c == '-': + empty = False + text = '­' + elif c == '\\': + empty = False + text = '\\' + else: + if c != ' ': + empty = False + text = c + parsed.append(text) + c = line.read(1) + + if not empty: + text = self.end_line() + parsed.append(text) + output.append(u''.join(parsed)) + line.close() + + return u'\n'.join(output) + + +def pml_to_html(pml, close_all=False): + ''' + close_all will close div all div and span tags when one is closed and then + re-open the appropriate ones. + ''' + + hizer = PML_HTMLizer(close_all) + return hizer.parse_pml(pml) def footnote_sidebar_to_html(id, pml): if id.startswith('\x01'):