PML Input: Various improvements

This commit is contained in:
Kovid Goyal 2009-11-29 14:36:20 -07:00
parent c1c030a386
commit 5e78ed9dee
2 changed files with 202 additions and 119 deletions

View File

@ -45,7 +45,7 @@ class PMLInput(InputFormatPlugin):
ienc = self.options.input_encoding ienc = self.options.input_encoding
self.log.debug('Converting PML to HTML...') self.log.debug('Converting PML to HTML...')
hizer = PML_HTMLizer(close_all) hizer = PML_HTMLizer()
html = hizer.parse_pml(pml_stream.read().decode(ienc), html_path) html = hizer.parse_pml(pml_stream.read().decode(ienc), html_path)
html_stream.write('<html><head><title /></head><body>%s</body></html>' % html.encode('utf-8', 'replace')) html_stream.write('<html><head><title /></head><body>%s</body></html>' % html.encode('utf-8', 'replace'))

View File

@ -36,6 +36,7 @@ class PML_HTMLizer(object):
't', 't',
's', 's',
'l', 'l',
'k',
'T', 'T',
'Fn', 'Fn',
'Sd', 'Sd',
@ -49,7 +50,7 @@ class PML_HTMLizer(object):
] ]
STATES_TAGS = { STATES_TAGS = {
'h1': ('<h1 style="page-break-after: always;">', '</h1>'), 'h1': ('<h1 style="page-break-before: always;">', '</h1>'),
'h2': ('<h2>', '</h2>'), 'h2': ('<h2>', '</h2>'),
'h3': ('<h3>', '</h3>'), 'h3': ('<h3>', '</h3>'),
'h4': ('<h4>', '</h4>'), 'h4': ('<h4>', '</h4>'),
@ -62,11 +63,12 @@ class PML_HTMLizer(object):
'r': ('<div style="text-align: right;">', '</div>'), 'r': ('<div style="text-align: right;">', '</div>'),
't': ('<div style="margin-left: 5%;">', '</div>'), 't': ('<div style="margin-left: 5%;">', '</div>'),
'T': ('<div style="margin-left: %s;">', '</div>'), 'T': ('<div style="margin-left: %s;">', '</div>'),
'i': ('<span style="font-style : italic;">', '</span>'), 'i': ('<span style="font-style: italic;">', '</span>'),
'u': ('<span style="text-decoration : underline;">', '</span>'), 'u': ('<span style="text-decoration: underline;">', '</span>'),
'd': ('<span style="text-decoration: line-through;">', '</span>'), 'd': ('<span style="text-decoration: line-through;">', '</span>'),
'b': ('<span style="font-weight: bold;">', '</span>'), 'b': ('<span style="font-weight: bold;">', '</span>'),
'l': ('<span style="font-size: 150%">', '</span>'), 'l': ('<span style="font-size: 150%;">', '</span>'),
'k': ('<span style="font-size: 75%;">', '</span>'),
'FS': ('<div id="%s">', '</div>'), 'FS': ('<div id="%s">', '</div>'),
} }
@ -91,12 +93,25 @@ class PML_HTMLizer(object):
'b': 'b', 'b': 'b',
'B': 'b', 'B': 'b',
'l': 'l', 'l': 'l',
'k': 'k',
'Fn': 'a', 'Fn': 'a',
'Sd': 'a', 'Sd': 'a',
'FN': 'FS', 'FN': 'FS',
'SB': 'FS', 'SB': 'FS',
} }
BLOCK_STATES = [
'a',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'sb',
'sp',
]
DIV_STATES = [ DIV_STATES = [
'c', 'c',
'r', 'r',
@ -107,14 +122,14 @@ class PML_HTMLizer(object):
SPAN_STATES = [ SPAN_STATES = [
'l', 'l',
'k',
'i', 'i',
'u', 'u',
'd', 'd',
'b', 'b',
] ]
def __init__(self, close_all): def __init__(self):
self.close_all = close_all
self.state = {} self.state = {}
self.toc = TOC() self.toc = TOC()
self.file_name = '' self.file_name = ''
@ -122,25 +137,28 @@ class PML_HTMLizer(object):
def prepare_pml(self, pml): def prepare_pml(self, pml):
# Remove comments # Remove comments
pml = re.sub(r'(?mus)\\v(?P<text>.*?)\\v', '', pml) pml = re.sub(r'(?mus)\\v(?P<text>.*?)\\v', '', pml)
# Remove extra white spaces.
pml = re.sub(r'(?mus)[ ]{2,}', ' ', pml)
pml = re.sub(r'(?mus)^[ ]*(?=.)', '', pml)
pml = re.sub(r'(?mus)(?<=.)[ ]*$', '', pml)
pml = re.sub(r'(?mus)^[ ]*$', '', pml)
# Footnotes and Sidebars # Footnotes and Sidebars
pml = re.sub(r'(?mus)<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', lambda match: '\\FN="fns-%s"%s\\FN' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) pml = re.sub(r'(?mus)<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', lambda match: '\\FN="fns-%s"%s\\FN' % (match.group('target'), match.group('text')) if match.group('text') else '', pml)
pml = re.sub(r'(?mus)<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', lambda match: '\\SB="fns-%s"%s\\SB' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) pml = re.sub(r'(?mus)<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', lambda match: '\\SB="fns-%s"%s\\SB' % (match.group('target'), match.group('text')) if match.group('text') else '', pml)
pml = prepare_string_for_xml(pml) # Convert &'s into entities so &amp; in the text doesn't get turned into
# &. It will display as &amp;
pml = pml.replace('&', '&amp;')
pml = re.sub(r'\\a(?P<num>\d{3})', lambda match: '&#%s;' % match.group('num'), pml) pml = re.sub(r'\\a(?P<num>\d{3})', lambda match: '&#%s;' % match.group('num'), pml)
pml = re.sub(r'\\U(?P<num>[0-9a-f]{4})', lambda match: '%s' % my_unichr(int(match.group('num'), 16)), pml) pml = re.sub(r'\\U(?P<num>[0-9a-f]{4})', lambda match: '%s' % my_unichr(int(match.group('num'), 16)), pml)
pml = prepare_string_for_xml(pml)
return pml return pml
def prepare_line(self, line):
line = re.sub(r'[ ]{2,}', ' ', line)
line = re.sub(r'^[ ]*(?=.)', '', line)
line = re.sub(r'(?<=.)[ ]*$', '', line)
line = re.sub(r'^[ ]*$', '', line)
return line
def cleanup_html(self, html): def cleanup_html(self, html):
old = html old = html
html = self.cleanup_html_remove_redundant(html) html = self.cleanup_html_remove_redundant(html)
@ -156,6 +174,7 @@ class PML_HTMLizer(object):
html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html) html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html)
else: else:
html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html) html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html)
html = re.sub(r'<p>\s*</p>', '', html)
return html return html
def start_line(self): def start_line(self):
@ -173,140 +192,211 @@ class PML_HTMLizer(object):
def end_line(self): def end_line(self):
end = u'' end = u''
div = []
span = []
other = []
for key, val in self.state.items(): for key, val in self.state.items():
if val[0]: if val[0]:
if key == 'T': if key == 'T':
self.state['T'][0] = False self.state['T'][0] = False
end += self.STATES_TAGS[key][1] if key in self.DIV_STATES:
div.append(key)
elif key in self.SPAN_STATES:
span.append(key)
else:
other.append(key)
for key in span+div+other:
end += self.STATES_TAGS[key][1]
return u'%s</p>' % end return u'%s</p>' % end
def process_code_simple(self, code): def process_code(self, code, stream, pre=''):
if code not in self.CODE_STATES.keys():
return u''
text = u'' text = u''
if self.state[self.CODE_STATES[code]][0]: code = self.CODE_STATES.get(code, None)
text = self.STATES_TAGS[self.CODE_STATES[code]][1] if not code:
else: return text
text = self.STATES_TAGS[self.CODE_STATES[code]][0]
self.state[self.CODE_STATES[code]][0] = not self.state[self.CODE_STATES[code]][0]
return text
def process_code_link(self, stream, pre=''):
text = u''
href = self.code_value(stream)
if pre:
href = '#%s-%s' % (pre, href)
if self.state['a'][0]:
text = self.STATES_TAGS['a'][1]
else:
text = self.STATES_TAGS['a'][0] % href
self.state['a'][1] = href
self.state['a'][0] = not self.state['a'][0]
return text
def process_code_div_span(self, code, stream):
if self.close_all:
return self.process_code_div_span_call(code, stream)
else:
return self.process_code_div_span_ind(code, stream)
def process_code_div_span_ind(self, code, stream):
text = u''
ds = []
code = self.CODE_STATES[code]
if code in self.DIV_STATES: if code in self.DIV_STATES:
ds = self.DIV_STATES[:] # Ignore multilple T's on the same line. They do not have a closing
ss = self.SPAN_STATES[:] # code. They get closed at the end of the line.
if code == 'T' and self.state['T'][0]:
self.code_value(stream)
return text
text = self.process_code_div(code, stream)
elif code in self.SPAN_STATES: elif code in self.SPAN_STATES:
ds = self.SPAN_STATES[:] text = self.process_code_span(code, stream)
ss = [] elif code in self.BLOCK_STATES:
text = self.process_code_block(code, stream, pre)
else:
text = self.process_code_simple(code)
self.state[code][0] = not self.state[code][0]
return text
def process_code_simple(self, code):
text = u''
if self.state[code][0]:
text = self.STATES_TAGS[code][1]
else:
if code in self.STATES_VALUE_REQ:
val = self.code_value(stream)
text += self.STATES_TAGS[code][0] % val
self.state[code][1] = val
else:
text = self.STATES_TAGS[code][0]
return text
def process_code_div(self, code, stream):
text = u''
# Close code.
if self.state[code][0]: if self.state[code][0]:
# Close all. # Close all.
for c in ss+ds: for c in self.SPAN_STATES+self.DIV_STATES:
if self.state[c][0]: if self.state[c][0]:
text += self.STATES_TAGS[c][1] text += self.STATES_TAGS[c][1]
# Reopen the based on state. # Reopen the based on state.
del ds[ds.index(code)] for c in self.DIV_STATES+self.SPAN_STATES:
for c in ds+ss: if code == c:
continue
if self.state[c][0]: if self.state[c][0]:
if c in self.STATES_VALUE_REQ: if c in self.STATES_VALUE_REQ:
text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1]
else: else:
text += self.STATES_TAGS[c][0] text += self.STATES_TAGS[c][0]
# Open code.
else: else:
# Close all spans.
for c in self.SPAN_STATES:
if self.state[c][0]:
text += self.STATES_TAGS[c][1]
# Process the code
if code in self.STATES_VALUE_REQ: if code in self.STATES_VALUE_REQ:
val = self.code_value(stream) val = self.code_value(stream)
text = self.STATES_TAGS[code][0] % val text += self.STATES_TAGS[code][0] % val
self.state[code][1] = val self.state[code][1] = val
else: else:
text = self.STATES_TAGS[code][0] text += self.STATES_TAGS[code][0]
# Re-open all spans based on state
self.state[code][0] = not self.state[code][0] for c in self.SPAN_STATES:
if self.state[c][0]:
if c in self.STATES_VALUE_REQ:
text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1]
else:
text += self.STATES_TAGS[c][0]
return text return text
def process_code_div_span_call(self, code, stream): def process_code_span(self, code, stream):
text = u'' text = u''
divs = self.DIV_STATES[:]
spans = self.SPAN_STATES[:]
code = self.CODE_STATES[code]
# Close code.
if self.state[code][0]: if self.state[code][0]:
# Close all divs then spans. # Close all spans
for c in spans+divs: for c in self.SPAN_STATES:
if self.state[c][0]: if self.state[c][0]:
text += self.STATES_TAGS[c][1] text += self.STATES_TAGS[c][1]
# Reopen the based on state. Open divs then spans # Re-open the spans based on state except for code which will be
if code in self.DIV_STATES: # left closed.
del divs[divs.index(code)] for c in self.SPAN_STATES:
if code in self.SPAN_STATES: if code == c:
del spans[spans.index(code)] continue
for c in divs+spans:
if self.state[c][0]: if self.state[c][0]:
if c in self.STATES_VALUE_REQ: if c in self.STATES_VALUE_REQ:
text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] text += self.STATES_TAGS[code][0] % self.state[c][1]
else: else:
text += self.STATES_TAGS[c][0] text += self.STATES_TAGS[c][0]
# Open code.
else: else:
if code in self.STATES_VALUE_REQ: if code in self.STATES_VALUE_REQ:
val = self.code_value(stream) val = self.code_value(stream)
text = self.STATES_TAGS[code][0] % val text += self.STATES_TAGS[code][0] % val
self.state[code][1] = val self.state[code][1] = val
else: else:
text = self.STATES_TAGS[code][0] text += self.STATES_TAGS[code][0]
self.state[code][0] = not self.state[code][0] return text
def process_code_block(self, code, stream, pre=''):
text = u''
# Close all spans
for c in self.SPAN_STATES:
if self.state[c][0]:
text += self.STATES_TAGS[c][1]
# Process the code
if self.state[code][0]:
# Close tag
text += self.STATES_TAGS[code][1]
else:
# Open tag
if code in self.STATES_VALUE_REQ:
val = self.code_value(stream)
if pre:
val = '#%s-%s' % (pre, val)
text += self.STATES_TAGS[code][0] % val
self.state[code][1] = val
else:
text += self.STATES_TAGS[code][0]
# Re-open all spans if code was a div based on state
for c in self.SPAN_STATES:
if self.state[c][0]:
if c in self.STATES_VALUE_REQ:
text += self.STATES_TAGS[code][0] % self.state[c][1]
else:
text += self.STATES_TAGS[c][0]
return text return text
def code_value(self, stream): def code_value(self, stream):
value = u'' value = u''
open = False # state 0 is before =
# state 1 is before the first "
# state 2 is before the second "
# state 3 is after the second "
state = 0
loc = stream.tell()
c = stream.read(1) c = stream.read(1)
while c != '': while c != '':
if open and c != '"': if state == 0:
value += c if c == '=':
if c == '"': state = 1
if not open: elif c != ' ':
open = True # A code that requires an argument should have = after the
else: # code but sometimes has spaces. If it has anything other
# than a space or = after the code then we can assume the
# markup is invalid. We will stop looking for the value
# and continue to hopefully not lose any data.
break break
elif state == 1:
if c == '"':
state = 2
elif c != ' ':
# " should always follow = but we will allow for blank
# space after the =.
break
elif state == 2:
if c == '"':
state = 3
break
else:
value += c
c = stream.read(1) c = stream.read(1)
if state != 3:
# Unable to complete the sequence to reterieve the value. Reset
# the stream to the location it started.
stream.seek(loc)
value = u''
return value.strip() return value.strip()
def parse_pml(self, pml, file_name=''): def parse_pml(self, pml, file_name=''):
@ -323,11 +413,12 @@ class PML_HTMLizer(object):
for line in pml.splitlines(): for line in pml.splitlines():
if not line: if not line:
continue continue
parsed = [] parsed = []
empty = True empty = True
# Must use StringIO, cStringIO does not support unicode # Must use StringIO, cStringIO does not support unicode
line = StringIO.StringIO(self.prepare_line(line)) line = StringIO.StringIO(line)
parsed.append(self.start_line()) parsed.append(self.start_line())
c = line.read(1) c = line.read(1)
@ -337,20 +428,20 @@ class PML_HTMLizer(object):
if c == '\\': if c == '\\':
c = line.read(1) c = line.read(1)
if c == 'x': if c in 'xqcrtTiIuobBlk':
text = self.process_code_simple(c) text = self.process_code(c, line)
elif c in 'XS': elif c in 'FSX':
l = line.read(1) l = line.read(1)
if '%s%s' % (c, l) == 'Sd': if '%s%s' % (c, l) == 'Fn':
text = self.process_code_link(line, 'fns') text = self.process_code('Fn', line, 'fns')
elif '%s%s' % (c, l) == 'FN':
text = self.process_code('FN', line)
elif '%s%s' % (c, l) == 'SB': elif '%s%s' % (c, l) == 'SB':
text = self.process_code_div_span('SB', line) text = self.process_code('SB', line)
elif '%s%s' % (c, l) == 'Sd':
text = self.process_code('Sd', line, 'fns')
else: else:
text = self.process_code_simple('%s%s' % (c, l)) text = self.process_code('%s%s' % (c, l), line)
elif c == 'q':
text = self.process_code_link(line)
elif c in 'crtTiIuobBl':
text = self.process_code_div_span(c, line)
elif c == 'm': elif c == 'm':
empty = False empty = False
src = self.code_value(line) src = self.code_value(line)
@ -369,12 +460,6 @@ class PML_HTMLizer(object):
text = '<span id="%s"></span>' % id text = '<span id="%s"></span>' % id
elif c == 'n': elif c == 'n':
pass pass
elif c == 'F':
l = line.read(1)
if '%s%s' % (c, l) == 'Fn':
text = self.process_code_link(line, 'fns')
elif '%s%s' % (c, l) == 'FN':
text = self.process_code_div_span('FN', line)
elif c == 'w': elif c == 'w':
empty = False empty = False
text = '<hr width="%s" />' % self.code_value(line) text = '<hr width="%s" />' % self.code_value(line)
@ -387,7 +472,10 @@ class PML_HTMLizer(object):
else: else:
if c != ' ': if c != ' ':
empty = False empty = False
text = c if self.state['k'][0]:
text = c.upper()
else:
text = c
parsed.append(text) parsed.append(text)
c = line.read(1) c = line.read(1)
@ -405,13 +493,8 @@ class PML_HTMLizer(object):
return self.toc return self.toc
def pml_to_html(pml, close_all=False): def pml_to_html(pml):
''' hizer = PML_HTMLizer()
close_all will close div all div and span tags when one is closed and then
re-open the appropriate ones.
'''
hizer = PML_HTMLizer(close_all)
return hizer.parse_pml(pml) return hizer.parse_pml(pml)
def footnote_sidebar_to_html(id, pml): def footnote_sidebar_to_html(id, pml):