From 5e78ed9dee0f7ea83608086e592c9b37bbc6a404 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 29 Nov 2009 14:36:20 -0700
Subject: [PATCH] PML Input: Various improvements
---
src/calibre/ebooks/pml/input.py | 2 +-
src/calibre/ebooks/pml/pmlconverter.py | 319 ++++++++++++++++---------
2 files changed, 202 insertions(+), 119 deletions(-)
diff --git a/src/calibre/ebooks/pml/input.py b/src/calibre/ebooks/pml/input.py
index eac2e99e05..ad37494ff7 100644
--- a/src/calibre/ebooks/pml/input.py
+++ b/src/calibre/ebooks/pml/input.py
@@ -45,7 +45,7 @@ class PMLInput(InputFormatPlugin):
ienc = self.options.input_encoding
self.log.debug('Converting PML to HTML...')
- hizer = PML_HTMLizer(close_all)
+ hizer = PML_HTMLizer()
html = hizer.parse_pml(pml_stream.read().decode(ienc), html_path)
html_stream.write('%s' % html.encode('utf-8', 'replace'))
diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index 5ef218e962..cd5bafa260 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -36,6 +36,7 @@ class PML_HTMLizer(object):
't',
's',
'l',
+ 'k',
'T',
'Fn',
'Sd',
@@ -49,7 +50,7 @@ class PML_HTMLizer(object):
]
STATES_TAGS = {
- 'h1': ('', '
'),
+ 'h1': ('', '
'),
'h2': ('', '
'),
'h3': ('', '
'),
'h4': ('', '
'),
@@ -62,11 +63,12 @@ class PML_HTMLizer(object):
'r': ('', '
'),
't': ('', '
'),
'T': ('', '
'),
- 'i': ('', ''),
- 'u': ('', ''),
+ 'i': ('', ''),
+ 'u': ('', ''),
'd': ('', ''),
'b': ('', ''),
- 'l': ('', ''),
+ 'l': ('', ''),
+ 'k': ('', ''),
'FS': ('', '
'),
}
@@ -91,12 +93,25 @@ class PML_HTMLizer(object):
'b': 'b',
'B': 'b',
'l': 'l',
+ 'k': 'k',
'Fn': 'a',
'Sd': 'a',
'FN': 'FS',
'SB': 'FS',
}
+ BLOCK_STATES = [
+ 'a',
+ 'h1',
+ 'h2',
+ 'h3',
+ 'h4',
+ 'h5',
+ 'h6',
+ 'sb',
+ 'sp',
+ ]
+
DIV_STATES = [
'c',
'r',
@@ -107,14 +122,14 @@ class PML_HTMLizer(object):
SPAN_STATES = [
'l',
+ 'k',
'i',
'u',
'd',
'b',
]
- def __init__(self, close_all):
- self.close_all = close_all
+ def __init__(self):
self.state = {}
self.toc = TOC()
self.file_name = ''
@@ -122,25 +137,28 @@ class PML_HTMLizer(object):
def prepare_pml(self, pml):
# Remove comments
pml = re.sub(r'(?mus)\\v(?P.*?)\\v', '', pml)
+
+ # Remove extra white spaces.
+ pml = re.sub(r'(?mus)[ ]{2,}', ' ', pml)
+ pml = re.sub(r'(?mus)^[ ]*(?=.)', '', pml)
+ pml = re.sub(r'(?mus)(?<=.)[ ]*$', '', pml)
+ pml = re.sub(r'(?mus)^[ ]*$', '', pml)
+
# Footnotes and Sidebars
pml = re.sub(r'(?mus).+?)">\s*(?P.*?)\s*', lambda match: '\\FN="fns-%s"%s\\FN' % (match.group('target'), match.group('text')) if match.group('text') else '', pml)
pml = re.sub(r'(?mus).+?)">\s*(?P.*?)\s*', lambda match: '\\SB="fns-%s"%s\\SB' % (match.group('target'), match.group('text')) if match.group('text') else '', pml)
- pml = prepare_string_for_xml(pml)
+ # Convert &'s into entities so & in the text doesn't get turned into
+ # &. It will display as &
+ pml = pml.replace('&', '&')
pml = re.sub(r'\\a(?P\d{3})', lambda match: '%s;' % match.group('num'), pml)
pml = re.sub(r'\\U(?P[0-9a-f]{4})', lambda match: '%s' % my_unichr(int(match.group('num'), 16)), pml)
+ pml = prepare_string_for_xml(pml)
+
return pml
- def prepare_line(self, line):
- line = re.sub(r'[ ]{2,}', ' ', line)
- line = re.sub(r'^[ ]*(?=.)', '', line)
- line = re.sub(r'(?<=.)[ ]*$', '', line)
- line = re.sub(r'^[ ]*$', '', line)
-
- return line
-
def cleanup_html(self, html):
old = html
html = self.cleanup_html_remove_redundant(html)
@@ -156,6 +174,7 @@ class PML_HTMLizer(object):
html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html)
else:
html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html)
+ html = re.sub(r'\s*
', '', html)
return html
def start_line(self):
@@ -173,140 +192,211 @@ class PML_HTMLizer(object):
def end_line(self):
end = u''
+ div = []
+ span = []
+ other = []
+
for key, val in self.state.items():
if val[0]:
if key == 'T':
self.state['T'][0] = False
- end += self.STATES_TAGS[key][1]
+ if key in self.DIV_STATES:
+ div.append(key)
+ elif key in self.SPAN_STATES:
+ span.append(key)
+ else:
+ other.append(key)
+ for key in span+div+other:
+ end += self.STATES_TAGS[key][1]
return u'%s
' % end
- def process_code_simple(self, code):
- if code not in self.CODE_STATES.keys():
- return u''
-
+ def process_code(self, code, stream, pre=''):
text = u''
- if self.state[self.CODE_STATES[code]][0]:
- text = self.STATES_TAGS[self.CODE_STATES[code]][1]
- else:
- text = self.STATES_TAGS[self.CODE_STATES[code]][0]
-
- self.state[self.CODE_STATES[code]][0] = not self.state[self.CODE_STATES[code]][0]
-
- return text
-
- def process_code_link(self, stream, pre=''):
- text = u''
-
- href = self.code_value(stream)
- if pre:
- href = '#%s-%s' % (pre, href)
-
- if self.state['a'][0]:
- text = self.STATES_TAGS['a'][1]
- else:
- text = self.STATES_TAGS['a'][0] % href
- self.state['a'][1] = href
-
- self.state['a'][0] = not self.state['a'][0]
-
- return text
-
- def process_code_div_span(self, code, stream):
- if self.close_all:
- return self.process_code_div_span_call(code, stream)
- else:
- return self.process_code_div_span_ind(code, stream)
-
- def process_code_div_span_ind(self, code, stream):
- text = u''
- ds = []
-
- code = self.CODE_STATES[code]
+ code = self.CODE_STATES.get(code, None)
+ if not code:
+ return text
if code in self.DIV_STATES:
- ds = self.DIV_STATES[:]
- ss = self.SPAN_STATES[:]
+ # Ignore multilple T's on the same line. They do not have a closing
+ # code. They get closed at the end of the line.
+ if code == 'T' and self.state['T'][0]:
+ self.code_value(stream)
+ return text
+ text = self.process_code_div(code, stream)
elif code in self.SPAN_STATES:
- ds = self.SPAN_STATES[:]
- ss = []
+ text = self.process_code_span(code, stream)
+ elif code in self.BLOCK_STATES:
+ text = self.process_code_block(code, stream, pre)
+ else:
+ text = self.process_code_simple(code)
+
+ self.state[code][0] = not self.state[code][0]
+
+ return text
+
+ def process_code_simple(self, code):
+ text = u''
+ if self.state[code][0]:
+ text = self.STATES_TAGS[code][1]
+ else:
+ if code in self.STATES_VALUE_REQ:
+ val = self.code_value(stream)
+ text += self.STATES_TAGS[code][0] % val
+ self.state[code][1] = val
+ else:
+ text = self.STATES_TAGS[code][0]
+
+ return text
+
+ def process_code_div(self, code, stream):
+ text = u''
+
+ # Close code.
if self.state[code][0]:
# Close all.
- for c in ss+ds:
+ for c in self.SPAN_STATES+self.DIV_STATES:
if self.state[c][0]:
text += self.STATES_TAGS[c][1]
# Reopen the based on state.
- del ds[ds.index(code)]
- for c in ds+ss:
+ for c in self.DIV_STATES+self.SPAN_STATES:
+ if code == c:
+ continue
if self.state[c][0]:
if c in self.STATES_VALUE_REQ:
text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1]
else:
text += self.STATES_TAGS[c][0]
+ # Open code.
else:
+ # Close all spans.
+ for c in self.SPAN_STATES:
+ if self.state[c][0]:
+ text += self.STATES_TAGS[c][1]
+ # Process the code
if code in self.STATES_VALUE_REQ:
val = self.code_value(stream)
- text = self.STATES_TAGS[code][0] % val
+ text += self.STATES_TAGS[code][0] % val
self.state[code][1] = val
else:
- text = self.STATES_TAGS[code][0]
-
- self.state[code][0] = not self.state[code][0]
+ text += self.STATES_TAGS[code][0]
+ # Re-open all spans based on state
+ for c in self.SPAN_STATES:
+ if self.state[c][0]:
+ if c in self.STATES_VALUE_REQ:
+ text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1]
+ else:
+ text += self.STATES_TAGS[c][0]
return text
- def process_code_div_span_call(self, code, stream):
+ def process_code_span(self, code, stream):
text = u''
- divs = self.DIV_STATES[:]
- spans = self.SPAN_STATES[:]
-
- code = self.CODE_STATES[code]
+ # Close code.
if self.state[code][0]:
- # Close all divs then spans.
- for c in spans+divs:
+ # Close all spans
+ for c in self.SPAN_STATES:
if self.state[c][0]:
text += self.STATES_TAGS[c][1]
- # Reopen the based on state. Open divs then spans
- if code in self.DIV_STATES:
- del divs[divs.index(code)]
- if code in self.SPAN_STATES:
- del spans[spans.index(code)]
- for c in divs+spans:
+ # Re-open the spans based on state except for code which will be
+ # left closed.
+ for c in self.SPAN_STATES:
+ if code == c:
+ continue
if self.state[c][0]:
if c in self.STATES_VALUE_REQ:
- text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1]
+ text += self.STATES_TAGS[code][0] % self.state[c][1]
else:
text += self.STATES_TAGS[c][0]
+ # Open code.
else:
if code in self.STATES_VALUE_REQ:
val = self.code_value(stream)
- text = self.STATES_TAGS[code][0] % val
+ text += self.STATES_TAGS[code][0] % val
self.state[code][1] = val
else:
- text = self.STATES_TAGS[code][0]
+ text += self.STATES_TAGS[code][0]
- self.state[code][0] = not self.state[code][0]
+ return text
+
+ def process_code_block(self, code, stream, pre=''):
+ text = u''
+
+ # Close all spans
+ for c in self.SPAN_STATES:
+ if self.state[c][0]:
+ text += self.STATES_TAGS[c][1]
+
+ # Process the code
+ if self.state[code][0]:
+ # Close tag
+ text += self.STATES_TAGS[code][1]
+ else:
+ # Open tag
+ if code in self.STATES_VALUE_REQ:
+ val = self.code_value(stream)
+ if pre:
+ val = '#%s-%s' % (pre, val)
+ text += self.STATES_TAGS[code][0] % val
+ self.state[code][1] = val
+ else:
+ text += self.STATES_TAGS[code][0]
+
+ # Re-open all spans if code was a div based on state
+ for c in self.SPAN_STATES:
+ if self.state[c][0]:
+ if c in self.STATES_VALUE_REQ:
+ text += self.STATES_TAGS[code][0] % self.state[c][1]
+ else:
+ text += self.STATES_TAGS[c][0]
return text
def code_value(self, stream):
value = u''
- open = False
+ # state 0 is before =
+ # state 1 is before the first "
+ # state 2 is before the second "
+ # state 3 is after the second "
+ state = 0
+ loc = stream.tell()
c = stream.read(1)
while c != '':
- if open and c != '"':
- value += c
- if c == '"':
- if not open:
- open = True
- else:
+ if state == 0:
+ if c == '=':
+ state = 1
+ elif c != ' ':
+ # A code that requires an argument should have = after the
+ # code but sometimes has spaces. If it has anything other
+ # than a space or = after the code then we can assume the
+ # markup is invalid. We will stop looking for the value
+ # and continue to hopefully not lose any data.
break
+ elif state == 1:
+ if c == '"':
+ state = 2
+ elif c != ' ':
+ # " should always follow = but we will allow for blank
+ # space after the =.
+ break
+ elif state == 2:
+ if c == '"':
+ state = 3
+ break
+ else:
+ value += c
c = stream.read(1)
+ if state != 3:
+ # Unable to complete the sequence to reterieve the value. Reset
+ # the stream to the location it started.
+ stream.seek(loc)
+ value = u''
+
return value.strip()
def parse_pml(self, pml, file_name=''):
@@ -323,11 +413,12 @@ class PML_HTMLizer(object):
for line in pml.splitlines():
if not line:
continue
+
parsed = []
empty = True
# Must use StringIO, cStringIO does not support unicode
- line = StringIO.StringIO(self.prepare_line(line))
+ line = StringIO.StringIO(line)
parsed.append(self.start_line())
c = line.read(1)
@@ -337,20 +428,20 @@ class PML_HTMLizer(object):
if c == '\\':
c = line.read(1)
- if c == 'x':
- text = self.process_code_simple(c)
- elif c in 'XS':
+ if c in 'xqcrtTiIuobBlk':
+ text = self.process_code(c, line)
+ elif c in 'FSX':
l = line.read(1)
- if '%s%s' % (c, l) == 'Sd':
- text = self.process_code_link(line, 'fns')
+ if '%s%s' % (c, l) == 'Fn':
+ text = self.process_code('Fn', line, 'fns')
+ elif '%s%s' % (c, l) == 'FN':
+ text = self.process_code('FN', line)
elif '%s%s' % (c, l) == 'SB':
- text = self.process_code_div_span('SB', line)
+ text = self.process_code('SB', line)
+ elif '%s%s' % (c, l) == 'Sd':
+ text = self.process_code('Sd', line, 'fns')
else:
- text = self.process_code_simple('%s%s' % (c, l))
- elif c == 'q':
- text = self.process_code_link(line)
- elif c in 'crtTiIuobBl':
- text = self.process_code_div_span(c, line)
+ text = self.process_code('%s%s' % (c, l), line)
elif c == 'm':
empty = False
src = self.code_value(line)
@@ -369,12 +460,6 @@ class PML_HTMLizer(object):
text = '' % id
elif c == 'n':
pass
- elif c == 'F':
- l = line.read(1)
- if '%s%s' % (c, l) == 'Fn':
- text = self.process_code_link(line, 'fns')
- elif '%s%s' % (c, l) == 'FN':
- text = self.process_code_div_span('FN', line)
elif c == 'w':
empty = False
text = '
' % self.code_value(line)
@@ -387,7 +472,10 @@ class PML_HTMLizer(object):
else:
if c != ' ':
empty = False
- text = c
+ if self.state['k'][0]:
+ text = c.upper()
+ else:
+ text = c
parsed.append(text)
c = line.read(1)
@@ -405,13 +493,8 @@ class PML_HTMLizer(object):
return self.toc
-def pml_to_html(pml, close_all=False):
- '''
- close_all will close div all div and span tags when one is closed and then
- re-open the appropriate ones.
- '''
-
- hizer = PML_HTMLizer(close_all)
+def pml_to_html(pml):
+ hizer = PML_HTMLizer()
return hizer.parse_pml(pml)
def footnote_sidebar_to_html(id, pml):