From 379c3b3465b74b8b44aed196ca4ffe050dd6d74c Mon Sep 17 00:00:00 2001
From: jason <jason@upstairs>
Date: Sun, 22 Nov 2009 13:00:45 +0000
Subject: [PATCH 1/3] Add item to UI to config para indent

---
 src/calibre/ebooks/conversion/cli.py         |  2 +-
 src/calibre/ebooks/conversion/plumber.py     |  5 +++
 src/calibre/ebooks/oeb/transforms/flatcss.py |  2 +-
 src/calibre/gui2/convert/look_and_feel.py    |  4 +-
 src/calibre/gui2/convert/look_and_feel.ui    | 41 +++++++++++++++++---
 5 files changed, 46 insertions(+), 8 deletions(-)
diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 75c545f8b5..178561fcb5 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -125,7 +125,7 @@ def add_pipeline_options(parser, plumber):
                       'extra_css',
                       'margin_top', 'margin_left', 'margin_right',
                       'margin_bottom', 'dont_justify',
-                      'insert_blank_line', 'remove_paragraph_spacing',
+                      'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
                       'asciiize', 'remove_header', 'header_regex',
                       'remove_footer', 'footer_regex',
                   ]
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 2a3dfedd65..262f64a9cc 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -309,6 +309,11 @@ OptionRecommendation(name='remove_paragraph_spacing',
         'paragraphs of 1.5em. Spacing removal will not work '
         'if the source file does not use paragraphs (<p> or <div> tags).')
         ),
+        
+OptionRecommendation(name='remove_paragraph_spacing_indent_size',
+        recommended_value=1.5, level=OptionRecommendation.LOW,
+        help=_('Width of the indent used with Remove spacing between paragraphs option')
+        ),        
 
 OptionRecommendation(name='prefer_metadata_cover',
         recommended_value=False, level=OptionRecommendation.LOW,
diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py
index 464acbe0e0..ffb5364750 100644
--- a/src/calibre/ebooks/oeb/transforms/flatcss.py
+++ b/src/calibre/ebooks/oeb/transforms/flatcss.py
@@ -278,7 +278,7 @@ class CSSFlattener(object):
             if self.context.insert_blank_line:
                 cssdict['margin-top'] = cssdict['margin-bottom'] = '0.5em'
             if self.context.remove_paragraph_spacing:
-                cssdict['text-indent'] = '1.5em'
+                cssdict['text-indent'] =  "%1.1f em" % self.context.remove_paragraph_spacing_indent_size
         if cssdict:
             items = cssdict.items()
             items.sort()
diff --git a/src/calibre/gui2/convert/look_and_feel.py b/src/calibre/gui2/convert/look_and_feel.py
index a10a410b67..4d43f64910 100644
--- a/src/calibre/gui2/convert/look_and_feel.py
+++ b/src/calibre/gui2/convert/look_and_feel.py
@@ -23,7 +23,7 @@ class LookAndFeelWidget(Widget, Ui_Form):
                     'font_size_mapping', 'line_height',
                     'linearize_tables',
                     'disable_font_rescaling', 'insert_blank_line',
-                    'remove_paragraph_spacing', 'input_encoding',
+                    'remove_paragraph_spacing', 'remove_paragraph_spacing_indent_size','input_encoding',
                     'asciiize']
                 )
         self.db, self.book_id = db, book_id
@@ -32,6 +32,8 @@ class LookAndFeelWidget(Widget, Ui_Form):
         self.opt_disable_font_rescaling.toggle()
         self.connect(self.button_font_key, SIGNAL('clicked()'),
                 self.font_key_wizard)
+        self.opt_remove_paragraph_spacing.toggle()
+        self.opt_remove_paragraph_spacing.toggle()
 
     def font_key_wizard(self):
         from calibre.gui2.convert.font_key import FontKeyChooser
diff --git a/src/calibre/gui2/convert/look_and_feel.ui b/src/calibre/gui2/convert/look_and_feel.ui
index d451cd9af0..0161dfcea6 100644
--- a/src/calibre/gui2/convert/look_and_feel.ui
+++ b/src/calibre/gui2/convert/look_and_feel.ui
@@ -127,20 +127,39 @@
      </property>
     </widget>
    </item>
-   <item row="5" column="0" colspan="3">
+   <item row="5" column="0" >
     <widget class="QCheckBox" name="opt_remove_paragraph_spacing">
      <property name="text">
       <string>Remove &amp;spacing between paragraphs</string>
      </property>
     </widget>
-   </item>
-   <item row="6" column="0">
-    <widget class="QCheckBox" name="opt_insert_blank_line">
+   </item> <item row="5" column="1">
+    <widget class="QLabel" name="label_4">
      <property name="text">
-      <string>Insert &amp;blank line</string>
+      <string>Indent size:</string>
+     </property>
+     <property name="alignment">
+      <set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
      </property>
     </widget>
    </item>
+   <item row="5" column="2">
+    <widget class="QDoubleSpinBox" name="opt_remove_paragraph_spacing_indent_size">
+     <property name="suffix">
+      <string> em</string>
+     </property>
+     <property name="decimals">
+      <number>1</number>
+     </property>
+    </widget>
+   </item>
+   <item row="6" column="0">
+       <widget class="QCheckBox" name="opt_insert_blank_line">
+        <property name="text">
+         <string>Insert &amp;blank line</string>
+        </property>
+       </widget>
+   </item>
    <item row="7" column="0">
     <widget class="QCheckBox" name="opt_dont_justify">
      <property name="text">
@@ -216,5 +235,17 @@
     </hint>
    </hints>
   </connection>
+  <connection>
+     <sender>opt_remove_paragraph_spacing</sender>
+     <signal>toggled(bool)</signal>
+     <receiver>label_4</receiver>
+     <slot>setEnabled(bool)</slot>
+    </connection>
+    <connection>
+     <sender>opt_remove_paragraph_spacing</sender>
+     <signal>toggled(bool)</signal>
+     <receiver>opt_remove_paragraph_spacing_indent_size</receiver>
+     <slot>setEnabled(bool)</slot>
+  </connection>
  </connections>
 </ui>

From bf93536979998441cb936fd065790c16ff1dd4cd Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 22 Nov 2009 07:50:11 -0700
Subject: [PATCH 2/3] PDB/PML Input: All new state machine parser, should
 result in better conversions.

---
 src/calibre/ebooks/pdb/ereader/reader.py |   4 +-
 src/calibre/ebooks/pml/pmlconverter.py   | 441 +++++++++++++++++++----
 2 files changed, 375 insertions(+), 70 deletions(-)

diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py
index 77ca8d6933..ad1df98793 100644
--- a/src/calibre/ebooks/pdb/ereader/reader.py
+++ b/src/calibre/ebooks/pdb/ereader/reader.py
@@ -31,5 +31,5 @@ class Reader(FormatReader):
     def dump_pml(self):
         return self.reader.dump_pml()
 
-    def dump_images(self):
-        return self.reader.dump_images()
+    def dump_images(self, out_dir):
+        return self.reader.dump_images(out_dir)
diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index 1505e5fc4b..05cf488617 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -9,85 +9,390 @@ __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 
 import re
+import StringIO
 
-from calibre import my_unichr
+from calibre import my_unichr, prepare_string_for_xml
 from calibre.ebooks.pdb.ereader import image_name
 
-PML_HTML_RULES = [
-    # Any literal <, &, and > chars be escaped to avoid HTML issues (though
-    # <footnote> and <sidebar> tags are handled specially later).
-    (re.compile(r'&'), lambda match: '&amp;'),
-    (re.compile(r'<'), lambda match: '&lt;'),
-    (re.compile(r'>'), lambda match: '&gt;'),
+class PML_HTMLizer(object):
 
-    # NOP-process all \x escapes, turning \\ into &#92;  This keeps the regex
-    # parsing simple while making sure that we don't try to honor \\x as \x
-    # (and also makes sure we DO honor \\\x as &#92; followed by \x).
-    (re.compile(r'\\(.)'), lambda match: '&#92;' if match.group(1) == '\\' else '\\' + match.group(1)),
+    STATES = [
+        'i',
+        'u',
+        'd',
+        'b',
+        'sp',
+        'sb',
+        'h1',
+        'h2',
+        'h3',
+        'h4',
+        'h5',
+        'h6',
+        'a',
+        'c',
+        'r',
+        't',
+        's',
+        'l',
+        'T',
+        'Fn',
+        'Sd',
+        'FS'
+    ]
 
-    (re.compile(r'\\p'), lambda match: '<br /><br style="page-break-after: always;" />'),
-    (re.compile(r'\\x(?P<text>.*?)\\x', re.DOTALL), lambda match: '<h1 style="page-break-before: always;">%s</h1>' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]', re.DOTALL), lambda match: '<h%s>%s</h%s>' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''),
-    (re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry
-    (re.compile(r'\\c(?P<text>.*?)\\c', re.DOTALL), lambda match: '<div style="text-align: center; margin: auto;">%s</div>' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\r(?P<text>.*?)\\r', re.DOTALL), lambda match: '<div style="text-align: right;">%s</div>' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\i(?P<text>.*?)\\i', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\u(?P<text>.*?)\\u', re.DOTALL), lambda match: '<u>%s</u>' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\o(?P<text>.*?)\\o', re.DOTALL), lambda match: '<del>%s</del>' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\v(?P<text>.*?)\\v', re.DOTALL), lambda match: '<!-- %s -->' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\t(?P<text>.*?)\\t', re.DOTALL), lambda match: '<div style="margin-left: 5%%;">%s</div>' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\T="(?P<val>\d+)%*"(?P<text>.*?)$', re.MULTILINE), lambda match: r'<div style="margin-left: %s%%">%s</div>' % (match.group('val'), match.group('text')) if match.group('text') else ''),
-    (re.compile(r'\\w="(?P<val>\d+)%"'), lambda match: '<hr width="%s%%" />' % match.group('val')),
-    (re.compile(r'\\n'), lambda match: ''),
-    (re.compile(r'\\s(?P<text>.*?)\\s', re.DOTALL), lambda match: '<span style="font-size: 1em;">%s</span>' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\b(?P<text>.*?)\\b', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text') if match.group('text') else ''), # \b is deprecated; \B should be used instead.
-    (re.compile(r'\\l(?P<text>.*?)\\l', re.DOTALL), lambda match: '<span style="font-size: 1.5em;">%s</span>' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\B(?P<text>.*?)\\B', re.DOTALL), lambda match: '<b>%s</b>' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\Sp(?P<text>.*?)\\Sp', re.DOTALL), lambda match: '<sup>%s</sup>' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\Sb(?P<text>.*?)\\Sb', re.DOTALL), lambda match: '<sub>%s</sub>' % match.group('text') if match.group('text') else ''),
-    (re.compile(r'\\k(?P<text>.*?)\\k', re.DOTALL), lambda match: '<span style="font-size: 0.75em;">%s</span>' % match.group('text').upper() if match.group('text') else ''),
-    (re.compile(r'\\a(?P<num>\d{3})'), lambda match: '&#%s;' % match.group('num')),
-    (re.compile(r'\\U(?P<num>[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))),
-    (re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
-    (re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.*?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
-    (re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')),
-    (re.compile(r'\\-'), lambda match: ''),
-    (re.compile(r'\\Fn="(?P<target>.+?)"(?P<text>.*?)\\Fn'), lambda match: '<a href="#fns-%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
-    (re.compile(r'\\Sd="(?P<target>.+?)"(?P<text>.*?)\\Sd'), lambda match: '<a href="#fns-%s">%s</a>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
-    # Just italicize index items as that is how the eReader software renders them.
-    (re.compile(r'\\I(?P<text>.*?)\\I', re.DOTALL), lambda match: '<i>%s</i>' % match.group('text') if match.group('text') else ''),
+    STATES_VALUE_REQ = [
+        'a',
+        'T',
+        'FS'
+    ]
 
-    # Sidebar and Footnotes
-    (re.compile(r'&lt;sidebar\s+id="(?P<target>.+?)"&gt;\s*(?P<text>.*?)\s*&lt;/sidebar&gt;', re.DOTALL), lambda match: '<div id="fns-%s">%s</div>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
-    (re.compile(r'&lt;footnote\s+id="(?P<target>.+?)"&gt;\s*(?P<text>.*?)\s*&lt;/footnote&gt;', re.DOTALL), lambda match: '<div id="fns-%s">%s</div>' % (match.group('target'), match.group('text')) if match.group('text') else ''),
+    STATES_TAGS = {
+        'h1': ('<h1 style="page-break-after: always;">', '</h1>'),
+        'h2': ('<h2>', '</h2>'),
+        'h3': ('<h3>', '</h3>'),
+        'h4': ('<h4>', '</h4>'),
+        'h5': ('<h5>', '</h5>'),
+        'h6': ('<h6>', '</h6>'),
+        'sp': ('<sup>', '</sup>'),
+        'sb': ('<sub>', '</sub>'),
+        'a': ('<a href="%s">', '</a>'),
+        'c': ('<div style="text-align: center; margin: auto;">', '</div>'),
+        'r': ('<div style="text-align: right;">', '</div>'),
+        't': ('<div style="margin-left: 5%;">', '</div>'),
+        'T': ('<div style="margin-left: %s;">', '</div>'),
+        'i': ('<span style="font-style : italic;">', '</span>'),
+        'u': ('<span style="text-decoration : underline;">', '</span>'),
+        'd': ('<span style="text-decoration: line-through;">', '</span>'),
+        'b': ('<span style="font-weight: bold;">', '</span>'),
+        's': ('<span style="font-size: 75%">', '</span>'),
+        'l': ('<span style="font-size: 125%">', '</span>'),
+        'FS': ('<div id="%s">', '</div>'),
+    }
 
-    # eReader files are one paragraph per line.
-    # This forces the lines to wrap properly.
-    (re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')),
-    # Remove empty <p>'s.
-    (re.compile('<p>[ ]*</p>'), lambda match: ''),
-    # Ensure empty lines carry over.
-    (re.compile('(\r\n|\n|\r){3}'), lambda match: '<br />'),
+    CODE_STATES = {
+        'q': 'a',
+        'x': 'h1',
+        'X0': 'h2',
+        'X1': 'h3',
+        'X2': 'h4',
+        'X3': 'h5',
+        'X4': 'h6',
+        'Sp': 'sp',
+        'Sb': 'sb',
+        'c': 'c',
+        'r': 'r',
+        't': 't',
+        'T': 'T',
+        'i': 'i',
+        'I': 'i',
+        'u': 'u',
+        'o': 'd',
+        'b': 'b',
+        'B': 'b',
+        's': 's',
+        'l': 'l',
+        'Fn': 'a',
+        'Sd': 'a',
+        'FN': 'FS',
+        'SB': 'FS',
+    }
 
-    # Try to fix some of the misordering of character-attribute tags.
-    (re.compile(r'(?P<ch>(<(i|u|b|del|sup|sub)( [^>]+)?>)+)(?P<close>(</(div|span)>)+)'), lambda match: match.group('close') + match.group('ch')),
-    (re.compile(r'(?P<ch>(<(i|u|b|del|sup|sub|span)( [^>]+)?>)+)(?P<blk>(<(div|h\d)( [^>]+)?>)+)'), lambda match: match.group('blk') + match.group('ch')),
+    DIV_STATES = [
+        'c',
+        'r',
+        't',
+        'T',
+        'FS',
+    ]
 
-    # Remove unmatched plm codes.
-    (re.compile(r'\\X[0-4]'), lambda match: ''),
-    (re.compile(r'\\T="\d+%*"'), lambda match: ''),
-    (re.compile(r'\\Sp'), lambda match: ''),
-    (re.compile(r'\\Sb'), lambda match: ''),
-    # Remove invalid single item pml codes.
-    (re.compile(r'\\.'), lambda match: ''),
-]
+    SPAN_STATES = [
+        's',
+        'l',
+        'i',
+        'u',
+        'd',
+        'b',
+    ]
 
-def pml_to_html(pml):
-    html = pml
-    for rule in PML_HTML_RULES:
-        html = rule[0].sub(rule[1], html)
+    def __init__(self, close_all):
+        self.close_all = close_all
+        self.state = {}
 
-    return html
+    def prepare_pml(self, pml):
+        # Remove comments
+        pml = re.sub(r'(?mus)\\v(?P<text>.*?)\\v', '', pml)
+        # Footnotes and Sidebars
+        pml = re.sub(r'(?mus)<footnote\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</footnote>', lambda match: '\\FN="fns-%s"%s\\FN' % (match.group('target'), match.group('text')) if match.group('text') else '', pml)
+        pml = re.sub(r'(?mus)<sidebar\s+id="(?P<target>.+?)">\s*(?P<text>.*?)\s*</sidebar>', lambda match: '\\SB="fns-%s"%s\\SB' % (match.group('target'), match.group('text')) if match.group('text') else '', pml)
+
+        pml = prepare_string_for_xml(pml)
+
+        pml = re.sub(r'\\a(?P<num>\d{3})', lambda match: '&#%s;' % match.group('num'), pml)
+        pml = re.sub(r'\\U(?P<num>[0-9a-f]{4})', lambda match: '%s' % my_unichr(int(match.group('num'), 16)), pml)
+
+        return pml
+
+    def prepare_line(self, line):
+        line = re.sub(r'[ ]{2,}', ' ', line)
+        line = re.sub(r'^[ ]*(?=.)', '', line)
+        line = re.sub(r'(?<=.)[ ]*$', '', line)
+        line = re.sub(r'^[ ]*$', '', line)
+
+        return line
+
+    def start_line(self):
+        start = u''
+
+        for key, val in self.state.items():
+            if val[0]:
+                if key not in self.STATES_VALUE_REQ:
+                    start += self.STATES_TAGS[key][0]
+                else:
+                    start += self.STATES_TAGS[key][0] % val[1]
+
+        return u'<p>%s' % start
+
+    def end_line(self):
+        end = u''
+
+        for key, val in self.state.items():
+            if val[0]:
+                if key == 'T':
+                    self.state['T'][0] = False
+                end += self.STATES_TAGS[key][1]
+
+        return u'%s</p>' % end
+
+    def process_code_simple(self, code):
+        if code not in self.CODE_STATES.keys():
+            return u''
+
+        text = u''
+
+        if self.state[self.CODE_STATES[code]][0]:
+            text = self.STATES_TAGS[self.CODE_STATES[code]][1]
+        else:
+            text = self.STATES_TAGS[self.CODE_STATES[code]][0]
+
+        self.state[self.CODE_STATES[code]][0] = not self.state[self.CODE_STATES[code]][0]
+
+        return text
+
+    def process_code_link(self, stream, pre=''):
+        text = u''
+
+        href = self.code_value(stream)
+        if pre:
+            href = '#%s-%s' % (pre, href)
+
+        if self.state['a'][0]:
+            text = self.STATES_TAGS['a'][1]
+        else:
+            text = self.STATES_TAGS['a'][0] % href
+            self.state['a'][1] = href
+
+        self.state['a'][0] = not self.state['a'][0]
+
+        return text
+
+    def process_code_div_span(self, code, stream):
+        if self.close_all:
+            return self.process_code_div_span_call(code, stream)
+        else:
+            return self.process_code_div_span_ind(code, stream)
+
+    def process_code_div_span_ind(self, code, stream):
+        text = u''
+        ds = []
+
+        code = self.CODE_STATES[code]
+
+        if code in self.DIV_STATES:
+            ds = self.DIV_STATES[:]
+        elif code in self.SPAN_STATES:
+            ds = self.SPAN_STATES[:]
+
+        if self.state[code][0]:
+            # Close all.
+            for c in ds:
+                if self.state[c][0]:
+                    text += self.STATES_TAGS[c][1]
+            # Reopen the based on state.
+            del ds[ds.index(code)]
+            for c in ds:
+                if self.state[c][0]:
+                    if c in self.STATES_VALUE_REQ:
+                        text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1]
+                    else:
+                        text += self.STATES_TAGS[c][0]
+        else:
+            if code in self.STATES_VALUE_REQ:
+                val = self.code_value(stream)
+                text = self.STATES_TAGS[code][0] % val
+                self.state[code][1] = val
+            else:
+                text = self.STATES_TAGS[code][0]
+
+        self.state[code][0] = not self.state[code][0]
+
+        return text
+
+    def process_code_div_span_call(self, code, stream):
+        text = u''
+        divs = self.DIV_STATES[:]
+        spans = self.SPAN_STATES[:]
+
+        code = self.CODE_STATES[code]
+
+        if self.state[code][0]:
+            # Close all divs then spans.
+            for c in spans+divs:
+                if self.state[c][0]:
+                    text += self.STATES_TAGS[c][1]
+            # Reopen the based on state. Open divs then spans
+            if code in self.DIV_STATES:
+                del divs[divs.index(code)]
+            if code in self.SPAN_STATES:
+                del spans[spans.index(code)]
+            for c in divs+spans:
+                if state[c][0]:
+                    if c in self.STATES_VALUE_REQ:
+                        text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1]
+                    else:
+                        text += self.STATES_TAGS[c][0]
+        else:
+            if code in self.STATES_VALUE_REQ:
+                val = self.code_value(stream)
+                text = self.STATES_TAGS[code][0] % val
+                state[code][1] = val
+            else:
+                text = self.STATES_TAGS[code][0]
+
+        self.state[code][0] = not self.state[code][0]
+
+        return text
+
+    def code_value(self, stream):
+        value = u''
+        open = False
+
+        c = stream.read(1)
+        while c != '':
+            if open and c != '"':
+                value += c
+            if c == '"':
+                if not open:
+                    open = True
+                else:
+                    break
+            c = stream.read(1)
+
+        return value.strip()
+
+    def parse_pml(self, pml):
+        pml = self.prepare_pml(pml)
+        output = []
+
+        self.state = {}
+        for s in self.STATES:
+            self.state[s] = [False, ''];
+
+        for line in pml.splitlines():
+            if not line:
+                continue
+            parsed = []
+            empty = True
+
+            # Must use StringIO, cStringIO does not support unicode
+            line = StringIO.StringIO(self.prepare_line(line))
+            parsed.append(self.start_line())
+
+            c = line.read(1)
+            while c != '':
+                text = u''
+
+                if c == '\\':
+                    c = line.read(1)
+
+                    if c == 'x':
+                        text = self.process_code_simple(c)
+                    elif c in 'XS':
+                        l = line.read(1)
+                        if '%s%s' % (c, l) == 'Sd':
+                            text = self.process_code_link(line, 'fns')
+                        elif '%s%s' % (c, l) == 'SB':
+                            text = self.process_code_div_span('SB', line)
+                        else:
+                            text = self.process_code_simple('%s%s' % (c, l))
+                    elif c == 'q':
+                        text = self.process_code_link(line)
+                    elif c in 'crtTiIuobB':
+                        text = self.process_code_div_span(c, line)
+                    elif c in 'sl':
+                        close = u''
+                        if c == 's' and self.state['l']:
+                            close = self.process_code_div_span('l', line)
+                        if c == 'l' and self.state['s']:
+                            close = self.process_code_div_span('s', line)
+                        text = self.process_code_div_span(c, line)
+                        text = close+text
+                    elif c == 'm':
+                        empty = False
+                        src = self.code_value(line)
+                        text = '<img src="images/%s" />' % image_name(src).strip('\x00')
+                    elif c == 'Q':
+                        empty = False
+                        id = self.code_value(line)
+                        text = '<span id="%s"></span>' % id
+                    elif c == 'p':
+                        empty = False
+                        text = '<br /><br style="page-break-after: always;" />'
+                    elif c == 'C':
+                        # This should be made to create a TOC entry
+                        line.read(1)
+                        self.code_value(line)
+                    elif c == 'n':
+                        pass
+                    elif c == 'F':
+                        l = line.read(1)
+                        if '%s%s' % (c, l) == 'Fn':
+                            text = self.process_code_link(line, 'fns')
+                        elif '%s%s' % (c, l) == 'FN':
+                            text = self.process_code_div_span('FN', line)
+                    elif c == 'w':
+                        empty = False
+                        text = '<hr width="%s" />' % self.code_value(line)
+                    elif c == '-':
+                        empty = False
+                        text = '&shy;'
+                    elif c == '\\':
+                        empty = False
+                        text = '\\'
+                else:
+                    if c != ' ':
+                        empty = False
+                    text = c
+                parsed.append(text)
+                c = line.read(1)
+
+            if not empty:
+                text = self.end_line()
+                parsed.append(text)
+                output.append(u''.join(parsed))
+            line.close()
+
+        return u'\n'.join(output)
+
+
+def pml_to_html(pml, close_all=False):
+    '''
+    close_all will close div all div and span tags when one is closed and then
+    re-open the appropriate ones.
+    '''
+
+    hizer = PML_HTMLizer(close_all)
+    return hizer.parse_pml(pml)
 
 def footnote_sidebar_to_html(id, pml):
     if id.startswith('\x01'):

From 7ba005f3e0516aa02ff2d4d86c08bd02430eddd6 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 22 Nov 2009 08:14:34 -0700
Subject: [PATCH 3/3] IGN:...

---
 resources/recipes/fokkeensukke.recipe    | 174 +++++++++++------------
 src/calibre/ebooks/conversion/plumber.py |   8 +-
 src/calibre/ebooks/pml/pmlconverter.py   |   4 +-
 src/calibre/manual/conversion.rst        |   2 +-
 4 files changed, 95 insertions(+), 93 deletions(-)

diff --git a/resources/recipes/fokkeensukke.recipe b/resources/recipes/fokkeensukke.recipe
index 5627631770..3ddbe1cfe5 100644
--- a/resources/recipes/fokkeensukke.recipe
+++ b/resources/recipes/fokkeensukke.recipe
@@ -1,87 +1,87 @@
-#!/usr/bin/python
-from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag
-
-
-class FokkeEnSukkeRecipe(BasicNewsRecipe) :
-    __license__   = 'GPL v3'
-    __author__ = 'kwetal'
-    language = 'nl'
-    description = u'Popular Dutch daily cartoon Fokke en Sukke'
-
-    title = u'Fokke en Sukke'
-    no_stylesheets = True
-    # For reasons unknown to me the extra css is, on the cartoon pages, inserted in the <body> and not in the <head>. My reader (Sony PRS-600) has a serious issue
-    # with that: it treats it as content and displays it as is. Setting this property to empty solves this for me.
-    template_css = ''
-    INDEX = u'http://foksuk.nl'
-
-    # This cover is not as nice as it could be, needs some work
-    #cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'
-
-    keep_only_tags = [dict(name='div', attrs={'class' : 'cartoon'})]
-
-    def parse_index(self) :
-        # A list with daynames as they _can_ appear in the index
-        dayNames = ['maandag', 'dinsdag', 'woensdag', 'donderdag', 'vrijdag', 'zaterdag & zondag']
-        soup = self.index_to_soup(self.INDEX)
-
-        # Find the links for the various cartoons for this week and loop through them
-        index = soup.find('div', attrs={'class' : 'selectcartoon'})
-        links = index.findAll('a')
-        maxIndex = len(links) - 1
-        articles = []
-        for i in range(len(links)) :
-            # The first link does not interest us, as it points to no cartoon. A begin_at parameter in the range() function would be nice.
-            if i == 0 :
-                continue
-
-            # There can be more than one cartoon for a given day (currently either one or two). If there's only one, there is just a link with the dayname.
-            # If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>. In that case we're interested in the last two.
-            if links[i].renderContents() in dayNames :
-                # If the link is not in daynames, we processed it already, but if it is, let's see if the next one has '1' as content
-                if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1') :
-                    # Got you! Add it to the list
-                    article = {'title' : links[i].renderContents() + ' 1', 'date' : u'', 'url'  : self.INDEX + links[i + 1]['href'], 'description' : ''}
-                    articles.append(article)
-                    # If there is a '1', there should be a '2' as well, but better save than sorry
-                    if (i + 2 <= maxIndex) and (links[i + 2].renderContents() == '2') :
-                        # Got you! Add it to the list
-                        article = {'title' : links[i].renderContents() + ' 2', 'date' : u'', 'url'  : self.INDEX + links[i + 2]['href'], 'description' : ''}
-                        articles.append(article)
-                else :
-                    # There is only one cartoon for this day. Add it to the list.
-                    article = {'title' : links[i].renderContents(), 'date' : u'', 'url'  : self.INDEX + links[i]['href'], 'description' : ''}
-                    articles.append(article)
-        # Might as well use the weeknumber as title
-        week = index.find('span', attrs={'class' : 'week'}).renderContents()
-
-        return [[week, articles]]
-
-    def preprocess_html(self, soup) :
-        # This method is called for every page, be it cartoon or TOC. We need to process each in their own way
-        cartoon = soup.find('div', attrs={'class' : 'cartoon'})
-        if cartoon :
-            # It is a cartoon. Extract the title.
-            title = ''
-            img = soup.find('img', attrs = {'alt' : True})
-            if img :
-                title = img['alt']
-
-            # Using the 'extra_css' displays it in the <body> and not in the <head>. See comment at the top of this class. Setting the style this way solves that.
-            tag = Tag(soup, 'div', [('style', 'text-align: center; margin-bottom: 8px')])
-            tag.insert(0, title)
-            cartoon.insert(0, tag)
-
-            # I have not quite worked out why, but we have to throw out this part of the page. It contains the very same index we processed earlier,
-            # and Calibre does not like that too much. As far as I can tell it goes into recursion and the result is an empty eBook.
-            select = cartoon.find('div', attrs={'class' : 'selectcartoon'})
-            if select :
-                select.extract()
-
-            return cartoon
-        else :
-            # It is a TOC. Just return the whole lot.
-            return soup
-
-
+#!/usr/bin/python
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Tag
+
+
+class FokkeEnSukkeRecipe(BasicNewsRecipe) :
+    __license__   = 'GPL v3'
+    __author__ = 'kwetal'
+    language = 'nl'
+    description = u'Popular Dutch daily cartoon Fokke en Sukke'
+
+    title = u'Fokke en Sukke'
+    no_stylesheets = True
+    # For reasons unknown to me the extra css is, on the cartoon pages, inserted in the <body> and not in the <head>. My reader (Sony PRS-600) has a serious issue
+    # with that: it treats it as content and displays it as is. Setting this property to empty solves this for me.
+    template_css = ''
+    INDEX = u'http://foksuk.nl'
+
+    # This cover is not as nice as it could be, needs some work
+    #cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'
+
+    keep_only_tags = [dict(name='div', attrs={'class' : 'cartoon'})]
+
+    def parse_index(self) :
+        # A list with daynames as they _can_ appear in the index
+        dayNames = ['maandag', 'dinsdag', 'woensdag', 'donderdag', 'vrijdag', 'zaterdag & zondag']
+        soup = self.index_to_soup(self.INDEX)
+
+        # Find the links for the various cartoons for this week and loop through them
+        index = soup.find('div', attrs={'class' : 'selectcartoon'})
+        links = index.findAll('a')
+        maxIndex = len(links) - 1
+        articles = []
+        for i in range(len(links)) :
+            # The first link does not interest us, as it points to no cartoon. A begin_at parameter in the range() function would be nice.
+            if i == 0 :
+                continue
+
+            # There can be more than one cartoon for a given day (currently either one or two). If there's only one, there is just a link with the dayname.
+            # If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>. In that case we're interested in the last two.
+            if links[i].renderContents() in dayNames :
+                # If the link is not in daynames, we processed it already, but if it is, let's see if the next one has '1' as content
+                if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1') :
+                    # Got you! Add it to the list
+                    article = {'title' : links[i].renderContents() + ' 1', 'date' : u'', 'url'  : self.INDEX + links[i + 1]['href'], 'description' : ''}
+                    articles.append(article)
+                    # If there is a '1', there should be a '2' as well, but better save than sorry
+                    if (i + 2 <= maxIndex) and (links[i + 2].renderContents() == '2') :
+                        # Got you! Add it to the list
+                        article = {'title' : links[i].renderContents() + ' 2', 'date' : u'', 'url'  : self.INDEX + links[i + 2]['href'], 'description' : ''}
+                        articles.append(article)
+                else :
+                    # There is only one cartoon for this day. Add it to the list.
+                    article = {'title' : links[i].renderContents(), 'date' : u'', 'url'  : self.INDEX + links[i]['href'], 'description' : ''}
+                    articles.append(article)
+        # Might as well use the weeknumber as title
+        week = index.find('span', attrs={'class' : 'week'}).renderContents()
+
+        return [[week, articles]]
+
+    def preprocess_html(self, soup) :
+        # This method is called for every page, be it cartoon or TOC. We need to process each in their own way
+        cartoon = soup.find('div', attrs={'class' : 'cartoon'})
+        if cartoon :
+            # It is a cartoon. Extract the title.
+            title = ''
+            img = soup.find('img', attrs = {'alt' : True})
+            if img :
+                title = img['alt']
+
+            # Using the 'extra_css' displays it in the <body> and not in the <head>. See comment at the top of this class. Setting the style this way solves that.
+            tag = Tag(soup, 'div', [('style', 'text-align: center; margin-bottom: 8px')])
+            tag.insert(0, title)
+            cartoon.insert(0, tag)
+
+            # I have not quite worked out why, but we have to throw out this part of the page. It contains the very same index we processed earlier,
+            # and Calibre does not like that too much. As far as I can tell it goes into recursion and the result is an empty eBook.
+            select = cartoon.find('div', attrs={'class' : 'selectcartoon'})
+            if select :
+                select.extract()
+
+            return cartoon
+        else :
+            # It is a TOC. Just return the whole lot.
+            return soup
+
+
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 262f64a9cc..30cc42480c 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -309,11 +309,13 @@ OptionRecommendation(name='remove_paragraph_spacing',
         'paragraphs of 1.5em. Spacing removal will not work '
         'if the source file does not use paragraphs (<p> or <div> tags).')
         ),
-        
+
 OptionRecommendation(name='remove_paragraph_spacing_indent_size',
         recommended_value=1.5, level=OptionRecommendation.LOW,
-        help=_('Width of the indent used with Remove spacing between paragraphs option')
-        ),        
+        help=_('When calibre removes inter paragraph spacing, it automatically '
+            'sets a paragraph indent, to ensure that paragraphs can be easily '
+            'distinguished. This option controls the width of that indent.')
+        ),
 
 OptionRecommendation(name='prefer_metadata_cover',
         recommended_value=False, level=OptionRecommendation.LOW,
diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index 05cf488617..cb8ae15298 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -256,7 +256,7 @@ class PML_HTMLizer(object):
             if code in self.SPAN_STATES:
                 del spans[spans.index(code)]
             for c in divs+spans:
-                if state[c][0]:
+                if self.state[c][0]:
                     if c in self.STATES_VALUE_REQ:
                         text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1]
                     else:
@@ -265,7 +265,7 @@ class PML_HTMLizer(object):
             if code in self.STATES_VALUE_REQ:
                 val = self.code_value(stream)
                 text = self.STATES_TAGS[code][0] % val
-                state[code][1] = val
+                self.state[code][1] = val
             else:
                 text = self.STATES_TAGS[code][0]
 
diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index 64d8b7b62b..a841b9cf04 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -163,7 +163,7 @@ Paragraph spacing
 Normally, paragraphs in XHTML are rendered with a blank line between them and no leading text
 indent. |app| has a couple of options to control this. :guilabel:`Remove spacing between paragraphs`
 forcefully ensure that all paragraphs have no inter paragraph spacing. It also sets the text
-indent to 1.5em (can be changed) to mark that start of every paragraph.
+indent to 1.5em (can be changed) to mark the start of every paragraph.
 :guilabel:`Insert blank line` does the
 opposite, guaranteeing that there is exactly one blank line between each pair of paragraphs. 
 Both these options are very comprehensive, removing spacing, or inserting it for *all* paragraphs