From 7c2aa5b07eafc5361538c7407fbd734ac1ecea90 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 17 Aug 2007 17:59:24 +0000
Subject: [PATCH] Support CSS attribute white-space

---
 src/libprs500/ebooks/lrf/html/convert_from.py | 128 ++++++++++--------
 src/libprs500/ebooks/lrf/html/demo/demo.html  |   6 +-
 src/libprs500/ebooks/lrf/pylrs/pylrs.py       |  46 -------
 3 files changed, 74 insertions(+), 106 deletions(-)

diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py
index 42da6ff1f8..82347933e6 100644
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@@ -219,13 +219,12 @@ class Span(_Span):
             t['wordspace'] = 50
         return t
     
-    def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta=0, normal_font_size=100):
+    def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta, parent_style,
+                 normal_font_size=100):
         src = ns.string if hasattr(ns, 'string') else ns
-        src = re.sub(r'\s{2,}', ' ', src)  # Remove multiple spaces
         for pat, repl in Span.rules:
             src = pat.sub(repl, src)
-        if not src:
-            raise ConversionError('No point in adding an empty string to a Span')
+        src = src.replace(u'\xa0', ' ') # nbsp is replaced with \xa0 by BeatifulSoup
         attrs = Span.translate_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory)
         if 'fontsize' in attrs.keys():
             normal_font_size = int(attrs['fontsize'])
@@ -258,6 +257,9 @@ class Span(_Span):
             attrs['baselineskip'] = int(attrs['fontsize']) + 20
         if attrs['fontfacename'] == fonts['serif']['normal'][1]:
             attrs.pop('fontfacename')
+        for key in attrs:
+            if parent_style.has_key(key) and str(parent_style[key]) == str(attrs[key]):
+                attrs.pop(key) 
         _Span.__init__(self, text=src, **attrs)
         
 class HTMLConverter(object):
@@ -330,7 +332,7 @@ class HTMLConverter(object):
            'cite'   : {'font-style'  : 'italic'},
            'em'     : {"font-style"  : "italic"},
            'small'  : {'font-size'   : 'small'},
-           'pre'    : {'font-family' : 'monospace' },
+           'pre'    : {'font-family' : 'monospace', 'white-space': 'pre' },
            'code'   : {'font-family' : 'monospace' },
            'tt'     : {'font-family' : 'monospace'},
            'center' : {'text-align'  : 'center'},
@@ -366,6 +368,7 @@ class HTMLConverter(object):
         self.link_level  = 0  #: Current link level
         self.memory = []        #: Used to ensure that duplicate CSS unhandled erros are not reported
         self.tops = {}          #: element representing the top of each HTML file in the LRF file
+        self.previous_text = '' #: Used to figure out when to lstrip
         # Styles 
         self.blockquote_style = book.create_block_style(sidemargin=60, 
                                                         topskip=20, footskip=20)
@@ -381,8 +384,7 @@ class HTMLConverter(object):
         self.list_indent = 20
         self.list_counter = 1
         
-        self.book = book            #: The Book object representing a BBeB book
-        self.lstrip_toggle = False #: If true the next add_text call will do an lstrip
+        self.book = book                #: The Book object representing a BBeB book
         self.start_on_file(path, is_root=True)
         
     def start_on_file(self, path, is_root=True, link_level=0):
@@ -415,6 +417,7 @@ class HTMLConverter(object):
         self.css = HTMLConverter.CSS.copy()
         self.target_prefix = path
         self.links[path] = []
+        self.previous_text = '\n'
         self.tops[path] = self.parse_file(soup, is_root)
         self.processed_files.append(path)
         self.process_links(is_root, path, link_level=link_level)
@@ -467,20 +470,21 @@ class HTMLConverter(object):
                 # however we need to as we don't do alignment at a block level.
                 # float is removed by the process_alignment function.
                 if chk.startswith('font') or chk == 'text-align' or \
-                chk == 'float': 
+                chk == 'float' or chk == 'white-space': 
                     temp[key] = pcss[key]
             prop.update(temp)
             
         prop = dict()
+        tagname = tag.name.lower()
         if parent_css:
             merge_parent_css(prop, parent_css)
         if tag.has_key("align"):
             prop["text-align"] = tag["align"]
-        if self.css.has_key(tag.name):
-            prop.update(self.css[tag.name])
+        if self.css.has_key(tagname):
+            prop.update(self.css[tagname])
         if tag.has_key("class"):
             cls = tag["class"].lower()            
-            for classname in ["."+cls, tag.name+"."+cls]:
+            for classname in ["."+cls, tagname+"."+cls]:
                 if self.css.has_key(classname):
                     prop.update(self.css[classname])
         if tag.has_key("style"):
@@ -537,7 +541,12 @@ class HTMLConverter(object):
                 raise ConversionError, 'Could not parse ' + self.file_name
         return top
             
-    def create_link(self, para, tag):
+    def create_link(self, children, tag):
+        para = None
+        for i in range(len(children)-1, -1, -1):
+            if not isinstance(children[i], CR):
+                para = children[i]
+                break
         text = self.get_text(tag, 1000)
         if not text:
             text = 'Link'
@@ -736,30 +745,41 @@ class HTMLConverter(object):
                                 blockStyle=self.current_block.blockStyle,
                                 textStyle=ts)
             self.current_para = Paragraph()
+            return True
+        return False
     
     def add_text(self, tag, css):
         '''
         Add text to the current paragraph taking CSS into account.
         @param tag: Either a BeautifulSoup tag or a string
-        @param css:
-        @type css:
+        @param css: A dict
         '''
         src = tag.string if hasattr(tag, 'string') else tag
-        src = re.sub(r'\s{1,}', ' ', src) 
-        if self.lstrip_toggle:
+        src = src.replace('\r\n', '\n').replace('\r', '\n')
+        collapse_whitespace = not css.has_key('white-space') or css['white-space'] != 'pre'
+        if self.process_alignment(css) and collapse_whitespace:
+            # Dont want leading blanks in a new paragraph
             src = src.lstrip()
-            self.lstrip_toggle = False
-        if not src.strip():
-            self.current_para.append(' ')
+        args = self.sanctify_css(css), self.memory, self.profile.dpi, self.fonts,\
+                self.logger, self.font_delta, self.current_block.textStyle.attrs
+        if collapse_whitespace:
+            src = re.sub(r'\s{1,}', ' ', src)
+            if len(self.previous_text) != len(self.previous_text.rstrip()):
+                src = src.lstrip()
+            if len(src):
+                self.previous_text = src
+                self.current_para.append(Span(src, *args))    
         else:
-            self.process_alignment(css)
-            try:
-                self.current_para.append(Span(src, self.sanctify_css(css), self.memory,
-                                              self.profile.dpi, self.fonts, self.logger, 
-                                              font_delta=self.font_delta))
-                self.current_para.normalize_spaces()
-            except ConversionError:
-                self.logger.exception('Bad text')
+            srcs = src.split('\n')
+            for src in srcs:
+                if src:
+                    self.current_para.append(Span(src, *args))
+                    if len(srcs) > 1:                
+                        self.line_break()
+        
+    def line_break(self):
+        self.current_para.append(CR())
+        self.previous_text = '\n'
         
     def sanctify_css(self, css):
         """ Return a copy of C{css} that is safe for use in a SPAM Xylog tag """
@@ -770,7 +790,7 @@ class HTMLConverter(object):
                'padding' in test or 'border' in test or 'page-break' in test \
                or test.startswith('mso') or test.startswith('background')\
                or test.startswith('line') or test in ['color', 'display', \
-                           'letter-spacing', 'position']:
+                           'letter-spacing', 'position', 'white-space']:
                 css.pop(key)              
         return css
     
@@ -1032,7 +1052,7 @@ class HTMLConverter(object):
                     if not text.strip():
                         text = "Link"
                     self.add_text(text, tag_css)
-                    self.links[self.target_prefix].append(self.create_link(self.current_para.contents[-1], tag))
+                    self.links[self.target_prefix].append(self.create_link(self.current_para.contents, tag))
                     if tag.has_key('id') or tag.has_key('name'):
                         key = 'name' if tag.has_key('name') else 'id'
                         self.targets[self.target_prefix+tag[key]] = self.current_block
@@ -1131,28 +1151,19 @@ class HTMLConverter(object):
             if ncss:
                 update_css(ncss)            
         elif tagname == 'pre':
-            for c in tag.findAll(True):
-                c.replaceWith(self.get_text(c))
             self.end_current_para()
-            self.current_block.append_to(self.current_page)
-            attrs = Span.translate_attrs(tag_css, self.profile.dpi, self.fonts, 
-                                    self.logger, self.font_delta, self.memory)
-            attrs['fontfacename'] = self.fonts['mono']['normal'][1]
-            ts = self.book.create_text_style(**self.unindented_style.attrs)
-            ts.attrs.update(attrs)
-            self.current_block = self.book.create_text_block(
-                                    blockStyle=self.current_block.blockStyle,
-                                    textStyle=ts)
-            src = ''.join([str(i) for i in tag.contents])
-            lines = src.split('\n')
-            for line in lines:
-                try:
-                    self.current_para.append(line)
-                    self.current_para.CR()
-                except ConversionError:
-                    pass
             self.end_current_block()
-            self.current_block = self.book.create_text_block()
+            self.current_block.textStyle = self.current_block.textStyle.copy()
+            self.current_block.textStyle.attrs['parindent'] = '0'
+            if tag.contents:
+                c = tag.contents[0]
+                if isinstance(c, NavigableString):
+                    c = str(c).replace('\r\n', '\n').replace('\r', '\n')
+                    if c.startswith('\n'):
+                        c = c[1:]
+                        tag.contents[0] = NavigableString(c)
+            self.process_children(tag, tag_css)
+            self.end_current_block()
         elif tagname in ['ul', 'ol', 'dl']:
             self.list_level += 1
             if tagname == 'ol':
@@ -1189,9 +1200,10 @@ class HTMLConverter(object):
                                         textStyle=self.unindented_style)
 
             if self.current_para.has_text():
-                self.current_para.append(CR())
+                self.line_break()
                 self.current_block.append(self.current_para)
             self.current_para = Paragraph()
+            self.previous_text = '\n'
             if tagname == 'li':
                 in_ol, parent = True, tag.parent            
                 while parent:                
@@ -1228,6 +1240,7 @@ class HTMLConverter(object):
                 self.block_styles.append(bs)
             self.current_block = self.book.create_text_block(
                                     blockStyle=bs, textStyle=ts)
+            self.previous_text = '\n'
             self.process_children(tag, tag_css)
             self.current_para.append_to(self.current_block)
             self.current_block.append_to(self.current_page)
@@ -1262,14 +1275,16 @@ class HTMLConverter(object):
             self.end_current_para()
             if not tag.contents or not src.strip(): # Handle empty <p></p> elements
                 self.current_block.append(CR())
+                self.previous_text = '\n'
                 self.process_children(tag, tag_css)
                 return
-            self.lstrip_toggle = True
+            self.previous_text = '\n'
             self.process_block(tag, tag_css, tkey)
             self.process_children(tag, tag_css)
             self.end_current_para()
             if tagname.startswith('h') or self.blank_after_para:
-                self.current_block.append(CR())            
+                self.current_block.append(CR())
+                self.previous_text = '\n'            
         elif tagname in ['b', 'strong', 'i', 'em', 'span', 'tt', 'big', 'code', 'cite']:
             self.process_children(tag, tag_css)
         elif tagname == 'font':
@@ -1277,16 +1292,19 @@ class HTMLConverter(object):
                 tag_css['font-family'] = tag['face']
             self.process_children(tag, tag_css)
         elif tagname in ['br']:
-            self.current_para.append(CR())
+            self.line_break()
+            self.previous_text = '\n'
         elif tagname in ['hr', 'tr']: # tr needed for nested tables
-            self.end_current_para()            
-            self.current_block.append(CR())
+            self.end_current_para()
+            self.line_break()
             self.end_current_block()
             if tagname == 'hr':
                 self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth']))
+            self.previous_text = '\n'
             self.process_children(tag, tag_css)
         elif tagname == 'td': # Needed for nested tables
             self.current_para.append(' ')
+            self.previous_text = ' '
             self.process_children(tag, tag_css)
         elif tagname == 'table' and not self.ignore_tables and not self.in_table:
             tag_css = self.tag_css(tag) # Table should not inherit CSS
diff --git a/src/libprs500/ebooks/lrf/html/demo/demo.html b/src/libprs500/ebooks/lrf/html/demo/demo.html
index f399cfb24b..62fce386ae 100644
--- a/src/libprs500/ebooks/lrf/html/demo/demo.html
+++ b/src/libprs500/ebooks/lrf/html/demo/demo.html
@@ -72,10 +72,6 @@
  Note that if you have custom fonts on your reader, the table may not be properly aligned. Also html2lrf does not support nested tables.
  </p>
  <br />
- <p>
- The table conversion code is very new and likely to be swarming with bugs, so please report them at <br/><font name="monospace>https://libprs500.kovidgoyal.net/newticket</font>
- </p>
- <br/>
  <p style="page-break-after:always">
  On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan.
  </p>
@@ -122,7 +118,7 @@
  <blockquote>This is blockquoted text. It is rendered in a separate block with margins.</blockquote>The above text should be distinct from the rest of the paragraph.
  </p>
  <hr/>
- <p style='text-indent:30em'>A very indented paragraph</p>
+ <p style='text-indent:10em'>A very indented paragraph</p>
  <p style='text-indent:0em'>An unindented paragraph</p>
  <p>A default indented paragraph</p><br/>
  <hr/>
diff --git a/src/libprs500/ebooks/lrf/pylrs/pylrs.py b/src/libprs500/ebooks/lrf/pylrs/pylrs.py
index f476c32620..8915e91f6f 100644
--- a/src/libprs500/ebooks/lrf/pylrs/pylrs.py
+++ b/src/libprs500/ebooks/lrf/pylrs/pylrs.py
@@ -251,52 +251,6 @@ class LrsContainer(object):
         self.validChildren = validChildren
         self.must_append = False
             
-        
-    def normalize_spaces(self, prior_text=False):
-        '''
-        Remove multiple spaces and handle &nbsp;
-        @param prior_text: True if the paragraph this container is part of
-                           has non whitespace text before this container. 
-        '''
-        temp = []
-        for i in range(len(self.contents)):
-            elem = self.contents[i]
-            try:
-                if isinstance(elem, Text):
-                    n = self.contents[i+1]
-                    if isinstance(n, Text):
-                        elem.text += n.text
-                        i += 1                        
-            except:
-                continue
-            finally:
-                temp.append(elem)
-        self.contents = temp
-        
-        def has_prior_text(idx):
-            for i in range(idx):
-                con = self.contents[i]
-                if hasattr(con, 'has_text') and con.has_text():
-                    return True
-            return False
-        
-        for i in range(len(self.contents)):
-            elem = self.contents[i]
-            if not prior_text and i > 0:
-                prior_text = has_prior_text(i)
-                
-            if isinstance(elem, Text):
-                src = elem.text
-                if isinstance(src, basestring):
-                    src = re.sub(r'\s{1,}', ' ', src)
-                    if isinstance(self.contents[i-1], (CR, DropCaps)) \
-                              or not prior_text:
-                        src = src.lstrip()                        
-                    src = src.replace(u'\xa0', ' ') # nbsp is replaced with \xa0 by BeatifulSoup
-                elem.text = src
-            elif hasattr(elem, 'normalize_spaces'):
-                elem.normalize_spaces(prior_text)
-    
     def has_text(self):
         ''' Return True iff this container has non whitespace text '''
         if hasattr(self, 'text'):