More sophisticated handling of blank spaces and added --wordspace

2025-07-09 03:04:10 -04:00 · 2007-07-31 22:25:04 +00:00 · 2007-07-31 22:25:04 +00:00 · c5d4f81ccb
commit c5d4f81ccb
parent 0ace5e730b
4 changed files with 54 additions and 9 deletions
--- a/src/libprs500/init.py
+++ b/src/libprs500/init.py
@ -13,7 +13,7 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ''' E-book management software'''
-__version__   = "0.3.81"
+__version__   = "0.3.82"
 __docformat__ = "epytext"
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
 __appname__   = 'libprs500'
--- a/src/libprs500/ebooks/lrf/init.py
+++ b/src/libprs500/ebooks/lrf/init.py
@ -103,6 +103,8 @@ def option_parser(usage):
                      dest='font_delta')
    laf.add_option('--disable-autorotation', action='store_true', default=False, 
                   help='Disable autorotation of images.', dest='disable_autorotation')
+    laf.add_option('--wordspace', dest='wordspace', default=2.5, type='float',
+                   help='Set the space between words in pts. Default is %default')
    page = parser.add_option_group('PAGE OPTIONS')
    page.add_option('-p', '--profile', default=PRS500_PROFILE, dest='profile', type='choice',
                      choices=profiles, action='callback', callback=profile_from_string,
@ -240,7 +242,8 @@ def Book(options, font_delta=0, header=None,
    tsd = dict(fontsize=fontsize, 
               parindent=int(profile.parindent), 
               linespace=int(10*profile.line_space),
-               baselineskip=baselineskip)
+               baselineskip=baselineskip,
+               wordspace=10*options.wordspace)
    if fonts['serif'] and fonts['serif'].has_key('normal'):
        tsd['fontfacename'] = fonts['serif']['normal'][1]
    
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -47,7 +47,7 @@ from libprs500 import extract, filename_to_utf8
 from libprs500.ptempfile import PersistentTemporaryFile

 class Span(_Span):
-    replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo', 'nbsp' ]
+    replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
    patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
    targets  = [ unichr(name2codepoint[i]) for i in replaced_entities ]
    rules = zip(patterns, targets)
@ -229,9 +229,6 @@ class HTMLConverter(object):
    IGNORED_TAGS   = (Comment, Declaration, ProcessingInstruction)
    # Fix <a /> elements 
    MARKUP_MASSAGE   = [
-                        # Convert &nbsp; into a normal space as the default 
-                        # conversion converts it into \xa0 which is not a space in LRF
-                        (re.compile('&nbsp;'), lambda match : ' '),
                        # Close <a /> tags
                        (re.compile("(<a\s+.*?)/>|<a/>", re.IGNORECASE), 
                         lambda match: match.group(1)+"></a>"),
@ -401,7 +398,6 @@ class HTMLConverter(object):
        self.soup = BeautifulSoup(raw, 
                         convertEntities=BeautifulSoup.HTML_ENTITIES,
                         markupMassage=nmassage)
-        #print self.soup
        print 'done\n\tConverting to BBeB...',
        sys.stdout.flush()
        self.verbose = verbose        
@ -763,7 +759,8 @@ class HTMLConverter(object):
        @param css:
        @type css:
        '''
-        src = tag.string if hasattr(tag, 'string') else tag 
+        src = tag.string if hasattr(tag, 'string') else tag
+        src = re.sub(r'\s{1,}', ' ', src) 
        if self.lstrip_toggle:
            src = src.lstrip()
            self.lstrip_toggle = False
@ -774,6 +771,7 @@ class HTMLConverter(object):
            try:
                self.current_para.append(Span(src, self.sanctify_css(css), self.memory,\
                                              self.profile.dpi, self.fonts, font_delta=self.font_delta))
+                self.current_para.normalize_spaces()
            except ConversionError, err:
                if self.verbose:
                    print >>sys.stderr, err
--- a/src/libprs500/ebooks/lrf/pylrs/pylrs.py
+++ b/src/libprs500/ebooks/lrf/pylrs/pylrs.py
@ -252,6 +252,51 @@ class LrsContainer(object):
        self.must_append = False
            
        
+    def normalize_spaces(self, prior_text=False):
+        '''
+        Remove multiple spaces and handle &nbsp;
+        @param prior_text: True if the paragraph this container is part of
+                           has non whitespace text before this container. 
+        '''
+        temp = []
+        for i in range(len(self.contents)):
+            elem = self.contents[i]
+            try:
+                if isinstance(elem, Text):
+                    n = self.contents[i+1]
+                    if isinstance(n, Text):
+                        elem.text += n.text
+                        i += 1                        
+            except:
+                continue
+            finally:
+                temp.append(elem)
+        self.contents = temp
+        
+        def has_prior_text(idx):
+            for i in range(idx):
+                con = self.contents[i]
+                if hasattr(con, 'has_text') and con.has_text():
+                    return True
+            return False
+        
+        for i in range(len(self.contents)):
+            elem = self.contents[i]
+            if not prior_text and i > 0:
+                prior_text = has_prior_text(i)
+                
+            if isinstance(elem, Text):
+                src = elem.text
+                if isinstance(src, basestring):
+                    src = re.sub(r'\s{1,}', ' ', src)
+                    if isinstance(self.contents[i-1], (CR, DropCaps)) \
+                              or not prior_text:
+                        src = src.lstrip()                        
+                    src = src.replace(u'\xa0', ' ') # nbsp is replaced with \xa0 by BeatifulSoup
+                elem.text = src
+            elif hasattr(elem, 'normalize_spaces'):
+                elem.normalize_spaces(prior_text)
+    
    def has_text(self):
        ''' Return True iff this container has non whitespace text '''
        if hasattr(self, 'text'):
@ -1508,7 +1553,6 @@ class Paragraph(LrsContainer):
        if text is not None:
            self.append(text)
        
-    
    def CR(self):
        # Okay, here's a single autoappender for this common operation
        cr = CR()