More sophisticated handling of blank spaces and added --wordspace

This commit is contained in:
Kovid Goyal 2007-07-31 22:25:04 +00:00
parent 0ace5e730b
commit c5d4f81ccb
4 changed files with 54 additions and 9 deletions

View File

@ -13,7 +13,7 @@
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
''' E-book management software'''
__version__ = "0.3.81"
__version__ = "0.3.82"
__docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
__appname__ = 'libprs500'

View File

@ -103,6 +103,8 @@ def option_parser(usage):
dest='font_delta')
laf.add_option('--disable-autorotation', action='store_true', default=False,
help='Disable autorotation of images.', dest='disable_autorotation')
laf.add_option('--wordspace', dest='wordspace', default=2.5, type='float',
help='Set the space between words in pts. Default is %default')
page = parser.add_option_group('PAGE OPTIONS')
page.add_option('-p', '--profile', default=PRS500_PROFILE, dest='profile', type='choice',
choices=profiles, action='callback', callback=profile_from_string,
@ -240,7 +242,8 @@ def Book(options, font_delta=0, header=None,
tsd = dict(fontsize=fontsize,
parindent=int(profile.parindent),
linespace=int(10*profile.line_space),
baselineskip=baselineskip)
baselineskip=baselineskip,
wordspace=10*options.wordspace)
if fonts['serif'] and fonts['serif'].has_key('normal'):
tsd['fontfacename'] = fonts['serif']['normal'][1]

View File

@ -47,7 +47,7 @@ from libprs500 import extract, filename_to_utf8
from libprs500.ptempfile import PersistentTemporaryFile
class Span(_Span):
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo', 'nbsp' ]
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
targets = [ unichr(name2codepoint[i]) for i in replaced_entities ]
rules = zip(patterns, targets)
@ -229,9 +229,6 @@ class HTMLConverter(object):
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
# Fix <a /> elements
MARKUP_MASSAGE = [
# Convert &nbsp; into a normal space as the default
# conversion converts it into \xa0 which is not a space in LRF
(re.compile('&nbsp;'), lambda match : ' '),
# Close <a /> tags
(re.compile("(<a\s+.*?)/>|<a/>", re.IGNORECASE),
lambda match: match.group(1)+"></a>"),
@ -401,7 +398,6 @@ class HTMLConverter(object):
self.soup = BeautifulSoup(raw,
convertEntities=BeautifulSoup.HTML_ENTITIES,
markupMassage=nmassage)
#print self.soup
print 'done\n\tConverting to BBeB...',
sys.stdout.flush()
self.verbose = verbose
@ -763,7 +759,8 @@ class HTMLConverter(object):
@param css:
@type css:
'''
src = tag.string if hasattr(tag, 'string') else tag
src = tag.string if hasattr(tag, 'string') else tag
src = re.sub(r'\s{1,}', ' ', src)
if self.lstrip_toggle:
src = src.lstrip()
self.lstrip_toggle = False
@ -774,6 +771,7 @@ class HTMLConverter(object):
try:
self.current_para.append(Span(src, self.sanctify_css(css), self.memory,\
self.profile.dpi, self.fonts, font_delta=self.font_delta))
self.current_para.normalize_spaces()
except ConversionError, err:
if self.verbose:
print >>sys.stderr, err

View File

@ -252,6 +252,51 @@ class LrsContainer(object):
self.must_append = False
def normalize_spaces(self, prior_text=False):
'''
Remove multiple spaces and handle &nbsp;
@param prior_text: True if the paragraph this container is part of
has non whitespace text before this container.
'''
temp = []
for i in range(len(self.contents)):
elem = self.contents[i]
try:
if isinstance(elem, Text):
n = self.contents[i+1]
if isinstance(n, Text):
elem.text += n.text
i += 1
except:
continue
finally:
temp.append(elem)
self.contents = temp
def has_prior_text(idx):
for i in range(idx):
con = self.contents[i]
if hasattr(con, 'has_text') and con.has_text():
return True
return False
for i in range(len(self.contents)):
elem = self.contents[i]
if not prior_text and i > 0:
prior_text = has_prior_text(i)
if isinstance(elem, Text):
src = elem.text
if isinstance(src, basestring):
src = re.sub(r'\s{1,}', ' ', src)
if isinstance(self.contents[i-1], (CR, DropCaps)) \
or not prior_text:
src = src.lstrip()
src = src.replace(u'\xa0', ' ') # nbsp is replaced with \xa0 by BeatifulSoup
elem.text = src
elif hasattr(elem, 'normalize_spaces'):
elem.normalize_spaces(prior_text)
def has_text(self):
''' Return True iff this container has non whitespace text '''
if hasattr(self, 'text'):
@ -1508,7 +1553,6 @@ class Paragraph(LrsContainer):
if text is not None:
self.append(text)
def CR(self):
# Okay, here's a single autoappender for this common operation
cr = CR()