Replace non-breaking spaces with spaces.

This commit is contained in:
John Schember 2009-10-27 07:44:55 -04:00
parent 68e3acd43a
commit 66f7802f9e

View File

@ -153,6 +153,10 @@ class PMLMLizer(object):
for unused in anchors.difference(links): for unused in anchors.difference(links):
text = text.replace('\\Q="%s"' % unused, '') text = text.replace('\\Q="%s"' % unused, '')
# Replace bad characters.
text = text.replace(u'\xc2', '')
text = text.replace(u'\xa0', ' ')
# Turn all html entities into unicode. This should not be necessary as # Turn all html entities into unicode. This should not be necessary as
# lxml should have already done this but we want to be sure it happens. # lxml should have already done this but we want to be sure it happens.
for entity in set(re.findall('&.+?;', text)): for entity in set(re.findall('&.+?;', text)):