PDF input: More special character fixes.

This commit is contained in:
John Schember 2009-07-31 07:57:49 -04:00
parent 2bd8cb9059
commit 32192a5a72

View File

@ -114,7 +114,7 @@ class HTMLPreProcessor(object):
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
# Fix French accents
# Fix accents
(re.compile(u'`\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ò'),
(re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
(re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
@ -126,6 +126,32 @@ class HTMLPreProcessor(object):
(re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'),
(re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'),
(re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'),
(re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'),
(re.compile(u'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ú'),
(re.compile(u'´\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ú'),
(re.compile(u'´\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'é'),
(re.compile(u'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'É'),
(re.compile(u'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'í'),
(re.compile(u'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Í'),
(re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),
(re.compile(u'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Á'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ô'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ô'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'û'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Û'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ê'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ê'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'î'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Î'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'â'),
(re.compile(u'ˆ\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Â'),
# Fix more special characters
(re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'),
(re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'),
# Remove page links
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
# Remove <hr> tags