PDF preprocessing rule additions.

This commit is contained in:
John Schember 2009-06-14 20:33:55 -04:00
parent 0c3e217e41
commit ab9c4b39f0

View File

@ -130,7 +130,11 @@ class HTMLPreProcessor(object):
# Have paragraphs show better # Have paragraphs show better
(re.compile(r'<br.*?>'), lambda match : '<p>'), (re.compile(r'<br.*?>'), lambda match : '<p>'),
# Clean up spaces # Clean up spaces
(re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Connect paragraphs split by -
(re.compile(u'(?<=[^\s][-])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''),
# Remove - that splits words
(re.compile(u'(?<=[^\s])[-]+(?=[^\s])'), lambda match: ''),
# Add space before and after italics # Add space before and after italics
(re.compile(u'(?<!“)<i>'), lambda match: ' <i>'), (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '), (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),