Change line length amount to work with duplicates removed lengths. Enhance the unwrapping regex to account for more cases.

This commit is contained in:
John Schember 2009-06-22 21:12:17 -04:00
parent e426c9e60d
commit 3a87f0d065

View File

@ -183,12 +183,12 @@ class HTMLPreProcessor(object):
elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html):
length = line_length(html, .3)
length = line_length(html, .5)
line_length_rules = []
if length:
line_length_rules = [
# Un wrap using punctuation
(re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % length, re.UNICODE), wrap_lines),
(re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
]
rules = self.PDFTOHTML + line_length_rules