cleaned up comments

This commit is contained in:
ldolse 2011-01-07 01:57:00 +08:00
parent 760d4d2fd3
commit 93bd1df11a
2 changed files with 3 additions and 6 deletions

View File

@ -363,11 +363,6 @@ class HTMLPreProcessor(object):
# Remove gray background # Remove gray background
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
# Detect Chapters to match default XPATH in GUI
#(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
# Cover the case where every letter in a chapter title is separated by a space
#(re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
# Convert line breaks to paragraphs # Convert line breaks to paragraphs
(re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'), (re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'),
(re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'), (re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'),

View File

@ -18,7 +18,9 @@ properties counted:
* non_asian_words * non_asian_words
* words * words
Python License Sourced from:
http://ginstrom.com/scribbles/2008/05/17/counting-words-etc-in-an-html-file-with-python/
http://ginstrom.com/scribbles/2007/10/06/counting-words-characters-and-asian-characters-with-python/
""" """
__version__ = 0.1 __version__ = 0.1
__author__ = "Ryan Ginstrom" __author__ = "Ryan Ginstrom"