diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index d9dff9532f..fda7be0783 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -65,6 +65,19 @@ def munge_paths(basepath, url): path = os.path.join(os.path.dirname(basepath), path) return os.path.normpath(path), fragment +def strip_style_comments(match): + src = match.group() + while True: + lindex = src.find('/*') + if lindex < 0: + break + rindex = src.find('*/', lindex) + if rindex < 0: + src = src[:lindex] + break + src = src[:lindex] + src[rindex+2:] + return src + class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) @@ -87,6 +100,9 @@ class HTMLConverter(object): # Replace entities (re.compile(ur'&(\S+?);'), partial(entity_to_unicode, exceptions=['lt', 'gt', 'amp'])), + # Remove comments from within style tags as they can mess up BeatifulSoup + (re.compile(r'()', re.IGNORECASE|re.DOTALL), + strip_style_comments), ] # Fix Baen markup BAEN = [