calibre/src/calibre/ebooks/conversion/utils.py

#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai

__license__   = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

import re
from math import ceil
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log
from calibre.utils.wordcount import get_wordcount_obj


class HeuristicProcessor(object):

    def __init__(self, extra_opts=None, log=None):
        self.log = default_log if log is None else log
        self.html_preprocess_sections = 0
        self.found_indents = 0
        self.extra_opts = extra_opts
        self.deleted_nbsps = False
        self.totalwords = 0
        self.min_chapters = 1
        self.chapters_no_title = 0
        self.chapters_with_title = 0
        self.blanks_deleted = False
        self.blanks_between_paragraphs = False
        self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
        self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
        self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
        self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
        self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
        self.line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
        self.single_blank = re.compile(r'(\s*<p[^>]*>\s*</p>)', re.IGNORECASE)
        self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
        self.common_in_text_endings = u'[\"\'—’”,\.!\?\…\)„\w]'
        self.common_in_text_beginnings = u'[\w\'\"“‘‛]'

    def is_pdftohtml(self, src):
        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]

    def is_abbyy(self, src):
        return '<meta name="generator" content="ABBYY FineReader' in src[:1000]

    def chapter_head(self, match):
        from calibre.utils.html2text import html2text
        chap = match.group('chap')
        title = match.group('title')
        if not title:
            self.html_preprocess_sections = self.html_preprocess_sections + 1
            self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                    " chapters. - " + unicode(chap))
            return '<h2>'+chap+'</h2>\n'
        else:
            delete_whitespace = re.compile('^\s*(?P<c>.*?)\s*$')
            delete_quotes = re.compile('\'\"')
            txt_chap = delete_quotes.sub('', delete_whitespace.sub('\g<c>', html2text(chap)))
            txt_title = delete_quotes.sub('', delete_whitespace.sub('\g<c>', html2text(title)))
            self.html_preprocess_sections = self.html_preprocess_sections + 1
            self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                    " chapters & titles. - " + unicode(chap) + ", " + unicode(title))
            return '<h2 title="'+txt_chap+', '+txt_title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'

    def chapter_break(self, match):
        chap = match.group('section')
        styles = match.group('styles')
        self.html_preprocess_sections = self.html_preprocess_sections + 1
        self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                " section markers based on punctuation. - " + unicode(chap))
        return '<'+styles+' style="page-break-before:always">'+chap

    def analyze_title_matches(self, match):
        #chap = match.group('chap')
        title = match.group('title')
        if not title:
            self.chapters_no_title = self.chapters_no_title + 1
        else:
            self.chapters_with_title = self.chapters_with_title + 1

    def insert_indent(self, match):
        pstyle = match.group('formatting')
        span = match.group('span')
        self.found_indents = self.found_indents + 1
        if pstyle:
            if pstyle.lower().find('style'):
                pstyle = re.sub(r'"$', '; text-indent:3%"', pstyle)
            else:
                pstyle = pstyle+' style="text-indent:3%"'
            if not span:
                return '<p '+pstyle+'>'
            else:
                return '<p '+pstyle+'>'+span
        else:
            if not span:
                return '<p style="text-indent:3%">'
            else:
                return '<p style="text-indent:3%">'+span

    def no_markup(self, raw, percent):
        '''
        Detects total marked up line endings in the file. raw is the text to
        inspect.  Percent is the minimum percent of line endings which should
        be marked up to return true.
        '''
        htm_end_ere = re.compile('</(p|div)>', re.DOTALL)
        line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
        htm_end = htm_end_ere.findall(raw)
        line_end = line_end_ere.findall(raw)
        tot_htm_ends = len(htm_end)
        tot_ln_fds = len(line_end)
        #self.log.debug("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
        #        unicode(tot_htm_ends) + " marked up endings")

        if percent > 1:
            percent = 1
        if percent < 0:
            percent = 0

        min_lns = tot_ln_fds * percent
        #self.log.debug("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
        return min_lns > tot_htm_ends

    def dump(self, raw, where):
        import os
        dp = getattr(self.extra_opts, 'debug_pipeline', None)
        if dp and os.path.exists(dp):
            odir = os.path.join(dp, 'preprocess')
            if not os.path.exists(odir):
                    os.makedirs(odir)
            if os.path.exists(odir):
                odir = os.path.join(odir, where)
                if not os.path.exists(odir):
                    os.makedirs(odir)
                name, i = None, 0
                while not name or os.path.exists(os.path.join(odir, name)):
                    i += 1
                    name = '%04d.html'%i
                with open(os.path.join(odir, name), 'wb') as f:
                    f.write(raw.encode('utf-8'))

    def get_word_count(self, html):
        word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
        word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
        wordcount = get_wordcount_obj(word_count_text)
        return wordcount.words

    def markup_italicis(self, html):
        ITALICIZE_WORDS = [
            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
            'Mlle.', 'Mons.', 'PS.', 'PPS.',
        ]

        ITALICIZE_STYLE_PATS = [
            r'(?msu)(?<=[\s>])_(?P<words>[^_]+)?_',
            r'(?msu)(?<=[\s>])/(?P<words>[^/]+)?/',
            r'(?msu)(?<=[\s>])~~(?P<words>[^~]+)?~~',
            r'(?msu)(?<=[\s>])\*(?P<words>[^\*]+)?\*',
            r'(?msu)(?<=[\s>])~(?P<words>[^~]+)?~',
            r'(?msu)(?<=[\s>])_/(?P<words>[^/_]+)?/_',
            r'(?msu)(?<=[\s>])_\*(?P<words>[^\*_]+)?\*_',
            r'(?msu)(?<=[\s>])\*/(?P<words>[^/\*]+)?/\*',
            r'(?msu)(?<=[\s>])_\*/(?P<words>[^\*_]+)?/\*_',
            r'(?msu)(?<=[\s>])/:(?P<words>[^:/]+)?:/',
            r'(?msu)(?<=[\s>])\|:(?P<words>[^:\|]+)?:\|',
        ]

        for word in ITALICIZE_WORDS:
            html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '<i>%s</i>' % word, html)

        for pat in ITALICIZE_STYLE_PATS:
            html = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), html)

        return html

    def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
        '''
        Searches for common chapter headings throughout the document
        attempts multiple patterns based on likelihood of a match
        with minimum false positives.  Exits after finding a successful pattern
        '''
        # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
        # minimum of chapters to search for.  A max limit is calculated to prevent things like OCR
        # or pdf page numbers from being treated as TOC markers
        max_chapters = 150
        typical_chapters = 7000.
        if wordcount > 7000:
            if wordcount > 200000:
                typical_chapters = 15000.
            self.min_chapters = int(ceil(wordcount / typical_chapters))
        self.log.debug("minimum chapters required are: "+str(self.min_chapters))
        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
        self.html_preprocess_sections = len(heading.findall(html))
        self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")

        # Build the Regular Expressions in pieces
        init_lookahead = "(?=<(p|div))"
        chapter_line_open = self.line_open
        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
        chapter_header_open = r"(?P<chap>"
        title_header_open = r"(?P<title>"
        chapter_header_close = ")\s*"
        title_header_close = ")"
        chapter_line_close = self.line_close
        title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"

        is_pdftohtml = self.is_pdftohtml(html)
        if is_pdftohtml:
            title_line_open = "<(?P<outer2>p)[^>]*>\s*"
            title_line_close = "\s*</(?P=outer2)>"


        if blanks_between_paragraphs:
            blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
        else:
            blank_lines = ""
        opt_title_open = "("
        opt_title_close = ")?"
        n_lookahead_open = "(?!\s*"
        n_lookahead_close = ")\s*"

        default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
        simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"

        analysis_result = []

        chapter_types = [
            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
            [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
            [r"[^'\"]?(\d+(\.|:))\s*([\w\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
            [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'],  # Numeric Chapters, no dot or colon
            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
            ]

        def recurse_patterns(html, analyze):
            # Start with most typical chapter headings, get more aggressive until one works
            for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
                n_lookahead = ''
                hits = 0
                self.chapters_no_title = 0
                self.chapters_with_title = 0

                if n_lookahead_req:
                    lp_n_lookahead_open = n_lookahead_open
                    lp_n_lookahead_close = n_lookahead_close
                else:
                    lp_n_lookahead_open = ''
                    lp_n_lookahead_close = ''

                if strict_title:
                    lp_title = default_title
                else:
                    lp_title = simple_title

                if ignorecase:
                    arg_ignorecase = r'(?i)'
                else:
                    arg_ignorecase = ''

                if title_req:
                    lp_opt_title_open = ''
                    lp_opt_title_close = ''
                else:
                    lp_opt_title_open = opt_title_open
                    lp_opt_title_close = opt_title_close

                if self.html_preprocess_sections >= self.min_chapters:
                    break
                full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
                if n_lookahead_req:
                    n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
                if not analyze:
                    self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)

                chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
                chapdetect = re.compile(r'%s' % chapter_marker)

                if analyze:
                    hits = len(chapdetect.findall(html))
                    if hits:
                        chapdetect.sub(self.analyze_title_matches, html)
                        if float(self.chapters_with_title) / float(hits) > .5:
                            title_req = True
                            strict_title = False
                        self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
                        if type_name == 'common':
                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
                        elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits:
                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
                            break
                else:
                    html = chapdetect.sub(self.chapter_head, html)
            return html

        recurse_patterns(html, True)
        chapter_types = analysis_result
        html = recurse_patterns(html, False)

        words_per_chptr = wordcount
        if words_per_chptr > 0 and self.html_preprocess_sections > 0:
            words_per_chptr = wordcount / self.html_preprocess_sections
        self.log.debug("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
        return html

    def punctuation_unwrap(self, length, content, format):
        '''
        Unwraps lines based on line length and punctuation
        supports a range of html markup and text files
        '''
        # define the pieces of the regex
        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
        em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
        soft_hyphen = u"\xad"
        line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
        blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
        line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
        txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"

        unwrap_regex = lookahead+line_ending+blanklines+line_opening
        em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
        shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening

        if format == 'txt':
            unwrap_regex = lookahead+txt_line_wrap
            em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
            shy_unwrap_regex = soft_hyphen+txt_line_wrap

        unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
        em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
        shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)

        content = unwrap.sub(' ', content)
        content = em_en_unwrap.sub('', content)
        content = shy_unwrap.sub('', content)
        return content

    def txt_process(self, match):
        from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
        separate_paragraphs_single_line
        content = match.group('text')
        content = separate_paragraphs_single_line(content)
        content = preserve_spaces(content)
        content = convert_basic(content, epub_split_size_kb=0)
        return content

    def markup_pre(self, html):
        pre = re.compile(r'<pre>', re.IGNORECASE)
        if len(pre.findall(html)) >= 1:
            self.log.debug("Running Text Processing")
            outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
            html = outerhtml.sub(self.txt_process, html)
        else:
            # Add markup naively
            # TODO - find out if there are cases where there are more than one <pre> tag or
            # other types of unmarked html and handle them in some better fashion
            add_markup = re.compile('(?<!>)(\n)')
            html = add_markup.sub('</p>\n<p>', html)
        return html

    def arrange_htm_line_endings(self, html):
        html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
        html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
        return html

    def fix_nbsp_indents(self, html):
        txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
        html = txtindent.sub(self.insert_indent, html)
        if self.found_indents > 1:
            self.log.debug("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
        return html

    def cleanup_markup(self, html):
        # remove remaining non-breaking spaces
        html = re.sub(ur'\u00a0', ' ', html)
        # Get rid of various common microsoft specific tags which can cause issues later
        # Get rid of empty <o:p> tags to simplify other processing
        html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
        # Delete microsoft 'smart' tags
        html = re.sub('(?i)</?st1:\w+>', '', html)
        # Re-open self closing paragraph tags
        html = re.sub('<p[^>/]*/>', '<p> </p>', html)
        # Get rid of empty span, bold, font, em, & italics tags
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
        html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
        html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
        # delete surrounding divs from empty paragraphs
        html = re.sub('<div[^>]*>\s*<p[^>]*>\s*</p>\s*</div>', '<p> </p>', html)
        # Empty heading tags
        html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
        self.deleted_nbsps = True
        return html

    def analyze_line_endings(self, html):
        '''
        determines the type of html line ending used most commonly in a document
        use before calling docanalysis functions
        '''
        paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
        spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
        paras = len(paras_reg.findall(html))
        spans = len(spans_reg.findall(html))
        if spans > 1:
            if float(paras) / float(spans) < 0.75:
                return 'spanned_html'
            else:
                return 'html'
        else:
            return 'html'

    def analyze_blanks(self, html):
        blanklines = self.blankreg.findall(html)
        lines = self.linereg.findall(html)
        if len(lines) > 1:
            self.log.debug("There are " + unicode(len(blanklines)) + " blank lines. " +
                    unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")

            if float(len(blanklines)) / float(len(lines)) > 0.40:
                return True
            else:
                return False

    def cleanup_required(self):
        for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']:
            if getattr(self.extra_opts, option, False):
                return True
        return False

    def merge_blanks(self, html, blanks_count=None):
        base_em = .5 # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp
        em_per_line = 1.5 # Add another 1.5 em for each additional blank

        def merge_matches(match):
            to_merge = match.group(0)
            lines = float(len(self.single_blank.findall(to_merge))) - 1.
            em = base_em + (em_per_line * lines)
            if to_merge.find('whitespace'):
                newline = self.any_multi_blank.sub('\n<p class="whitespace'+str(int(em * 10))+'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
            else:
                newline = self.any_multi_blank.sub('\n<p class="softbreak'+str(int(em * 10))+'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
            return newline

        html = self.any_multi_blank.sub(merge_matches, html)
        return html

    def detect_whitespace(self, html):
        blanks_around_headings = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?(?P<heading><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE)
        blanks_n_nopunct = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE)

        def merge_header_whitespace(match):
            initblanks = match.group('initparas')
            endblanks = match.group('initparas')
            heading = match.group('heading')
            top_margin = ''
            bottom_margin = ''
            if initblanks is not None:
                top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
            if endblanks is not None:
                bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(initblanks)))+'em;'

            if initblanks == None and endblanks == None:
                return heading
            else:
                heading = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '\n\n<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', heading)
            return heading

        html = blanks_around_headings.sub(merge_header_whitespace, html)

        def markup_whitespaces(match):
            blanks = match.group(0)
            blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks)
            return blanks

        html = blanks_n_nopunct.sub(markup_whitespaces, html)
        if self.html_preprocess_sections > self.min_chapters:
            html = re.sub('(?si)^.*?(?=<h\d)', markup_whitespaces, html)

        return html

    def detect_soft_breaks(self, html):
        if not self.blanks_deleted and self.blanks_between_paragraphs:
            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html)
        else:
            html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
        return html

    def markup_user_break(self, replacement_break):
        '''
        Takes string a user supplies and wraps it in markup that will be centered with
        appropriate margins.  <hr> and <img> tags are allowed.  If the user specifies
        a style with width attributes in the <hr> tag then the appropriate margins are
        applied to wrapping divs.  This is because many ebook devices don't support margin:auto
        All other html is converted to text.
        '''
        hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em; page-break-before:avoid">'
        if re.findall('(<|>)', replacement_break):
            if re.match('^<hr', replacement_break):
                if replacement_break.find('width') != -1:
                   width = int(re.sub('.*?width(:|=)(?P<wnum>\d+).*', '\g<wnum>', replacement_break))
                   replacement_break = re.sub('(?i)(width=\d+\%?|width:\s*\d+(\%|px|pt|em)?;?)', '', replacement_break)
                   divpercent = (100 - width) / 2
                   hr_open = re.sub('45', str(divpercent), hr_open)
                   scene_break = hr_open+replacement_break+'</div>'
                else:
                   scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
            elif re.match('^<img', replacement_break):
                scene_break = self.scene_break_open+replacement_break+'</p>'
            else:
                from calibre.utils.html2text import html2text
                replacement_break = html2text(replacement_break)
                replacement_break = re.sub('\s', '&nbsp;', replacement_break)
                scene_break = self.scene_break_open+replacement_break+'</p>'
        else:
            replacement_break = re.sub('\s', '&nbsp;', replacement_break)
            scene_break = self.scene_break_open+replacement_break+'</p>'

        return scene_break

    def abbyy_processor(self, html):
        abbyy_line = re.compile('((?P<linestart><p\sstyle="(?P<styles>[^\"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
        empty_paragraph = '\n<p> </p>\n'
        self.in_blockquote = False
        self.previous_was_paragraph = False
        html = re.sub('</?a[^>]*>', '', html)

        def check_paragraph(content):
            content = re.sub('\s*</?span[^>]*>\s*', '', content)
            if re.match('.*[\"\'.!?:]$', content):
                #print "detected this as a paragraph"
                return True
            else:
                return False

        def convert_styles(match):
            #print "raw styles are: "+match.group('styles')
            content = match.group('content')
            #print "raw content is: "+match.group('content')
            image = match.group('image')

            is_paragraph = False
            text_align = ''
            text_indent = ''
            paragraph_before = ''
            paragraph_after = ''
            blockquote_open = '\n<blockquote>\n'
            blockquote_close = '</blockquote>\n'
            indented_text = 'text-indent:3%;'
            blockquote_open_loop = ''
            blockquote_close_loop = ''
            debugabby = False

            if image:
                debugabby = True
                if self.in_blockquote:
                    self.in_blockquote = False
                    blockquote_close_loop = blockquote_close
                self.previous_was_paragraph = False
                return blockquote_close_loop+'\n'+image+'\n'
            else:
                styles = match.group('styles').split(';')
                is_paragraph = check_paragraph(content)
                #print "styles for this line are: "+str(styles)
                split_styles = []
                for style in styles:
                   #print "style is: "+str(style)
                   newstyle = style.split(':')
                   #print "newstyle is: "+str(newstyle)
                   split_styles.append(newstyle)
                styles = split_styles
                for style, setting in styles:
                    if style == 'text-align' and setting != 'left':
                        text_align = style+':'+setting+';'
                    if style == 'text-indent':
                        setting = int(re.sub('\s*pt\s*', '', setting))
                        if 9 < setting < 14:
                            text_indent = indented_text
                        else:
                            text_indent = style+':'+str(setting)+'pt;'
                    if style == 'padding':
                        setting = re.sub('pt', '', setting).split(' ')
                        if int(setting[1]) < 16 and int(setting[3]) < 16:
                            if self.in_blockquote:
                                debugabby = True
                                if is_paragraph:
                                    self.in_blockquote = False
                                    blockquote_close_loop = blockquote_close
                            if int(setting[3]) > 8 and text_indent == '':
                                text_indent = indented_text
                            if int(setting[0]) > 5:
                                paragraph_before = empty_paragraph
                            if int(setting[2]) > 5:
                                paragraph_after = empty_paragraph
                        elif not self.in_blockquote and self.previous_was_paragraph:
                            debugabby = True
                            self.in_blockquote = True
                            blockquote_open_loop = blockquote_open
                        if debugabby:
                            self.log.debug('\n\n******\n')
                            self.log.debug('padding top is: '+str(setting[0]))
                            self.log.debug('padding right is:'
                                    +str(setting[1]))
                            self.log.debug('padding bottom is: ' +
                                    str(setting[2]))
                            self.log.debug('padding left is: '
                                    +str(setting[3]))

                #print "text-align is: "+str(text_align)
                #print "\n***\nline is:\n     "+str(match.group(0))+'\n'
                if debugabby:
                    #print "this line is a paragraph = "+str(is_paragraph)+", previous line was "+str(self.previous_was_paragraph)
                    self.log.debug("styles for this line were:", styles)
                    self.log.debug('newline is:')
                    self.log.debug(blockquote_open_loop+blockquote_close_loop+
                            paragraph_before+'<p style="'+text_indent+text_align+
                            '">'+content+'</p>'+paragraph_after+'\n\n\n\n\n')
                #print "is_paragraph is "+str(is_paragraph)+", previous_was_paragraph is "+str(self.previous_was_paragraph)
                self.previous_was_paragraph = is_paragraph
                #print "previous_was_paragraph is now set to "+str(self.previous_was_paragraph)+"\n\n\n"
                return blockquote_open_loop+blockquote_close_loop+paragraph_before+'<p style="'+text_indent+text_align+'">'+content+'</p>'+paragraph_after

        html = abbyy_line.sub(convert_styles, html)
        return html


    def __call__(self, html):
        self.log.debug("*********  Heuristic processing HTML  *********")
        # Count the words in the document to estimate how many chapters to look for and whether
        # other types of processing are attempted
        try:
            self.totalwords = self.get_word_count(html)
        except:
            self.log.warn("Can't get wordcount")

        if self.totalwords < 50:
            self.log.warn("flow is too short, not running heuristics")
            return html

        is_abbyy = self.is_abbyy(html)
        if is_abbyy:
            html = self.abbyy_processor(html)

        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = self.arrange_htm_line_endings(html)
        #self.dump(html, 'after_arrange_line_endings')
        if self.cleanup_required():
            ###### Check Markup ######
            #
            # some lit files don't have any <p> tags or equivalent (generally just plain text between
            # <pre> tags), check and  mark up line endings if required before proceeding
            # fix indents must run after this step
            if self.no_markup(html, 0.1):
                self.log.debug("not enough paragraph markers, adding now")
                # markup using text processing
                html = self.markup_pre(html)

        # Replace series of non-breaking spaces with text-indent
        if getattr(self.extra_opts, 'fix_indents', False):
            html = self.fix_nbsp_indents(html)

        if self.cleanup_required():
            # fix indents must run before this step, as it removes non-breaking spaces
            html = self.cleanup_markup(html)

        is_pdftohtml = self.is_pdftohtml(html)
        if is_pdftohtml:
            self.line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
            self.line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"

        # ADE doesn't render <br />, change to empty paragraphs
        #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)

        # Determine whether the document uses interleaved blank lines
        self.blanks_between_paragraphs = self.analyze_blanks(html)

        # detect chapters/sections to match xpath or splitting logic

        if getattr(self.extra_opts, 'markup_chapter_headings', False):
            html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
        #self.dump(html, 'after_chapter_markup')

        if getattr(self.extra_opts, 'italicize_common_cases', False):
            html = self.markup_italicis(html)

        # If more than 40% of the lines are empty paragraphs and the user has enabled delete
        # blank paragraphs then delete blank lines to clean up spacing
        if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
            self.log.debug("deleting blank lines")
            self.blanks_deleted = True
            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
            html = self.blankreg.sub('', html)

        # Determine line ending type
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
        # that lines can be un-wrapped across page boundaries
        format = self.analyze_line_endings(html)

        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
        # more of the lines break in the same region of the document then unwrapping is required
        docanalysis = DocAnalysis(format, html)
        hardbreaks = docanalysis.line_histogram(.50)
        self.log.debug("Hard line breaks check returned "+unicode(hardbreaks))

        # Calculate Length
        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
        length = docanalysis.line_length(unwrap_factor)
        self.log.debug("Median line length is " + unicode(length) + ", calculated with " + format + " format")

        ###### Unwrap lines ######
        if getattr(self.extra_opts, 'unwrap_lines', False):
            # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
            if hardbreaks or unwrap_factor < 0.4:
                self.log.debug("Unwrapping required, unwrapping Lines")
                # Dehyphenate with line length limiters
                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
                html = dehyphenator(html,'html', length)
                html = self.punctuation_unwrap(length, html, 'html')

        if getattr(self.extra_opts, 'dehyphenate', False):
            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
            self.log.debug("Fixing hyphenated content")
            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
            html = dehyphenator(html,'html_cleanup', length)
            html = dehyphenator(html, 'individual_words', length)

        # If still no sections after unwrapping mark split points on lines with no punctuation
        if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
            self.log.debug("Looking for more split points based on punctuation,"
                    " currently have " + unicode(self.html_preprocess_sections))
            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
            html = chapdetect3.sub(self.chapter_break, html)

        if getattr(self.extra_opts, 'renumber_headings', False):
            # search for places where a first or second level heading is immediately followed by another
            # top level heading.  demote the second heading to h3 to prevent splitting between chapter
            # headings and titles, images, etc
            doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
            html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)

        # If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
        # style it with the 'whitespace' class.  All remaining blank lines are styled as softbreaks.
        # Multiple sequential blank paragraphs are merged with appropriate margins
        # If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
        if getattr(self.extra_opts, 'format_scene_breaks', False):
            html = self.detect_whitespace(html)
            html = self.detect_soft_breaks(html)
            blanks_count = len(self.any_multi_blank.findall(html))
            if blanks_count >= 1:
                html = self.merge_blanks(html, blanks_count)
            scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
            scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
            # If the user has enabled scene break replacement, then either softbreaks
            # or 'hard' scene breaks are replaced, depending on which is in use
            # Otherwise separator lines are centered, use a bit larger margin in this case
            replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
            if replacement_break:
                replacement_break = self.markup_user_break(replacement_break)
                if len(scene_break.findall(html)) >= 1:
                    html = scene_break.sub(replacement_break, html)
                else:
                    html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
            else:
                html = scene_break.sub(self.scene_break_open+'\g<break>'+'</p>', html)

        if self.deleted_nbsps:
            # put back non-breaking spaces in empty paragraphs so they render correctly
            html = self.anyblank.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
        return html