diff --git a/resources/recipes/ajc.recipe b/resources/recipes/ajc.recipe index ccd0efebdd..ea989b4b4c 100644 --- a/resources/recipes/ajc.recipe +++ b/resources/recipes/ajc.recipe @@ -1,6 +1,6 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__author__ = 'Tony Stegall' +__author__ = 'Tony Stegall' __copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com' __version__ = '1.03' __date__ = '27, September 2010' @@ -9,6 +9,8 @@ __docformat__ = 'restructuredtext en' import datetime +from calibre.web.feeds.news import BasicNewsRecipe + class AdvancedUserRecipe1282101454(BasicNewsRecipe): now = datetime.datetime.now() title = 'The AJC' @@ -20,39 +22,39 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): oldest_article = 1 max_articles_per_feed = 100 no_stylesheets = True - + masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif' extra_css = ''' h1.articleHeadline{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} h2.articleSubheadline{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} - + p.byline{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;} p.organization{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;} - - + + p{font-family:Helvetica,Arial,sans-serif;font-size:small;} ''' - - + + keep_only_tags = [ dict(name='div', attrs={'class':['cxArticleHeader']}) ,dict(attrs={'id':['cxArticleText']}) ] - - + + remove_tags = [ dict(name='div' , attrs={'class':'cxArticleList' }) ,dict(name='div' , attrs={'class':'cxFeedTease' }) ,dict(name='div' , attrs={'class':'cxElementEnlarge' }) ,dict(name='div' , attrs={'id':'cxArticleTools' }) ] - - - + + + feeds = [ ('Breaking News', 'http://www.ajc.com/genericList-rss.do?source=61499'), # ------------------------------------------------------------------- - # Here are the different area feeds. Choose which ever one you wish to + # Here are the different area feeds. Choose which ever one you wish to # read by simply removing the pound sign from it. I currently have it # set to only get the Cobb area # -------------------------------------------------------------------- @@ -70,7 +72,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): ('Opinions', 'http://www.ajc.com/section-rss.do?source=opinion'), ('Ga Politics', 'http://www.ajc.com/section-rss.do?source=georgia-politics-elections'), # ------------------------------------------------------------------------ - # Here are the different sports feeds. I only follow the Falcons, and Highschool + # Here are the different sports feeds. I only follow the Falcons, and Highschool # but again # You can enable which ever team you like by removing the pound sign # ------------------------------------------------------------------------ @@ -85,25 +87,25 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): ('Music', 'http://www.accessatlanta.com/section-rss.do?source=music'), ] - + def postprocess_html(self, soup, first): for credit_tag in soup.findAll('span', attrs={'class':['imageCredit rightFloat']}): credit_tag.extract() - + return soup - + #def print_version(self, url): # return url.partition('?')[0] +'?printArticle=y' - - - - - - + + + + + + diff --git a/resources/recipes/boortz.recipe b/resources/recipes/boortz.recipe index dfb624c4bc..b281798ac8 100644 --- a/resources/recipes/boortz.recipe +++ b/resources/recipes/boortz.recipe @@ -1,6 +1,6 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__author__ = 'Tony Stegall' +__author__ = 'Tony Stegall' __copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com' __version__ = '1.04' __date__ = '27, September 2010' @@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en' from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, re + class AdvancedUserRecipe1282101454(BasicNewsRecipe): title = 'Nealz Nuze' language = 'en' @@ -18,7 +18,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): category = 'news, politics, USA, talkshow' oldest_article = 1 max_articles_per_feed = 100 - + no_stylesheets = True remove_javascript = True use_embedded_content = True @@ -26,5 +26,5 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): conversion_options = {'linearize_tables' : True} feeds = [ ('NUZE', 'http://boortz.com/nealz_nuze_rss/rss.xml') - + ] diff --git a/resources/recipes/popscience.recipe b/resources/recipes/popscience.recipe index 2bef7e4807..5f66d048a6 100644 --- a/resources/recipes/popscience.recipe +++ b/resources/recipes/popscience.recipe @@ -1,5 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, re +import re class AdvancedUserRecipe1282101454(BasicNewsRecipe): title = 'Popular Science' @@ -13,35 +13,35 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): no_stylesheets = True remove_javascript = True use_embedded_content = True - + masthead_url = 'http://www.raytheon.com/newsroom/rtnwcm/groups/Public/documents/masthead/rtn08_popscidec_masthead.jpg' - - + + feeds = [ - + ('Gadgets', 'http://www.popsci.com/full-feed/gadgets'), ('Cars', 'http://www.popsci.com/full-feed/cars'), ('Science', 'http://www.popsci.com/full-feed/science'), ('Technology', 'http://www.popsci.com/full-feed/technology'), ('DIY', 'http://www.popsci.com/full-feed/diy'), - + ] - - #The following will get read of the Gallery: links when found - + + #The following will get read of the Gallery: links when found + def preprocess_html(self, soup) : print 'SOUP IS: ', soup weblinks = soup.findAll(['head','h2']) if weblinks is not None: for link in weblinks: if re.search('(Gallery)(:)',str(link)): - + link.parent.extract() return soup - #----------------------------------------------------------------- - - + #----------------------------------------------------------------- + + diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index 2b5eb5011e..0310f09242 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -1,3 +1,4 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' @@ -251,6 +252,9 @@ class OutputProfile(Plugin): #: The character used to represent a star in ratings ratings_char = u'*' + #: Unsupported unicode characters to be replaced during preprocessing + unsupported_unicode_chars = [] + @classmethod def tags_to_string(cls, tags): return escape(', '.join(tags)) @@ -422,6 +426,8 @@ class SonyReaderOutput(OutputProfile): dpi = 168.451 fbase = 12 fsizes = [7.5, 9, 10, 12, 15.5, 20, 22, 24] + unsupported_unicode_chars = [u'\u201f', u'\u201b'] + class KoboReaderOutput(OutputProfile): diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 92c2fe5954..bb5c26a50c 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -62,49 +62,104 @@ def wrap_lines(match): else: return ital+' ' -def line_length(format, raw, percent): +class DocAnalysis(object): ''' - raw is the raw text to find the line length to use for wrapping. - percentage is a decimal number, 0 - 1 which is used to determine - how far in the list of line lengths to use. The list of line lengths is - ordered smallest to larged and does not include duplicates. 0.5 is the - median value. + Provides various text analysis functions to determine how the document is structured. + format is the type of document analysis will be done against. + raw is the raw text to determine the line length to use for wrapping. + Blank lines are excluded from analysis ''' - raw = raw.replace(' ', ' ') - if format == 'html': - linere = re.compile('(?<=)', re.DOTALL) - elif format == 'pdf': - linere = re.compile('(?<=
).*?(?=
)', re.DOTALL) - elif format == 'spanned_html': - linere = re.compile('(?<=)', re.DOTALL) - lines = linere.findall(raw) - lengths = [] - for line in lines: - if len(line) > 0: - lengths.append(len(line)) + def __init__(self, format='html', raw=''): + raw = raw.replace(' ', ' ') + if format == 'html': + linere = re.compile('(?<=]*>\s*

).*?(?=

)', re.DOTALL) + elif format == 'pdf': + linere = re.compile('(?<=
)(?!\s*
).*?(?=
)', re.DOTALL) + elif format == 'spanned_html': + linere = re.compile('(?<=)', re.DOTALL) + self.lines = linere.findall(raw) - if not lengths: - return 0 + def line_length(self, percent): + ''' + Analyses the document to find the median line length. + percentage is a decimal number, 0 - 1 which is used to determine + how far in the list of line lengths to use. The list of line lengths is + ordered smallest to larged and does not include duplicates. 0.5 is the + median value. + ''' + lengths = [] + for line in self.lines: + if len(line) > 0: + lengths.append(len(line)) - lengths = list(set(lengths)) - total = sum(lengths) - avg = total / len(lengths) - max_line = avg * 2 + if not lengths: + return 0 - lengths = sorted(lengths) - for i in range(len(lengths) - 1, -1, -1): - if lengths[i] > max_line: - del lengths[i] + lengths = list(set(lengths)) + total = sum(lengths) + avg = total / len(lengths) + max_line = avg * 2 - if percent > 1: - percent = 1 - if percent < 0: - percent = 0 + lengths = sorted(lengths) + for i in range(len(lengths) - 1, -1, -1): + if lengths[i] > max_line: + del lengths[i] - index = int(len(lengths) * percent) - 1 + if percent > 1: + percent = 1 + if percent < 0: + percent = 0 - return lengths[index] + index = int(len(lengths) * percent) - 1 + + return lengths[index] + + def line_histogram(self, percent): + ''' + Creates a broad histogram of the document to determine whether it incorporates hard + line breaks. Lines are sorted into 20 'buckets' based on length. + percent is the percentage of lines that should be in a single bucket to return true + The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks + ''' + minLineLength=20 # Ignore lines under 20 chars (typical of spaces) + maxLineLength=1900 # Discard larger than this to stay in range + buckets=20 # Each line is divided into a bucket based on length + + #print "there are "+str(len(lines))+" lines" + #max = 0 + #for line in self.lines: + # l = len(line) + # if l > max: + # max = l + #print "max line found is "+str(max) + # Build the line length histogram + hRaw = [ 0 for i in range(0,buckets) ] + for line in self.lines: + l = len(line) + if l > minLineLength and l < maxLineLength: + l = int(l/100) + #print "adding "+str(l) + hRaw[l]+=1 + + # Normalize the histogram into percents + totalLines = len(self.lines) + h = [ float(count)/totalLines for count in hRaw ] + #print "\nhRaw histogram lengths are: "+str(hRaw) + #print " percents are: "+str(h)+"\n" + + # Find the biggest bucket + maxValue = 0 + for i in range(0,len(h)): + if h[i] > maxValue: + maxValue = h[i] + + if maxValue < percent: + #print "Line lengths are too variable. Not unwrapping." + return False + else: + #print str(maxValue)+" of the lines were in one bucket" + return True class Dehyphenator(object): ''' @@ -117,42 +172,62 @@ class Dehyphenator(object): def __init__(self): # Add common suffixes to the regex below to increase the likelihood of a match - # don't add suffixes which are also complete words, such as 'able' or 'sex' - self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) + self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) # remove prefixes if the prefix was not already the point of hyphenation - self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE) - self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE) + self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE) + self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE) def dehyphenate(self, match): firsthalf = match.group('firstpart') secondhalf = match.group('secondpart') + try: + wraptags = match.group('wraptags') + except: + wraptags = '' hyphenated = str(firsthalf) + "-" + str(secondhalf) dehyphenated = str(firsthalf) + str(secondhalf) lookupword = self.removesuffixes.sub('', dehyphenated) if self.prefixes.match(firsthalf) is None: lookupword = self.removeprefix.sub('', lookupword) - booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE) #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) - match = booklookup.search(self.html) - if match: - #print "returned dehyphenated word: " + str(dehyphenated) - return dehyphenated - else: - #print "returned hyphenated word: " + str(hyphenated) + try: + searchresult = self.html.find(str.lower(lookupword)) + except: return hyphenated + if self.format == 'html_cleanup': + if self.html.find(lookupword) != -1 or searchresult != -1: + #print "Cleanup:returned dehyphenated word: " + str(dehyphenated) + return dehyphenated + elif self.html.find(hyphenated) != -1: + #print "Cleanup:returned hyphenated word: " + str(hyphenated) + return hyphenated + else: + #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) + return firsthalf+u'\u2014'+wraptags+secondhalf + + else: + if self.html.find(lookupword) != -1 or searchresult != -1: + #print "returned dehyphenated word: " + str(dehyphenated) + return dehyphenated + else: + #print " returned hyphenated word: " + str(hyphenated) + return hyphenated def __call__(self, html, format, length=1): self.html = html + self.format = format if format == 'html': - intextmatch = re.compile(u'(?<=.{%i})(?P[^“"\s>]+)-\s*(?=<)(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)' % length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)' % length) elif format == 'pdf': - intextmatch = re.compile(u'(?<=.{%i})(?P[^“"\s>]+)-\s*(

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) elif format == 'individual_words': - intextmatch = re.compile('>[^<]*\b(?P[^"\s>]+)-(?P[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') html = intextmatch.sub(self.dehyphenate, html) return html - class CSSPreProcessor(object): PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') @@ -286,7 +361,7 @@ class HTMLPreProcessor(object): (re.compile(r']+>'), lambda match : ''), # Detect Chapters to match default XPATH in GUI - (re.compile(r'
\s*(?P(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(){0,2})\s*(
\s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head), + (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head), # Cover the case where every letter in a chapter title is separated by a space (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head), @@ -374,10 +449,8 @@ class HTMLPreProcessor(object): print 'Failed to parse remove_footer regexp' traceback.print_exc() - # unwrap em/en dashes, delete soft hyphens - moved here so it's executed after header/footer removal + # delete soft hyphens - moved here so it's executed after header/footer removal if is_pdftohtml: - # unwrap em/en dashes - end_rules.append((re.compile(u'(?<=[–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens with formatting @@ -391,12 +464,15 @@ class HTMLPreProcessor(object): length = -1 if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: - length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor')) + docanalysis = DocAnalysis('pdf', html) + length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor')) if length: - # print "The pdf line length returned is " + str(length) + #print "The pdf line length returned is " + str(length) + # unwrap em/en dashes + end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation - (re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), + (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: @@ -454,6 +530,14 @@ class HTMLPreProcessor(object): if getattr(self.extra_opts, 'smarten_punctuation', False): html = self.smarten_punctuation(html) + unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars + if unsupported_unicode_chars: + from calibre.ebooks.unidecode.unidecoder import Unidecoder + unidecoder = Unidecoder() + for char in unsupported_unicode_chars: + asciichar = unidecoder.decode(char) + html = html.replace(char, asciichar) + return html def smarten_punctuation(self, html): diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 6a5eaa4a34..5f5c12a703 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' import re -from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator +from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.utils.logging import default_log class PreProcessor(object): @@ -77,13 +77,18 @@ class PreProcessor(object): def __call__(self, html): self.log("********* Preprocessing HTML *********") + + # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly + html = re.sub(r"\s*</p>", "</p>\n", html) + html = re.sub(r"\s*<p>\s*", "\n<p>", html) + ###### Check Markup ###### # # some lit files don't have any <p> tags or equivalent (generally just plain text between # <pre> tags), check and mark up line endings if required before proceeding if self.no_markup(html, 0.1): self.log("not enough paragraph markers, adding now") - # check if content is in pre tags, use txt procesor to mark up if so + # check if content is in pre tags, use txt processor to mark up if so pre = re.compile(r'<pre>', re.IGNORECASE) if len(pre.findall(html)) == 1: self.log("Running Text Processing") @@ -113,47 +118,77 @@ class PreProcessor(object): # Get rid of empty <o:p> tags to simplify other processing html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) # Get rid of empty span, bold, & italics tags - html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html) + html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html) html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html) html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html) - # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing + # If more than 40% of the lines are empty paragraphs and the user has enabled remove + # paragraph spacing then delete blank lines to clean up spacing linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL) blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE) #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE) blanklines = blankreg.findall(html) lines = linereg.findall(html) + blanks_between_paragraphs = False if len(lines) > 1: self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts, 'remove_paragraph_spacing', False): self.log("deleting blank lines") html = blankreg.sub('', html) - # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly - html = re.sub(r"\s*</p>", "</p>\n", html) - html = re.sub(r"\s*<p>\s*", "\n<p>", html) + elif float(len(blanklines)) / float(len(lines)) > 0.40: + blanks_between_paragraphs = True + #print "blanks between paragraphs is marked True" + else: + blanks_between_paragraphs = False + #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") # detect chapters/sections to match xpath or splitting logic + # + # Build the Regular Expressions in pieces + lookahead = "(?=<(p|div))" + chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*" + chapter_header_open = r"(?P<chap>" + chapter_header_close = ")\s*" + chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*" + if blanks_between_paragraphs: + blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*" + else: + blank_lines = "" + opt_title_open = "(" + title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*" + title_header_open = "(?P<title>" + title_header_close = ")\s*" + title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>" + opt_title_close = ")?" + + default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)" + typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}" + numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*" + uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*" + + chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + #print chapter_marker heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings") # # Start with most typical chapter headings, get more aggressive until one works if self.html_preprocess_sections < 10: - chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE) + chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) html = chapdetect.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters") - chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) + chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) html = chapdetect2.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words") - chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) + chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE) html = chapdetect2.sub(self.chapter_head, html) - ###### Unwrap lines ###### # - self.log("Unwrapping Lines") # Some OCR sourced files have line breaks in the html using a combination of span & p tags # span are used for hard line breaks, p for new paragraphs. Determine which is used so # that lines can be un-wrapped across page boundaries @@ -168,25 +203,40 @@ class PreProcessor(object): format = 'html' else: format = 'html' - + # Check Line histogram to determine if the document uses hard line breaks, If 50% or + # more of the lines break in the same region of the document then unwrapping is required + docanalysis = DocAnalysis(format, html) + hardbreaks = docanalysis.line_histogram(.50) + self.log("Hard line breaks check returned "+str(hardbreaks)) # Calculate Length - length = line_length(format, html, getattr(self.extra_opts, - 'html_unwrap_factor', 0.4)) + unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4) + length = docanalysis.line_length(unwrap_factor) self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***") - max_length = length * 1.4 - min_max = str("(?<=.{"+str(length)+"})(?<!.{"+str(max_length)+"})") - # - # Unwrap em/en dashes, delete soft-hyphens - #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") - html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) - html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html) - # Dehyphenate - dehyphenator = Dehyphenator() - html = dehyphenator(html,'html', length) + # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor + if hardbreaks or unwrap_factor < 0.4: + self.log("Unwrapping required, unwrapping Lines") + # Unwrap em/en dashes + html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html) + # Dehyphenate + self.log("Unwrapping/Removing hyphens") + dehyphenator = Dehyphenator() + html = dehyphenator(html,'html', length) + self.log("Done dehyphenating") + # Unwrap lines using punctation and line length + unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) + html = unwrap.sub(' ', html) + #check any remaining hyphens, but only unwrap if there is a match + dehyphenator = Dehyphenator() + html = dehyphenator(html,'html_cleanup', length) + else: + # dehyphenate in cleanup mode to fix anything previous conversions/editing missed + self.log("Cleaning up hyphenation") + dehyphenator = Dehyphenator() + html = dehyphenator(html,'html_cleanup', length) + self.log("Done dehyphenating") - # Unwrap lines using punctation and line length - unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) - html = unwrap.sub(' ', html) + # delete soft hyphens + html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) # If still no sections after unwrapping mark split points on lines with no punctuation if self.html_preprocess_sections < 10: