diff --git a/resources/recipes/infobae.recipe b/resources/recipes/infobae.recipe index cda9bf83d2..b7f9cd3c6c 100644 --- a/resources/recipes/infobae.recipe +++ b/resources/recipes/infobae.recipe @@ -1,12 +1,8 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2008-2009, Darko Miletic ' +__copyright__ = '2008-2010, Darko Miletic ' ''' infobae.com ''' -import re -import urllib, urlparse from calibre.web.feeds.news import BasicNewsRecipe @@ -20,35 +16,24 @@ class Infobae(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - language = 'es' - lang = 'es-AR' - + language = 'es' encoding = 'cp1252' - cover_url = 'http://www.infobae.com/imgs/header/header.gif' + masthead_url = 'http://www.infobae.com/imgs/header/header.gif' remove_javascript = True - preprocess_regexps = [(re.compile( - r''), lambda m:'')] - - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' - - extra_css = ''' - .col-center{font-family:Arial,Helvetica,sans-serif;} - h1{font-family:Arial,Helvetica,sans-serif; color:#0D4261;} - .fuenteIntNota{font-family:Arial,Helvetica,sans-serif; color:#1D1D1D; font-size:x-small;} - ''' - - keep_only_tags = [dict(name='div', attrs={'class':['content']})] - - - remove_tags = [ - dict(name='div', attrs={'class':['options','col-right','controles', 'bannerLibre','tiulo-masleidas','masleidas-h']}), - dict(name='a', attrs={'name' : 'comentario',}), - dict(name='iframe'), - dict(name='img', alt = "Ver galerias de imagenes"), - - ] - + remove_empty_feeds = True + extra_css = ''' + body{font-family:Arial,Helvetica,sans-serif;} + .popUpTitulo{color:#0D4261; font-size: xx-large} + ''' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } + feeds = [ (u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' ) @@ -57,39 +42,14 @@ class Infobae(BasicNewsRecipe): ,(u'Deportes' , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml' ) ] -# def print_version(self, url): -# main, sep, article_part = url.partition('contenidos/') -# article_id, rsep, rrest = article_part.partition('-') -# return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id - - def get_article_url(self, article): - ans = article.get('link').encode('utf-8') - parts = list(urlparse.urlparse(ans)) - parts[2] = urllib.quote(parts[2]) - ans = urlparse.urlunparse(parts) - return ans.decode('utf-8') - - - def preprocess_html(self, soup): - - for tag in soup.head.findAll('strong'): - tag.extract() - for tag in soup.findAll('meta'): - del tag['content'] - tag.extract() - - mtag = '\n\n' - soup.head.insert(0,mtag) - for item in soup.findAll(style=True): - del item['style'] - - return soup + def print_version(self, url): + article_part = url.rpartition('/')[2] + article_id= article_part.partition('-')[0] + return 'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id def postprocess_html(self, soup, first): - for tag in soup.findAll(name='strong'): tag.name = 'b' - return soup diff --git a/resources/recipes/nspm.recipe b/resources/recipes/nspm.recipe index 13ff42b277..58b782415b 100644 --- a/resources/recipes/nspm.recipe +++ b/resources/recipes/nspm.recipe @@ -6,6 +6,7 @@ nspm.rs import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import NavigableString class Nspm(BasicNewsRecipe): title = 'Nova srpska politicka misao' @@ -21,6 +22,7 @@ class Nspm(BasicNewsRecipe): encoding = 'utf-8' language = 'sr' delay = 2 + remove_empty_feeds = True publication_type = 'magazine' masthead_url = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg' extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @@ -45,8 +47,9 @@ class Nspm(BasicNewsRecipe): dict(name=['link','object','embed','script','meta','base','iframe']) ,dict(attrs={'class':'buttonheading'}) ] - remove_tags_after = dict(attrs={'class':'article_separator'}) - remove_attributes = ['width','height'] + remove_tags_before = dict(attrs={'class':'contentheading'}) + remove_tags_after = dict(attrs={'class':'article_separator'}) + remove_attributes = ['width','height'] def get_browser(self): br = BasicNewsRecipe.get_browser() @@ -67,4 +70,8 @@ class Nspm(BasicNewsRecipe): def preprocess_html(self, soup): for item in soup.body.findAll(style=True): del item['style'] + for item in soup.body.findAll('h1'): + nh = NavigableString(item.a.string) + item.a.extract() + item.insert(0,nh) return self.adeify_images(soup) diff --git a/resources/recipes/xkcd.recipe b/resources/recipes/xkcd.recipe index 312027004e..ad0d420deb 100644 --- a/resources/recipes/xkcd.recipe +++ b/resources/recipes/xkcd.recipe @@ -24,18 +24,18 @@ class XkcdCom(BasicNewsRecipe): (re.compile(r'()'), lambda m: '%s%s

%s

' % (m.group(1), m.group(3), m.group(2))) ] - + def parse_index(self): INDEX = 'http://xkcd.com/archive/' - soup = self.index_to_soup(INDEX) + soup = self.index_to_soup(INDEX) articles = [] for item in soup.findAll('a', title=True): articles.append({ 'date': item['title'], 'timestamp': time.mktime(time.strptime(item['title'], '%Y-%m-%d'))+1, 'url': 'http://xkcd.com' + item['href'], - 'title': self.tag_to_string(item).encode('UTF-8'), + 'title': self.tag_to_string(item), 'description': '', 'content': '', }) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 256bcce6fc..7742a20a21 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -75,6 +75,8 @@ def line_length(format, raw, percent): linere = re.compile('(?<=)', re.DOTALL) elif format == 'pdf': linere = re.compile('(?<=
).*?(?=
)', re.DOTALL) + elif format == 'spanned_html': + linere = re.compile('(?<=)', re.DOTALL) lines = linere.findall(raw) lengths = [] @@ -223,34 +225,33 @@ class HTMLPreProcessor(object): # ˙ (re.compile(u'˙\s*()*\s*z', re.UNICODE), lambda match: u'ż'), (re.compile(u'˙\s*()*\s*Z', re.UNICODE), lambda match: u'Ż'), - + + # If pdf printed from a browser then the header/footer has a reliable pattern + (re.compile(r'((?<=)\s*file:////?[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''), + + # Center separator lines + (re.compile(u'
\s*(?P([*#•]+\s*)+)\s*
'), lambda match: '

\n

' + match.group(1) + '

'), # Remove page links (re.compile(r'', re.IGNORECASE), lambda match: ''), # Remove
tags - (re.compile(r'', re.IGNORECASE), lambda match: '
'), - # Replace

with

- (re.compile(r'\s*', re.IGNORECASE), lambda match: '

'), - - # Remove hyphenation - (re.compile(r'-\n\r?'), lambda match: ''), + (re.compile(r'', re.IGNORECASE), lambda match: '
'), # Remove gray background (re.compile(r']+>'), lambda match : ''), # Detect Chapters to match default XPATH in GUI - (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(()?)?)]*>\s*(?P(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), - (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), - + (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head), + # Cover the case where every letter in a chapter title is separated by a space + (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head), + # Have paragraphs show better (re.compile(r'<br.*?>'), lambda match : '<p>'), # Clean up spaces (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), - # Connect paragraphs split by - - (re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''), # Add space before and after italics (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'), - (re.compile(r'</i>(?=\w)'), lambda match: '</i> '), + (re.compile(r'</i>(?=\w)'), lambda match: '</i> '), ] # Fix Book Designer markup @@ -327,13 +328,30 @@ class HTMLPreProcessor(object): import traceback print 'Failed to parse remove_footer regexp' traceback.print_exc() - + + # unwrap hyphenation - moved here so it's executed after header/footer removal + if is_pdftohtml: + # unwrap visible dashes and hyphens - don't delete they are often hyphens for + # for compound words, formatting, etc + end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: '')) + # unwrap/delete soft hyphens + end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: '')) + # unwrap/delete soft hyphens with formatting + end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) + + # Make the more aggressive chapter marking regex optional with the preprocess option to + # reduce false positives and move after header/footer removal + if getattr(self.extra_opts, 'preprocess_html', None): + if is_pdftohtml: + end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),) + if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor')) if length: + # print "The pdf line length returned is " + str(length) end_rules.append( # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), + (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py new file mode 100644 index 0000000000..5301f70a16 --- /dev/null +++ b/src/calibre/ebooks/conversion/utils.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +__license__ = 'GPL v3' +__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' +__docformat__ = 'restructuredtext en' + +import re +from calibre.ebooks.conversion.preprocess import line_length +from calibre.utils.logging import default_log + +class PreProcessor(object): + + def __init__(self, log=None): + self.log = default_log if log is None else log + self.html_preprocess_sections = 0 + self.found_indents = 0 + + def chapter_head(self, match): + chap = match.group('chap') + title = match.group('title') + if not title: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap)) + return '<h2>'+chap+'</h2>\n' + else: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title)) + return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n' + + def chapter_break(self, match): + chap = match.group('section') + styles = match.group('styles') + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap)) + return '<'+styles+' style="page-break-before:always">'+chap + + def insert_indent(self, match): + pstyle = match.group('formatting') + span = match.group('span') + self.found_indents = self.found_indents + 1 + if pstyle: + if not span: + return '<p '+pstyle+' style="text-indent:3%">' + else: + return '<p '+pstyle+' style="text-indent:3%">'+span + else: + if not span: + return '<p style="text-indent:3%">' + else: + return '<p style="text-indent:3%">'+span + + def no_markup(self, raw, percent): + ''' + Detects total marked up line endings in the file. raw is the text to + inspect. Percent is the minimum percent of line endings which should + be marked up to return true. + ''' + htm_end_ere = re.compile('</p>', re.DOTALL) + line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) + htm_end = htm_end_ere.findall(raw) + line_end = line_end_ere.findall(raw) + tot_htm_ends = len(htm_end) + tot_ln_fds = len(line_end) + self.log("There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked up endings") + + if percent > 1: + percent = 1 + if percent < 0: + percent = 0 + + min_lns = tot_ln_fds * percent + self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup") + if min_lns > tot_htm_ends: + return True + + def __call__(self, html): + self.log("********* Preprocessing HTML *********") + # Replace series of non-breaking spaces with text-indent + txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE) + html = txtindent.sub(self.insert_indent, html) + if self.found_indents > 1: + self.log("replaced "+str(self.found_indents)+ " nbsp indents with inline styles") + # remove remaining non-breaking spaces + html = re.sub(ur'\u00a0', ' ', html) + # Get rid of empty <o:p> tags to simplify other processing + html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) + # Get rid of empty span tags + html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html) + + # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing + linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL) + blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE) + blanklines = blankreg.findall(html) + lines = linereg.findall(html) + if len(lines) > 1: + self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") + if float(len(blanklines)) / float(len(lines)) > 0.40: + self.log("deleting blank lines") + html = blankreg.sub('', html) + # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly + html = re.sub(r"\s*</p>", "</p>\n", html) + html = re.sub(r"\s*<p>\s*", "\n<p>", html) + + # some lit files don't have any <p> tags or equivalent (generally just plain text between + # <pre> tags), check and mark up line endings if required before proceeding + if self.no_markup(html, 0.1): + self.log("not enough paragraph markers, adding now") + add_markup = re.compile('(?<!>)(\n)') + html = add_markup.sub('</p>\n<p>', html) + + # detect chapters/sections to match xpath or splitting logic + heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) + self.html_preprocess_sections = len(heading.findall(html)) + self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings") + # + # Start with most typical chapter headings, get more aggressive until one works + if self.html_preprocess_sections < 10: + chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) + html = chapdetect.sub(self.chapter_head, html) + if self.html_preprocess_sections < 10: + self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters") + chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) + html = chapdetect2.sub(self.chapter_head, html) + + if self.html_preprocess_sections < 10: + self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words") + chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) + html = chapdetect2.sub(self.chapter_head, html) + + # Unwrap lines + # + self.log("Unwrapping Lines") + # Some OCR sourced files have line breaks in the html using a combination of span & p tags + # span are used for hard line breaks, p for new paragraphs. Determine which is used so + # that lines can be un-wrapped across page boundaries + paras_reg = re.compile('<p[^>]*>', re.IGNORECASE) + spans_reg = re.compile('<span[^>]*>', re.IGNORECASE) + paras = len(paras_reg.findall(html)) + spans = len(spans_reg.findall(html)) + if spans > 1: + if float(paras) / float(spans) < 0.75: + format = 'spanned_html' + else: + format = 'html' + else: + format = 'html' + + # Calculate Length + length = line_length(format, html, 0.4) + self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***") + # + # Unwrap and/or delete soft-hyphens, hyphens + html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) + html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html) + + # Unwrap lines using punctation if the median length of all lines is less than 200 + unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) + html = unwrap.sub(' ', html) + + # If still no sections after unwrapping mark split points on lines with no punctuation + if self.html_preprocess_sections < 10: + self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections)) + #self.log(html) + chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) + html = chapdetect3.sub(self.chapter_break, html) + # search for places where a first or second level heading is immediately followed by another + # top level heading. demote the second heading to h3 to prevent splitting between chapter + # headings and titles, images, etc + doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) + html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) + + return html diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index d57bfddd3e..084d48e54b 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows from calibre import unicode_path from calibre.utils.localization import get_lang from calibre.utils.filenames import ascii_filename -from calibre.ebooks.conversion.preprocess import line_length +from calibre.ebooks.conversion.utils import PreProcessor class Link(object): ''' @@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin): return (None, raw) def preprocess_html(self, html): - if not hasattr(self, 'log'): - from calibre.utils.logging import default_log - self.log = default_log - self.log("********* Preprocessing HTML *********") - # Detect Chapters to match the xpath in the GUI - chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE) - html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html) - # Unwrap lines using punctation if the median length of all lines is less than 150 - # - # Insert extra line feeds so the line length regex functions properly - html = re.sub(r"</p>", "</p>\n", html) - length = line_length('html', html, 0.4) - self.log.debug("*** Median length is " + str(length) + " ***") - unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) - if length < 150: - html = unwrap.sub(' ', html) - return html + preprocessor = PreProcessor(log=getattr(self, 'log', None)) + return preprocessor(html) + diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py index 9bf20fb1d4..65f5c607a2 100644 --- a/src/calibre/ebooks/lit/input.py +++ b/src/calibre/ebooks/lit/input.py @@ -6,10 +6,9 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import re - from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.conversion.preprocess import line_length +from calibre.ebooks.conversion.utils import PreProcessor + class LITInput(InputFormatPlugin): @@ -55,18 +54,6 @@ class LITInput(InputFormatPlugin): def preprocess_html(self, html): - self.log("********* Preprocessing HTML *********") - # Detect Chapters to match the xpath in the GUI - chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE) - html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html) - # Unwrap lines using punctation if the median length of all lines is less than 150 - # - # Insert extra line feeds so the line length regex functions properly - html = re.sub(r"</p>", "</p>\n", html) - length = line_length('html', html, 0.4) - self.log("*** Median length is " + str(length) + " ***") - unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) - if length < 150: - html = unwrap.sub(' ', html) - return html + preprocessor = PreProcessor(log=getattr(self, 'log', None)) + return preprocessor(html) diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py index 487e70c04f..b8dc7a9560 100644 --- a/src/calibre/ebooks/mobi/input.py +++ b/src/calibre/ebooks/mobi/input.py @@ -3,6 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' +import re from calibre.customize.conversion import InputFormatPlugin class MOBIInput(InputFormatPlugin): @@ -37,3 +38,12 @@ class MOBIInput(InputFormatPlugin): include_meta_content_type=False)) accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]' return mr.created_opf_path + + def preprocess_html(self, html): + # search for places where a first or second level heading is immediately followed by another + # top level heading. demote the second heading to h3 to prevent splitting between chapter + # headings and titles, images, etc + doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) + html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) + return html + diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py index 3ae9f8ccca..c151551866 100644 --- a/src/calibre/ebooks/pdb/pdf/reader.py +++ b/src/calibre/ebooks/pdb/pdf/reader.py @@ -21,7 +21,7 @@ class Reader(FormatReader): self.options = options setattr(self.options, 'new_pdf_engine', False) setattr(self.options, 'no_images', False) - setattr(self.options, 'unwrap_factor', 0.5) + setattr(self.options, 'unwrap_factor', 0.45) def extract_content(self, output_dir): self.log.info('Extracting PDF...') diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index 64a089281e..14b3552b04 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -22,10 +22,10 @@ class PDFInput(InputFormatPlugin): options = set([ OptionRecommendation(name='no_images', recommended_value=False, help=_('Do not extract images from the document')), - OptionRecommendation(name='unwrap_factor', recommended_value=0.5, + OptionRecommendation(name='unwrap_factor', recommended_value=0.45, help=_('Scale used to determine the length at which a line should ' 'be unwrapped. Valid values are a decimal between 0 and 1. The ' - 'default is 0.5, this is the median line length.')), + 'default is 0.45, just below the median line length.')), OptionRecommendation(name='new_pdf_engine', recommended_value=False, help=_('Use the new PDF conversion engine.')) ]) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index adda8794ca..000c603c1c 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -7,7 +7,7 @@ import os, glob, re, textwrap from lxml import etree from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.conversion.preprocess import line_length +from calibre.ebooks.conversion.utils import PreProcessor class InlineClass(etree.XSLTExtension): @@ -229,16 +229,8 @@ class RTFInput(InputFormatPlugin): res = transform.tostring(result) res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] if self.options.preprocess_html: - self.log("********* Preprocessing HTML *********") - # Detect Chapters to match the xpath in the GUI - chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)\s*</span>\s*</p>', re.IGNORECASE) - res = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', res) - # Unwrap lines using punctation if the median length of all lines is less than 150 - length = line_length('html', res, 0.4) - self.log("*** Median length is " + str(length) + " ***") - unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*(</p>)?\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*(<span[^>]*>)?\s*" % length, re.UNICODE) - if length < 150: - res = unwrap.sub(' ', res) + preprocessor = PreProcessor(log=getattr(self, 'log', None)) + res = preprocessor(res) f.write(res) self.write_inline_css(inline_class) stream.seek(0) diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui index 626c68ea63..b2ee421922 100644 --- a/src/calibre/gui2/convert/pdf_input.ui +++ b/src/calibre/gui2/convert/pdf_input.ui @@ -46,7 +46,7 @@ <double>0.010000000000000</double> </property> <property name="value"> - <double>0.500000000000000</double> + <double>0.450000000000000</double> </property> </widget> </item> diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py index a70cf8b664..8aef350498 100644 --- a/src/calibre/web/feeds/__init__.py +++ b/src/calibre/web/feeds/__init__.py @@ -165,7 +165,9 @@ class Feed(object): if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article: self.articles.append(article) else: - self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title)) + t = strftime(u'%a, %d %b, %Y %H:%M', article.localtime.timetuple()) + self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'% + (title, t, self.title)) d = item.get('date', '') article.formatted_date = d