diff --git a/resources/recipes/new_london_day.recipe b/resources/recipes/new_london_day.recipe new file mode 100644 index 0000000000..bc8c44e40e --- /dev/null +++ b/resources/recipes/new_london_day.recipe @@ -0,0 +1,74 @@ +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1294342201(BasicNewsRecipe): + title = u'New London Day' + __author__ = 'Being' + description = 'State, local and business news from New London, CT' + language = 'en_GB' + oldest_article = 1 + max_articles_per_feed = 200 + + use_embedded_content = False + no_stylesheets = True + remove_javascript = True + remove_tags_before = dict(id='article') + remove_tags_after = dict(id='article') + remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}), + dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), + dict(name=['script', 'noscript', 'style'])] + remove_tags_after = [ {'class':['photo_article',]} ] + remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer"]}, + {'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent"]}, + dict(name='font',attrs={'id':["cr-other-headlines"]})] + extra_css = ''' + h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} + h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} + .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} + .date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} + p{font-family:Arial,Helvetica,sans-serif;font-size:small;} + .copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center} + .story{font-family:Arial,Helvetica,sans-serif;font-size:small;} + .entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;} + .pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;} + .maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;} + .story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;} + body{font-family:Helvetica,Arial,sans-serif;font-size:small;} + ''' + + feeds = [ + (u'All News', u'http://www.theday.com/section/rss'), + (u'Breaking News', u'http://www.theday.com/section/rss01'), + (u'Police and Courts', u'http://www.theday.com/section/rss02'), + (u'State News', u'http://www.theday.com/section/rss03'), + (u'Local Business', u'http://www.theday.com/section/rss04'), + (u'Entertainment', u'http://www.theday.com/section/rss05'), + (u'Opinion', u'http://www.theday.com/section/rss06'), + (u'Casinos', u'http://www.theday.com/section/rss12'), + (u'Defense and Military', u'http://www.theday.com/section/rss14'), + (u'Ann Baldelli Ruminations', u'http://www.theday.com/section/rss20'), + (u'Paul Choiniere Ruminations', u'http://www.theday.com/section/rss21'), + (u'Michael Costanza Omnivore', u'http://www.theday.com/section/rss23'), + (u'Rebecca Dangelo Reel Life', u'http://www.theday.com/section/rss25'),] + + def print_version(self, url): + return url.replace('/index.html', '/print.html') + + def get_article_url(self, article): + return article.get('feedburner_origlink', article.get('guid', article.get('link'))) + + + def postprocess_html(self, soup, first_fetch): + for t in soup.findAll(['table', 'tr', 'td']): + t.name = 'div' + + for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})): + tag.extract() + for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})): + tag.extract() + + return soup + diff --git a/resources/recipes/njp.recipe b/resources/recipes/njp.recipe index f2a427072b..996aef2fdf 100644 --- a/resources/recipes/njp.recipe +++ b/resources/recipes/njp.recipe @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- __license__ = 'GPL v3' -__copyright__ = 'Chema Cort閟 - 2011-01-05' +__copyright__ = u'Chema Cort\xe9s - 2011-01-05' __version__ = 'v0.01' __date__ = '2011-01-05' ''' @@ -13,7 +13,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class NewJournalOfPhysics(BasicNewsRecipe): title = u'New Journal of Physics' - __author__ = u'Chema Cort閟' + __author__ = u'Chema Cort\xe9s' description = u'The open-access journal for physics' publisher = u'IOP (Institute of Physics)' category = 'physics, journal, science' diff --git a/resources/recipes/walla.recipe b/resources/recipes/walla.recipe new file mode 100644 index 0000000000..5fbfed7a03 --- /dev/null +++ b/resources/recipes/walla.recipe @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1283848012(BasicNewsRecipe): + description = 'The WallaNews.' + cover_url = 'http://ftp5.bizportal.co.il/web/giflib/news/rsPhoto/sz_5/rsz_220_220_logo_walla.gif' + title = u'Walla' + language = 'he' + __author__ = 'marbs' + extra_css='img {max-width:100%;} body{direction: rtl;},title{direction: rtl; } ,article_description{direction: rtl; }, a.article{direction: rtl; } ,calibre_feed_description{direction: rtl; }' + simultaneous_downloads = 5 +# remove_javascript = True + timefmt = '[%a, %d %b, %Y]' + oldest_article = 1 + max_articles_per_feed = 100 + # remove_attributes = ['width'] + keep_only_tags =dict(name='div', attrs={'class':'wp-0-b w3'}) + remove_tags = [dict(name='div', attrs={'class':'tagsContainer'})] + max_articles_per_feed = 100 +# preprocess_regexps = [ +# (re.compile(r'

 

', re.DOTALL|re.IGNORECASE), lambda match: '') +# ] + + + feeds = [(u'讞讚砖讜转', u'http://rss.walla.co.il/?w=/1/0/1/@rss'), + (u'注住拽讬诐', u'http://rss.walla.co.il/?w=/2/3/1/@rss'), + (u'转专讘讜转', u'http://rss.walla.co.il/?w=/4/249/1/@rss'), + (u'讘专讬讗讜转', u'http://rss.walla.co.il/?w=/5/18/1/@rss'), + (u'TECH', u'http://rss.walla.co.il/?w=/6/4/1/@rss'), + (u'讗住讟专讜诇讜讙讬讛', u'http://rss.walla.co.il/?w=/8/3307/1/@rss'), + (u'讘注诇讬 讞讬讬诐', u'http://rss.walla.co.il/?w=/59/5703/1/@rss'), + (u'专讻讘', u'http://rss.walla.co.il/?w=/31/4700/1/@rss'), + (u'住诇讘住', u'http://rss.walla.co.il/?w=/22/3600/1/@rss'), + (u'讗讜讻诇', u'http://rss.walla.co.il/?w=/9/903/1/@rss'), + (u'讗讜驻谞讛', u'http://rss.walla.co.il/?w=/24/2120/1/@rss'), + (u'讘专谞讝讛', u'http://rss.walla.co.il/?w=/27/3900/1/@rss'), + (u'ZONE', u'http://rss.walla.co.il/?w=/18/500/1/@rss'), + (u'住驻讜专讟', u'http://rss.walla.co.il/?w=/3/7/1/@rss')] + + def print_version(self, url): + print_url = url + '/@@/item/printer' + return print_url + diff --git a/resources/viewer/bookmarks.js b/resources/viewer/bookmarks.js index d36e7c579a..253524326f 100644 --- a/resources/viewer/bookmarks.js +++ b/resources/viewer/bookmarks.js @@ -41,6 +41,7 @@ function scroll_to_bookmark(bookmark) { $.scrollTo($(bm[0]), 1000, { over:ratio, + axis: 'y', // Do not scroll in the x direction onAfter:function(){window.py_bridge.animated_scroll_done()} } ); diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 3ff816b3bf..29006ffd9b 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -51,16 +51,16 @@ def chap_head(match): chap = match.group('chap') title = match.group('title') if not title: - return '

'+chap+'


\n' + return '

'+chap+'


\n' else: - return '

'+chap+'

\n

'+title+'

\n' + return '

'+chap+'

\n

'+title+'

\n' def wrap_lines(match): ital = match.group('ital') if not ital: - return ' ' + return ' ' else: - return ital+' ' + return ital+' ' class DocAnalysis(object): ''' @@ -191,7 +191,7 @@ class Dehyphenator(object): dehyphenated = unicode(firsthalf) + unicode(secondhalf) lookupword = self.removesuffixes.sub('', dehyphenated) if self.prefixes.match(firsthalf) is None: - lookupword = self.removeprefix.sub('', lookupword) + lookupword = self.removeprefix.sub('', lookupword) #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) try: searchresult = self.html.find(lookupword.lower()) @@ -353,7 +353,7 @@ class HTMLPreProcessor(object): (re.compile(r'((?<=)\s*file:////?[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''), # Center separator lines - (re.compile(u'
\s*(?P([*#鈥+\s*)+)\s*
'), lambda match: '

\n

' + match.group(1) + '

'), + (re.compile(u'
\s*(?P([*#鈥⑩湨]+\s*)+)\s*
'), lambda match: '

\n

' + match.group(1) + '

'), # Remove page links (re.compile(r'', re.IGNORECASE), lambda match: ''), @@ -363,13 +363,11 @@ class HTMLPreProcessor(object): # Remove gray background (re.compile(r']+>'), lambda match : ''), - # Detect Chapters to match default XPATH in GUI - (re.compile(r'
\s*(?P(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(){0,2})\s*(
\s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head), - # Cover the case where every letter in a chapter title is separated by a space - (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head), + # Convert line breaks to paragraphs + (re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'), + (re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'), + (re.compile(r'\s*</body>'), lambda match : '</p>\n</body>'), - # Have paragraphs show better - (re.compile(r'<br.*?>'), lambda match : '<p>'), # Clean up spaces (re.compile(u'(?<=[\.,;\?!鈥"\'])[\s^ ]*(?=<)'), lambda match: ' '), # Add space before and after italics @@ -455,9 +453,9 @@ class HTMLPreProcessor(object): # delete soft hyphens - moved here so it's executed after header/footer removal if is_pdftohtml: # unwrap/delete soft hyphens - end_rules.append((re.compile(u'[颅](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: '')) + end_rules.append((re.compile(u'[颅](</p>\s*<p>\s*)+\s*(?=[[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens with formatting - end_rules.append((re.compile(u'[颅]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) + end_rules.append((re.compile(u'[颅]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) # Make the more aggressive chapter marking regex optional with the preprocess option to # reduce false positives and move after header/footer removal @@ -475,7 +473,7 @@ class HTMLPreProcessor(object): end_rules.append((re.compile(u'(?<=.{%i}[鈥撯擼)\s*<p>\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation - (re.compile(u'(?<=.{%i}([a-z盲毛茂枚眉脿猫矛貌霉谩膰茅铆贸艅艣煤芒锚卯么没莽膮臋偶谋,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), + (re.compile(u'(?<=.{%i}([a-z盲毛茂枚眉脿猫矛貌霉谩膰茅铆贸艅艣煤芒锚卯么没莽膮臋偶谋茫玫帽忙酶镁冒脽,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: @@ -508,7 +506,15 @@ class HTMLPreProcessor(object): if is_pdftohtml and length > -1: # Dehyphenate dehyphenator = Dehyphenator() - html = dehyphenator(html,'pdf', length) + html = dehyphenator(html,'html', length) + + if is_pdftohtml: + from calibre.ebooks.conversion.utils import PreProcessor + pdf_markup = PreProcessor(self.extra_opts, None) + totalwords = 0 + totalwords = pdf_markup.get_word_count(html) + if totalwords > 7000: + html = pdf_markup.markup_chapters(html, totalwords, True) #dump(html, 'post-preprocess') @@ -554,5 +560,9 @@ class HTMLPreProcessor(object): html = smartyPants(html) html = html.replace(start, '<!--') html = html.replace(stop, '-->') + # convert ellipsis to entities to prevent wrapping + html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html) + # convert double dashes to em-dash + html = re.sub('\s--\s', u'\u2014', html) return substitute_entites(html) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 11979b933c..1bb232c911 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -6,8 +6,10 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' import re +from math import ceil from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.utils.logging import default_log +from calibre.utils.wordcount import get_wordcount_obj class PreProcessor(object): @@ -17,6 +19,9 @@ class PreProcessor(object): self.found_indents = 0 self.extra_opts = extra_opts + def is_pdftohtml(self, src): + return '<!-- created by calibre\'s pdftohtml -->' in src[:1000] + def chapter_head(self, match): chap = match.group('chap') title = match.group('title') @@ -64,7 +69,7 @@ class PreProcessor(object): inspect. Percent is the minimum percent of line endings which should be marked up to return true. ''' - htm_end_ere = re.compile('</p>', re.DOTALL) + htm_end_ere = re.compile('</(p|div)>', re.DOTALL) line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) htm_end = htm_end_ere.findall(raw) line_end = line_end_ere.findall(raw) @@ -101,36 +106,125 @@ class PreProcessor(object): with open(os.path.join(odir, name), 'wb') as f: f.write(raw.encode('utf-8')) + def get_word_count(self, html): + word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html) + word_count_text = re.sub(r'<[^>]*>', '', word_count_text) + wordcount = get_wordcount_obj(word_count_text) + return wordcount.words + + def markup_chapters(self, html, wordcount, blanks_between_paragraphs): + # Typical chapters are between 2000 and 7000 words, use the larger number to decide the + # minimum of chapters to search for + self.min_chapters = 1 + if wordcount > 7000: + self.min_chapters = int(ceil(wordcount / 7000.)) + #print "minimum chapters required are: "+str(self.min_chapters) + heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) + self.html_preprocess_sections = len(heading.findall(html)) + self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") + + # Build the Regular Expressions in pieces + init_lookahead = "(?=<(p|div))" + chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*" + title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*" + chapter_header_open = r"(?P<chap>" + title_header_open = r"(?P<title>" + chapter_header_close = ")\s*" + title_header_close = ")" + chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>" + title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>" + + is_pdftohtml = self.is_pdftohtml(html) + if is_pdftohtml: + chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*" + chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>" + title_line_open = "<(?P<outer2>p)[^>]*>\s*" + title_line_close = "\s*</(?P=outer2)>" + + + if blanks_between_paragraphs: + blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*" + else: + blank_lines = "" + opt_title_open = "(" + opt_title_close = ")?" + n_lookahead_open = "\s+(?!" + n_lookahead_close = ")" + + default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)" + + chapter_types = [ + [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], + [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#鈥+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines + [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters + [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering + [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles + [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon + [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters + ] + + # Start with most typical chapter headings, get more aggressive until one works + for [chapter_type, lookahead_ignorecase, log_message] in chapter_types: + if self.html_preprocess_sections >= self.min_chapters: + break + full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close + n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) + self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message) + if lookahead_ignorecase: + chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) + else: + chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close + chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) + html = chapdetect.sub(self.chapter_head, html) + + words_per_chptr = wordcount + if words_per_chptr > 0 and self.html_preprocess_sections > 0: + words_per_chptr = wordcount / self.html_preprocess_sections + self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters") + return html + + + def __call__(self, html): self.log("********* Preprocessing HTML *********") + # Count the words in the document to estimate how many chapters to look for and whether + # other types of processing are attempted + totalwords = 0 + totalwords = self.get_word_count(html) + + if totalwords < 20: + self.log("not enough text, not preprocessing") + return html + # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly - html = re.sub(r"\s*</p>", "</p>\n", html) - html = re.sub(r"\s*<p(?P<style>[^>]*)>\s*", "\n<p"+"\g<style>"+">", html) + html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html) + html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html) ###### Check Markup ###### # # some lit files don't have any <p> tags or equivalent (generally just plain text between # <pre> tags), check and mark up line endings if required before proceeding if self.no_markup(html, 0.1): - self.log("not enough paragraph markers, adding now") - # check if content is in pre tags, use txt processor to mark up if so - pre = re.compile(r'<pre>', re.IGNORECASE) - if len(pre.findall(html)) == 1: - self.log("Running Text Processing") - from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \ - separate_paragraphs_single_line - outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL) - html = outerhtml.sub('\g<text>', html) - html = separate_paragraphs_single_line(html) - html = preserve_spaces(html) - html = convert_basic(html, epub_split_size_kb=0) - else: - # Add markup naively - # TODO - find out if there are cases where there are more than one <pre> tag or - # other types of unmarked html and handle them in some better fashion - add_markup = re.compile('(?<!>)(\n)') - html = add_markup.sub('</p>\n<p>', html) + self.log("not enough paragraph markers, adding now") + # check if content is in pre tags, use txt processor to mark up if so + pre = re.compile(r'<pre>', re.IGNORECASE) + if len(pre.findall(html)) == 1: + self.log("Running Text Processing") + from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \ + separate_paragraphs_single_line + outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL) + html = outerhtml.sub('\g<text>', html) + html = separate_paragraphs_single_line(html) + html = preserve_spaces(html) + html = convert_basic(html, epub_split_size_kb=0) + else: + # Add markup naively + # TODO - find out if there are cases where there are more than one <pre> tag or + # other types of unmarked html and handle them in some better fashion + add_markup = re.compile('(?<!>)(\n)') + html = add_markup.sub('</p>\n<p>', html) ###### Mark Indents/Cleanup ###### # @@ -141,12 +235,17 @@ class PreProcessor(object): self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles") # remove remaining non-breaking spaces html = re.sub(ur'\u00a0', ' ', html) + # Get rid of various common microsoft specific tags which can cause issues later # Get rid of empty <o:p> tags to simplify other processing html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) + # Delete microsoft 'smart' tags + html = re.sub('(?i)</?st1:\w+>', '', html) # Get rid of empty span, bold, & italics tags html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html) html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html) html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html) + # ADE doesn't render <br />, change to empty paragraphs + #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html) # If more than 40% of the lines are empty paragraphs and the user has enabled remove # paragraph spacing then delete blank lines to clean up spacing @@ -164,63 +263,16 @@ class PreProcessor(object): self.log("deleting blank lines") html = blankreg.sub('', html) elif float(len(blanklines)) / float(len(lines)) > 0.40: - blanks_between_paragraphs = True - #print "blanks between paragraphs is marked True" + blanks_between_paragraphs = True + #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False + #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic # - # Build the Regular Expressions in pieces - init_lookahead = "(?=<(p|div))" - chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*" - title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*" - chapter_header_open = r"(?P<chap>" - title_header_open = r"(?P<title>" - chapter_header_close = ")\s*" - title_header_close = ")" - chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>" - title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>" - if blanks_between_paragraphs: - blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*" - else: - blank_lines = "" - opt_title_open = "(" - opt_title_close = ")?" - n_lookahead_open = "\s+(?!" - n_lookahead_close = ")" - - default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)" - - min_chapters = 10 - heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) - self.html_preprocess_sections = len(heading.findall(html)) - self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") - - chapter_types = [ - [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], - [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters - [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#鈥+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines - [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles - [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters - ] - - # Start with most typical chapter headings, get more aggressive until one works - for [chapter_type, lookahead_ignorecase, log_message] in chapter_types: - if self.html_preprocess_sections >= min_chapters: - break - full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close - n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message) - if lookahead_ignorecase: - chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) - else: - chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close - chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) - - html = chapdetect.sub(self.chapter_head, html) + html = self.markup_chapters(html, totalwords, blanks_between_paragraphs) ###### Unwrap lines ###### @@ -247,7 +299,7 @@ class PreProcessor(object): # Calculate Length unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4) length = docanalysis.line_length(unwrap_factor) - self.log("*** Median line length is " + unicode(length) + ", calculated with " + format + " format ***") + self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format") # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor if hardbreaks or unwrap_factor < 0.4: self.log("Unwrapping required, unwrapping Lines") @@ -260,7 +312,7 @@ class PreProcessor(object): self.log("Done dehyphenating") # Unwrap lines using punctation and line length #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE) - unwrap = re.compile(u"(?<=.{%i}([a-z盲毛茂枚眉脿猫矛貌霉谩膰茅铆贸艅艣煤芒锚卯么没莽膮臋偶谋,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) + unwrap = re.compile(u"(?<=.{%i}([a-z盲毛茂枚眉脿猫矛貌霉谩膰茅铆贸艅艣煤芒锚卯么没莽膮臋偶谋茫玫帽忙酶镁冒脽,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) html = unwrap.sub(' ', html) #check any remaining hyphens, but only unwrap if there is a match dehyphenator = Dehyphenator() @@ -276,7 +328,7 @@ class PreProcessor(object): html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) # If still no sections after unwrapping mark split points on lines with no punctuation - if self.html_preprocess_sections < 5: + if self.html_preprocess_sections < self.min_chapters: self.log("Looking for more split points based on punctuation," " currently have " + unicode(self.html_preprocess_sections)) chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#鈥+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index f9ce9befb4..4dd6e7c7ae 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -16,6 +16,7 @@ import uuid from lxml import etree +from calibre import guess_type from calibre import prepare_string_for_xml from calibre.constants import __appname__, __version__ from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace @@ -161,6 +162,23 @@ class FB2MLizer(object): text.append('<section>') self.section_level += 1 + # Insert the title page / cover into the spine if it is not already referenced. + title_name = u'' + if 'titlepage' in self.oeb_book.guide: + title_name = 'titlepage' + elif 'cover' in self.oeb_book.guide: + title_name = 'cover' + if title_name: + title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href] + if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml': + self.oeb_book.spine.insert(0, title_item, True) + # Create xhtml page to reference cover image so it can be used. + if not title_name and self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids: + id = unicode(self.oeb_book.metadata.cover[0]) + cover_item = self.oeb_book.manifest.ids[id] + if cover_item.media_type in OEB_RASTER_IMAGES: + self.insert_image_cover(cover_item.href) + for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) @@ -185,6 +203,17 @@ class FB2MLizer(object): return ''.join(text) + '</body>' + def insert_image_cover(self, image_href): + from calibre.ebooks.oeb.base import RECOVER_PARSER + try: + root = etree.fromstring(u'<html xmlns="%s"><body><img src="%s" /></body></html>' % (XHTML_NS, image_href), parser=RECOVER_PARSER) + except: + root = etree.fromstring(u'', parser=RECOVER_PARSER) + + id, href = self.oeb_book.manifest.generate('fb2_cover', 'fb2_cover.xhtml') + item = self.oeb_book.manifest.add(id, href, guess_type(href)[0], data=root) + self.oeb_book.spine.insert(0, item, True) + def fb2mlize_images(self): ''' This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function. diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 6850c48b16..1b665bf94e 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en' import os -from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader from calibre.ebooks.conversion.utils import PreProcessor @@ -18,19 +18,6 @@ class PDBInput(InputFormatPlugin): description = 'Convert PDB to HTML' file_types = set(['pdb']) - options = set([ - OptionRecommendation(name='single_line_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line represents ' - 'a paragraph instead.')), - OptionRecommendation(name='print_formatted_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line starting with ' - 'an indent (either a tab or 2+ spaces) represents a paragraph. ' - 'Paragraphs end when the next line that starts with an indent ' - 'is reached.')), - ]) - def convert(self, stream, options, file_ext, log, accelerators): header = PdbHeaderReader(stream) @@ -49,4 +36,4 @@ class PDBInput(InputFormatPlugin): def preprocess_html(self, options, html): self.options = options preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) - return preprocessor(html) \ No newline at end of file + return preprocessor(html) diff --git a/src/calibre/ebooks/pdb/output.py b/src/calibre/ebooks/pdb/output.py index 4e76a2d298..7bca4e5c5d 100644 --- a/src/calibre/ebooks/pdb/output.py +++ b/src/calibre/ebooks/pdb/output.py @@ -22,7 +22,7 @@ class PDBOutput(OutputFormatPlugin): short_switch='f', choices=FORMAT_WRITERS.keys(), help=(_('Format to use inside the pdb container. Choices are:')+\ ' %s' % FORMAT_WRITERS.keys())), - OptionRecommendation(name='output_encoding', recommended_value='cp1252', + OptionRecommendation(name='pdb_output_encoding', recommended_value='cp1252', level=OptionRecommendation.LOW, help=_('Specify the character encoding of the output document. ' \ 'The default is cp1252. Note: This option is not honored by all ' \ diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py index 52b8d1361f..945e31559a 100644 --- a/src/calibre/ebooks/pdb/palmdoc/reader.py +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -8,12 +8,11 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' -import os import struct +from cStringIO import StringIO + from calibre.ebooks.pdb.formatreader import FormatReader -from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line, separate_paragraphs_print_formatted class HeaderRecord(object): ''' @@ -33,9 +32,7 @@ class Reader(FormatReader): def __init__(self, header, stream, log, options): self.stream = stream self.log = log - self.encoding = options.input_encoding - self.single_line_paras = options.single_line_paras - self.print_formatted_paras = options.print_formatted_paras + self.options = options self.sections = [] for i in range(header.num_sections): @@ -48,34 +45,29 @@ class Reader(FormatReader): def decompress_text(self, number): if self.header_record.compression == 1: - return self.section_data(number).decode('cp1252' if self.encoding is None else self.encoding) + return self.section_data(number) if self.header_record.compression == 2 or self.header_record.compression == 258: from calibre.ebooks.compression.palmdoc import decompress_doc - return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') + return decompress_doc(self.section_data(number)) return '' def extract_content(self, output_dir): - txt = '' + raw_txt = '' self.log.info('Decompressing text...') for i in range(1, self.header_record.num_records + 1): self.log.debug('\tDecompressing text section %i' % i) - txt += self.decompress_text(i) + raw_txt += self.decompress_text(i) self.log.info('Converting text to OEB...') - if self.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if self.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - html = convert_basic(txt) - with open(os.path.join(output_dir, 'index.html'), 'wb') as index: - index.write(html.encode('utf-8')) + stream = StringIO(raw_txt) - from calibre.ebooks.metadata.meta import get_metadata - mi = get_metadata(self.stream, 'pdb') - manifest = [('index.html', None)] - spine = ['index.html'] - opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) + from calibre.customize.ui import plugin_for_input_format - return os.path.join(output_dir, 'metadata.opf') + txt_plugin = plugin_for_input_format('txt') + for option in txt_plugin.options: + if not hasattr(self.options, option.option.name): + setattr(self.options, option.name, option.recommended_value) + stream.seek(0) + return txt_plugin.convert(stream, self.options, 'txt', self.log, {}) diff --git a/src/calibre/ebooks/pdb/palmdoc/writer.py b/src/calibre/ebooks/pdb/palmdoc/writer.py index 3f4a92fbed..5e9b77d75c 100644 --- a/src/calibre/ebooks/pdb/palmdoc/writer.py +++ b/src/calibre/ebooks/pdb/palmdoc/writer.py @@ -50,7 +50,8 @@ class Writer(FormatWriter): txt = writer.extract_content(oeb_book, self.opts) self.log.debug('\tReplacing newlines with selected type...') - txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace') + txt = specified_newlines(TxtNewlines('windows').newline, + txt).encode(self.opts.pdb_output_encoding, 'replace') txt_length = len(txt) diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py index c151551866..30b0c4c57c 100644 --- a/src/calibre/ebooks/pdb/pdf/reader.py +++ b/src/calibre/ebooks/pdb/pdf/reader.py @@ -19,9 +19,6 @@ class Reader(FormatReader): self.stream = stream self.log = log self.options = options - setattr(self.options, 'new_pdf_engine', False) - setattr(self.options, 'no_images', False) - setattr(self.options, 'unwrap_factor', 0.45) def extract_content(self, output_dir): self.log.info('Extracting PDF...') @@ -31,7 +28,12 @@ class Reader(FormatReader): for x in xrange(self.header.section_count()): pdf.write(self.header.section_data(x)) - from calibre.customize.ui import plugin_for_input_format - pdf.seek(0) - return plugin_for_input_format('pdf').convert(pdf, self.options, - 'pdf', self.log, []) + from calibre.customize.ui import plugin_for_input_format + + pdf_plugin = plugin_for_input_format('pdf') + for option in pdf_plugin.options: + if not hasattr(self.options, option.option.name): + setattr(self.options, option.name, option.recommended_value) + + pdf.seek(0) + return pdf_plugin.convert(pdf, self.options, 'pdf', self.log, {}) diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py index 5cac283264..6e7f5dd923 100644 --- a/src/calibre/ebooks/pdb/ztxt/reader.py +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -8,12 +8,13 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' -import os, struct, zlib +import struct +import zlib + +from cStringIO import StringIO from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ztxt import zTXTError -from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line, separate_paragraphs_print_formatted SUPPORTED_VERSION = (1, 40) @@ -38,9 +39,7 @@ class Reader(FormatReader): def __init__(self, header, stream, log, options): self.stream = stream self.log = log - self.encoding = options.input_encoding - self.single_line_paras = options.single_line_paras - self.print_formatted_paras = options.print_formatted_paras + self.options = options self.sections = [] for i in range(header.num_sections): @@ -68,30 +67,25 @@ class Reader(FormatReader): def decompress_text(self, number): if number == 1: self.uncompressor = zlib.decompressobj() - return self.uncompressor.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') + return self.uncompressor.decompress(self.section_data(number)) def extract_content(self, output_dir): - txt = '' + raw_txt = '' self.log.info('Decompressing text...') for i in range(1, self.header_record.num_records + 1): self.log.debug('\tDecompressing text section %i' % i) - txt += self.decompress_text(i) + raw_txt += self.decompress_text(i) self.log.info('Converting text to OEB...') - if self.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if self.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - html = convert_basic(txt) - with open(os.path.join(output_dir, 'index.html'), 'wb') as index: - index.write(html.encode('utf-8')) + stream = StringIO(raw_txt) - from calibre.ebooks.metadata.meta import get_metadata - mi = get_metadata(self.stream, 'pdb') - manifest = [('index.html', None)] - spine = ['index.html'] - opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) + from calibre.customize.ui import plugin_for_input_format - return os.path.join(output_dir, 'metadata.opf') + txt_plugin = plugin_for_input_format('txt') + for option in txt_plugin.options: + if not hasattr(self.options, option.option.name): + setattr(self.options, option.name, option.recommended_value) + stream.seek(0) + return txt_plugin.convert(stream, self.options, 'txt', self.log, {}) diff --git a/src/calibre/ebooks/pdb/ztxt/writer.py b/src/calibre/ebooks/pdb/ztxt/writer.py index ee4c5752c3..7c9056fe69 100644 --- a/src/calibre/ebooks/pdb/ztxt/writer.py +++ b/src/calibre/ebooks/pdb/ztxt/writer.py @@ -22,12 +22,12 @@ class Writer(FormatWriter): def __init__(self, opts, log): self.opts = opts self.log = log - + def write_content(self, oeb_book, out_stream, metadata=None): title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown') txt_records, txt_length = self._generate_text(oeb_book) - + crc32 = 0 section_lengths = [] compressor = zlib.compressobj(9) @@ -41,32 +41,33 @@ class Writer(FormatWriter): header_record = self._header_record(txt_length, len(txt_records), crc32) section_lengths.insert(0, len(header_record)) - + out_stream.seek(0) hb = PdbHeaderBuilder('zTXTGPlm', title) hb.build_header(section_lengths, out_stream) for record in [header_record]+txt_records: out_stream.write(record) - + def _generate_text(self, oeb_book): writer = TXTMLizer(self.log) txt = writer.extract_content(oeb_book, self.opts) self.log.debug('\tReplacing newlines with selected type...') - txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace') + txt = specified_newlines(TxtNewlines('windows').newline, + txt).encode(self.opts.pdb_output_encoding, 'replace') txt_length = len(txt) - + txt_records = [] for i in range(0, (len(txt) / MAX_RECORD_SIZE) + 1): txt_records.append(txt[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE]) - + return txt_records, txt_length - + def _header_record(self, txt_length, record_count, crc32): record = '' - + record += struct.pack('>H', 0x012c) # [0:2], version. 0x012c = 1.44 record += struct.pack('>H', record_count) # [2:4], Number of PDB records used for the text of the book. record += struct.pack('>L', txt_length) # [4:8], Uncompressed length of the entire text of the book. @@ -79,6 +80,6 @@ class Writer(FormatWriter): record += struct.pack('>B', 0) # [19:20], Reserved. record += struct.pack('>L', crc32) # [20:24], crc32 record += struct.pack('>LL', 0, 0) # [24:32], padding - + return record - + diff --git a/src/calibre/ebooks/pml/output.py b/src/calibre/ebooks/pml/output.py index 7e3729aa4a..58dc9a2138 100644 --- a/src/calibre/ebooks/pml/output.py +++ b/src/calibre/ebooks/pml/output.py @@ -28,7 +28,7 @@ class PMLOutput(OutputFormatPlugin): file_type = 'pmlz' options = set([ - OptionRecommendation(name='output_encoding', recommended_value='cp1252', + OptionRecommendation(name='pml_output_encoding', recommended_value='cp1252', level=OptionRecommendation.LOW, help=_('Specify the character encoding of the output document. ' \ 'The default is cp1252.')), @@ -48,7 +48,7 @@ class PMLOutput(OutputFormatPlugin): pmlmlizer = PMLMLizer(log) pml = unicode(pmlmlizer.extract_content(oeb_book, opts)) with open(os.path.join(tdir, 'index.pml'), 'wb') as out: - out.write(pml.encode(opts.output_encoding, 'replace')) + out.write(pml.encode(opts.pml_output_encoding, 'replace')) self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, tdir, opts) diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py index 67fa6ac66e..aac72da7a8 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/tcr/input.py @@ -4,11 +4,9 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' -import os +from cStringIO import StringIO -from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation -from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line, separate_paragraphs_print_formatted +from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.compression.tcr import decompress class TCRInput(InputFormatPlugin): @@ -18,37 +16,20 @@ class TCRInput(InputFormatPlugin): description = 'Convert TCR files to HTML' file_types = set(['tcr']) - options = set([ - OptionRecommendation(name='single_line_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line represents ' - 'a paragraph instead.')), - OptionRecommendation(name='print_formatted_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line starting with ' - 'an indent (either a tab or 2+ spaces) represents a paragraph. ' - 'Paragraphs end when the next line that starts with an indent ' - 'is reached.')), - ]) - def convert(self, stream, options, file_ext, log, accelerators): log.info('Decompressing text...') - ienc = options.input_encoding if options.input_encoding else 'utf-8' - txt = decompress(stream).decode(ienc, 'replace') + raw_txt = decompress(stream) log.info('Converting text to OEB...') - if options.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if options.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - html = convert_basic(txt) - with open(os.path.join(os.getcwd(), 'index.html'), 'wb') as index: - index.write(html.encode('utf-8')) + stream = StringIO(raw_txt) - from calibre.ebooks.metadata.meta import get_metadata - mi = get_metadata(stream, 'tcr') - manifest = [('index.html', None)] - spine = ['index.html'] - opf_writer(os.getcwd(), 'metadata.opf', manifest, spine, mi) + from calibre.customize.ui import plugin_for_input_format - return os.path.join(os.getcwd(), 'metadata.opf') + txt_plugin = plugin_for_input_format('txt') + for option in txt_plugin.options: + if not hasattr(options, option.option.name): + setattr(options, option.name, option.recommended_value) + + stream.seek(0) + return txt_plugin.convert(stream, options, + 'txt', log, accelerators) diff --git a/src/calibre/ebooks/tcr/output.py b/src/calibre/ebooks/tcr/output.py index 3ca82730cc..97c9cae26c 100644 --- a/src/calibre/ebooks/tcr/output.py +++ b/src/calibre/ebooks/tcr/output.py @@ -18,7 +18,7 @@ class TCROutput(OutputFormatPlugin): file_type = 'tcr' options = set([ - OptionRecommendation(name='output_encoding', recommended_value='utf-8', + OptionRecommendation(name='tcr_output_encoding', recommended_value='utf-8', level=OptionRecommendation.LOW, help=_('Specify the character encoding of the output document. ' \ 'The default is utf-8.')), @@ -40,7 +40,7 @@ class TCROutput(OutputFormatPlugin): setattr(opts, 'indent_paras', False) writer = TXTMLizer(log) - txt = writer.extract_content(oeb_book, opts).encode(opts.output_encoding, 'replace') + txt = writer.extract_content(oeb_book, opts).encode(opts.tcr_output_encoding, 'replace') log.info('Compressing text...') txt = compress(txt) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 44b98304ea..47e92a45a9 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -7,9 +7,10 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ - preserve_spaces + preserve_spaces, detect_paragraph_type, detect_formatting_type from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -20,45 +21,57 @@ class TXTInput(InputFormatPlugin): file_types = set(['txt']) options = set([ - OptionRecommendation(name='single_line_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line represents ' - 'a paragraph instead.')), - OptionRecommendation(name='print_formatted_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line starting with ' - 'an indent (either a tab or 2+ spaces) represents a paragraph. ' - 'Paragraphs end when the next line that starts with an indent ' - 'is reached.')), + OptionRecommendation(name='paragraph_type', recommended_value='auto', + choices=['auto', 'block', 'single', 'print'], + help=_('Paragraph structure.\n' + 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' + '* auto: Try to auto detect paragraph type.\n' + '* block: Treat a blank line as a paragraph break.\n' + '* single: Assume every line is a paragraph.\n' + '* print: Assume every line starting with 2+ spaces or a tab ' + 'starts a paragraph.')), + OptionRecommendation(name='formatting_type', recommended_value='auto', + choices=['auto', 'none', 'markdown'], + help=_('Formatting used within the document.' + '* auto: Try to auto detect the document formatting.\n' + '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' + '* markdown: Run the input though the markdown pre-processor. ' + 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, help=_('Normally extra spaces are condensed into a single space. ' 'With this option all spaces will be displayed.')), - OptionRecommendation(name='markdown', recommended_value=False, - help=_('Run the text input through the markdown pre-processor. To ' - 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name="markdown_disable_toc", recommended_value=False, help=_('Do not insert a Table of Contents into the output text.')), ]) def convert(self, stream, options, file_ext, log, accelerators): - ienc = stream.encoding if stream.encoding else 'utf-8' + log.debug('Reading text from file...') + + txt = stream.read() + # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding - log.debug('Reading text from file...') - txt = stream.read().decode(ienc, 'replace') - - # Adjust paragraph formatting as requested - if options.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if options.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - if options.preserve_spaces: - txt = preserve_spaces(txt) + log.debug('Using user specified input encoding of %s' % ienc) + else: + det_encoding = detect(txt) + ienc = det_encoding['encoding'] + log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, det_encoding['confidence'] * 100)) + if not ienc: + ienc = 'utf-8' + log.debug('No input encoding specified and could not auto detect using %s' % ienc) + txt = txt.decode(ienc, 'replace') txt = _ent_pat.sub(xml_entity_to_unicode, txt) + # Preserve spaces will replace multiple spaces to a space + # followed by the   entity. + if options.preserve_spaces: + txt = preserve_spaces(txt) + + if options.formatting_type == 'auto': + options.formatting_type = detect_formatting_type(txt) - if options.markdown: + if options.formatting_type == 'markdown': log.debug('Running text though markdown conversion...') try: html = convert_markdown(txt, disable_toc=options.markdown_disable_toc) @@ -66,6 +79,22 @@ class TXTInput(InputFormatPlugin): raise ValueError('This txt file has malformed markup, it cannot be' ' converted by calibre. See http://daringfireball.net/projects/markdown/syntax') else: + # Determine the paragraph type of the document. + if options.paragraph_type == 'auto': + options.paragraph_type = detect_paragraph_type(txt) + if options.paragraph_type == 'unknown': + log.debug('Could not reliably determine paragraph type using block') + options.paragraph_type = 'block' + else: + log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + + # We don't check for block because the processor assumes block. + # single and print at transformed to block for processing. + if options.paragraph_type == 'single': + txt = separate_paragraphs_single_line(txt) + elif options.paragraph_type == 'print': + txt = separate_paragraphs_print_formatted(txt) + flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) @@ -85,11 +114,10 @@ class TXTInput(InputFormatPlugin): htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) - cwd = os.getcwdu() odi = options.debug_pipeline options.debug_pipeline = None - oeb = html_input(open(htmlfile.name, 'rb'), options, 'html', log, - {}, cwd) + oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, + {}) options.debug_pipeline = odi os.remove(htmlfile.name) return oeb diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 0e077672d8..4d0d176fe4 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -26,7 +26,7 @@ class TXTOutput(OutputFormatPlugin): 'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. ' 'For Mac OS X use \'unix\'. \'system\' will default to the newline ' 'type used by this OS.') % sorted(TxtNewlines.NEWLINE_TYPES.keys())), - OptionRecommendation(name='output_encoding', recommended_value='utf-8', + OptionRecommendation(name='txt_output_encoding', recommended_value='utf-8', level=OptionRecommendation.LOW, help=_('Specify the character encoding of the output document. ' \ 'The default is utf-8.')), @@ -64,7 +64,7 @@ class TXTOutput(OutputFormatPlugin): writer = MarkdownMLizer(log) else: writer = TXTMLizer(log) - + txt = writer.extract_content(oeb_book, opts) log.debug('\tReplacing newlines with selected type...') @@ -81,7 +81,7 @@ class TXTOutput(OutputFormatPlugin): out_stream.seek(0) out_stream.truncate() - out_stream.write(txt.encode(opts.output_encoding, 'replace')) + out_stream.write(txt.encode(opts.txt_output_encoding, 'replace')) if close: out_stream.close() diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 50d8419110..2fbf1a384a 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -48,7 +48,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0): if isbytestring(txt): txt = txt.decode('utf-8') - lines = [] # Split into paragraphs based on having a blank line between text. for line in txt.split('\n\n'): @@ -93,3 +92,54 @@ def split_string_separator(txt, size) : xrange(0, len(txt), size)]) return txt +def detect_paragraph_type(txt): + ''' + Tries to determine the formatting of the document. + + block: Paragraphs are separated by a blank line. + single: Each line is a paragraph. + print: Each paragraph starts with a 2+ spaces or a tab + and ends when a new paragraph is reached. + markdown: Markdown formatting is in the document. + + returns block, single, print, markdown + ''' + txt = txt.replace('\r\n', '\n') + txt = txt.replace('\r', '\n') + txt_line_count = len(re.findall('(?mu)^\s*.+$', txt)) + + # Check for print + tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) + if tab_line_count / float(txt_line_count) >= .25: + return 'print' + + # Check for block + empty_line_count = len(re.findall('(?mu)^\s*$', txt)) + if empty_line_count / float(txt_line_count) >= .25: + return 'block' + + # Nothing else matched to assume single. + return 'single' + +def detect_formatting_type(txt): + # Check for markdown + # Headings + if len(re.findall('(?mu)^#+', txt)) >= 5: + return 'markdown' + if len(re.findall('(?mu)^=+$', txt)) >= 5: + return 'markdown' + if len(re.findall('(?mu)^-+$', txt)) >= 5: + return 'markdown' + # Images + if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5: + return 'markdown' + # Links + if len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt)) >= 5: + return 'markdown' + # Escaped characters + md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!'] + for c in md_escapted_characters: + if txt.count('\\'+c) > 10: + return 'markdown' + + return 'none' diff --git a/src/calibre/gui2/book_details.py b/src/calibre/gui2/book_details.py index 8e3e8b10de..2f7892692c 100644 --- a/src/calibre/gui2/book_details.py +++ b/src/calibre/gui2/book_details.py @@ -256,8 +256,10 @@ class BookInfo(QWebView): % (left_pane, right_pane))) def mouseDoubleClickEvent(self, ev): - if self.width() - ev.x() < 25 or \ - self.height() - ev.y() < 25: + swidth = self.page().mainFrame().scrollBarGeometry(Qt.Vertical).width() + sheight = self.page().mainFrame().scrollBarGeometry(Qt.Horizontal).height() + if self.width() - ev.x() < swidth or \ + self.height() - ev.y() < sheight: # Filter out double clicks on the scroll bar ev.accept() else: diff --git a/src/calibre/gui2/convert/__init__.py b/src/calibre/gui2/convert/__init__.py index 1557ce8939..ea7a24510a 100644 --- a/src/calibre/gui2/convert/__init__.py +++ b/src/calibre/gui2/convert/__init__.py @@ -6,7 +6,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import textwrap +import textwrap, codecs from functools import partial from PyQt4.Qt import QWidget, QSpinBox, QDoubleSpinBox, QLineEdit, QTextEdit, \ @@ -128,6 +128,7 @@ class Widget(QWidget): def get_value(self, g): from calibre.gui2.convert.xpath_wizard import XPathEdit from calibre.gui2.convert.regex_builder import RegexEdit + from calibre.gui2.widgets import EncodingComboBox ret = self.get_value_handler(g) if ret != 'this is a dummy return value, xcswx1avcx4x': return ret @@ -139,6 +140,13 @@ class Widget(QWidget): if not ans: ans = None return ans + elif isinstance(g, EncodingComboBox): + ans = unicode(g.currentText()).strip() + try: + codecs.lookup(ans) + except: + ans = '' + return ans elif isinstance(g, QComboBox): return unicode(g.currentText()) elif isinstance(g, QCheckBox): @@ -192,6 +200,11 @@ class Widget(QWidget): if not val: val = '' getattr(g, 'setPlainText', g.setText)(val) getattr(g, 'setCursorPosition', lambda x: x)(0) + elif isinstance(g, EncodingComboBox): + if val: + g.setEditText(val) + else: + g.setCurrentIndex(0) elif isinstance(g, QComboBox) and val: idx = g.findText(val, Qt.MatchFixedString) if idx < 0: @@ -202,8 +215,6 @@ class Widget(QWidget): g.setCheckState(Qt.Checked if bool(val) else Qt.Unchecked) elif isinstance(g, (XPathEdit, RegexEdit)): g.edit.setText(val if val else '') - elif isinstance(g, EncodingComboBox): - g.setEditText(val if val else '') else: raise Exception('Can\'t set value %s in %s'%(repr(val), unicode(g.objectName()))) diff --git a/src/calibre/gui2/convert/pdb_input.py b/src/calibre/gui2/convert/pdb_input.py deleted file mode 100644 index 4510cf81ba..0000000000 --- a/src/calibre/gui2/convert/pdb_input.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- - -__license__ = 'GPL 3' -__copyright__ = '2009, John Schember <john@nachtimwald.com>' -__docformat__ = 'restructuredtext en' - -from calibre.gui2.convert.pdb_input_ui import Ui_Form -from calibre.gui2.convert import Widget - -class PluginWidget(Widget, Ui_Form): - - TITLE = _('PDB Input') - HELP = _('Options specific to')+' PDB '+_('input') - COMMIT_NAME = 'pdb_input' - ICON = I('mimetypes/unknown.png') - - def __init__(self, parent, get_option, get_help, db=None, book_id=None): - Widget.__init__(self, parent, - ['single_line_paras', 'print_formatted_paras']) - self.db, self.book_id = db, book_id - self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/pdb_input.ui b/src/calibre/gui2/convert/pdb_input.ui deleted file mode 100644 index 2b632b1a33..0000000000 --- a/src/calibre/gui2/convert/pdb_input.ui +++ /dev/null @@ -1,48 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<ui version="4.0"> - <class>Form</class> - <widget class="QWidget" name="Form"> - <property name="geometry"> - <rect> - <x>0</x> - <y>0</y> - <width>400</width> - <height>300</height> - </rect> - </property> - <property name="windowTitle"> - <string>Form</string> - </property> - <layout class="QGridLayout" name="gridLayout"> - <item row="2" column="0"> - <spacer name="verticalSpacer"> - <property name="orientation"> - <enum>Qt::Vertical</enum> - </property> - <property name="sizeHint" stdset="0"> - <size> - <width>20</width> - <height>213</height> - </size> - </property> - </spacer> - </item> - <item row="0" column="0"> - <widget class="QCheckBox" name="opt_single_line_paras"> - <property name="text"> - <string>Treat each &line as a paragraph</string> - </property> - </widget> - </item> - <item row="1" column="0"> - <widget class="QCheckBox" name="opt_print_formatted_paras"> - <property name="text"> - <string>Assume print formatting</string> - </property> - </widget> - </item> - </layout> - </widget> - <resources/> - <connections/> -</ui> diff --git a/src/calibre/gui2/convert/pdb_output.py b/src/calibre/gui2/convert/pdb_output.py index 51c202cb03..ec6b7abb08 100644 --- a/src/calibre/gui2/convert/pdb_output.py +++ b/src/calibre/gui2/convert/pdb_output.py @@ -19,7 +19,7 @@ class PluginWidget(Widget, Ui_Form): ICON = I('mimetypes/unknown.png') def __init__(self, parent, get_option, get_help, db=None, book_id=None): - Widget.__init__(self, parent, ['format', 'inline_toc', 'output_encoding']) + Widget.__init__(self, parent, ['format', 'inline_toc', 'pdb_output_encoding']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/pdb_output.ui b/src/calibre/gui2/convert/pdb_output.ui index 17bdc0a984..fcca83cc2e 100644 --- a/src/calibre/gui2/convert/pdb_output.ui +++ b/src/calibre/gui2/convert/pdb_output.ui @@ -55,10 +55,21 @@ </widget> </item> <item row="1" column="1"> - <widget class="QLineEdit" name="opt_output_encoding"/> + <widget class="EncodingComboBox" name="opt_pdb_output_encoding"> + <property name="editable"> + <bool>true</bool> + </property> + </widget> </item> </layout> </widget> + <customwidgets> + <customwidget> + <class>EncodingComboBox</class> + <extends>QComboBox</extends> + <header>widgets.h</header> + </customwidget> + </customwidgets> <resources/> <connections/> </ui> diff --git a/src/calibre/gui2/convert/pml_output.py b/src/calibre/gui2/convert/pml_output.py index f7905194ca..56197ecde0 100644 --- a/src/calibre/gui2/convert/pml_output.py +++ b/src/calibre/gui2/convert/pml_output.py @@ -18,6 +18,6 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, ['inline_toc', 'full_image_depth', - 'output_encoding']) + 'pml_output_encoding']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/pmlz_output.ui b/src/calibre/gui2/convert/pmlz_output.ui index 9754752c8a..162cfbb831 100644 --- a/src/calibre/gui2/convert/pmlz_output.ui +++ b/src/calibre/gui2/convert/pmlz_output.ui @@ -14,7 +14,7 @@ <string>Form</string> </property> <layout class="QGridLayout" name="gridLayout"> - <item row="3" column="0"> + <item row="4" column="0"> <spacer name="verticalSpacer"> <property name="orientation"> <enum>Qt::Vertical</enum> @@ -27,32 +27,47 @@ </property> </spacer> </item> - <item row="1" column="0"> + <item row="2" column="0"> <widget class="QCheckBox" name="opt_inline_toc"> <property name="text"> <string>&Inline TOC</string> </property> </widget> </item> - <item row="2" column="0"> + <item row="3" column="0"> <widget class="QCheckBox" name="opt_full_image_depth"> <property name="text"> <string>Do not reduce image size and depth</string> </property> </widget> </item> - <item row="0" column="0"> - <widget class="QLabel" name="label"> - <property name="text"> - <string>Output Encoding:</string> - </property> - </widget> - </item> - <item row="0" column="1"> - <widget class="QLineEdit" name="opt_output_encoding"/> + <item row="1" column="0"> + <layout class="QHBoxLayout" name="horizontalLayout"> + <item> + <widget class="QLabel" name="label"> + <property name="text"> + <string>Output Encoding:</string> + </property> + </widget> + </item> + <item> + <widget class="EncodingComboBox" name="opt_pml_output_encoding"> + <property name="editable"> + <bool>true</bool> + </property> + </widget> + </item> + </layout> </item> </layout> </widget> + <customwidgets> + <customwidget> + <class>EncodingComboBox</class> + <extends>QComboBox</extends> + <header>widgets.h</header> + </customwidget> + </customwidgets> <resources/> <connections/> </ui> diff --git a/src/calibre/gui2/convert/txt_input.py b/src/calibre/gui2/convert/txt_input.py index 31019251e2..62672cc0f9 100644 --- a/src/calibre/gui2/convert/txt_input.py +++ b/src/calibre/gui2/convert/txt_input.py @@ -16,7 +16,10 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['single_line_paras', 'print_formatted_paras', 'markdown', - 'markdown_disable_toc', 'preserve_spaces']) + ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) self.db, self.book_id = db, book_id + for x in get_option('paragraph_type').option.choices: + self.opt_paragraph_type.addItem(x) + for x in get_option('formatting_type').option.choices: + self.opt_formatting_type.addItem(x) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_input.ui b/src/calibre/gui2/convert/txt_input.ui index 186783c277..6cbd68135f 100644 --- a/src/calibre/gui2/convert/txt_input.ui +++ b/src/calibre/gui2/convert/txt_input.ui @@ -6,7 +6,7 @@ <rect> <x>0</x> <y>0</y> - <width>470</width> + <width>518</width> <height>300</height> </rect> </property> @@ -15,47 +15,23 @@ </property> <layout class="QGridLayout" name="gridLayout"> <item row="0" column="0"> - <widget class="QCheckBox" name="opt_single_line_paras"> + <widget class="QLabel" name="label_2"> <property name="text"> - <string>Treat each &line as a paragraph</string> + <string>Paragraph style:</string> </property> </widget> </item> - <item row="1" column="0"> - <widget class="QCheckBox" name="opt_print_formatted_paras"> + <item row="0" column="1"> + <widget class="QComboBox" name="opt_paragraph_type"/> + </item> + <item row="5" column="0" colspan="2"> + <widget class="QCheckBox" name="opt_preserve_spaces"> <property name="text"> - <string>Assume print formatting</string> + <string>Preserve &spaces</string> </property> </widget> </item> - <item row="2" column="0"> - <widget class="QCheckBox" name="opt_markdown"> - <property name="text"> - <string>Process using markdown</string> - </property> - </widget> - </item> - <item row="3" column="0"> - <widget class="QLabel" name="label"> - <property name="text"> - <string><p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>.</string> - </property> - <property name="wordWrap"> - <bool>true</bool> - </property> - <property name="openExternalLinks"> - <bool>true</bool> - </property> - </widget> - </item> - <item row="4" column="0"> - <widget class="QCheckBox" name="opt_markdown_disable_toc"> - <property name="text"> - <string>Do not insert Table of Contents into output text when using markdown</string> - </property> - </widget> - </item> - <item row="6" column="0"> + <item row="6" column="0" colspan="2"> <spacer name="verticalSpacer"> <property name="orientation"> <enum>Qt::Vertical</enum> @@ -68,32 +44,47 @@ </property> </spacer> </item> - <item row="5" column="0"> - <widget class="QCheckBox" name="opt_preserve_spaces"> + <item row="1" column="1"> + <widget class="QComboBox" name="opt_formatting_type"/> + </item> + <item row="1" column="0"> + <widget class="QLabel" name="label_3"> <property name="text"> - <string>Preserve &spaces</string> + <string>Formatting style:</string> </property> </widget> </item> + <item row="2" column="0" rowspan="2" colspan="2"> + <widget class="QGroupBox" name="groupBox"> + <property name="title"> + <string>Markdown Options</string> + </property> + <layout class="QVBoxLayout" name="verticalLayout"> + <item> + <widget class="QLabel" name="label"> + <property name="text"> + <string><p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>.</string> + </property> + <property name="wordWrap"> + <bool>true</bool> + </property> + <property name="openExternalLinks"> + <bool>true</bool> + </property> + </widget> + </item> + <item> + <widget class="QCheckBox" name="opt_markdown_disable_toc"> + <property name="text"> + <string>Do not insert Table of Contents into output text when using markdown</string> + </property> + </widget> + </item> + </layout> + </widget> + </item> </layout> </widget> <resources/> - <connections> - <connection> - <sender>opt_markdown</sender> - <signal>toggled(bool)</signal> - <receiver>opt_markdown_disable_toc</receiver> - <slot>setEnabled(bool)</slot> - <hints> - <hint type="sourcelabel"> - <x>76</x> - <y>80</y> - </hint> - <hint type="destinationlabel"> - <x>418</x> - <y>105</y> - </hint> - </hints> - </connection> - </connections> + <connections/> </ui> diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py index 9f30e0d83f..9a228bd4cf 100644 --- a/src/calibre/gui2/convert/txt_output.py +++ b/src/calibre/gui2/convert/txt_output.py @@ -22,7 +22,7 @@ class PluginWidget(Widget, Ui_Form): Widget.__init__(self, parent, ['newline', 'max_line_length', 'force_max_line_length', 'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references', - 'output_encoding']) + 'txt_output_encoding']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_output.ui b/src/calibre/gui2/convert/txt_output.ui index 6290a096c8..57fe702db7 100644 --- a/src/calibre/gui2/convert/txt_output.ui +++ b/src/calibre/gui2/convert/txt_output.ui @@ -96,10 +96,21 @@ </widget> </item> <item row="2" column="1"> - <widget class="QLineEdit" name="opt_output_encoding"/> + <widget class="EncodingComboBox" name="opt_txt_output_encoding"> + <property name="editable"> + <bool>true</bool> + </property> + </widget> </item> </layout> </widget> + <customwidgets> + <customwidget> + <class>EncodingComboBox</class> + <extends>QComboBox</extends> + <header>widgets.h</header> + </customwidget> + </customwidgets> <resources/> <connections/> </ui> diff --git a/src/calibre/gui2/viewer/documentview.py b/src/calibre/gui2/viewer/documentview.py index f77f23c154..13469f5622 100644 --- a/src/calibre/gui2/viewer/documentview.py +++ b/src/calibre/gui2/viewer/documentview.py @@ -449,7 +449,7 @@ class Document(QWebPage): # {{{ return self.mainFrame().contentsSize().width() # offsetWidth gives inaccurate results def set_bottom_padding(self, amount): - s = QSize(-1, -1) if amount == 0 else QSize(self.width, + s = QSize(-1, -1) if amount == 0 else QSize(self.viewportSize().width(), self.height+amount) self.setPreferredContentsSize(s) @@ -820,6 +820,7 @@ class DocumentView(QWebView): # {{{ self.flipper.initialize(self.current_page_image()) self.manager.next_document() return + #oheight = self.document.height lower_limit = opos + delta_y # Max value of top y co-ord after scrolling max_y = self.document.height - window_height # The maximum possible top y co-ord if max_y < lower_limit: @@ -835,6 +836,7 @@ class DocumentView(QWebView): # {{{ if epf: self.flipper.initialize(self.current_page_image()) #print 'Document height:', self.document.height + #print 'Height change:', (self.document.height - oheight) max_y = self.document.height - window_height lower_limit = min(max_y, lower_limit) #print 'Scroll to:', lower_limit @@ -842,6 +844,7 @@ class DocumentView(QWebView): # {{{ self.document.scroll_to(self.document.xpos, lower_limit) actually_scrolled = self.document.ypos - opos #print 'After scroll pos:', self.document.ypos + #print 'Scrolled by:', self.document.ypos - opos self.find_next_blank_line(window_height - actually_scrolled) #print 'After blank line pos:', self.document.ypos if epf: diff --git a/src/calibre/utils/wordcount.py b/src/calibre/utils/wordcount.py new file mode 100644 index 0000000000..cd0058fb2f --- /dev/null +++ b/src/calibre/utils/wordcount.py @@ -0,0 +1,85 @@ +#!/usr/bin/python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +""" +Get word, character, and Asian character counts + +1. Get a word count as a dictionary: + wc = get_wordcount(text) + words = wc['words'] # etc. + +2. Get a word count as an object + wc = get_wordcount_obj(text) + words = wc.words # etc. + +properties counted: + * characters + * chars_no_spaces + * asian_chars + * non_asian_words + * words + +Sourced from: +http://ginstrom.com/scribbles/2008/05/17/counting-words-etc-in-an-html-file-with-python/ +http://ginstrom.com/scribbles/2007/10/06/counting-words-characters-and-asian-characters-with-python/ +""" +__version__ = 0.1 +__author__ = "Ryan Ginstrom" + +IDEOGRAPHIC_SPACE = 0x3000 + +def is_asian(char): + """Is the character Asian?""" + + # 0x3000 is ideographic space (i.e. double-byte space) + # Anything over is an Asian character + return ord(char) > IDEOGRAPHIC_SPACE + +def filter_jchars(c): + """Filters Asian characters to spaces""" + if is_asian(c): + return ' ' + return c + +def nonj_len(word): + u"""Returns number of non-Asian words in {word} + - 鏃ユ湰瑾濧銈€偢銈€兂B -> 2 + - hello -> 1 + @param word: A word, possibly containing Asian characters + """ + # Here are the steps: + # 鏈瑂pam鏃ggs + # -> [' ', 's', 'p', 'a', 'm', ' ', 'e', 'g', 'g', 's'] + # -> ' spam eggs' + # -> ['spam', 'eggs'] + # The length of which is 2! + chars = [filter_jchars(c) for c in word] + return len(u''.join(chars).split()) + +def get_wordcount(text): + """Get the word/character count for text + + @param text: The text of the segment + """ + + characters = len(text) + chars_no_spaces = sum([not x.isspace() for x in text]) + asian_chars = sum([is_asian(x) for x in text]) + non_asian_words = nonj_len(text) + words = non_asian_words + asian_chars + + return dict(characters=characters, + chars_no_spaces=chars_no_spaces, + asian_chars=asian_chars, + non_asian_words=non_asian_words, + words=words) + +def dict2obj(dictionary): + """Transform a dictionary into an object""" + class Obj(object): + def __init__(self, dictionary): + self.__dict__.update(dictionary) + return Obj(dictionary) + +def get_wordcount_obj(text): + """Get the wordcount as an object rather than a dictionary""" + return dict2obj(get_wordcount(text))