From 15e6c1d212ebe9c4f1a935914ff9dccf2f93e5da Mon Sep 17 00:00:00 2001 From: ldolse Date: Tue, 23 Nov 2010 13:54:45 +0800 Subject: [PATCH 01/35] bockquotes and paragraph tags also have a conflict in mobi files - lxml allows

inside blocquote, but not the other way around --- src/calibre/ebooks/conversion/utils.py | 28 ++++++--- src/calibre/ebooks/mobi/reader.py | 3 + src/calibre/utils/wordcount.py | 83 ++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 9 deletions(-) create mode 100644 src/calibre/utils/wordcount.py diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 11979b933c..8baeefcd1a 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en' import re from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.utils.logging import default_log +from calibre.utils.wordcount import get_wordcount_obj class PreProcessor(object): @@ -107,7 +108,7 @@ class PreProcessor(object): # Arrange line feeds and

tags so the line_length and no_markup functions work correctly html = re.sub(r"\s*

", "

\n", html) html = re.sub(r"\s*[^>]*)>\s*", "\n"+">", html) - + ###### Check Markup ###### # # some lit files don't have any

tags or equivalent (generally just plain text between @@ -168,9 +169,21 @@ class PreProcessor(object): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False + + # Count the words in the document to estimate how many chapters to look for + word_count_text = re.sub(r'(?s)]*>.*?', '', html) + word_count_text = re.sub(r'<[^>]*>', '', word_count_text) + wordcount = get_wordcount_obj(word_count_text) + + #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic # + min_chapters = 10 + heading = re.compile(']*>', re.IGNORECASE) + self.html_preprocess_sections = len(heading.findall(html)) + self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") + # Build the Regular Expressions in pieces init_lookahead = "(?=<(p|div))" chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" @@ -192,12 +205,7 @@ class PreProcessor(object): n_lookahead_close = ")" default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)" - - min_chapters = 10 - heading = re.compile(']*>', re.IGNORECASE) - self.html_preprocess_sections = len(heading.findall(html)) - self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") - + chapter_types = [ [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters @@ -219,9 +227,11 @@ class PreProcessor(object): else: chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) - + html = chapdetect.sub(self.chapter_head, html) - + + words_per_chptr = wordcount.words / self.html_preprocess_sections + print "wordcount is: "+ str(wordcount.words)+", Average words per chapter is: "+str(words_per_chptr)+", Marked "+str(self.html_preprocess_sections)+" chapters" ###### Unwrap lines ###### # diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index c4845f9443..309023ede9 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -480,6 +480,9 @@ class MobiReader(object): # - lxml and beautifulsoup expect/assume a specific order based on xhtml spec self.processed_html = re.sub(r'(?i)(?P(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P]*>)', '\g'+'\g', self.processed_html) self.processed_html = re.sub(r'(?i)(?P]*>)\s*(?P(\s*){1,})', '\g'+'\g', self.processed_html) + self.processed_html = re.sub(r'(?i)(?P

(]*>\s*){1,})(?P]*>)', '\g'+'\g
', self.processed_html) + self.processed_html = re.sub(r'(?i)(?P]*>)\s*(?P
(]*>\s*){1,})', '\g
'+'\g', self.processed_html) + def remove_random_bytes(self, html): return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08', diff --git a/src/calibre/utils/wordcount.py b/src/calibre/utils/wordcount.py new file mode 100644 index 0000000000..2bc91f4014 --- /dev/null +++ b/src/calibre/utils/wordcount.py @@ -0,0 +1,83 @@ +#!/usr/bin/python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +""" +Get word, character, and Asian character counts + +1. Get a word count as a dictionary: + wc = get_wordcount(text) + words = wc['words'] # etc. + +2. Get a word count as an object + wc = get_wordcount_obj(text) + words = wc.words # etc. + +properties counted: + * characters + * chars_no_spaces + * asian_chars + * non_asian_words + * words + +Python License +""" +__version__ = 0.1 +__author__ = "Ryan Ginstrom" + +IDEOGRAPHIC_SPACE = 0x3000 + +def is_asian(char): + """Is the character Asian?""" + + # 0x3000 is ideographic space (i.e. double-byte space) + # Anything over is an Asian character + return ord(char) > IDEOGRAPHIC_SPACE + +def filter_jchars(c): + """Filters Asian characters to spaces""" + if is_asian(c): + return ' ' + return c + +def nonj_len(word): + u"""Returns number of non-Asian words in {word} + - 日本語AアジアンB -> 2 + - hello -> 1 + @param word: A word, possibly containing Asian characters + """ + # Here are the steps: + # 本spam日eggs + # -> [' ', 's', 'p', 'a', 'm', ' ', 'e', 'g', 'g', 's'] + # -> ' spam eggs' + # -> ['spam', 'eggs'] + # The length of which is 2! + chars = [filter_jchars(c) for c in word] + return len(u''.join(chars).split()) + +def get_wordcount(text): + """Get the word/character count for text + + @param text: The text of the segment + """ + + characters = len(text) + chars_no_spaces = sum([not x.isspace() for x in text]) + asian_chars = sum([is_asian(x) for x in text]) + non_asian_words = nonj_len(text) + words = non_asian_words + asian_chars + + return dict(characters=characters, + chars_no_spaces=chars_no_spaces, + asian_chars=asian_chars, + non_asian_words=non_asian_words, + words=words) + +def dict2obj(dictionary): + """Transform a dictionary into an object""" + class Obj(object): + def __init__(self, dictionary): + self.__dict__.update(dictionary) + return Obj(dictionary) + +def get_wordcount_obj(text): + """Get the wordcount as an object rather than a dictionary""" + return dict2obj(get_wordcount(text)) From a1dcbb33c1c9ff12c8dbc2092ef4172c014dc827 Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 29 Nov 2010 16:38:17 +0800 Subject: [PATCH 02/35] moved chapter markup to a function, tied preprocessing into word count --- src/calibre/ebooks/conversion/utils.py | 149 +++++++++++++++---------- 1 file changed, 87 insertions(+), 62 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 8baeefcd1a..c42068cfe0 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -65,7 +65,7 @@ class PreProcessor(object): inspect. Percent is the minimum percent of line endings which should be marked up to return true. ''' - htm_end_ere = re.compile('

', re.DOTALL) + htm_end_ere = re.compile('', re.DOTALL) line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) htm_end = htm_end_ere.findall(raw) line_end = line_end_ere.findall(raw) @@ -102,12 +102,93 @@ class PreProcessor(object): with open(os.path.join(odir, name), 'wb') as f: f.write(raw.encode('utf-8')) + def get_word_count(self, html): + totalwords = 0 + word_count_text = re.sub(r'(?s)]*>.*?', '', html) + word_count_text = re.sub(r'<[^>]*>', '', word_count_text) + wordcount = get_wordcount_obj(word_count_text) + return wordcount.words + + def markup_chapters(self, html, wordcount, blanks_between_paragraphs): + # Typical chapters are between 2000 and 7000 words, use the larger number to decide the + # minimum of chapters to search for + self.min_chapters = 1 + if wordcount > 7000: + self.min_chapters = wordcount / 7000 + print "minimum chapters required are: "+str(self.min_chapters) + heading = re.compile(']*>', re.IGNORECASE) + self.html_preprocess_sections = len(heading.findall(html)) + self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") + + # Build the Regular Expressions in pieces + init_lookahead = "(?=<(p|div))" + chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" + title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" + chapter_header_open = r"(?P" + title_header_open = r"(?P" + chapter_header_close = ")\s*" + title_header_close = ")" + chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>" + title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>" + + if blanks_between_paragraphs: + blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*" + else: + blank_lines = "" + opt_title_open = "(" + opt_title_close = ")?" + n_lookahead_open = "\s+(?!" + n_lookahead_close = ")" + + default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)" + + chapter_types = [ + [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], + [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters + [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines + [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles + [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters + ] + + # Start with most typical chapter headings, get more aggressive until one works + for [chapter_type, lookahead_ignorecase, log_message] in chapter_types: + if self.html_preprocess_sections >= self.min_chapters: + break + full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close + n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) + self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message) + if lookahead_ignorecase: + chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) + else: + chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close + chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) + + html = chapdetect.sub(self.chapter_head, html) + + words_per_chptr = wordcount + if words_per_chptr > 0 and self.html_preprocess_sections > 0: + words_per_chptr = wordcount / self.html_preprocess_sections + print "Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters" + + return html + + + def __call__(self, html): self.log("********* Preprocessing HTML *********") + # Count the words in the document to estimate how many chapters to look for and whether + # other types of processing are attempted + totalwords = self.get_word_count(html) + + if totalwords < 10: + print "not enough text, not preprocessing" + return html + # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly - html = re.sub(r"\s*</p>", "</p>\n", html) - html = re.sub(r"\s*<p(?P<style>[^>]*)>\s*", "\n<p"+"\g<style>"+">", html) + html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html) + html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html) ###### Check Markup ###### # @@ -170,68 +251,12 @@ class PreProcessor(object): else: blanks_between_paragraphs = False - # Count the words in the document to estimate how many chapters to look for - word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html) - word_count_text = re.sub(r'<[^>]*>', '', word_count_text) - wordcount = get_wordcount_obj(word_count_text) - - #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic # - min_chapters = 10 - heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) - self.html_preprocess_sections = len(heading.findall(html)) - self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") - # Build the Regular Expressions in pieces - init_lookahead = "(?=<(p|div))" - chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*" - title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*" - chapter_header_open = r"(?P<chap>" - title_header_open = r"(?P<title>" - chapter_header_close = ")\s*" - title_header_close = ")" - chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>" - title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>" + self.markup_chapters(html, totalwords, blanks_between_paragraphs) - if blanks_between_paragraphs: - blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*" - else: - blank_lines = "" - opt_title_open = "(" - opt_title_close = ")?" - n_lookahead_open = "\s+(?!" - n_lookahead_close = ")" - - default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)" - - chapter_types = [ - [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], - [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters - [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines - [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles - [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters - ] - - # Start with most typical chapter headings, get more aggressive until one works - for [chapter_type, lookahead_ignorecase, log_message] in chapter_types: - if self.html_preprocess_sections >= min_chapters: - break - full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close - n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message) - if lookahead_ignorecase: - chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) - else: - chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close - chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) - - html = chapdetect.sub(self.chapter_head, html) - - words_per_chptr = wordcount.words / self.html_preprocess_sections - print "wordcount is: "+ str(wordcount.words)+", Average words per chapter is: "+str(words_per_chptr)+", Marked "+str(self.html_preprocess_sections)+" chapters" ###### Unwrap lines ###### # @@ -257,7 +282,7 @@ class PreProcessor(object): # Calculate Length unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4) length = docanalysis.line_length(unwrap_factor) - self.log("*** Median line length is " + unicode(length) + ", calculated with " + format + " format ***") + self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format") # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor if hardbreaks or unwrap_factor < 0.4: self.log("Unwrapping required, unwrapping Lines") @@ -286,7 +311,7 @@ class PreProcessor(object): html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) # If still no sections after unwrapping mark split points on lines with no punctuation - if self.html_preprocess_sections < 5: + if self.html_preprocess_sections < self.min_chapters: self.log("Looking for more split points based on punctuation," " currently have " + unicode(self.html_preprocess_sections)) chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) From e9f5cb683db7425c8fdf6c01523d69e085f221e4 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Fri, 10 Dec 2010 13:29:36 -0800 Subject: [PATCH 03/35] tweaked chapter markup function --- src/calibre/ebooks/conversion/utils.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index c42068cfe0..0665cccb14 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -103,7 +103,6 @@ class PreProcessor(object): f.write(raw.encode('utf-8')) def get_word_count(self, html): - totalwords = 0 word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html) word_count_text = re.sub(r'<[^>]*>', '', word_count_text) wordcount = get_wordcount_obj(word_count_text) @@ -162,15 +161,13 @@ class PreProcessor(object): chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) else: chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close - chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) - + chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) html = chapdetect.sub(self.chapter_head, html) words_per_chptr = wordcount if words_per_chptr > 0 and self.html_preprocess_sections > 0: words_per_chptr = wordcount / self.html_preprocess_sections print "Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters" - return html @@ -180,10 +177,11 @@ class PreProcessor(object): # Count the words in the document to estimate how many chapters to look for and whether # other types of processing are attempted + totalwords = 0 totalwords = self.get_word_count(html) - if totalwords < 10: - print "not enough text, not preprocessing" + if totalwords < 20: + self.log("not enough text, not preprocessing") return html # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly @@ -255,7 +253,7 @@ class PreProcessor(object): # detect chapters/sections to match xpath or splitting logic # - self.markup_chapters(html, totalwords, blanks_between_paragraphs) + html = self.markup_chapters(html, totalwords, blanks_between_paragraphs) ###### Unwrap lines ###### From 2aa0a8d38aae8e9f8ee312954698808782b4c884 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Sat, 11 Dec 2010 15:08:28 -0500 Subject: [PATCH 04/35] handle br tags to render in ADE --- src/calibre/ebooks/conversion/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 0665cccb14..cda9d9cbba 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -227,6 +227,8 @@ class PreProcessor(object): html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html) html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html) html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html) + # ADE doesn't render <br />, change to empty paragraphs + html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html) # If more than 40% of the lines are empty paragraphs and the user has enabled remove # paragraph spacing then delete blank lines to clean up spacing From 13dbd42f35d3b5bc36ee3fa46a8ec2cab19e1d71 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Mon, 13 Dec 2010 16:19:50 -0500 Subject: [PATCH 05/35] Added ellipsis substitutions to the smarten punctuation option --- src/calibre/ebooks/conversion/preprocess.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 3ff816b3bf..bc4df4233a 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -554,5 +554,8 @@ class HTMLPreProcessor(object): html = smartyPants(html) html = html.replace(start, '<!--') html = html.replace(stop, '-->') + # convert ellipsis to entities to prevent unwrapping + html = re.sub('(?u)(?<=\w)(\.\s?){3}', '…', html) + html = re.sub('(?u)(?<=\w)\s(\.\s?){3}', ' …', html) return substitute_entites(html) From 10b3353f57cea9f3bc18d29b367acedd05832162 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Tue, 14 Dec 2010 12:16:32 -0500 Subject: [PATCH 06/35] tweaked the ellipsis substitution --- src/calibre/ebooks/conversion/preprocess.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index bc4df4233a..3385771228 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -554,8 +554,8 @@ class HTMLPreProcessor(object): html = smartyPants(html) html = html.replace(start, '<!--') html = html.replace(stop, '-->') - # convert ellipsis to entities to prevent unwrapping - html = re.sub('(?u)(?<=\w)(\.\s?){3}', '…', html) - html = re.sub('(?u)(?<=\w)\s(\.\s?){3}', ' …', html) + # convert ellipsis to entities to prevent wrapping + html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html) + # nbsp gets changed to space: html = re.sub('(?u)(?<=\w)\s(\.\s?){2}\.', ' …', html) return substitute_entites(html) From 0c2ab9e32838933e0b3731f8cca72a0e98c36730 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Fri, 17 Dec 2010 02:09:25 -0500 Subject: [PATCH 07/35] merged pdf chapter markup with preprocess markup --- src/calibre/ebooks/conversion/preprocess.py | 27 +++++++++++++++------ src/calibre/ebooks/conversion/utils.py | 18 ++++++++++++-- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 3385771228..310a636022 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -364,12 +364,15 @@ class HTMLPreProcessor(object): (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), # Detect Chapters to match default XPATH in GUI - (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head), + #(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head), # Cover the case where every letter in a chapter title is separated by a space - (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head), + #(re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head), - # Have paragraphs show better - (re.compile(r'<br.*?>'), lambda match : '<p>'), + # Convert line breaks to paragraphs + (re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'), + (re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'), + (re.compile(r'\s*</body>'), lambda match : '</p>\n</body>'), + # Clean up spaces (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), # Add space before and after italics @@ -455,9 +458,9 @@ class HTMLPreProcessor(object): # delete soft hyphens - moved here so it's executed after header/footer removal if is_pdftohtml: # unwrap/delete soft hyphens - end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: '')) + end_rules.append((re.compile(u'[­](</p>\s*<p>\s*)+\s*(?=[[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens with formatting - end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) + end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) # Make the more aggressive chapter marking regex optional with the preprocess option to # reduce false positives and move after header/footer removal @@ -475,7 +478,7 @@ class HTMLPreProcessor(object): end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation - (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), + (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: @@ -508,7 +511,15 @@ class HTMLPreProcessor(object): if is_pdftohtml and length > -1: # Dehyphenate dehyphenator = Dehyphenator() - html = dehyphenator(html,'pdf', length) + html = dehyphenator(html,'html', length) + + if is_pdftohtml: + from calibre.ebooks.conversion.utils import PreProcessor + pdf_markup = PreProcessor(self.extra_opts, None) + totalwords = 0 + totalwords = pdf_markup.get_word_count(html) + if totalwords > 7000: + html = pdf_markup.markup_chapters(html, totalwords, True) #dump(html, 'post-preprocess') diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index cda9d9cbba..3fd7f88434 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -18,6 +18,9 @@ class PreProcessor(object): self.found_indents = 0 self.extra_opts = extra_opts + def is_pdftohtml(self, src): + return '<!-- created by calibre\'s pdftohtml -->' in src[:1000] + def chapter_head(self, match): chap = match.group('chap') title = match.group('title') @@ -130,6 +133,15 @@ class PreProcessor(object): chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>" title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>" + is_pdftohtml = self.is_pdftohtml(html) + if is_pdftohtml: + print "this is a pdf" + chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*" + chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>" + title_line_open = "<(?P<outer2>p)[^>]*>\s*" + title_line_close = "\s*</(?P=outer2)>" + + if blanks_between_paragraphs: blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*" else: @@ -139,11 +151,13 @@ class PreProcessor(object): n_lookahead_open = "\s+(?!" n_lookahead_close = ")" - default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)" + default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)" chapter_types = [ [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], - [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters + [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters + [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering + [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters From 24da52303d96e3417d5f347e9d0248abb9af6970 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Thu, 23 Dec 2010 13:52:13 -0500 Subject: [PATCH 08/35] added more non-ascii lower-case characters to the unwrap expressions --- src/calibre/ebooks/conversion/preprocess.py | 2 +- src/calibre/ebooks/conversion/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 310a636022..ca74b04e8d 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -478,7 +478,7 @@ class HTMLPreProcessor(object): end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation - (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), + (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 3fd7f88434..2176f0811a 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -309,7 +309,7 @@ class PreProcessor(object): self.log("Done dehyphenating") # Unwrap lines using punctation and line length #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE) - unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) + unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) html = unwrap.sub(' ', html) #check any remaining hyphens, but only unwrap if there is a match dehyphenator = Dehyphenator() From 7008e2a23a8697c6418fe56501f631fbc3e1c63d Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Tue, 28 Dec 2010 10:39:45 -0500 Subject: [PATCH 09/35] fixed some indents --- src/calibre/ebooks/conversion/utils.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 2176f0811a..56c9c9673e 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -135,11 +135,10 @@ class PreProcessor(object): is_pdftohtml = self.is_pdftohtml(html) if is_pdftohtml: - print "this is a pdf" - chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*" - chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>" - title_line_open = "<(?P<outer2>p)[^>]*>\s*" - title_line_close = "\s*</(?P=outer2)>" + chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*" + chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>" + title_line_open = "<(?P<outer2>p)[^>]*>\s*" + title_line_close = "\s*</(?P=outer2)>" if blanks_between_paragraphs: From 38a82b049dade612732287cd15e9716b56b5f995 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 1 Jan 2011 21:39:57 -0500 Subject: [PATCH 10/35] GUI: Editable combo box with most common character encodings instead of fully free form text entry. This still allows users to specify encodings that are not part of the common list. --- src/calibre/gui2/convert/__init__.py | 4 ++- src/calibre/gui2/convert/look_and_feel.ui | 22 +++++++++---- src/calibre/gui2/convert/pdb_output.ui | 13 +++++++- src/calibre/gui2/convert/pmlz_output.ui | 39 ++++++++++++++++------- src/calibre/gui2/convert/txt_output.ui | 13 +++++++- src/calibre/gui2/widgets.py | 26 +++++++++++++++ 6 files changed, 95 insertions(+), 22 deletions(-) diff --git a/src/calibre/gui2/convert/__init__.py b/src/calibre/gui2/convert/__init__.py index c1efe5b9af..6b977afc19 100644 --- a/src/calibre/gui2/convert/__init__.py +++ b/src/calibre/gui2/convert/__init__.py @@ -191,7 +191,9 @@ class Widget(QWidget): if not val: val = '' getattr(g, 'setPlainText', g.setText)(val) getattr(g, 'setCursorPosition', lambda x: x)(0) - elif isinstance(g, QComboBox) and val: + elif isinstance(g, QComboBox): + if not val: + val = '' idx = g.findText(val, Qt.MatchFixedString) if idx < 0: g.addItem(val) diff --git a/src/calibre/gui2/convert/look_and_feel.ui b/src/calibre/gui2/convert/look_and_feel.ui index 367233e2c0..cd0426ac53 100644 --- a/src/calibre/gui2/convert/look_and_feel.ui +++ b/src/calibre/gui2/convert/look_and_feel.ui @@ -84,7 +84,7 @@ <string>...</string> </property> <property name="icon"> - <iconset resource="../../../../resources/images.qrc"> + <iconset> <normaloff>:/images/wizard.png</normaloff>:/images/wizard.png</iconset> </property> <property name="iconSize"> @@ -122,14 +122,8 @@ <property name="text"> <string>Input character &encoding:</string> </property> - <property name="buddy"> - <cstring>opt_input_encoding</cstring> - </property> </widget> </item> - <item row="5" column="1" colspan="3"> - <widget class="QLineEdit" name="opt_input_encoding"/> - </item> <item row="6" column="0" colspan="2"> <widget class="QCheckBox" name="opt_remove_paragraph_spacing"> <property name="text"> @@ -244,8 +238,22 @@ </property> </widget> </item> + <item row="5" column="1" colspan="3"> + <widget class="EncodingComboBox" name="opt_input_encoding"> + <property name="editable"> + <bool>true</bool> + </property> + </widget> + </item> </layout> </widget> + <customwidgets> + <customwidget> + <class>EncodingComboBox</class> + <extends>QComboBox</extends> + <header>widgets.h</header> + </customwidget> + </customwidgets> <resources> <include location="../../../../resources/images.qrc"/> <include location="../../../../resources/images.qrc"/> diff --git a/src/calibre/gui2/convert/pdb_output.ui b/src/calibre/gui2/convert/pdb_output.ui index 17bdc0a984..a571a0035b 100644 --- a/src/calibre/gui2/convert/pdb_output.ui +++ b/src/calibre/gui2/convert/pdb_output.ui @@ -55,10 +55,21 @@ </widget> </item> <item row="1" column="1"> - <widget class="QLineEdit" name="opt_output_encoding"/> + <widget class="EncodingComboBox" name="opt_output_encoding"> + <property name="editable"> + <bool>true</bool> + </property> + </widget> </item> </layout> </widget> + <customwidgets> + <customwidget> + <class>EncodingComboBox</class> + <extends>QComboBox</extends> + <header>widgets.h</header> + </customwidget> + </customwidgets> <resources/> <connections/> </ui> diff --git a/src/calibre/gui2/convert/pmlz_output.ui b/src/calibre/gui2/convert/pmlz_output.ui index 9754752c8a..bd70cf1039 100644 --- a/src/calibre/gui2/convert/pmlz_output.ui +++ b/src/calibre/gui2/convert/pmlz_output.ui @@ -14,7 +14,7 @@ <string>Form</string> </property> <layout class="QGridLayout" name="gridLayout"> - <item row="3" column="0"> + <item row="4" column="0"> <spacer name="verticalSpacer"> <property name="orientation"> <enum>Qt::Vertical</enum> @@ -27,32 +27,47 @@ </property> </spacer> </item> - <item row="1" column="0"> + <item row="2" column="0"> <widget class="QCheckBox" name="opt_inline_toc"> <property name="text"> <string>&Inline TOC</string> </property> </widget> </item> - <item row="2" column="0"> + <item row="3" column="0"> <widget class="QCheckBox" name="opt_full_image_depth"> <property name="text"> <string>Do not reduce image size and depth</string> </property> </widget> </item> - <item row="0" column="0"> - <widget class="QLabel" name="label"> - <property name="text"> - <string>Output Encoding:</string> - </property> - </widget> - </item> - <item row="0" column="1"> - <widget class="QLineEdit" name="opt_output_encoding"/> + <item row="1" column="0"> + <layout class="QHBoxLayout" name="horizontalLayout"> + <item> + <widget class="QLabel" name="label"> + <property name="text"> + <string>Output Encoding:</string> + </property> + </widget> + </item> + <item> + <widget class="EncodingComboBox" name="opt_output_encoding"> + <property name="editable"> + <bool>true</bool> + </property> + </widget> + </item> + </layout> </item> </layout> </widget> + <customwidgets> + <customwidget> + <class>EncodingComboBox</class> + <extends>QComboBox</extends> + <header>widgets.h</header> + </customwidget> + </customwidgets> <resources/> <connections/> </ui> diff --git a/src/calibre/gui2/convert/txt_output.ui b/src/calibre/gui2/convert/txt_output.ui index 6290a096c8..3a2516b98e 100644 --- a/src/calibre/gui2/convert/txt_output.ui +++ b/src/calibre/gui2/convert/txt_output.ui @@ -96,10 +96,21 @@ </widget> </item> <item row="2" column="1"> - <widget class="QLineEdit" name="opt_output_encoding"/> + <widget class="EncodingComboBox" name="opt_output_encoding"> + <property name="editable"> + <bool>true</bool> + </property> + </widget> </item> </layout> </widget> + <customwidgets> + <customwidget> + <class>EncodingComboBox</class> + <extends>QComboBox</extends> + <header>widgets.h</header> + </customwidget> + </customwidgets> <resources/> <connections/> </ui> diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py index bc3c23876f..cab2e2d4df 100644 --- a/src/calibre/gui2/widgets.py +++ b/src/calibre/gui2/widgets.py @@ -616,6 +616,32 @@ class ComboBoxWithHelp(QComboBox): QComboBox.hidePopup(self) self.set_state() + +class EncodingComboBox(QComboBox): + ''' + A combobox that holds text encodings support + by Python. This is only populated with the most + common and standard encodings. There is no good + way to programatically list all supported encodings + using encodings.aliases.aliases.keys(). It + will not work. + ''' + + ENCODINGS = ['', 'ascii', 'big5', 'cp1250', 'cp1251', 'cp1252', 'cp1253', + 'cp1254', 'cp1255', 'cp1256', 'euc_jp', 'euc_kr', 'gb2312', 'gb18030', + 'hz', 'iso2022_jp', 'iso2022_kr', 'iso8859_5', 'latin_1', 'shift_jis', + 'utf_8', + ] + + def __init__(self, parent=None): + QComboBox.__init__(self, parent) + self.setEditable(True) + self.setLineEdit(EnLineEdit(self)) + + for item in self.ENCODINGS: + self.addItem(item) + + class PythonHighlighter(QSyntaxHighlighter): Rules = [] From 47aeaf10b67498bb8c8c4399abe0ab60f2d0401b Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sat, 1 Jan 2011 23:03:58 -0500 Subject: [PATCH 11/35] TXT Input: Attempt to detect the input encoding when not specified. TCR, PDB Input: Use TXT Input converion plugin for conversion, adds encoding detection and allows for all of TXT Input options to be used (eReader PDB ignores options that do not apply to it). --- src/calibre/ebooks/pdb/input.py | 14 +++++++-- src/calibre/ebooks/pdb/palmdoc/reader.py | 37 +++++++--------------- src/calibre/ebooks/pdb/ztxt/reader.py | 40 +++++++++--------------- src/calibre/ebooks/tcr/input.py | 35 +++++++++------------ src/calibre/ebooks/txt/input.py | 20 ++++++++---- 5 files changed, 67 insertions(+), 79 deletions(-) diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 6850c48b16..9edf381f1e 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -22,13 +22,23 @@ class PDBInput(InputFormatPlugin): OptionRecommendation(name='single_line_paras', recommended_value=False, help=_('Normally calibre treats blank lines as paragraph markers. ' 'With this option it will assume that every line represents ' - 'a paragraph instead.')), + 'a paragraph instead. This option is ignored by eReader format.')), OptionRecommendation(name='print_formatted_paras', recommended_value=False, help=_('Normally calibre treats blank lines as paragraph markers. ' 'With this option it will assume that every line starting with ' 'an indent (either a tab or 2+ spaces) represents a paragraph. ' 'Paragraphs end when the next line that starts with an indent ' - 'is reached.')), + 'is reached. This option is ignored by eReader format.')), + OptionRecommendation(name='preserve_spaces', recommended_value=False, + help=_('Normally extra spaces are condensed into a single space. ' + 'With this option all spaces will be displayed. This option ' + 'is ignored by eReader format.')), + OptionRecommendation(name='markdown', recommended_value=False, + help=_('Run the text input through the markdown pre-processor. To ' + 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), + OptionRecommendation(name="markdown_disable_toc", recommended_value=False, + help=_('Do not insert a Table of Contents into the output text. ' + 'This option is ignored by eReader format.')), ]) def convert(self, stream, options, file_ext, log, diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py index 52b8d1361f..f1f00ea8e3 100644 --- a/src/calibre/ebooks/pdb/palmdoc/reader.py +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -11,9 +11,9 @@ __docformat__ = 'restructuredtext en' import os import struct +from cStringIO import StringIO + from calibre.ebooks.pdb.formatreader import FormatReader -from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line, separate_paragraphs_print_formatted class HeaderRecord(object): ''' @@ -33,9 +33,7 @@ class Reader(FormatReader): def __init__(self, header, stream, log, options): self.stream = stream self.log = log - self.encoding = options.input_encoding - self.single_line_paras = options.single_line_paras - self.print_formatted_paras = options.print_formatted_paras + self.options = options self.sections = [] for i in range(header.num_sections): @@ -48,34 +46,23 @@ class Reader(FormatReader): def decompress_text(self, number): if self.header_record.compression == 1: - return self.section_data(number).decode('cp1252' if self.encoding is None else self.encoding) + return self.section_data(number) if self.header_record.compression == 2 or self.header_record.compression == 258: from calibre.ebooks.compression.palmdoc import decompress_doc - return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') + return decompress_doc(self.section_data(number)) return '' def extract_content(self, output_dir): - txt = '' + raw_txt = '' self.log.info('Decompressing text...') for i in range(1, self.header_record.num_records + 1): self.log.debug('\tDecompressing text section %i' % i) - txt += self.decompress_text(i) + raw_txt += self.decompress_text(i) self.log.info('Converting text to OEB...') - if self.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if self.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - html = convert_basic(txt) - with open(os.path.join(output_dir, 'index.html'), 'wb') as index: - index.write(html.encode('utf-8')) - - from calibre.ebooks.metadata.meta import get_metadata - mi = get_metadata(self.stream, 'pdb') - manifest = [('index.html', None)] - spine = ['index.html'] - opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) - - return os.path.join(output_dir, 'metadata.opf') - + stream = StringIO(raw_txt) + from calibre.customize.ui import plugin_for_input_format + stream.seek(0) + return plugin_for_input_format('txt').convert(stream, self.options, + 'txt', self.log, {}) diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py index 5cac283264..7e51dae1fd 100644 --- a/src/calibre/ebooks/pdb/ztxt/reader.py +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -8,12 +8,13 @@ __license__ = 'GPL v3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' -import os, struct, zlib +import struct +import zlib + +from cStringIO import StringIO from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.ztxt import zTXTError -from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line, separate_paragraphs_print_formatted SUPPORTED_VERSION = (1, 40) @@ -38,9 +39,7 @@ class Reader(FormatReader): def __init__(self, header, stream, log, options): self.stream = stream self.log = log - self.encoding = options.input_encoding - self.single_line_paras = options.single_line_paras - self.print_formatted_paras = options.print_formatted_paras + self.options = options self.sections = [] for i in range(header.num_sections): @@ -68,30 +67,19 @@ class Reader(FormatReader): def decompress_text(self, number): if number == 1: self.uncompressor = zlib.decompressobj() - return self.uncompressor.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') + return self.uncompressor.decompress(self.section_data(number)) def extract_content(self, output_dir): - txt = '' + raw_txt = '' self.log.info('Decompressing text...') for i in range(1, self.header_record.num_records + 1): self.log.debug('\tDecompressing text section %i' % i) - txt += self.decompress_text(i) - + raw_txt += self.decompress_text(i) + self.log.info('Converting text to OEB...') - if self.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if self.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - html = convert_basic(txt) - with open(os.path.join(output_dir, 'index.html'), 'wb') as index: - index.write(html.encode('utf-8')) - - from calibre.ebooks.metadata.meta import get_metadata - mi = get_metadata(self.stream, 'pdb') - manifest = [('index.html', None)] - spine = ['index.html'] - opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) - - return os.path.join(output_dir, 'metadata.opf') - + stream = StringIO(raw_txt) + from calibre.customize.ui import plugin_for_input_format + stream.seek(0) + return plugin_for_input_format('txt').convert(stream, self.options, + 'txt', self.log, {}) diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py index 67fa6ac66e..47154988a0 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/tcr/input.py @@ -4,11 +4,9 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' -import os +from cStringIO import StringIO from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation -from calibre.ebooks.txt.processor import convert_basic, opf_writer, \ - separate_paragraphs_single_line, separate_paragraphs_print_formatted from calibre.ebooks.compression.tcr import decompress class TCRInput(InputFormatPlugin): @@ -29,26 +27,23 @@ class TCRInput(InputFormatPlugin): 'an indent (either a tab or 2+ spaces) represents a paragraph. ' 'Paragraphs end when the next line that starts with an indent ' 'is reached.')), + OptionRecommendation(name='preserve_spaces', recommended_value=False, + help=_('Normally extra spaces are condensed into a single space. ' + 'With this option all spaces will be displayed.')), + OptionRecommendation(name='markdown', recommended_value=False, + help=_('Run the text input through the markdown pre-processor. To ' + 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), + OptionRecommendation(name="markdown_disable_toc", recommended_value=False, + help=_('Do not insert a Table of Contents into the output text.')), ]) def convert(self, stream, options, file_ext, log, accelerators): log.info('Decompressing text...') - ienc = options.input_encoding if options.input_encoding else 'utf-8' - txt = decompress(stream).decode(ienc, 'replace') + raw_txt = decompress(stream) log.info('Converting text to OEB...') - if options.single_line_paras: - txt = separate_paragraphs_single_line(txt) - if options.print_formatted_paras: - txt = separate_paragraphs_print_formatted(txt) - html = convert_basic(txt) - with open(os.path.join(os.getcwd(), 'index.html'), 'wb') as index: - index.write(html.encode('utf-8')) - - from calibre.ebooks.metadata.meta import get_metadata - mi = get_metadata(stream, 'tcr') - manifest = [('index.html', None)] - spine = ['index.html'] - opf_writer(os.getcwd(), 'metadata.opf', manifest, spine, mi) - - return os.path.join(os.getcwd(), 'metadata.opf') + stream = StringIO(raw_txt) + from calibre.customize.ui import plugin_for_input_format + stream.seek(0) + return plugin_for_input_format('txt').convert(stream, options, + 'txt', log, accelerators) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 44b98304ea..1a732535b3 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces @@ -42,11 +43,19 @@ class TXTInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): - ienc = stream.encoding if stream.encoding else 'utf-8' + log.debug('Reading text from file...') + + txt = stream.read() if options.input_encoding: ienc = options.input_encoding - log.debug('Reading text from file...') - txt = stream.read().decode(ienc, 'replace') + log.debug('Using user specified input encoding of %s' % ienc) + else: + ienc = detect(txt)['encoding'] + log.debug('Detected input encoding as %s' % ienc) + if not ienc: + ienc = 'utf-8' + log.debug('No input encoding specified and could not auto detect using %s' % ienc) + txt = txt.decode(ienc, 'replace') # Adjust paragraph formatting as requested if options.single_line_paras: @@ -85,11 +94,10 @@ class TXTInput(InputFormatPlugin): htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) - cwd = os.getcwdu() odi = options.debug_pipeline options.debug_pipeline = None - oeb = html_input(open(htmlfile.name, 'rb'), options, 'html', log, - {}, cwd) + oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, + {}) options.debug_pipeline = odi os.remove(htmlfile.name) return oeb From 089d3679420b087c09dce06b3ea80ac1faf194c0 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 2 Jan 2011 09:59:41 -0500 Subject: [PATCH 12/35] PDF Output: Change call to get_printer to correct get_pdf_printer. --- src/calibre/ebooks/pdf/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index 4ff10290c9..8938dd66c1 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -175,7 +175,7 @@ class PDFWriter(QObject): # {{{ if self.cover_data is None: return item_path = os.path.join(self.tmp_path, 'cover.pdf') - printer = self.get_printer() + printer = self.get_pdf_printer() printer.setOutputFileName(item_path) self.combine_queue.insert(0, item_path) p = QPixmap() From d9195c0632ac823e0e581e417596d1d2039aef9d Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 2 Jan 2011 17:32:16 -0500 Subject: [PATCH 13/35] TXT Input: Add confidence of detected encoding to debug log. --- src/calibre/ebooks/txt/input.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 1a732535b3..5e406216d6 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -50,8 +50,9 @@ class TXTInput(InputFormatPlugin): ienc = options.input_encoding log.debug('Using user specified input encoding of %s' % ienc) else: - ienc = detect(txt)['encoding'] - log.debug('Detected input encoding as %s' % ienc) + det_encoding = detect(txt) + ienc = det_encoding['encoding'] + log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, det_encoding['confidence'] * 100)) if not ienc: ienc = 'utf-8' log.debug('No input encoding specified and could not auto detect using %s' % ienc) From 9ec91639197e2e1dec38525984787b317c0296c9 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 2 Jan 2011 19:05:35 -0500 Subject: [PATCH 14/35] TXT Input: Auto detect paragraph structure. --- src/calibre/ebooks/pdb/input.py | 30 ++++++++--------- src/calibre/ebooks/tcr/input.py | 24 +++++++------- src/calibre/ebooks/txt/input.py | 51 ++++++++++++++++++----------- src/calibre/ebooks/txt/processor.py | 50 +++++++++++++++++++++++++++- 4 files changed, 104 insertions(+), 51 deletions(-) diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 9edf381f1e..b8b4b93ca1 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -19,26 +19,22 @@ class PDBInput(InputFormatPlugin): file_types = set(['pdb']) options = set([ - OptionRecommendation(name='single_line_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line represents ' - 'a paragraph instead. This option is ignored by eReader format.')), - OptionRecommendation(name='print_formatted_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line starting with ' - 'an indent (either a tab or 2+ spaces) represents a paragraph. ' - 'Paragraphs end when the next line that starts with an indent ' - 'is reached. This option is ignored by eReader format.')), + OptionRecommendation(name='paragraph_format', recommended_value='auto', + choices=['auto', 'block', 'single', 'print', 'markdown'], + help=_('How calibre splits text into paragraphs.\n' + 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' + '* auto: Try to auto detect paragraph format.\n' + '* block: Treat a blank line as a paragraph break.\n' + '* single: Assume every line is a paragraph.\n' + '* print: Assume every line starting with 2+ spaces or a tab ' + 'starts a paragraph.\n' + '* markdown: Run the input though the markdown pre-processor. ' + 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, help=_('Normally extra spaces are condensed into a single space. ' - 'With this option all spaces will be displayed. This option ' - 'is ignored by eReader format.')), - OptionRecommendation(name='markdown', recommended_value=False, - help=_('Run the text input through the markdown pre-processor. To ' - 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), + 'With this option all spaces will be displayed.')), OptionRecommendation(name="markdown_disable_toc", recommended_value=False, - help=_('Do not insert a Table of Contents into the output text. ' - 'This option is ignored by eReader format.')), + help=_('Do not insert a Table of Contents into the output text.')), ]) def convert(self, stream, options, file_ext, log, diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py index 47154988a0..47fe7e7337 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/tcr/input.py @@ -17,22 +17,20 @@ class TCRInput(InputFormatPlugin): file_types = set(['tcr']) options = set([ - OptionRecommendation(name='single_line_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line represents ' - 'a paragraph instead.')), - OptionRecommendation(name='print_formatted_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line starting with ' - 'an indent (either a tab or 2+ spaces) represents a paragraph. ' - 'Paragraphs end when the next line that starts with an indent ' - 'is reached.')), + OptionRecommendation(name='paragraph_format', recommended_value='auto', + choices=['auto', 'block', 'single', 'print', 'markdown'], + help=_('How calibre splits text into paragraphs.\n' + 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' + '* auto: Try to auto detect paragraph format.\n' + '* block: Treat a blank line as a paragraph break.\n' + '* single: Assume every line is a paragraph.\n' + '* print: Assume every line starting with 2+ spaces or a tab ' + 'starts a paragraph.\n' + '* markdown: Run the input though the markdown pre-processor. ' + 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, help=_('Normally extra spaces are condensed into a single space. ' 'With this option all spaces will be displayed.')), - OptionRecommendation(name='markdown', recommended_value=False, - help=_('Run the text input through the markdown pre-processor. To ' - 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name="markdown_disable_toc", recommended_value=False, help=_('Do not insert a Table of Contents into the output text.')), ]) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 5e406216d6..e68c47e9b3 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -10,7 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ - preserve_spaces + preserve_spaces, detect_paragraph_formatting from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -21,22 +21,20 @@ class TXTInput(InputFormatPlugin): file_types = set(['txt']) options = set([ - OptionRecommendation(name='single_line_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line represents ' - 'a paragraph instead.')), - OptionRecommendation(name='print_formatted_paras', recommended_value=False, - help=_('Normally calibre treats blank lines as paragraph markers. ' - 'With this option it will assume that every line starting with ' - 'an indent (either a tab or 2+ spaces) represents a paragraph. ' - 'Paragraphs end when the next line that starts with an indent ' - 'is reached.')), + OptionRecommendation(name='paragraph_format', recommended_value='auto', + choices=['auto', 'block', 'single', 'print', 'markdown'], + help=_('How calibre splits text into paragraphs.\n' + 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' + '* auto: Try to auto detect paragraph format.\n' + '* block: Treat a blank line as a paragraph break.\n' + '* single: Assume every line is a paragraph.\n' + '* print: Assume every line starting with 2+ spaces or a tab ' + 'starts a paragraph.\n' + '* markdown: Run the input though the markdown pre-processor. ' + 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, help=_('Normally extra spaces are condensed into a single space. ' 'With this option all spaces will be displayed.')), - OptionRecommendation(name='markdown', recommended_value=False, - help=_('Run the text input through the markdown pre-processor. To ' - 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name="markdown_disable_toc", recommended_value=False, help=_('Do not insert a Table of Contents into the output text.')), ]) @@ -46,6 +44,7 @@ class TXTInput(InputFormatPlugin): log.debug('Reading text from file...') txt = stream.read() + # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding log.debug('Using user specified input encoding of %s' % ienc) @@ -58,17 +57,29 @@ class TXTInput(InputFormatPlugin): log.debug('No input encoding specified and could not auto detect using %s' % ienc) txt = txt.decode(ienc, 'replace') - # Adjust paragraph formatting as requested - if options.single_line_paras: + # Determine the formatting of the document. + if options.paragraph_format == 'auto': + options.paragraph_format = detect_paragraph_formatting(txt) + if options.paragraph_format == 'unknown': + log.debug('Could not reliably determine paragraph format using block format') + options.paragraph_format = 'block' + else: + log.debug('Auto detected paragraph format as %s' % options.paragraph_format) + + # We don't check for block because the processor assumes block. + # single and print at transformed to block for processing. + if options.paragraph_format == 'single': txt = separate_paragraphs_single_line(txt) - if options.print_formatted_paras: + elif options.paragraph_format == 'print': txt = separate_paragraphs_print_formatted(txt) + + txt = _ent_pat.sub(xml_entity_to_unicode, txt) + # Preserve spaces will replace multiple spaces to a space + # followed by the   entity. if options.preserve_spaces: txt = preserve_spaces(txt) - txt = _ent_pat.sub(xml_entity_to_unicode, txt) - - if options.markdown: + if options.paragraph_format == 'markdown': log.debug('Running text though markdown conversion...') try: html = convert_markdown(txt, disable_toc=options.markdown_disable_toc) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index dac1e34df7..e1014b0c7b 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -49,7 +49,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0): if isbytestring(txt): txt = txt.decode('utf-8') - lines = [] # Split into paragraphs based on having a blank line between text. for line in txt.split('\n\n'): @@ -94,3 +93,52 @@ def split_string_separator(txt, size) : xrange(0, len(txt), size)]) return txt +def detect_paragraph_formatting(txt): + ''' + Tries to determine the formatting of the document. + + block: Paragraphs are separated by a blank line. + single: Each line is a paragraph. + print: Each paragraph starts with a 2+ spaces or a tab + and ends when a new paragraph is reached. + markdown: Markdown formatting is in the document. + + returns block, single, print, markdown + ''' + txt = txt.replace('\r\n', '\n') + txt = txt.replace('\r', '\n') + txt_line_count = len(re.findall('(?mu)^\s*.+$', txt)) + + # Check for markdown + # Headings + if len(re.findall('(?mu)^#+', txt)) >= 5: + return 'markdown' + if len(re.findall('(?mu)^=+$', txt)) >= 5: + return 'markdown' + if len(re.findall('(?mu)^-+$', txt)) >= 5: + return 'markdown' + # Images + if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5: + return 'markdown' + # Links + if len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt)) >= 5: + return 'markdown' + # Escaped characters + md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!'] + for c in md_escapted_characters: + if txt.count('\\'+c) > 10: + return 'markdown' + + # Check for print + tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) + if tab_line_count / float(txt_line_count) >= .25: + return 'print' + + # Check for block + empty_line_count = len(re.findall('(?mu)^\s*$', txt)) + if empty_line_count / float(txt_line_count) >= .25: + return 'block' + + # Nothing else matched to assume single. + return 'single' + From 521e41973aa09d00bf3a495507b03a21e4257165 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Sun, 2 Jan 2011 19:18:52 -0500 Subject: [PATCH 15/35] GUI: TXT, TCR, PDB Inputs gui conversion options updated. --- src/calibre/gui2/convert/pdb_input.py | 10 +++-- src/calibre/gui2/convert/pdb_input.ui | 48 --------------------- src/calibre/gui2/convert/tcr_input.py | 23 ++++++++++ src/calibre/gui2/convert/txt_input.py | 5 ++- src/calibre/gui2/convert/txt_input.ui | 60 +++++++-------------------- 5 files changed, 48 insertions(+), 98 deletions(-) delete mode 100644 src/calibre/gui2/convert/pdb_input.ui create mode 100644 src/calibre/gui2/convert/tcr_input.py diff --git a/src/calibre/gui2/convert/pdb_input.py b/src/calibre/gui2/convert/pdb_input.py index 4510cf81ba..655f4025a7 100644 --- a/src/calibre/gui2/convert/pdb_input.py +++ b/src/calibre/gui2/convert/pdb_input.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- __license__ = 'GPL 3' -__copyright__ = '2009, John Schember <john@nachtimwald.com>' +__copyright__ = '2011, John Schember <john@nachtimwald.com>' __docformat__ = 'restructuredtext en' -from calibre.gui2.convert.pdb_input_ui import Ui_Form +from calibre.gui2.convert.txt_input_ui import Ui_Form from calibre.gui2.convert import Widget class PluginWidget(Widget, Ui_Form): @@ -12,10 +12,12 @@ class PluginWidget(Widget, Ui_Form): TITLE = _('PDB Input') HELP = _('Options specific to')+' PDB '+_('input') COMMIT_NAME = 'pdb_input' - ICON = I('mimetypes/unknown.png') + ICON = I('mimetypes/txt.png') def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['single_line_paras', 'print_formatted_paras']) + ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces']) self.db, self.book_id = db, book_id + for x in get_option('paragraph_format').option.choices: + self.opt_paragraph_format.addItem(x) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/pdb_input.ui b/src/calibre/gui2/convert/pdb_input.ui deleted file mode 100644 index 2b632b1a33..0000000000 --- a/src/calibre/gui2/convert/pdb_input.ui +++ /dev/null @@ -1,48 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<ui version="4.0"> - <class>Form</class> - <widget class="QWidget" name="Form"> - <property name="geometry"> - <rect> - <x>0</x> - <y>0</y> - <width>400</width> - <height>300</height> - </rect> - </property> - <property name="windowTitle"> - <string>Form</string> - </property> - <layout class="QGridLayout" name="gridLayout"> - <item row="2" column="0"> - <spacer name="verticalSpacer"> - <property name="orientation"> - <enum>Qt::Vertical</enum> - </property> - <property name="sizeHint" stdset="0"> - <size> - <width>20</width> - <height>213</height> - </size> - </property> - </spacer> - </item> - <item row="0" column="0"> - <widget class="QCheckBox" name="opt_single_line_paras"> - <property name="text"> - <string>Treat each &line as a paragraph</string> - </property> - </widget> - </item> - <item row="1" column="0"> - <widget class="QCheckBox" name="opt_print_formatted_paras"> - <property name="text"> - <string>Assume print formatting</string> - </property> - </widget> - </item> - </layout> - </widget> - <resources/> - <connections/> -</ui> diff --git a/src/calibre/gui2/convert/tcr_input.py b/src/calibre/gui2/convert/tcr_input.py new file mode 100644 index 0000000000..2aa877ce4d --- /dev/null +++ b/src/calibre/gui2/convert/tcr_input.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember <john@nachtimwald.com>' +__docformat__ = 'restructuredtext en' + +from calibre.gui2.convert.txt_input_ui import Ui_Form +from calibre.gui2.convert import Widget + +class PluginWidget(Widget, Ui_Form): + + TITLE = _('TCR Input') + HELP = _('Options specific to')+' TCR '+_('input') + COMMIT_NAME = 'tcr_input' + ICON = I('mimetypes/txt.png') + + def __init__(self, parent, get_option, get_help, db=None, book_id=None): + Widget.__init__(self, parent, + ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces']) + self.db, self.book_id = db, book_id + for x in get_option('paragraph_format').option.choices: + self.opt_paragraph_format.addItem(x) + self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_input.py b/src/calibre/gui2/convert/txt_input.py index 31019251e2..99d04fe2f4 100644 --- a/src/calibre/gui2/convert/txt_input.py +++ b/src/calibre/gui2/convert/txt_input.py @@ -16,7 +16,8 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['single_line_paras', 'print_formatted_paras', 'markdown', - 'markdown_disable_toc', 'preserve_spaces']) + ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces']) self.db, self.book_id = db, book_id + for x in get_option('paragraph_format').option.choices: + self.opt_paragraph_format.addItem(x) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_input.ui b/src/calibre/gui2/convert/txt_input.ui index 186783c277..b45297fdf2 100644 --- a/src/calibre/gui2/convert/txt_input.ui +++ b/src/calibre/gui2/convert/txt_input.ui @@ -6,7 +6,7 @@ <rect> <x>0</x> <y>0</y> - <width>470</width> + <width>488</width> <height>300</height> </rect> </property> @@ -15,27 +15,16 @@ </property> <layout class="QGridLayout" name="gridLayout"> <item row="0" column="0"> - <widget class="QCheckBox" name="opt_single_line_paras"> + <widget class="QLabel" name="label_2"> <property name="text"> - <string>Treat each &line as a paragraph</string> + <string>Document structure detection</string> </property> </widget> </item> - <item row="1" column="0"> - <widget class="QCheckBox" name="opt_print_formatted_paras"> - <property name="text"> - <string>Assume print formatting</string> - </property> - </widget> + <item row="0" column="1"> + <widget class="QComboBox" name="opt_paragraph_format"/> </item> - <item row="2" column="0"> - <widget class="QCheckBox" name="opt_markdown"> - <property name="text"> - <string>Process using markdown</string> - </property> - </widget> - </item> - <item row="3" column="0"> + <item row="1" column="0" colspan="2"> <widget class="QLabel" name="label"> <property name="text"> <string><p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>.</string> @@ -48,14 +37,21 @@ </property> </widget> </item> - <item row="4" column="0"> + <item row="2" column="0" colspan="2"> <widget class="QCheckBox" name="opt_markdown_disable_toc"> <property name="text"> <string>Do not insert Table of Contents into output text when using markdown</string> </property> </widget> </item> - <item row="6" column="0"> + <item row="3" column="0" colspan="2"> + <widget class="QCheckBox" name="opt_preserve_spaces"> + <property name="text"> + <string>Preserve &spaces</string> + </property> + </widget> + </item> + <item row="4" column="0" colspan="2"> <spacer name="verticalSpacer"> <property name="orientation"> <enum>Qt::Vertical</enum> @@ -68,32 +64,8 @@ </property> </spacer> </item> - <item row="5" column="0"> - <widget class="QCheckBox" name="opt_preserve_spaces"> - <property name="text"> - <string>Preserve &spaces</string> - </property> - </widget> - </item> </layout> </widget> <resources/> - <connections> - <connection> - <sender>opt_markdown</sender> - <signal>toggled(bool)</signal> - <receiver>opt_markdown_disable_toc</receiver> - <slot>setEnabled(bool)</slot> - <hints> - <hint type="sourcelabel"> - <x>76</x> - <y>80</y> - </hint> - <hint type="destinationlabel"> - <x>418</x> - <y>105</y> - </hint> - </hints> - </connection> - </connections> + <connections/> </ui> From a2e47dae8fc92312af7481a4546ca6fee698f7ad Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Mon, 3 Jan 2011 01:11:34 -0500 Subject: [PATCH 16/35] reordered chapter priority, added em-dashes to smarten punctuation --- src/calibre/ebooks/conversion/preprocess.py | 5 +++-- src/calibre/ebooks/conversion/utils.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index ca74b04e8d..7f27d7a465 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -353,7 +353,7 @@ class HTMLPreProcessor(object): (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''), # Center separator lines - (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'), + (re.compile(u'<br>\s*(?P<break>([*#•✦]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'), # Remove page links (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''), @@ -567,6 +567,7 @@ class HTMLPreProcessor(object): html = html.replace(stop, '-->') # convert ellipsis to entities to prevent wrapping html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html) - # nbsp gets changed to space: html = re.sub('(?u)(?<=\w)\s(\.\s?){2}\.', ' …', html) + # convert double dashes to em-dash + html = re.sub('\s--\s', u'\u2014', html) return substitute_entites(html) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 56c9c9673e..51f81978cf 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -156,9 +156,9 @@ class PreProcessor(object): [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering - [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles + [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters ] From 2427c5bdd01d9c94abd3e887dd9d1cfcc3e2f5fc Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Mon, 3 Jan 2011 20:53:41 -0500 Subject: [PATCH 17/35] FB2 Output: Fix bug #8172, Include cover page in output when it is not referenced in the oeb spine. --- src/calibre/ebooks/fb2/fb2ml.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index f9ce9befb4..8d23a5f0b2 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -161,6 +161,17 @@ class FB2MLizer(object): text.append('<section>') self.section_level += 1 + # Insert the title page / cover into the spine if it is not already referenced. + title_name = u'' + if 'titlepage' in self.oeb_book.guide: + title_name = 'titlepage' + elif 'cover' in self.oeb_book.guide: + title_name = 'cover' + if title_name: + title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href] + if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml': + self.oeb_book.spine.insert(0, title_item, True) + for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) From d23ce51b98629014b0d4ba899b89d74d9ba51812 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 5 Jan 2011 18:30:50 -0500 Subject: [PATCH 18/35] FB2 Ouput: Insert image based covers into document. --- src/calibre/ebooks/fb2/fb2ml.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 8d23a5f0b2..f6deab677a 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -16,6 +16,7 @@ import uuid from lxml import etree +from calibre import guess_type from calibre import prepare_string_for_xml from calibre.constants import __appname__, __version__ from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace @@ -171,6 +172,12 @@ class FB2MLizer(object): title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href] if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml': self.oeb_book.spine.insert(0, title_item, True) + # Create xhtml page to reference cover image so it can be used. + if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids: + id = unicode(self.oeb_book.metadata.cover[0]) + cover_item = self.oeb_book.manifest.ids[id] + if cover_item.media_type in OEB_RASTER_IMAGES: + self.insert_image_cover(cover_item.href) for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) @@ -196,6 +203,17 @@ class FB2MLizer(object): return ''.join(text) + '</body>' + def insert_image_cover(self, image_href): + from calibre.ebooks.oeb.base import RECOVER_PARSER + try: + root = etree.fromstring(u'<html xmlns="%s"><body><img src="%s" /></body></html>' % (XHTML_NS, image_href), parser=RECOVER_PARSER) + except: + root = etree.fromstring(u'', parser=RECOVER_PARSER) + + id, href = self.oeb_book.manifest.generate('fb2_cover', 'fb2_cover.xhtml') + item = self.oeb_book.manifest.add(id, href, guess_type(href)[0], data=root) + self.oeb_book.spine.insert(0, item, True) + def fb2mlize_images(self): ''' This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function. From 3bb40c9911b8cae50cedaa4490d6c5d731f8ddc3 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 5 Jan 2011 18:39:55 -0500 Subject: [PATCH 19/35] TCR, PDB (PalmDoc, zTXT) Input: Call TXT plugin while setting default values for options that are not set by calling plugin. --- src/calibre/ebooks/pdb/palmdoc/reader.py | 10 ++++++++-- src/calibre/ebooks/pdb/ztxt/reader.py | 10 ++++++++-- src/calibre/ebooks/tcr/input.py | 9 ++++++++- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/pdb/palmdoc/reader.py b/src/calibre/ebooks/pdb/palmdoc/reader.py index f1f00ea8e3..dd9706f00c 100644 --- a/src/calibre/ebooks/pdb/palmdoc/reader.py +++ b/src/calibre/ebooks/pdb/palmdoc/reader.py @@ -62,7 +62,13 @@ class Reader(FormatReader): self.log.info('Converting text to OEB...') stream = StringIO(raw_txt) + from calibre.customize.ui import plugin_for_input_format + + txt_plugin = plugin_for_input_format('txt') + for option in txt_plugin.options: + if not hasattr(self.options, option.option.name): + setattr(self.options, option.name, option.recommend_val) + stream.seek(0) - return plugin_for_input_format('txt').convert(stream, self.options, - 'txt', self.log, {}) + return txt_plugin.convert(stream, self.options, 'txt', self.log, {}) diff --git a/src/calibre/ebooks/pdb/ztxt/reader.py b/src/calibre/ebooks/pdb/ztxt/reader.py index 7e51dae1fd..8d51c07e97 100644 --- a/src/calibre/ebooks/pdb/ztxt/reader.py +++ b/src/calibre/ebooks/pdb/ztxt/reader.py @@ -79,7 +79,13 @@ class Reader(FormatReader): self.log.info('Converting text to OEB...') stream = StringIO(raw_txt) + from calibre.customize.ui import plugin_for_input_format + + txt_plugin = plugin_for_input_format('txt') + for option in txt_plugin.options: + if not hasattr(self.options, option.option.name): + setattr(self.options, option.name, option.recommend_val) + stream.seek(0) - return plugin_for_input_format('txt').convert(stream, self.options, - 'txt', self.log, {}) + return txt_plugin.convert(stream, self.options, 'txt', self.log, {}) diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py index 47fe7e7337..5f9554665b 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/tcr/input.py @@ -41,7 +41,14 @@ class TCRInput(InputFormatPlugin): log.info('Converting text to OEB...') stream = StringIO(raw_txt) + from calibre.customize.ui import plugin_for_input_format + + txt_plugin = plugin_for_input_format('txt') + for option in txt_plugin.options: + if not hasattr(options, option.option.name): + setattr(options, option.name, option.recommend_val) + stream.seek(0) - return plugin_for_input_format('txt').convert(stream, options, + return txt_plugin.convert(stream, options, 'txt', log, accelerators) From dea9ae683217159626407e622848c1481c1dcbef Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Wed, 5 Jan 2011 20:03:49 -0500 Subject: [PATCH 20/35] TXT Input: Split pargarph and formatting into two different options. --- src/calibre/ebooks/pdb/input.py | 15 ++++-- src/calibre/ebooks/tcr/input.py | 15 ++++-- src/calibre/ebooks/txt/input.py | 54 ++++++++++++--------- src/calibre/ebooks/txt/processor.py | 30 ++++++------ src/calibre/gui2/convert/pdb_input.py | 8 ++-- src/calibre/gui2/convert/tcr_input.py | 8 ++-- src/calibre/gui2/convert/txt_input.py | 8 ++-- src/calibre/gui2/convert/txt_input.ui | 69 +++++++++++++++++---------- 8 files changed, 126 insertions(+), 81 deletions(-) diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index b8b4b93ca1..3688abff3f 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -19,15 +19,20 @@ class PDBInput(InputFormatPlugin): file_types = set(['pdb']) options = set([ - OptionRecommendation(name='paragraph_format', recommended_value='auto', - choices=['auto', 'block', 'single', 'print', 'markdown'], - help=_('How calibre splits text into paragraphs.\n' + OptionRecommendation(name='paragraph_type', recommended_value='auto', + choices=['auto', 'block', 'single', 'print'], + help=_('Paragraph structure.\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' - '* auto: Try to auto detect paragraph format.\n' + '* auto: Try to auto detect paragraph type.\n' '* block: Treat a blank line as a paragraph break.\n' '* single: Assume every line is a paragraph.\n' '* print: Assume every line starting with 2+ spaces or a tab ' - 'starts a paragraph.\n' + 'starts a paragraph.')), + OptionRecommendation(name='formatting_type', recommended_value='auto', + choices=['auto', 'none', 'markdown'], + help=_('Formatting used within the document.' + '* auto: Try to auto detect the document formatting.\n' + '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' '* markdown: Run the input though the markdown pre-processor. ' 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py index 5f9554665b..e4118c1c0a 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/tcr/input.py @@ -17,15 +17,20 @@ class TCRInput(InputFormatPlugin): file_types = set(['tcr']) options = set([ - OptionRecommendation(name='paragraph_format', recommended_value='auto', - choices=['auto', 'block', 'single', 'print', 'markdown'], - help=_('How calibre splits text into paragraphs.\n' + OptionRecommendation(name='paragraph_type', recommended_value='auto', + choices=['auto', 'block', 'single', 'print'], + help=_('Paragraph structure.\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' - '* auto: Try to auto detect paragraph format.\n' + '* auto: Try to auto detect paragraph type.\n' '* block: Treat a blank line as a paragraph break.\n' '* single: Assume every line is a paragraph.\n' '* print: Assume every line starting with 2+ spaces or a tab ' - 'starts a paragraph.\n' + 'starts a paragraph.')), + OptionRecommendation(name='formatting_type', recommended_value='auto', + choices=['auto', 'none', 'markdown'], + help=_('Formatting used within the document.' + '* auto: Try to auto detect the document formatting.\n' + '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' '* markdown: Run the input though the markdown pre-processor. ' 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index e68c47e9b3..47e92a45a9 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -10,7 +10,7 @@ from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ - preserve_spaces, detect_paragraph_formatting + preserve_spaces, detect_paragraph_type, detect_formatting_type from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -21,15 +21,20 @@ class TXTInput(InputFormatPlugin): file_types = set(['txt']) options = set([ - OptionRecommendation(name='paragraph_format', recommended_value='auto', - choices=['auto', 'block', 'single', 'print', 'markdown'], - help=_('How calibre splits text into paragraphs.\n' + OptionRecommendation(name='paragraph_type', recommended_value='auto', + choices=['auto', 'block', 'single', 'print'], + help=_('Paragraph structure.\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' - '* auto: Try to auto detect paragraph format.\n' + '* auto: Try to auto detect paragraph type.\n' '* block: Treat a blank line as a paragraph break.\n' '* single: Assume every line is a paragraph.\n' '* print: Assume every line starting with 2+ spaces or a tab ' - 'starts a paragraph.\n' + 'starts a paragraph.')), + OptionRecommendation(name='formatting_type', recommended_value='auto', + choices=['auto', 'none', 'markdown'], + help=_('Formatting used within the document.' + '* auto: Try to auto detect the document formatting.\n' + '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' '* markdown: Run the input though the markdown pre-processor. ' 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), OptionRecommendation(name='preserve_spaces', recommended_value=False, @@ -57,29 +62,16 @@ class TXTInput(InputFormatPlugin): log.debug('No input encoding specified and could not auto detect using %s' % ienc) txt = txt.decode(ienc, 'replace') - # Determine the formatting of the document. - if options.paragraph_format == 'auto': - options.paragraph_format = detect_paragraph_formatting(txt) - if options.paragraph_format == 'unknown': - log.debug('Could not reliably determine paragraph format using block format') - options.paragraph_format = 'block' - else: - log.debug('Auto detected paragraph format as %s' % options.paragraph_format) - - # We don't check for block because the processor assumes block. - # single and print at transformed to block for processing. - if options.paragraph_format == 'single': - txt = separate_paragraphs_single_line(txt) - elif options.paragraph_format == 'print': - txt = separate_paragraphs_print_formatted(txt) - txt = _ent_pat.sub(xml_entity_to_unicode, txt) # Preserve spaces will replace multiple spaces to a space # followed by the   entity. if options.preserve_spaces: txt = preserve_spaces(txt) + + if options.formatting_type == 'auto': + options.formatting_type = detect_formatting_type(txt) - if options.paragraph_format == 'markdown': + if options.formatting_type == 'markdown': log.debug('Running text though markdown conversion...') try: html = convert_markdown(txt, disable_toc=options.markdown_disable_toc) @@ -87,6 +79,22 @@ class TXTInput(InputFormatPlugin): raise ValueError('This txt file has malformed markup, it cannot be' ' converted by calibre. See http://daringfireball.net/projects/markdown/syntax') else: + # Determine the paragraph type of the document. + if options.paragraph_type == 'auto': + options.paragraph_type = detect_paragraph_type(txt) + if options.paragraph_type == 'unknown': + log.debug('Could not reliably determine paragraph type using block') + options.paragraph_type = 'block' + else: + log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + + # We don't check for block because the processor assumes block. + # single and print at transformed to block for processing. + if options.paragraph_type == 'single': + txt = separate_paragraphs_single_line(txt) + elif options.paragraph_type == 'print': + txt = separate_paragraphs_print_formatted(txt) + flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index e1014b0c7b..f6d628e7c5 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -93,7 +93,7 @@ def split_string_separator(txt, size) : xrange(0, len(txt), size)]) return txt -def detect_paragraph_formatting(txt): +def detect_paragraph_type(txt): ''' Tries to determine the formatting of the document. @@ -109,6 +109,20 @@ def detect_paragraph_formatting(txt): txt = txt.replace('\r', '\n') txt_line_count = len(re.findall('(?mu)^\s*.+$', txt)) + # Check for print + tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) + if tab_line_count / float(txt_line_count) >= .25: + return 'print' + + # Check for block + empty_line_count = len(re.findall('(?mu)^\s*$', txt)) + if empty_line_count / float(txt_line_count) >= .25: + return 'block' + + # Nothing else matched to assume single. + return 'single' + +def detect_formatting_type(txt): # Check for markdown # Headings if len(re.findall('(?mu)^#+', txt)) >= 5: @@ -129,16 +143,4 @@ def detect_paragraph_formatting(txt): if txt.count('\\'+c) > 10: return 'markdown' - # Check for print - tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) - if tab_line_count / float(txt_line_count) >= .25: - return 'print' - - # Check for block - empty_line_count = len(re.findall('(?mu)^\s*$', txt)) - if empty_line_count / float(txt_line_count) >= .25: - return 'block' - - # Nothing else matched to assume single. - return 'single' - + return 'none' diff --git a/src/calibre/gui2/convert/pdb_input.py b/src/calibre/gui2/convert/pdb_input.py index 655f4025a7..16ff1ff236 100644 --- a/src/calibre/gui2/convert/pdb_input.py +++ b/src/calibre/gui2/convert/pdb_input.py @@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces']) + ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) self.db, self.book_id = db, book_id - for x in get_option('paragraph_format').option.choices: - self.opt_paragraph_format.addItem(x) + for x in get_option('paragraph_type').option.choices: + self.opt_paragraph_type.addItem(x) + for x in get_option('formatting_type').option.choices: + self.opt_formatting_type.addItem(x) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/tcr_input.py b/src/calibre/gui2/convert/tcr_input.py index 2aa877ce4d..366643ad5b 100644 --- a/src/calibre/gui2/convert/tcr_input.py +++ b/src/calibre/gui2/convert/tcr_input.py @@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces']) + ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) self.db, self.book_id = db, book_id - for x in get_option('paragraph_format').option.choices: - self.opt_paragraph_format.addItem(x) + for x in get_option('paragraph_type').option.choices: + self.opt_paragraph_type.addItem(x) + for x in get_option('formatting_type').option.choices: + self.opt_formatting_type.addItem(x) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_input.py b/src/calibre/gui2/convert/txt_input.py index 99d04fe2f4..62672cc0f9 100644 --- a/src/calibre/gui2/convert/txt_input.py +++ b/src/calibre/gui2/convert/txt_input.py @@ -16,8 +16,10 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, - ['paragraph_format', 'markdown_disable_toc', 'preserve_spaces']) + ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) self.db, self.book_id = db, book_id - for x in get_option('paragraph_format').option.choices: - self.opt_paragraph_format.addItem(x) + for x in get_option('paragraph_type').option.choices: + self.opt_paragraph_type.addItem(x) + for x in get_option('formatting_type').option.choices: + self.opt_formatting_type.addItem(x) self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/txt_input.ui b/src/calibre/gui2/convert/txt_input.ui index b45297fdf2..6cbd68135f 100644 --- a/src/calibre/gui2/convert/txt_input.ui +++ b/src/calibre/gui2/convert/txt_input.ui @@ -6,7 +6,7 @@ <rect> <x>0</x> <y>0</y> - <width>488</width> + <width>518</width> <height>300</height> </rect> </property> @@ -17,41 +17,21 @@ <item row="0" column="0"> <widget class="QLabel" name="label_2"> <property name="text"> - <string>Document structure detection</string> + <string>Paragraph style:</string> </property> </widget> </item> <item row="0" column="1"> - <widget class="QComboBox" name="opt_paragraph_format"/> + <widget class="QComboBox" name="opt_paragraph_type"/> </item> - <item row="1" column="0" colspan="2"> - <widget class="QLabel" name="label"> - <property name="text"> - <string><p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>.</string> - </property> - <property name="wordWrap"> - <bool>true</bool> - </property> - <property name="openExternalLinks"> - <bool>true</bool> - </property> - </widget> - </item> - <item row="2" column="0" colspan="2"> - <widget class="QCheckBox" name="opt_markdown_disable_toc"> - <property name="text"> - <string>Do not insert Table of Contents into output text when using markdown</string> - </property> - </widget> - </item> - <item row="3" column="0" colspan="2"> + <item row="5" column="0" colspan="2"> <widget class="QCheckBox" name="opt_preserve_spaces"> <property name="text"> <string>Preserve &spaces</string> </property> </widget> </item> - <item row="4" column="0" colspan="2"> + <item row="6" column="0" colspan="2"> <spacer name="verticalSpacer"> <property name="orientation"> <enum>Qt::Vertical</enum> @@ -64,6 +44,45 @@ </property> </spacer> </item> + <item row="1" column="1"> + <widget class="QComboBox" name="opt_formatting_type"/> + </item> + <item row="1" column="0"> + <widget class="QLabel" name="label_3"> + <property name="text"> + <string>Formatting style:</string> + </property> + </widget> + </item> + <item row="2" column="0" rowspan="2" colspan="2"> + <widget class="QGroupBox" name="groupBox"> + <property name="title"> + <string>Markdown Options</string> + </property> + <layout class="QVBoxLayout" name="verticalLayout"> + <item> + <widget class="QLabel" name="label"> + <property name="text"> + <string><p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>.</string> + </property> + <property name="wordWrap"> + <bool>true</bool> + </property> + <property name="openExternalLinks"> + <bool>true</bool> + </property> + </widget> + </item> + <item> + <widget class="QCheckBox" name="opt_markdown_disable_toc"> + <property name="text"> + <string>Do not insert Table of Contents into output text when using markdown</string> + </property> + </widget> + </item> + </layout> + </widget> + </item> </layout> </widget> <resources/> From 760d4d2fd35b2dc4284c2798a184a89b241438b6 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Fri, 7 Jan 2011 01:48:23 +0800 Subject: [PATCH 21/35] added preface to the list of common chapter headings --- src/calibre/ebooks/conversion/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 51f81978cf..ec83d36cfc 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -153,7 +153,7 @@ class PreProcessor(object): default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)" chapter_types = [ - [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], + [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines From 93bd1df11adc6fb33ed518fe898696f99e7ed3d1 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Fri, 7 Jan 2011 01:57:00 +0800 Subject: [PATCH 22/35] cleaned up comments --- src/calibre/ebooks/conversion/preprocess.py | 5 ----- src/calibre/utils/wordcount.py | 4 +++- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 7f27d7a465..67be59083e 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -363,11 +363,6 @@ class HTMLPreProcessor(object): # Remove gray background (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), - # Detect Chapters to match default XPATH in GUI - #(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head), - # Cover the case where every letter in a chapter title is separated by a space - #(re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head), - # Convert line breaks to paragraphs (re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'), (re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'), diff --git a/src/calibre/utils/wordcount.py b/src/calibre/utils/wordcount.py index 2bc91f4014..b317f99469 100644 --- a/src/calibre/utils/wordcount.py +++ b/src/calibre/utils/wordcount.py @@ -18,7 +18,9 @@ properties counted: * non_asian_words * words -Python License +Sourced from: +http://ginstrom.com/scribbles/2008/05/17/counting-words-etc-in-an-html-file-with-python/ +http://ginstrom.com/scribbles/2007/10/06/counting-words-characters-and-asian-characters-with-python/ """ __version__ = 0.1 __author__ = "Ryan Ginstrom" From 482c15e16ec8b7ce373c6048684b7565548fb62e Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Fri, 7 Jan 2011 02:34:52 +0800 Subject: [PATCH 23/35] removed debug statements --- src/calibre/ebooks/conversion/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index ec83d36cfc..2e1ee5852e 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -117,7 +117,7 @@ class PreProcessor(object): self.min_chapters = 1 if wordcount > 7000: self.min_chapters = wordcount / 7000 - print "minimum chapters required are: "+str(self.min_chapters) + #print "minimum chapters required are: "+str(self.min_chapters) heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") @@ -180,7 +180,7 @@ class PreProcessor(object): words_per_chptr = wordcount if words_per_chptr > 0 and self.html_preprocess_sections > 0: words_per_chptr = wordcount / self.html_preprocess_sections - print "Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters" + self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters") return html From 439b8c0f213d3b27888086b67619198c0722705f Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Fri, 7 Jan 2011 03:40:47 +0800 Subject: [PATCH 24/35] delete microsoft smart tags during preprocess --- src/calibre/ebooks/conversion/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 6d2d123b10..4bb96ac088 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -234,8 +234,11 @@ class PreProcessor(object): self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles") # remove remaining non-breaking spaces html = re.sub(ur'\u00a0', ' ', html) + # Get rid of various common microsoft specific tags which can cause issues later # Get rid of empty <o:p> tags to simplify other processing html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) + # Delete microsoft 'smart' tags + html = re.sub('(?i)</?st1:\w+>', '', html) # Get rid of empty span, bold, & italics tags html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html) html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html) From 9af7ba996f40dc4979df720f9d5bdcf36a8c14da Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Thu, 6 Jan 2011 14:40:01 -0700 Subject: [PATCH 25/35] If the user specifies an unknown encoding, automatically change it to None --- src/calibre/gui2/convert/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/convert/__init__.py b/src/calibre/gui2/convert/__init__.py index e5f72099fe..ea7a24510a 100644 --- a/src/calibre/gui2/convert/__init__.py +++ b/src/calibre/gui2/convert/__init__.py @@ -6,7 +6,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import textwrap +import textwrap, codecs from functools import partial from PyQt4.Qt import QWidget, QSpinBox, QDoubleSpinBox, QLineEdit, QTextEdit, \ @@ -128,6 +128,7 @@ class Widget(QWidget): def get_value(self, g): from calibre.gui2.convert.xpath_wizard import XPathEdit from calibre.gui2.convert.regex_builder import RegexEdit + from calibre.gui2.widgets import EncodingComboBox ret = self.get_value_handler(g) if ret != 'this is a dummy return value, xcswx1avcx4x': return ret @@ -139,6 +140,13 @@ class Widget(QWidget): if not ans: ans = None return ans + elif isinstance(g, EncodingComboBox): + ans = unicode(g.currentText()).strip() + try: + codecs.lookup(ans) + except: + ans = '' + return ans elif isinstance(g, QComboBox): return unicode(g.currentText()) elif isinstance(g, QCheckBox): From df602343b4caf21493f56997b1068d03b306ca84 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Thu, 6 Jan 2011 17:33:17 -0700 Subject: [PATCH 26/35] Walla by marbs --- resources/recipes/njp.recipe | 2 +- resources/recipes/walla.recipe | 44 ++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 resources/recipes/walla.recipe diff --git a/resources/recipes/njp.recipe b/resources/recipes/njp.recipe index ed202512f2..996aef2fdf 100644 --- a/resources/recipes/njp.recipe +++ b/resources/recipes/njp.recipe @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- __license__ = 'GPL v3' -__copyright__ = 'Chema Corts - 2011-01-05' +__copyright__ = u'Chema Cort\xe9s - 2011-01-05' __version__ = 'v0.01' __date__ = '2011-01-05' ''' diff --git a/resources/recipes/walla.recipe b/resources/recipes/walla.recipe new file mode 100644 index 0000000000..5fbfed7a03 --- /dev/null +++ b/resources/recipes/walla.recipe @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1283848012(BasicNewsRecipe): + description = 'The WallaNews.' + cover_url = 'http://ftp5.bizportal.co.il/web/giflib/news/rsPhoto/sz_5/rsz_220_220_logo_walla.gif' + title = u'Walla' + language = 'he' + __author__ = 'marbs' + extra_css='img {max-width:100%;} body{direction: rtl;},title{direction: rtl; } ,article_description{direction: rtl; }, a.article{direction: rtl; } ,calibre_feed_description{direction: rtl; }' + simultaneous_downloads = 5 +# remove_javascript = True + timefmt = '[%a, %d %b, %Y]' + oldest_article = 1 + max_articles_per_feed = 100 + # remove_attributes = ['width'] + keep_only_tags =dict(name='div', attrs={'class':'wp-0-b w3'}) + remove_tags = [dict(name='div', attrs={'class':'tagsContainer'})] + max_articles_per_feed = 100 +# preprocess_regexps = [ +# (re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: '') +# ] + + + feeds = [(u'חדשות', u'http://rss.walla.co.il/?w=/1/0/1/@rss'), + (u'עסקים', u'http://rss.walla.co.il/?w=/2/3/1/@rss'), + (u'תרבות', u'http://rss.walla.co.il/?w=/4/249/1/@rss'), + (u'בריאות', u'http://rss.walla.co.il/?w=/5/18/1/@rss'), + (u'TECH', u'http://rss.walla.co.il/?w=/6/4/1/@rss'), + (u'אסטרולוגיה', u'http://rss.walla.co.il/?w=/8/3307/1/@rss'), + (u'בעלי חיים', u'http://rss.walla.co.il/?w=/59/5703/1/@rss'), + (u'רכב', u'http://rss.walla.co.il/?w=/31/4700/1/@rss'), + (u'סלבס', u'http://rss.walla.co.il/?w=/22/3600/1/@rss'), + (u'אוכל', u'http://rss.walla.co.il/?w=/9/903/1/@rss'), + (u'אופנה', u'http://rss.walla.co.il/?w=/24/2120/1/@rss'), + (u'ברנזה', u'http://rss.walla.co.il/?w=/27/3900/1/@rss'), + (u'ZONE', u'http://rss.walla.co.il/?w=/18/500/1/@rss'), + (u'ספורט', u'http://rss.walla.co.il/?w=/3/7/1/@rss')] + + def print_version(self, url): + print_url = url + '/@@/item/printer' + return print_url + From 332c80aa54f73aa25fb31a5e2dd0482560bff384 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Thu, 6 Jan 2011 17:35:00 -0700 Subject: [PATCH 27/35] New London Day by Being --- resources/recipes/new_london_day.recipe | 74 +++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 resources/recipes/new_london_day.recipe diff --git a/resources/recipes/new_london_day.recipe b/resources/recipes/new_london_day.recipe new file mode 100644 index 0000000000..bc8c44e40e --- /dev/null +++ b/resources/recipes/new_london_day.recipe @@ -0,0 +1,74 @@ +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' +__docformat__ = 'restructuredtext en' + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1294342201(BasicNewsRecipe): + title = u'New London Day' + __author__ = 'Being' + description = 'State, local and business news from New London, CT' + language = 'en_GB' + oldest_article = 1 + max_articles_per_feed = 200 + + use_embedded_content = False + no_stylesheets = True + remove_javascript = True + remove_tags_before = dict(id='article') + remove_tags_after = dict(id='article') + remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}), + dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), + dict(name=['script', 'noscript', 'style'])] + remove_tags_after = [ {'class':['photo_article',]} ] + remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer"]}, + {'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent"]}, + dict(name='font',attrs={'id':["cr-other-headlines"]})] + extra_css = ''' + h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} + h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} + .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} + .date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} + p{font-family:Arial,Helvetica,sans-serif;font-size:small;} + .copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center} + .story{font-family:Arial,Helvetica,sans-serif;font-size:small;} + .entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;} + .pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;} + .maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;} + .story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;} + body{font-family:Helvetica,Arial,sans-serif;font-size:small;} + ''' + + feeds = [ + (u'All News', u'http://www.theday.com/section/rss'), + (u'Breaking News', u'http://www.theday.com/section/rss01'), + (u'Police and Courts', u'http://www.theday.com/section/rss02'), + (u'State News', u'http://www.theday.com/section/rss03'), + (u'Local Business', u'http://www.theday.com/section/rss04'), + (u'Entertainment', u'http://www.theday.com/section/rss05'), + (u'Opinion', u'http://www.theday.com/section/rss06'), + (u'Casinos', u'http://www.theday.com/section/rss12'), + (u'Defense and Military', u'http://www.theday.com/section/rss14'), + (u'Ann Baldelli Ruminations', u'http://www.theday.com/section/rss20'), + (u'Paul Choiniere Ruminations', u'http://www.theday.com/section/rss21'), + (u'Michael Costanza Omnivore', u'http://www.theday.com/section/rss23'), + (u'Rebecca Dangelo Reel Life', u'http://www.theday.com/section/rss25'),] + + def print_version(self, url): + return url.replace('/index.html', '/print.html') + + def get_article_url(self, article): + return article.get('feedburner_origlink', article.get('guid', article.get('link'))) + + + def postprocess_html(self, soup, first_fetch): + for t in soup.findAll(['table', 'tr', 'td']): + t.name = 'div' + + for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})): + tag.extract() + for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})): + tag.extract() + + return soup + From 7343d48a37227ed8e9093e2ccf4aead31aac614d Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 6 Jan 2011 20:04:11 -0500 Subject: [PATCH 28/35] Remove PDB and TCR input options. TXT auto options are default and should suffice. --- src/calibre/ebooks/fb2/fb2ml.py | 2 +- src/calibre/ebooks/pdb/input.py | 24 ------------------------ src/calibre/ebooks/tcr/input.py | 24 ------------------------ src/calibre/gui2/convert/pdb_input.py | 25 ------------------------- src/calibre/gui2/convert/tcr_input.py | 25 ------------------------- 5 files changed, 1 insertion(+), 99 deletions(-) delete mode 100644 src/calibre/gui2/convert/pdb_input.py delete mode 100644 src/calibre/gui2/convert/tcr_input.py diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index f6deab677a..4dd6e7c7ae 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -173,7 +173,7 @@ class FB2MLizer(object): if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml': self.oeb_book.spine.insert(0, title_item, True) # Create xhtml page to reference cover image so it can be used. - if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids: + if not title_name and self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids: id = unicode(self.oeb_book.metadata.cover[0]) cover_item = self.oeb_book.manifest.ids[id] if cover_item.media_type in OEB_RASTER_IMAGES: diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index 3688abff3f..8c754782a2 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -18,30 +18,6 @@ class PDBInput(InputFormatPlugin): description = 'Convert PDB to HTML' file_types = set(['pdb']) - options = set([ - OptionRecommendation(name='paragraph_type', recommended_value='auto', - choices=['auto', 'block', 'single', 'print'], - help=_('Paragraph structure.\n' - 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' - '* auto: Try to auto detect paragraph type.\n' - '* block: Treat a blank line as a paragraph break.\n' - '* single: Assume every line is a paragraph.\n' - '* print: Assume every line starting with 2+ spaces or a tab ' - 'starts a paragraph.')), - OptionRecommendation(name='formatting_type', recommended_value='auto', - choices=['auto', 'none', 'markdown'], - help=_('Formatting used within the document.' - '* auto: Try to auto detect the document formatting.\n' - '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' - '* markdown: Run the input though the markdown pre-processor. ' - 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), - OptionRecommendation(name='preserve_spaces', recommended_value=False, - help=_('Normally extra spaces are condensed into a single space. ' - 'With this option all spaces will be displayed.')), - OptionRecommendation(name="markdown_disable_toc", recommended_value=False, - help=_('Do not insert a Table of Contents into the output text.')), - ]) - def convert(self, stream, options, file_ext, log, accelerators): header = PdbHeaderReader(stream) diff --git a/src/calibre/ebooks/tcr/input.py b/src/calibre/ebooks/tcr/input.py index c1dcef235d..4c759c5be2 100644 --- a/src/calibre/ebooks/tcr/input.py +++ b/src/calibre/ebooks/tcr/input.py @@ -16,30 +16,6 @@ class TCRInput(InputFormatPlugin): description = 'Convert TCR files to HTML' file_types = set(['tcr']) - options = set([ - OptionRecommendation(name='paragraph_type', recommended_value='auto', - choices=['auto', 'block', 'single', 'print'], - help=_('Paragraph structure.\n' - 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n' - '* auto: Try to auto detect paragraph type.\n' - '* block: Treat a blank line as a paragraph break.\n' - '* single: Assume every line is a paragraph.\n' - '* print: Assume every line starting with 2+ spaces or a tab ' - 'starts a paragraph.')), - OptionRecommendation(name='formatting_type', recommended_value='auto', - choices=['auto', 'none', 'markdown'], - help=_('Formatting used within the document.' - '* auto: Try to auto detect the document formatting.\n' - '* none: Do not modify the paragraph formatting. Everything is a paragraph.\n' - '* markdown: Run the input though the markdown pre-processor. ' - 'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), - OptionRecommendation(name='preserve_spaces', recommended_value=False, - help=_('Normally extra spaces are condensed into a single space. ' - 'With this option all spaces will be displayed.')), - OptionRecommendation(name="markdown_disable_toc", recommended_value=False, - help=_('Do not insert a Table of Contents into the output text.')), - ]) - def convert(self, stream, options, file_ext, log, accelerators): log.info('Decompressing text...') raw_txt = decompress(stream) diff --git a/src/calibre/gui2/convert/pdb_input.py b/src/calibre/gui2/convert/pdb_input.py deleted file mode 100644 index 16ff1ff236..0000000000 --- a/src/calibre/gui2/convert/pdb_input.py +++ /dev/null @@ -1,25 +0,0 @@ -# -*- coding: utf-8 -*- - -__license__ = 'GPL 3' -__copyright__ = '2011, John Schember <john@nachtimwald.com>' -__docformat__ = 'restructuredtext en' - -from calibre.gui2.convert.txt_input_ui import Ui_Form -from calibre.gui2.convert import Widget - -class PluginWidget(Widget, Ui_Form): - - TITLE = _('PDB Input') - HELP = _('Options specific to')+' PDB '+_('input') - COMMIT_NAME = 'pdb_input' - ICON = I('mimetypes/txt.png') - - def __init__(self, parent, get_option, get_help, db=None, book_id=None): - Widget.__init__(self, parent, - ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) - self.db, self.book_id = db, book_id - for x in get_option('paragraph_type').option.choices: - self.opt_paragraph_type.addItem(x) - for x in get_option('formatting_type').option.choices: - self.opt_formatting_type.addItem(x) - self.initialize_options(get_option, get_help, db, book_id) diff --git a/src/calibre/gui2/convert/tcr_input.py b/src/calibre/gui2/convert/tcr_input.py deleted file mode 100644 index 366643ad5b..0000000000 --- a/src/calibre/gui2/convert/tcr_input.py +++ /dev/null @@ -1,25 +0,0 @@ -# -*- coding: utf-8 -*- - -__license__ = 'GPL 3' -__copyright__ = '2011, John Schember <john@nachtimwald.com>' -__docformat__ = 'restructuredtext en' - -from calibre.gui2.convert.txt_input_ui import Ui_Form -from calibre.gui2.convert import Widget - -class PluginWidget(Widget, Ui_Form): - - TITLE = _('TCR Input') - HELP = _('Options specific to')+' TCR '+_('input') - COMMIT_NAME = 'tcr_input' - ICON = I('mimetypes/txt.png') - - def __init__(self, parent, get_option, get_help, db=None, book_id=None): - Widget.__init__(self, parent, - ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces']) - self.db, self.book_id = db, book_id - for x in get_option('paragraph_type').option.choices: - self.opt_paragraph_type.addItem(x) - for x in get_option('formatting_type').option.choices: - self.opt_formatting_type.addItem(x) - self.initialize_options(get_option, get_help, db, book_id) From 1786820728f1d69d2f5c5bf2ffd4d8f50f4b0219 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 6 Jan 2011 20:07:09 -0500 Subject: [PATCH 29/35] PDB PDF Input: Dynamically set options based on PDF plugin. --- src/calibre/ebooks/pdb/pdf/reader.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py index c151551866..30b0c4c57c 100644 --- a/src/calibre/ebooks/pdb/pdf/reader.py +++ b/src/calibre/ebooks/pdb/pdf/reader.py @@ -19,9 +19,6 @@ class Reader(FormatReader): self.stream = stream self.log = log self.options = options - setattr(self.options, 'new_pdf_engine', False) - setattr(self.options, 'no_images', False) - setattr(self.options, 'unwrap_factor', 0.45) def extract_content(self, output_dir): self.log.info('Extracting PDF...') @@ -31,7 +28,12 @@ class Reader(FormatReader): for x in xrange(self.header.section_count()): pdf.write(self.header.section_data(x)) - from calibre.customize.ui import plugin_for_input_format - pdf.seek(0) - return plugin_for_input_format('pdf').convert(pdf, self.options, - 'pdf', self.log, []) + from calibre.customize.ui import plugin_for_input_format + + pdf_plugin = plugin_for_input_format('pdf') + for option in pdf_plugin.options: + if not hasattr(self.options, option.option.name): + setattr(self.options, option.name, option.recommended_value) + + pdf.seek(0) + return pdf_plugin.convert(pdf, self.options, 'pdf', self.log, {}) From b5599f8ff2a9006d4312a9c88451afaf6001e41d Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 6 Jan 2011 20:51:28 -0500 Subject: [PATCH 30/35] Fix indents. --- src/calibre/ebooks/conversion/preprocess.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 3ff816b3bf..9a27274dd8 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -51,16 +51,16 @@ def chap_head(match): chap = match.group('chap') title = match.group('title') if not title: - return '<h1>'+chap+'</h1><br/>\n' + return '<h1>'+chap+'</h1><br/>\n' else: - return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n' + return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n' def wrap_lines(match): ital = match.group('ital') if not ital: - return ' ' + return ' ' else: - return ital+' ' + return ital+' ' class DocAnalysis(object): ''' @@ -191,7 +191,7 @@ class Dehyphenator(object): dehyphenated = unicode(firsthalf) + unicode(secondhalf) lookupword = self.removesuffixes.sub('', dehyphenated) if self.prefixes.match(firsthalf) is None: - lookupword = self.removeprefix.sub('', lookupword) + lookupword = self.removeprefix.sub('', lookupword) #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) try: searchresult = self.html.find(lookupword.lower()) From c7332d3651a54bf3d9a5890f08af0c6de6776acb Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Thu, 6 Jan 2011 20:57:03 -0500 Subject: [PATCH 31/35] Fix indents. --- src/calibre/ebooks/conversion/utils.py | 40 +++++++++++++------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 11979b933c..a76ec8675d 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -113,24 +113,24 @@ class PreProcessor(object): # some lit files don't have any <p> tags or equivalent (generally just plain text between # <pre> tags), check and mark up line endings if required before proceeding if self.no_markup(html, 0.1): - self.log("not enough paragraph markers, adding now") - # check if content is in pre tags, use txt processor to mark up if so - pre = re.compile(r'<pre>', re.IGNORECASE) - if len(pre.findall(html)) == 1: - self.log("Running Text Processing") - from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \ - separate_paragraphs_single_line - outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL) - html = outerhtml.sub('\g<text>', html) - html = separate_paragraphs_single_line(html) - html = preserve_spaces(html) - html = convert_basic(html, epub_split_size_kb=0) - else: - # Add markup naively - # TODO - find out if there are cases where there are more than one <pre> tag or - # other types of unmarked html and handle them in some better fashion - add_markup = re.compile('(?<!>)(\n)') - html = add_markup.sub('</p>\n<p>', html) + self.log("not enough paragraph markers, adding now") + # check if content is in pre tags, use txt processor to mark up if so + pre = re.compile(r'<pre>', re.IGNORECASE) + if len(pre.findall(html)) == 1: + self.log("Running Text Processing") + from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \ + separate_paragraphs_single_line + outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL) + html = outerhtml.sub('\g<text>', html) + html = separate_paragraphs_single_line(html) + html = preserve_spaces(html) + html = convert_basic(html, epub_split_size_kb=0) + else: + # Add markup naively + # TODO - find out if there are cases where there are more than one <pre> tag or + # other types of unmarked html and handle them in some better fashion + add_markup = re.compile('(?<!>)(\n)') + html = add_markup.sub('</p>\n<p>', html) ###### Mark Indents/Cleanup ###### # @@ -164,8 +164,8 @@ class PreProcessor(object): self.log("deleting blank lines") html = blankreg.sub('', html) elif float(len(blanklines)) / float(len(lines)) > 0.40: - blanks_between_paragraphs = True - #print "blanks between paragraphs is marked True" + blanks_between_paragraphs = True + #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False #self.dump(html, 'before_chapter_markup') From 6a407327118744fe93aef1b0cd45e4368ff6f017 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Thu, 6 Jan 2011 20:34:23 -0700 Subject: [PATCH 32/35] ... --- src/calibre/gui2/book_details.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/calibre/gui2/book_details.py b/src/calibre/gui2/book_details.py index 8e3e8b10de..2f7892692c 100644 --- a/src/calibre/gui2/book_details.py +++ b/src/calibre/gui2/book_details.py @@ -256,8 +256,10 @@ class BookInfo(QWebView): % (left_pane, right_pane))) def mouseDoubleClickEvent(self, ev): - if self.width() - ev.x() < 25 or \ - self.height() - ev.y() < 25: + swidth = self.page().mainFrame().scrollBarGeometry(Qt.Vertical).width() + sheight = self.page().mainFrame().scrollBarGeometry(Qt.Horizontal).height() + if self.width() - ev.x() < swidth or \ + self.height() - ev.y() < sheight: # Filter out double clicks on the scroll bar ev.accept() else: From 32c1ef8ef6a964d5e42528953c35d413c5c0d9c2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Thu, 6 Jan 2011 20:41:38 -0700 Subject: [PATCH 33/35] E-book viewer: Fix next page skipping the bottom of chapters when the content is wider than the window. Fixes #8153 (Viewer in 0.7.36 cutting off end of chapters.) --- src/calibre/gui2/viewer/documentview.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/viewer/documentview.py b/src/calibre/gui2/viewer/documentview.py index f77f23c154..13469f5622 100644 --- a/src/calibre/gui2/viewer/documentview.py +++ b/src/calibre/gui2/viewer/documentview.py @@ -449,7 +449,7 @@ class Document(QWebPage): # {{{ return self.mainFrame().contentsSize().width() # offsetWidth gives inaccurate results def set_bottom_padding(self, amount): - s = QSize(-1, -1) if amount == 0 else QSize(self.width, + s = QSize(-1, -1) if amount == 0 else QSize(self.viewportSize().width(), self.height+amount) self.setPreferredContentsSize(s) @@ -820,6 +820,7 @@ class DocumentView(QWebView): # {{{ self.flipper.initialize(self.current_page_image()) self.manager.next_document() return + #oheight = self.document.height lower_limit = opos + delta_y # Max value of top y co-ord after scrolling max_y = self.document.height - window_height # The maximum possible top y co-ord if max_y < lower_limit: @@ -835,6 +836,7 @@ class DocumentView(QWebView): # {{{ if epf: self.flipper.initialize(self.current_page_image()) #print 'Document height:', self.document.height + #print 'Height change:', (self.document.height - oheight) max_y = self.document.height - window_height lower_limit = min(max_y, lower_limit) #print 'Scroll to:', lower_limit @@ -842,6 +844,7 @@ class DocumentView(QWebView): # {{{ self.document.scroll_to(self.document.xpos, lower_limit) actually_scrolled = self.document.ypos - opos #print 'After scroll pos:', self.document.ypos + #print 'Scrolled by:', self.document.ypos - opos self.find_next_blank_line(window_height - actually_scrolled) #print 'After blank line pos:', self.document.ypos if epf: From dcb425ebbf81b068b1e4c679c060ee205d967d71 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Thu, 6 Jan 2011 20:56:32 -0700 Subject: [PATCH 34/35] E-book viewer: When scrolling to a bookmark and the content is wider than the window, do not scroll in the horizontal direction --- resources/viewer/bookmarks.js | 1 + 1 file changed, 1 insertion(+) diff --git a/resources/viewer/bookmarks.js b/resources/viewer/bookmarks.js index d36e7c579a..467e4d9d38 100644 --- a/resources/viewer/bookmarks.js +++ b/resources/viewer/bookmarks.js @@ -41,6 +41,7 @@ function scroll_to_bookmark(bookmark) { $.scrollTo($(bm[0]), 1000, { over:ratio, + axis: 'y', onAfter:function(){window.py_bridge.animated_scroll_done()} } ); From cff2e9b34793c11e4c2e677848f2a719e95dcfc9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Thu, 6 Jan 2011 20:57:05 -0700 Subject: [PATCH 35/35] ... --- resources/viewer/bookmarks.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/viewer/bookmarks.js b/resources/viewer/bookmarks.js index 467e4d9d38..253524326f 100644 --- a/resources/viewer/bookmarks.js +++ b/resources/viewer/bookmarks.js @@ -41,7 +41,7 @@ function scroll_to_bookmark(bookmark) { $.scrollTo($(bm[0]), 1000, { over:ratio, - axis: 'y', + axis: 'y', // Do not scroll in the x direction onAfter:function(){window.py_bridge.animated_scroll_done()} } );