From 0ed2f3fceb3cc4cc06319d199109b7647e5c9af4 Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 1 Oct 2010 17:45:47 +0800 Subject: [PATCH 01/32] partial/potential fix for mobi problem --- src/calibre/ebooks/mobi/mobiml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index 231ad51eee..31b1ac5834 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -188,7 +188,7 @@ class MobiMLizer(object): para = wrapper emleft = int(round(left / self.profile.fbase)) - 1 emleft = min((emleft, 10)) - while emleft > 0: + while emleft > 1: para = etree.SubElement(para, XHTML('blockquote')) emleft -= 1 else: From 3900216da0896ff1d372714f074ea71e39885054 Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 1 Oct 2010 19:17:32 +0800 Subject: [PATCH 02/32] revert mobi change --- src/calibre/ebooks/mobi/mobiml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index 31b1ac5834..231ad51eee 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -188,7 +188,7 @@ class MobiMLizer(object): para = wrapper emleft = int(round(left / self.profile.fbase)) - 1 emleft = min((emleft, 10)) - while emleft > 1: + while emleft > 0: para = etree.SubElement(para, XHTML('blockquote')) emleft -= 1 else: From 4a044b8e9d6b5f0168ef4e65d6a3e9aa47f182b4 Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 4 Oct 2010 16:16:33 +0800 Subject: [PATCH 03/32] small tweak --- src/calibre/ebooks/conversion/utils.py | 8 +++++--- src/calibre/ebooks/mobi/mobiml.py | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 2faec27b68..976ed6a8f4 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -146,7 +146,7 @@ class PreProcessor(object): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False - #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") + self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") # detect chapters/sections to match xpath or splitting logic # # Build the Regular Expressions in pieces @@ -166,13 +166,13 @@ class PreProcessor(object): title_line_close = "()?\s*()?\s*(]*>)?\s*" opt_title_close = ")?" - default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)" + default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)" typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}" numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*" uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*" chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - #print chapter_marker + print chapter_marker heading = re.compile(']*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") @@ -184,12 +184,14 @@ class PreProcessor(object): if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters") chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + print chapter_marker chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) html = chapdetect2.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words") chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + print chapter_marker chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE) html = chapdetect2.sub(self.chapter_head, html) ###### Unwrap lines ###### diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index 231ad51eee..d4801e637e 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -184,12 +184,12 @@ class MobiMLizer(object): elif tag in NESTABLE_TAGS and istate.rendered: para = wrapper = bstate.nested[-1] elif left > 0 and indent >= 0: - para = wrapper = etree.SubElement(parent, XHTML('blockquote')) + para = wrapper = etree.SubElement(parent, XHTML('div')) para = wrapper emleft = int(round(left / self.profile.fbase)) - 1 emleft = min((emleft, 10)) while emleft > 0: - para = etree.SubElement(para, XHTML('blockquote')) + para = etree.SubElement(para, XHTML('div')) emleft -= 1 else: para = wrapper = etree.SubElement(parent, XHTML('p')) From b45dc837830b0e4f61b9cc19dfcc5f214589eb83 Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 11 Oct 2010 00:14:00 +1000 Subject: [PATCH 04/32] preprocessing tweaks, fixed division by zero error in line_histogram --- src/calibre/ebooks/conversion/preprocess.py | 2 ++ src/calibre/ebooks/conversion/utils.py | 6 +++++- src/calibre/ebooks/pdb/input.py | 6 ++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index c5ebae4bba..de01188829 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -144,6 +144,8 @@ class DocAnalysis(object): # Normalize the histogram into percents totalLines = len(self.lines) + if totalLines == 0: + return False h = [ float(count)/totalLines for count in hRaw ] #print "\nhRaw histogram lengths are: "+str(hRaw) #print " percents are: "+str(h)+"\n" diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 976ed6a8f4..a01c29f2fb 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -146,7 +146,7 @@ class PreProcessor(object): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False - self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") + #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") # detect chapters/sections to match xpath or splitting logic # # Build the Regular Expressions in pieces @@ -230,6 +230,7 @@ class PreProcessor(object): html = dehyphenator(html,'html', length) self.log("Done dehyphenating") # Unwrap lines using punctation and line length + unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE) unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) html = unwrap.sub(' ', html) #check any remaining hyphens, but only unwrap if there is a match @@ -259,5 +260,8 @@ class PreProcessor(object): # put back non-breaking spaces in empty paragraphs to preserve original formatting html = blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) + + # Center separator lines + html = re.sub(u'

\s*(?P([*#•]+\s*)+)\s*

', '

' + '\g' + '

', html) return html diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index dfe5b653dd..6850c48b16 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -9,6 +9,7 @@ import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader +from calibre.ebooks.conversion.utils import PreProcessor class PDBInput(InputFormatPlugin): @@ -44,3 +45,8 @@ class PDBInput(InputFormatPlugin): opf = reader.extract_content(os.getcwd()) return opf + + def preprocess_html(self, options, html): + self.options = options + preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) + return preprocessor(html) \ No newline at end of file From 87b615f81f694f2dffa23a07afe6e87d6e90497f Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 18 Oct 2010 04:55:53 +0800 Subject: [PATCH 05/32] added new chapter heading type --- src/calibre/ebooks/conversion/utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index a01c29f2fb..6002509013 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -154,7 +154,7 @@ class PreProcessor(object): chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" chapter_header_open = r"(?P" chapter_header_close = ")\s*" - chapter_line_close = "()?\s*()?\s*(]*>)?\s*\s*" + chapter_line_close = "()?\s*()?\s*()?\s*\s*" if blanks_between_paragraphs: blank_lines = "(\s*]*>\s*

){0,2}\s*" else: @@ -170,6 +170,7 @@ class PreProcessor(object): typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}" numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*" uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*" + numeric_titles = r".?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*" chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close print chapter_marker @@ -194,6 +195,14 @@ class PreProcessor(object): print chapter_marker chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE) html = chapdetect2.sub(self.chapter_head, html) + + if self.html_preprocess_sections < 10: + self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters with titles") + chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + print chapter_marker + chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) + html = chapdetect2.sub(self.chapter_head, html) + ###### Unwrap lines ###### # # Some OCR sourced files have line breaks in the html using a combination of span & p tags From ccb683ef09ded0d708f8eeda5269c2be6a1b3ba7 Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 18 Oct 2010 09:02:07 +0800 Subject: [PATCH 06/32] added unicode hyphens to dehyphenation function --- src/calibre/ebooks/conversion/preprocess.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index de01188829..4a77f58df4 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -219,13 +219,13 @@ class Dehyphenator(object): self.html = html self.format = format if format == 'html': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)' % length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)' % length) elif format == 'pdf': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) elif format == 'individual_words': - intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-(?P[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') + intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') html = intextmatch.sub(self.dehyphenate, html) return html From f31a055135908f2d79efa4fd3c176a5a5e9e9a52 Mon Sep 17 00:00:00 2001 From: ldolse Date: Tue, 16 Nov 2010 14:39:53 +0800 Subject: [PATCH 07/32] work in progress on chapter detection improvements --- src/calibre/ebooks/conversion/utils.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 6002509013..724428fec0 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -165,6 +165,8 @@ class PreProcessor(object): title_header_close = ")\s*" title_line_close = "()?\s*()?\s*(]*>)?\s*" opt_title_close = ")?" + n_lookahead_open = "(?!=" + n_lookahead_close = ")" default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)" typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}" @@ -172,7 +174,11 @@ class PreProcessor(object): uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*" numeric_titles = r".?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*" - chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close + n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line) + print "n_lookahead is " + n_lookahead + print "Chapter line is " + full_chapter_line + "\n\n" + chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close print chapter_marker heading = re.compile(']*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) @@ -184,21 +190,33 @@ class PreProcessor(object): html = chapdetect.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters") - chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + full_chapter_line = chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close + n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line) + print "n_lookahead is " + n_lookahead + print "Chapter line is " + full_chapter_line + "\n\n" + chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close print chapter_marker chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) html = chapdetect2.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words") - chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + full_chapter_line = chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close + n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line) + print "n_lookahead is " + n_lookahead + print "Chapter line is " + full_chapter_line + "\n\n" + chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close print chapter_marker chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE) html = chapdetect2.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters with titles") - chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + full_chapter_line = chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close + n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line) + print "n_lookahead is " + n_lookahead + print "Chapter line is " + full_chapter_line + "\n\n" + chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close print chapter_marker chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) html = chapdetect2.sub(self.chapter_head, html) From f7e99d2e86fbcae5fbdae3428905836256fae687 Mon Sep 17 00:00:00 2001 From: ldolse Date: Tue, 16 Nov 2010 15:54:49 +0800 Subject: [PATCH 08/32] added accented characters to the line unwrap patterns, since they're not covered under a-z character classes using the unicode option. --- src/calibre/ebooks/conversion/preprocess.py | 2 +- src/calibre/ebooks/conversion/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index e1d73dcfd9..ef092f7954 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -475,7 +475,7 @@ class HTMLPreProcessor(object): end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*

\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation - (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), + (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 724428fec0..143ece4b79 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -258,7 +258,7 @@ class PreProcessor(object): self.log("Done dehyphenating") # Unwrap lines using punctation and line length unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE) - unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) + unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) html = unwrap.sub(' ', html) #check any remaining hyphens, but only unwrap if there is a match dehyphenator = Dehyphenator() From 4526ced6d1257a34b9f3c093f02f373291657ab8 Mon Sep 17 00:00:00 2001 From: ldolse Date: Tue, 16 Nov 2010 18:34:21 +0800 Subject: [PATCH 09/32] made conversion of nbsp to indent a bit smarter --- src/calibre/ebooks/conversion/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 143ece4b79..51139d3a18 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -44,10 +44,14 @@ class PreProcessor(object): span = match.group('span') self.found_indents = self.found_indents + 1 if pstyle: - if not span: - return '

' + if pstyle.lower().find('style'): + pstyle = re.sub(r'"$', '; text-indent:3%"', pstyle) else: - return '

'+span + pstyle = pstyle+' style="text-indent:3%"' + if not span: + return '

' + else: + return '

'+span else: if not span: return '

' From 26ba75f76cc1db12439fb6f3a7c6bc9fbd049507 Mon Sep 17 00:00:00 2001 From: ldolse Date: Wed, 17 Nov 2010 10:25:51 +0800 Subject: [PATCH 10/32] added a search for emphasized lines during chapter markup --- src/calibre/ebooks/conversion/utils.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 51139d3a18..bec15924d6 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -177,9 +177,10 @@ class PreProcessor(object): numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*" uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*" numeric_titles = r".?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*" + emphasized_lines = r"]*>\s*(]*>)?\s*(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*" full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close - n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line) + n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) print "n_lookahead is " + n_lookahead print "Chapter line is " + full_chapter_line + "\n\n" chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close @@ -195,7 +196,7 @@ class PreProcessor(object): if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters") full_chapter_line = chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close - n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line) + n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) print "n_lookahead is " + n_lookahead print "Chapter line is " + full_chapter_line + "\n\n" chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close @@ -203,10 +204,21 @@ class PreProcessor(object): chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) html = chapdetect2.sub(self.chapter_head, html) + if self.html_preprocess_sections < 10: + self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying emphazised lines") + full_chapter_line = chapter_line_open+chapter_header_open+emphasized_lines+chapter_header_close+chapter_line_close + n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) + print "n_lookahead is " + n_lookahead + print "Chapter line is " + full_chapter_line + "\n\n" + chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + print chapter_marker + chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) + html = chapdetect2.sub(self.chapter_head, html) + if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words") full_chapter_line = chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close - n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line) + n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) print "n_lookahead is " + n_lookahead print "Chapter line is " + full_chapter_line + "\n\n" chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close @@ -217,7 +229,7 @@ class PreProcessor(object): if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters with titles") full_chapter_line = chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close - n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line) + n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) print "n_lookahead is " + n_lookahead print "Chapter line is " + full_chapter_line + "\n\n" chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close From a55b4dbbac65ef083d3af4943243f9b6e092d227 Mon Sep 17 00:00:00 2001 From: ldolse Date: Wed, 17 Nov 2010 13:49:12 +0800 Subject: [PATCH 11/32] remove extra line feeds from html comments when sanitizing --- src/calibre/library/comments.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py index 83eec89abe..00a6ef55ae 100644 --- a/src/calibre/library/comments.py +++ b/src/calibre/library/comments.py @@ -131,7 +131,8 @@ def comments_to_html(comments): def sanitize_comments_html(html): text = html2text(html) md = markdown.Markdown(safe_mode=True) - return md.convert(text) + cleansed = re.sub('\n+', '', md.convert(text)) + return cleansed def test(): for pat, val in [ From b03b8023943417dc544f70bd470ba5f61c59d848 Mon Sep 17 00:00:00 2001 From: ldolse Date: Wed, 17 Nov 2010 14:12:14 +0800 Subject: [PATCH 12/32] adjusted css to compact the comments display --- src/calibre/gui2/book_details.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/gui2/book_details.py b/src/calibre/gui2/book_details.py index e193fe10b2..8cc2965171 100644 --- a/src/calibre/gui2/book_details.py +++ b/src/calibre/gui2/book_details.py @@ -221,6 +221,8 @@ class BookInfo(QWebView): From fb124c50a767956abcadec577fe10ad1e0e4ae80 Mon Sep 17 00:00:00 2001 From: ldolse Date: Wed, 17 Nov 2010 17:55:50 +0800 Subject: [PATCH 13/32] added negative lookahead to reduce false positive matches during chapter marking --- src/calibre/ebooks/conversion/utils.py | 54 +++++++++++++++++--------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index bec15924d6..ac38a0097d 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -83,6 +83,24 @@ class PreProcessor(object): if min_lns > tot_htm_ends: return True + def dump(self, raw, where): + import os + dp = getattr(self.extra_opts, 'debug_pipeline', None) + if dp and os.path.exists(dp): + odir = os.path.join(dp, 'preprocess') + if not os.path.exists(odir): + os.makedirs(odir) + if os.path.exists(odir): + odir = os.path.join(odir, where) + if not os.path.exists(odir): + os.makedirs(odir) + name, i = None, 0 + while not name or os.path.exists(os.path.join(odir, name)): + i += 1 + name = '%04d.html'%i + with open(os.path.join(odir, name), 'wb') as f: + f.write(raw.encode('utf-8')) + def __call__(self, html): self.log("********* Preprocessing HTML *********") @@ -150,7 +168,7 @@ class PreProcessor(object): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False - #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") + #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic # # Build the Regular Expressions in pieces @@ -158,7 +176,7 @@ class PreProcessor(object): chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" chapter_header_open = r"(?P" chapter_header_close = ")\s*" - chapter_line_close = "()?\s*()?\s*()?\s*\s*" + chapter_line_close = "()?\s*()?\s*()?\s*" if blanks_between_paragraphs: blank_lines = "(\s*]*>\s*

){0,2}\s*" else: @@ -169,7 +187,7 @@ class PreProcessor(object): title_header_close = ")\s*" title_line_close = "()?\s*()?\s*(]*>)?\s*" opt_title_close = ")?" - n_lookahead_open = "(?!=" + n_lookahead_open = "\s+(?!" n_lookahead_close = ")" default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)" @@ -181,10 +199,10 @@ class PreProcessor(object): full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - print "n_lookahead is " + n_lookahead - print "Chapter line is " + full_chapter_line + "\n\n" + #print "n_lookahead is:\n" + n_lookahead + "\n\n" + #print "'normal' Chapter line - no title - is:\n" + full_chapter_line + "\n\n" chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - print chapter_marker + #print "full chapter regex with lookahead is:\n" + chapter_marker + "\n\n" heading = re.compile(']*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") @@ -197,10 +215,10 @@ class PreProcessor(object): self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters") full_chapter_line = chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - print "n_lookahead is " + n_lookahead - print "Chapter line is " + full_chapter_line + "\n\n" + #print "n_lookahead is " + n_lookahead + #print "Chapter line is " + full_chapter_line + "\n\n" chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - print chapter_marker + #print chapter_marker chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) html = chapdetect2.sub(self.chapter_head, html) @@ -208,10 +226,10 @@ class PreProcessor(object): self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying emphazised lines") full_chapter_line = chapter_line_open+chapter_header_open+emphasized_lines+chapter_header_close+chapter_line_close n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - print "n_lookahead is " + n_lookahead - print "Chapter line is " + full_chapter_line + "\n\n" + #print "n_lookahead is " + n_lookahead + #print "Chapter line is " + full_chapter_line + "\n\n" chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - print chapter_marker + #print chapter_marker chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) html = chapdetect2.sub(self.chapter_head, html) @@ -219,10 +237,10 @@ class PreProcessor(object): self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words") full_chapter_line = chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - print "n_lookahead is " + n_lookahead - print "Chapter line is " + full_chapter_line + "\n\n" + #print "n_lookahead is " + n_lookahead + #print "Chapter line is " + full_chapter_line + "\n\n" chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - print chapter_marker + #print chapter_marker chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE) html = chapdetect2.sub(self.chapter_head, html) @@ -230,10 +248,10 @@ class PreProcessor(object): self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters with titles") full_chapter_line = chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - print "n_lookahead is " + n_lookahead - print "Chapter line is " + full_chapter_line + "\n\n" + #print "n_lookahead is " + n_lookahead + #print "Chapter line is " + full_chapter_line + "\n\n" chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - print chapter_marker + #print chapter_marker chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) html = chapdetect2.sub(self.chapter_head, html) From b238903ba81af83b3a7246cdf5d4d839a48f0d9b Mon Sep 17 00:00:00 2001 From: ldolse Date: Wed, 17 Nov 2010 19:27:51 +0800 Subject: [PATCH 14/32] minor tweaks to chapter marking --- src/calibre/ebooks/conversion/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index ac38a0097d..fffb0d75d4 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -195,7 +195,7 @@ class PreProcessor(object): numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*" uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*" numeric_titles = r".?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*" - emphasized_lines = r"]*>\s*(]*>)?\s*(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*" + emphasized_lines = r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*" full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) @@ -308,10 +308,10 @@ class PreProcessor(object): html = re.sub(u'\xad\s*(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*', '', html) # If still no sections after unwrapping mark split points on lines with no punctuation - if self.html_preprocess_sections < 10: + if self.html_preprocess_sections < 5: self.log("Looking for more split points based on punctuation," " currently have " + unicode(self.html_preprocess_sections)) - chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P
(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?(){0,2}\s*()?\s*(){0,2}\s*()?\s*)', re.IGNORECASE) + chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P
(]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?(){0,2}\s*()?\s*(){0,2}\s*()?\s*)', re.IGNORECASE) html = chapdetect3.sub(self.chapter_break, html) # search for places where a first or second level heading is immediately followed by another # top level heading. demote the second heading to h3 to prevent splitting between chapter From 2b888a4add647821774fbf92ea7807bbdf435af9 Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 19 Nov 2010 10:03:56 +0800 Subject: [PATCH 15/32] fix a problem with pdf unwrap_factor getting set to 0.0 --- src/calibre/gui2/convert/pdf_input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/convert/pdf_input.py b/src/calibre/gui2/convert/pdf_input.py index 967a0fe234..f1ef7d24ee 100644 --- a/src/calibre/gui2/convert/pdf_input.py +++ b/src/calibre/gui2/convert/pdf_input.py @@ -22,5 +22,5 @@ class PluginWidget(Widget, Ui_Form): def set_value_handler(self, g, val): if val is None and isinstance(g, QDoubleSpinBox): - g.setValue(0.0) + g.setValue(0.45) return True From 2a40afbd8e819e8fee0261e1f35ba54af235be8d Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 19 Nov 2010 12:54:25 +0800 Subject: [PATCH 16/32] blanklines are preserved in rtf2xml, then converted to empty html paragraphs to preserver softbreaks --- src/calibre/ebooks/rtf/input.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 078b30627f..d7619d471a 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -84,7 +84,7 @@ class RTFInput(InputFormatPlugin): group_borders = 1, # Write or do not write paragraphs. Default is 0. - empty_paragraphs = 0, + empty_paragraphs = 1, ) parser.parse_rtf() ans = open('out.xml').read() @@ -228,6 +228,10 @@ class RTFInput(InputFormatPlugin): with open(html, 'wb') as f: res = transform.tostring(result) res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] + # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines + if not getattr(self.options, 'remove_paragraph_spacing', False): + res = re.sub('\s*', '', res) + res = re.sub('\n{4}', u'\n

\u00a0

\n', res) if self.options.preprocess_html: preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) res = preprocessor(res) From f8f908ecd670f63ac07573d9ea330abfbca4ff3a Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 19 Nov 2010 13:23:32 +0800 Subject: [PATCH 17/32] ... --- src/calibre/ebooks/rtf/input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index d7619d471a..d0ef19ecd9 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -231,7 +231,7 @@ class RTFInput(InputFormatPlugin): # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines if not getattr(self.options, 'remove_paragraph_spacing', False): res = re.sub('\s*', '', res) - res = re.sub('\n{4}', u'\n

\u00a0

\n', res) + res = re.sub('(?<=\n)\n{2}', u'

\u00a0

\n', res) if self.options.preprocess_html: preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) res = preprocessor(res) From 25c93421fb38455a4b57eb4e84bb9c55eb507299 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 20 Nov 2010 12:25:56 +0800 Subject: [PATCH 18/32] merge from trunk --- src/calibre/ebooks/conversion/plumber.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 9a863d7e66..d0e9aa2e99 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -957,6 +957,8 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None, ''' Create an OEBBook. ''' + if input_plugin == 'LITInput': + print "***\n\n*** Input plugin is: "+str(input_plugin)+"\n\n****" from calibre.ebooks.oeb.base import OEBBook html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, opts.preprocess_html, opts) From 9c2dcfd5aff2b6e521677bf8afeac68fb81c7816 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 20 Nov 2010 12:26:57 +0800 Subject: [PATCH 19/32] ... --- src/calibre/ebooks/conversion/plumber.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index d0e9aa2e99..9a863d7e66 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -957,8 +957,6 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None, ''' Create an OEBBook. ''' - if input_plugin == 'LITInput': - print "***\n\n*** Input plugin is: "+str(input_plugin)+"\n\n****" from calibre.ebooks.oeb.base import OEBBook html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, opts.preprocess_html, opts) From 267eebb9aa489cc443e57e90a9353730345af0c3 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 21 Nov 2010 15:38:18 +0800 Subject: [PATCH 20/32] adjusted preprocessing regexes for hyphen removal and chapter marking --- src/calibre/ebooks/conversion/preprocess.py | 2 +- src/calibre/ebooks/conversion/utils.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index ef092f7954..3ff816b3bf 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -220,7 +220,7 @@ class Dehyphenator(object): self.html = html self.format = format if format == 'html': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)' % length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P()?\s*(\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*

\s*)?\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length) elif format == 'pdf': intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) elif format == 'individual_words': diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 20689c6950..feb74324e8 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -107,7 +107,7 @@ class PreProcessor(object): # Arrange line feeds and

tags so the line_length and no_markup functions work correctly html = re.sub(r"\s*

", "

\n", html) html = re.sub(r"\s*

\s*", "\n

", html) - + ###### Check Markup ###### # # some lit files don't have any

tags or equivalent (generally just plain text between @@ -191,10 +191,10 @@ class PreProcessor(object): n_lookahead_close = ")" default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)" - typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}" - numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*" - uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*" - numeric_titles = r".?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*" + typical_chapters = r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}" + numeric_chapters = r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}\s*" + uppercase_chapters = r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*" + numeric_titles = r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*" emphasized_lines = r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*" full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close From 73278a8cd65dc780155154712ecdb77048fbacb0 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 21 Nov 2010 15:40:02 +0800 Subject: [PATCH 21/32] ... --- src/calibre/gui2/convert/pdf_input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/gui2/convert/pdf_input.py b/src/calibre/gui2/convert/pdf_input.py index f1ef7d24ee..967a0fe234 100644 --- a/src/calibre/gui2/convert/pdf_input.py +++ b/src/calibre/gui2/convert/pdf_input.py @@ -22,5 +22,5 @@ class PluginWidget(Widget, Ui_Form): def set_value_handler(self, g, val): if val is None and isinstance(g, QDoubleSpinBox): - g.setValue(0.45) + g.setValue(0.0) return True From c378a90a927bd9e1d075699226353ac05ccd9422 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 21 Nov 2010 16:43:06 +0800 Subject: [PATCH 22/32] reworked chapter marking code --- src/calibre/ebooks/conversion/utils.py | 86 ++++++++------------------ 1 file changed, 26 insertions(+), 60 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index feb74324e8..acd8d3f02a 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -172,7 +172,7 @@ class PreProcessor(object): # detect chapters/sections to match xpath or splitting logic # # Build the Regular Expressions in pieces - lookahead = "(?=<(p|div))" + init_lookahead = "(?=<(p|div))" chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" chapter_header_open = r"(?P" chapter_header_close = ")\s*" @@ -191,69 +191,35 @@ class PreProcessor(object): n_lookahead_close = ")" default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)" - typical_chapters = r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}" - numeric_chapters = r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}\s*" - uppercase_chapters = r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*" - numeric_titles = r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*" - emphasized_lines = r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*" - - full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close - n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - #print "n_lookahead is:\n" + n_lookahead + "\n\n" - #print "'normal' Chapter line - no title - is:\n" + full_chapter_line + "\n\n" - chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - #print "full chapter regex with lookahead is:\n" + chapter_marker + "\n\n" + + min_chapters = 10 heading = re.compile(']*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") - # - # Start with most typical chapter headings, get more aggressive until one works - if self.html_preprocess_sections < 10: - chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) + + chapter_types = [ + [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], + [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters + [r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)()?\s*", True, "Searching for emphasized lines"], # Emphasized lines + [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles + [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters + ] + + for [chapter_type, lookahead_ignorecase, log_message] in chapter_types: + if self.html_preprocess_sections >= min_chapters: + break + full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close + n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) + self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message) + if lookahead_ignorecase: + chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) + else: + chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close + chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) + html = chapdetect.sub(self.chapter_head, html) - if self.html_preprocess_sections < 10: - self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters") - full_chapter_line = chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close - n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - #print "n_lookahead is " + n_lookahead - #print "Chapter line is " + full_chapter_line + "\n\n" - chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - #print chapter_marker - chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) - html = chapdetect2.sub(self.chapter_head, html) - - if self.html_preprocess_sections < 10: - self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying emphazised lines") - full_chapter_line = chapter_line_open+chapter_header_open+emphasized_lines+chapter_header_close+chapter_line_close - n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - #print "n_lookahead is " + n_lookahead - #print "Chapter line is " + full_chapter_line + "\n\n" - chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - #print chapter_marker - chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) - html = chapdetect2.sub(self.chapter_head, html) - - if self.html_preprocess_sections < 10: - self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words") - full_chapter_line = chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close - n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - #print "n_lookahead is " + n_lookahead - #print "Chapter line is " + full_chapter_line + "\n\n" - chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - #print chapter_marker - chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE) - html = chapdetect2.sub(self.chapter_head, html) - - if self.html_preprocess_sections < 10: - self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters with titles") - full_chapter_line = chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close - n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - #print "n_lookahead is " + n_lookahead - #print "Chapter line is " + full_chapter_line + "\n\n" - chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - #print chapter_marker - chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) - html = chapdetect2.sub(self.chapter_head, html) + ###### Unwrap lines ###### # From fae3252d50f3316458dad2606a3362e2345f5326 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 21 Nov 2010 17:37:56 +0800 Subject: [PATCH 23/32] further cleanup to chapter markup --- src/calibre/ebooks/conversion/utils.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index acd8d3f02a..af3d83da4a 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -168,29 +168,30 @@ class PreProcessor(object): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False - #self.dump(html, 'before_chapter_markup') + self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic # # Build the Regular Expressions in pieces init_lookahead = "(?=<(p|div))" chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" + title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" chapter_header_open = r"(?P" + title_header_open = r"(?P" chapter_header_close = ")\s*" + title_header_close = ")" chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>" + title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>" + if blanks_between_paragraphs: blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*" else: blank_lines = "" opt_title_open = "(" - title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*" - title_header_open = "(?P<title>" - title_header_close = ")\s*" - title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>" opt_title_close = ")?" n_lookahead_open = "\s+(?!" n_lookahead_close = ")" - default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)" + default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)" min_chapters = 10 heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) @@ -204,7 +205,8 @@ class PreProcessor(object): [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters ] - + + # Start with most typical chapter headings, get more aggressive until one works for [chapter_type, lookahead_ignorecase, log_message] in chapter_types: if self.html_preprocess_sections >= min_chapters: break @@ -215,7 +217,9 @@ class PreProcessor(object): chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) else: + print "Chapter line is:\n"+full_chapter_line chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close + print "\nFull regex is:\n"+chapter_marker chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) html = chapdetect.sub(self.chapter_head, html) From cf4f9e41c273fe5f63db22a33120cd3a380bc730 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Sun, 21 Nov 2010 17:49:46 +0800 Subject: [PATCH 24/32] ... --- src/calibre/ebooks/conversion/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index af3d83da4a..26c8d23e0c 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -217,9 +217,7 @@ class PreProcessor(object): chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) else: - print "Chapter line is:\n"+full_chapter_line chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close - print "\nFull regex is:\n"+chapter_marker chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) html = chapdetect.sub(self.chapter_head, html) From e1602dc31a2ac0b3f8f4367fd5d881369906c7e1 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Sun, 21 Nov 2010 18:03:25 +0800 Subject: [PATCH 25/32] ... --- src/calibre/ebooks/conversion/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 26c8d23e0c..ea78808d08 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -168,7 +168,7 @@ class PreProcessor(object): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False - self.dump(html, 'before_chapter_markup') + #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic # # Build the Regular Expressions in pieces From caabf1d814a419c79aafb1a78a372afc894420de Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Sun, 21 Nov 2010 18:29:59 +0800 Subject: [PATCH 26/32] ... --- src/calibre/ebooks/conversion/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index ea78808d08..4f3e2ed90a 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -200,7 +200,7 @@ class PreProcessor(object): chapter_types = [ [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], - [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters + [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,10}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters From 7b35480ce2acf9a947193f504fe26ac78fb8ca94 Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Sun, 21 Nov 2010 19:40:17 +0800 Subject: [PATCH 27/32] fixed a problem with some formats and line unwrapping --- src/calibre/ebooks/conversion/utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 4f3e2ed90a..2039a545ca 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -22,12 +22,12 @@ class PreProcessor(object): title = match.group('title') if not title: self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("found " + unicode(self.html_preprocess_sections) + + self.log("marked " + unicode(self.html_preprocess_sections) + " chapters. - " + unicode(chap)) return '<h2>'+chap+'</h2>\n' else: self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("found " + unicode(self.html_preprocess_sections) + + self.log("marked " + unicode(self.html_preprocess_sections) + " chapters & titles. - " + unicode(chap) + ", " + unicode(title)) return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n' @@ -106,7 +106,7 @@ class PreProcessor(object): # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly html = re.sub(r"\s*</p>", "</p>\n", html) - html = re.sub(r"\s*<p>\s*", "\n<p>", html) + html = re.sub(r"\s*<p(?P<style>[^>]*)>\s*", "\n<p"+"\g<style>"+">", html) ###### Check Markup ###### # @@ -200,7 +200,7 @@ class PreProcessor(object): chapter_types = [ [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], - [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,10}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters + [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters @@ -241,6 +241,7 @@ class PreProcessor(object): format = 'html' # Check Line histogram to determine if the document uses hard line breaks, If 50% or # more of the lines break in the same region of the document then unwrapping is required + self.dump(html, 'before_doc_analysis_zipped_http') docanalysis = DocAnalysis(format, html) hardbreaks = docanalysis.line_histogram(.50) self.log("Hard line breaks check returned "+unicode(hardbreaks)) From 6c3fc5280f33434366ad38fc0c1128fda8810d3e Mon Sep 17 00:00:00 2001 From: ldolse <ldolse@yahoo.com> Date: Sun, 21 Nov 2010 19:52:55 +0800 Subject: [PATCH 28/32] ... --- src/calibre/ebooks/conversion/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 2039a545ca..cca3679d14 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -241,7 +241,6 @@ class PreProcessor(object): format = 'html' # Check Line histogram to determine if the document uses hard line breaks, If 50% or # more of the lines break in the same region of the document then unwrapping is required - self.dump(html, 'before_doc_analysis_zipped_http') docanalysis = DocAnalysis(format, html) hardbreaks = docanalysis.line_histogram(.50) self.log("Hard line breaks check returned "+unicode(hardbreaks)) From 1f720bec773166ded27ced0a311f656723738bb2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sun, 21 Nov 2010 08:04:11 -0700 Subject: [PATCH 29/32] Speedup database initialization be using exists instead of a permissions check for covers --- src/calibre/library/database2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 6d18a2d663..8e7002097a 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -771,7 +771,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): except: # Can happen if path has not yet been set return False - return os.access(path, os.R_OK) + return os.path.exists(path) def remove_cover(self, id, notify=True): path = os.path.join(self.library_path, self.path(id, index_is_id=True), 'cover.jpg') From cf2a9008ed6fc24c55ffd4002557560ef4699d46 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sun, 21 Nov 2010 08:27:48 -0700 Subject: [PATCH 30/32] Improved Brand Eins --- resources/recipes/brand_eins.recipe | 37 +++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/resources/recipes/brand_eins.recipe b/resources/recipes/brand_eins.recipe index be5b98ffe6..c69dd693b2 100644 --- a/resources/recipes/brand_eins.recipe +++ b/resources/recipes/brand_eins.recipe @@ -1,18 +1,22 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +# -*- coding: utf-8 mode: python -*- + +# Find the newest version of this recipe here: +# https://github.com/consti/BrandEins-Recipe/raw/master/brandeins.recipe __license__ = 'GPL v3' -__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>' -__version__ = '0.95' +__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>' +__version__ = '0.96' ''' http://brandeins.de - Wirtschaftsmagazin ''' import re import string from calibre.web.feeds.recipes import BasicNewsRecipe + class BrandEins(BasicNewsRecipe): - title = u'Brand Eins' + title = u'brand eins' __author__ = 'Constantin Hofstetter' description = u'Wirtschaftsmagazin' publisher ='brandeins.de' @@ -22,11 +26,14 @@ class BrandEins(BasicNewsRecipe): no_stylesheets = True encoding = 'utf-8' language = 'de' + publication_type = 'magazine' + needs_subscription = True # 2 is the last full magazine (default) # 1 is the newest (but not full) # 3 is one before 2 etc. - which_ausgabe = 2 + # This value can be set via the username field. + default_issue = 2 keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})] @@ -61,17 +68,31 @@ class BrandEins(BasicNewsRecipe): return soup + def get_cover(self, soup): + cover_url = None + cover_item = soup.find('div', attrs = {'class': 'cover_image'}) + if cover_item: + cover_url = 'http://www.brandeins.de/' + cover_item.img['src'] + return cover_url + def parse_index(self): feeds = [] archive = "http://www.brandeins.de/archiv.html" + issue = self.default_issue + if self.username: + try: + issue = int(self.username) + except: + pass + soup = self.index_to_soup(archive) latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0] - pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-self.which_ausgabe] + pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-issue] url = pre_latest_issue.get('href', False) # Get the title for the magazin - build it out of the title of the cover - take the issue and year; - self.title = "Brand Eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d+)", pre_latest_issue.find('img').get('title', False)).group('date') + self.title = "brand eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date') url = 'http://brandeins.de/'+url # url = "http://www.brandeins.de/archiv/magazin/tierisch.html" @@ -83,6 +104,7 @@ class BrandEins(BasicNewsRecipe): def brand_eins_parse_latest_issue(self, url): soup = self.index_to_soup(url) + self.cover_url = self.get_cover(soup) article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})] titles_and_articles = [] @@ -123,3 +145,4 @@ class BrandEins(BasicNewsRecipe): current_articles.append({'title': title, 'url': url, 'description': description, 'date':''}) titles_and_articles.append([chapter_title, current_articles]) return titles_and_articles + From a2fb4d4149b222afe150431bbfe6d49cb8109d89 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sun, 21 Nov 2010 08:41:33 -0700 Subject: [PATCH 31/32] Nikkei by Hiroshi Miura --- resources/recipes/nikkei_sub.recipe | 125 ++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 resources/recipes/nikkei_sub.recipe diff --git a/resources/recipes/nikkei_sub.recipe b/resources/recipes/nikkei_sub.recipe new file mode 100644 index 0000000000..95b0017339 --- /dev/null +++ b/resources/recipes/nikkei_sub.recipe @@ -0,0 +1,125 @@ +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +import mechanize +from calibre.ptempfile import PersistentTemporaryFile + + +class NikkeiNet_subscription(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + needs_subscription = True + oldest_article = 2 + max_articles_per_feed = 20 + language = 'ja' + remove_javascript = False + temp_files = [] + + remove_tags_before = {'class':"cmn-section cmn-indent"} + remove_tags = [ + {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, + {'class':"cmn-article_keyword cmn-clearfix"}, + {'class':"cmn-print_headline cmn-clearfix"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + + cj = mechanize.LWPCookieJar() + br.set_cookiejar(cj) + + #br.set_debug_http(True) + #br.set_debug_redirects(True) + #br.set_debug_responses(True) + + if self.username is not None and self.password is not None: + #print "----------------------------get login form--------------------------------------------" + # open login form + br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') + response = br.response() + #print "----------------------------get login form---------------------------------------------" + #print "----------------------------set login form---------------------------------------------" + # remove disabled input which brings error on mechanize + response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- ")) + response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->")) + br.set_response(response) + br.select_form(name='LA0010Form01') + br['LA0010Form01:LA0010Email'] = self.username + br['LA0010Form01:LA0010Password'] = self.password + br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True + br.submit() + br.response() + #print "----------------------------send login form---------------------------------------------" + #print "----------------------------open news main page-----------------------------------------" + # open news site + br.open('http://www.nikkei.com/') + br.response() + #print "----------------------------www.nikkei.com BODY --------------------------------------" + #print response2.get_data() + #print "-------------------------^^-got auto redirect form----^^--------------------------------" + # forced redirect in default + br.select_form(nr=0) + br.submit() + response3 = br.response() + # return some cookie which should be set by Javascript + #print response3.geturl() + raw = response3.get_data() + #print "---------------------------response to form --------------------------------------------" + # grab cookie from JS and set it + redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) + br.select_form(nr=0) + + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write("#LWP-Cookies-2.0\n") + + self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].close() + cj.load(self.temp_files[-1].name) + + br.submit() + + #br.set_debug_http(False) + #br.set_debug_redirects(False) + #br.set_debug_responses(False) + return br + + + + feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'), + (u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'), + (u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'), + (u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'), + (u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'), + (u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'), + (u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'), + (u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'), + (u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'), + (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'), + (u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'), + (u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'), + (u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'), + (u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'), + (u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'), + (u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'), + (u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'), + (u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'), + (u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'), + (u'\u5730\u57df\u30cb\u30e5\u30fc\u30b9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=local'), + (u'\u7d71\u8a08\u30fb\u767d\u66f8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=report'), + (u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking'), + (u'\u4f1a\u898b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=interview'), + (u'\u793e\u8aac\u30fb\u6625\u79cb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shasetsu'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba'), + (u'\u8abf\u67fb\u30fb\u30a2\u30f3\u30b1\u30fc\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=research') + ] + + + From fc524ee7d4eccbe6f8a0ae63f56a06caaf248fa0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Sun, 21 Nov 2010 09:52:40 -0700 Subject: [PATCH 32/32] Metadata and cover download plugins from Nicebooks --- src/calibre/customize/builtins.py | 6 +- src/calibre/customize/ui.py | 2 +- src/calibre/ebooks/metadata/nicebooks.py | 424 +++++++++++++++++++++++ src/calibre/gui2/wizard/__init__.py | 12 +- 4 files changed, 437 insertions(+), 7 deletions(-) create mode 100644 src/calibre/ebooks/metadata/nicebooks.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 681d953c9b..87946706cf 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -483,6 +483,7 @@ from calibre.devices.kobo.driver import KOBO from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ LibraryThing from calibre.ebooks.metadata.douban import DoubanBooks +from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ LibraryThingCovers, DoubanCovers from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX @@ -490,8 +491,9 @@ from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, - LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, - Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers] + LibraryThing, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, + Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers, + NiceBooksCovers] plugins += [ ComicInput, EPUBInput, diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 844269e453..c360122842 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -120,7 +120,7 @@ def enable_plugin(plugin_or_name): config['enabled_plugins'] = ep default_disabled_plugins = set([ - 'Douban Books', 'Douban.com covers', + 'Douban Books', 'Douban.com covers', 'Nicebooks', 'Nicebooks covers' ]) def is_disabled(plugin): diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py new file mode 100644 index 0000000000..4d19e9611b --- /dev/null +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -0,0 +1,424 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, sengian <sengian1@gmail.com>' +__docformat__ = 'restructuredtext en' + +import sys, textwrap, re, traceback, socket +from urllib import urlencode +from math import ceil +from copy import deepcopy + +from lxml.html import soupparser + +from calibre.utils.date import parse_date, utcnow +from calibre import browser, preferred_encoding +from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.metadata import MetaInformation, check_isbn, \ + authors_to_sort_string +from calibre.ebooks.metadata.fetch import MetadataSource +from calibre.ebooks.metadata.covers import CoverDownload +from calibre.utils.config import OptionParser + +class NiceBooks(MetadataSource): + + name = 'Nicebooks' + description = _('Downloads metadata from french Nicebooks') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + version = (1, 0, 0) + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + +class NiceBooksCovers(CoverDownload): + + name = 'Nicebooks covers' + description = _('Downloads covers from french Nicebooks') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + type = _('Cover download') + version = (1, 0, 0) + + def has_cover(self, mi, ans, timeout=5.): + if not mi.isbn: + return False + br = browser() + try: + entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0] + if Covers(mi.isbn)(entry).check_cover(): + self.debug('cover for', mi.isbn, 'found') + ans.set() + except Exception, e: + self.debug(e) + + def get_covers(self, mi, result_queue, abort, timeout=5.): + if not mi.isbn: + return + br = browser() + try: + entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0] + cover_data, ext = Covers(mi.isbn)(entry).get_cover(br, timeout) + if not ext: + ext = 'jpg' + result_queue.put((True, cover_data, ext, self.name)) + except Exception, e: + result_queue.put((False, self.exception_to_string(e), + traceback.format_exc(), self.name)) + + +def report(verbose): + if verbose: + import traceback + traceback.print_exc() + +def replace_monthsfr(datefr): + # Replace french months by english equivalent for parse_date + frtoen = { + u'[jJ]anvier': u'jan', + u'[fF].vrier': u'feb', + u'[mM]ars': u'mar', + u'[aA]vril': u'apr', + u'[mM]ai': u'may', + u'[jJ]uin': u'jun', + u'[jJ]uillet': u'jul', + u'[aA]o.t': u'aug', + u'[sS]eptembre': u'sep', + u'[Oo]ctobre': u'oct', + u'[nN]ovembre': u'nov', + u'[dD].cembre': u'dec' } + for k in frtoen.iterkeys(): + tmp = re.sub(k, frtoen[k], datefr) + if tmp <> datefr: break + return tmp + +class Query(object): + + BASE_URL = 'http://fr.nicebooks.com/' + + def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, max_results=20): + assert not(title is None and author is None and publisher is None \ + and isbn is None and keywords is None) + assert (max_results < 21) + + self.max_results = int(max_results) + + if isbn is not None: + q = isbn + else: + q = ' '.join([i for i in (title, author, publisher, keywords) \ + if i is not None]) + + if isinstance(q, unicode): + q = q.encode('utf-8') + self.urldata = 'search?' + urlencode({'q':q,'s':'Rechercher'}) + + def __call__(self, browser, verbose, timeout = 5.): + if verbose: + print 'Query:', self.BASE_URL+self.urldata + + try: + raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '<title>404 - ' in raw: + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + #nb of page to call + try: + nbresults = int(feed.xpath("//div[@id='topbar']/b")[0].text) + except: + #direct hit + return [feed] + + nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/10)) + pages =[feed] + if nbpagetoquery > 1: + for i in xrange(2, nbpagetoquery + 1): + try: + urldata = self.urldata + '&p=' + str(i) + raw = browser.open_novisit(self.BASE_URL+urldata, timeout=timeout).read() + except Exception, e: + continue + if '<title>404 - ' in raw: + continue + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + continue + pages.append(feed) + + results = [] + for x in pages: + results.extend([i.find_class('title')[0].get('href') \ + for i in x.xpath("//ul[@id='results']/li")]) + return results[:self.max_results] + +class ResultList(list): + + BASE_URL = 'http://fr.nicebooks.com' + + def __init__(self): + self.repub = re.compile(u'\s*.diteur\s*', re.I) + self.reauteur = re.compile(u'\s*auteur.*', re.I) + self.reautclean = re.compile(u'\s*\(.*\)\s*') + + def get_title(self, entry): + # title = deepcopy(entry.find("div[@id='book-info']")) + title = deepcopy(entry) + title.remove(title.find("dl[@title='Informations sur le livre']")) + title = ' '.join([i.text_content() for i in title.iterchildren()]) + return unicode(title.replace('\n', '')) + + def get_authors(self, entry): + # author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + author = entry.find("dl[@title='Informations sur le livre']") + authortext = [] + for x in author.getiterator('dt'): + if self.reauteur.match(x.text): + elt = x.getnext() + while elt.tag == 'dd': + authortext.append(unicode(elt.text_content())) + elt = elt.getnext() + break + if len(authortext) == 1: + authortext = [self.reautclean.sub('', authortext[0])] + return authortext + + def get_description(self, entry, verbose): + try: + return u'RESUME:\n' + unicode(entry.getparent().xpath("//p[@id='book-description']")[0].text) + except: + report(verbose) + return None + + def get_book_info(self, entry, mi, verbose): + entry = entry.find("dl[@title='Informations sur le livre']") + for x in entry.getiterator('dt'): + if x.text == 'ISBN': + isbntext = x.getnext().text_content().replace('-', '') + if check_isbn(isbntext): + mi.isbn = unicode(isbntext) + elif self.repub.match(x.text): + mi.publisher = unicode(x.getnext().text_content()) + elif x.text == 'Langue': + mi.language = unicode(x.getnext().text_content()) + elif x.text == 'Date de parution': + d = x.getnext().text_content() + try: + default = utcnow().replace(day=15) + d = replace_monthsfr(d) + d = parse_date(d, assume_utc=True, default=default) + mi.pubdate = d + except: + report(verbose) + return mi + + def fill_MI(self, entry, title, authors, verbose): + mi = MetaInformation(title, authors) + mi.author_sort = authors_to_sort_string(authors) + mi.comments = self.get_description(entry, verbose) + # entry = entry.find("dl[@title='Informations sur le livre']") + # mi.publisher = self.get_publisher(entry) + # mi.pubdate = self.get_date(entry, verbose) + # mi.isbn = self.get_ISBN(entry) + # mi.language = self.get_language(entry) + return self.get_book_info(entry, mi, verbose) + + def get_individual_metadata(self, browser, linkdata, verbose): + try: + raw = browser.open_novisit(self.BASE_URL + linkdata).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '<title>404 - ' in raw: + report(verbose) + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + # get results + return feed.xpath("//div[@id='container']")[0] + + def populate(self, entries, browser, verbose=False): + #single entry + if len(entries) == 1 and not isinstance(entries[0], str): + try: + entry = entries[0].xpath("//div[@id='container']")[0] + entry = entry.find("div[@id='book-info']") + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + return + self.append(self.fill_MI(entry, title, authors, verbose)) + else: + #multiple entries + for x in entries: + try: + entry = self.get_individual_metadata(browser, x, verbose) + entry = entry.find("div[@id='book-info']") + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + continue + self.append(self.fill_MI(entry, title, authors, verbose)) + + +class NiceBooksError(Exception): + pass + +class ISBNNotFound(NiceBooksError): + pass + +class Covers(object): + + def __init__(self, isbn = None): + assert isbn is not None + self.urlimg = '' + self.isbn = isbn + self.isbnf = False + + def __call__(self, entry = None): + try: + self.urlimg = entry.xpath("//div[@id='book-picture']/a")[0].get('href') + except: + return self + isbno = entry.get_element_by_id('book-info').find("dl[@title='Informations sur le livre']") + for x in isbno.getiterator('dt'): + if x.text == 'ISBN' and check_isbn(x.getnext().text_content()): + self.isbnf = True + break + return self + + def check_cover(self): + return True if self.urlimg else False + + def get_cover(self, browser, timeout = 5.): + try: + cover, ext = browser.open_novisit(self.urlimg, timeout=timeout).read(), \ + self.urlimg.rpartition('.')[-1] + return cover, ext if ext else 'jpg' + except Exception, err: + if isinstance(getattr(err, 'args', [None])[0], socket.timeout): + err = NiceBooksError(_('Nicebooks timed out. Try again later.')) + raise err + if not len(self.urlimg): + if not self.isbnf: + raise ISBNNotFound('ISBN: '+self.isbn+_(' not found.')) + raise NiceBooksError(_('An errror occured with Nicebooks cover fetcher')) + + +def search(title=None, author=None, publisher=None, isbn=None, + max_results=5, verbose=False, keywords=None): + br = browser() + entries = Query(title=title, author=author, isbn=isbn, publisher=publisher, + keywords=keywords, max_results=max_results)(br, verbose) + + if entries is None or len(entries) == 0: + return + + #List of entry + ans = ResultList() + ans.populate(entries, br, verbose) + return ans + +def check_for_cover(isbn): + br = browser() + entry = Query(isbn=isbn, max_results=1)(br, False)[0] + return Covers(isbn)(entry).check_cover() + +def cover_from_isbn(isbn, timeout = 5.): + br = browser() + entry = Query(isbn=isbn, max_results=1)(br, False, timeout)[0] + return Covers(isbn)(entry).get_cover(br, timeout) + + +def option_parser(): + parser = OptionParser(textwrap.dedent(\ + '''\ + %prog [options] + + Fetch book metadata from Nicebooks. You must specify one of title, author, + ISBN, publisher or keywords. Will fetch a maximum of 20 matches, + so you should make your query as specific as possible. + It can also get covers if the option is activated. + ''' + )) + parser.add_option('-t', '--title', help='Book title') + parser.add_option('-a', '--author', help='Book author(s)') + parser.add_option('-p', '--publisher', help='Book publisher') + parser.add_option('-i', '--isbn', help='Book ISBN') + parser.add_option('-k', '--keywords', help='Keywords') + parser.add_option('-c', '--covers', default=0, + help='Covers: 1-Check/ 2-Download') + parser.add_option('-p', '--coverspath', default='', + help='Covers files path') + parser.add_option('-m', '--max-results', default=20, + help='Maximum number of results to fetch') + parser.add_option('-v', '--verbose', default=0, action='count', + help='Be more verbose about errors') + return parser + +def main(args=sys.argv): + import os + parser = option_parser() + opts, args = parser.parse_args(args) + try: + results = search(opts.title, opts.author, isbn=opts.isbn, publisher=opts.publisher, + keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results) + except AssertionError: + report(True) + parser.print_help() + return 1 + if results is None or len(results) == 0: + print 'No result found for this search!' + return 0 + for result in results: + print unicode(result).encode(preferred_encoding, 'replace') + covact = int(opts.covers) + if covact == 1: + textcover = 'No cover found!' + if check_for_cover(result.isbn): + textcover = 'A cover was found for this book' + print textcover + elif covact == 2: + cover_data, ext = cover_from_isbn(result.isbn) + cpath = result.isbn + if len(opts.coverspath): + cpath = os.path.normpath(opts.coverspath + '/' + result.isbn) + oname = os.path.abspath(cpath+'.'+ext) + open(oname, 'wb').write(cover_data) + print 'Cover saved to file ', oname + print + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py index e2f463b80b..4f418d34d5 100644 --- a/src/calibre/gui2/wizard/__init__.py +++ b/src/calibre/gui2/wizard/__init__.py @@ -615,10 +615,14 @@ class LibraryPage(QWizardPage, LibraryUI): self.emit(SIGNAL('retranslate()')) self.init_languages() try: - if prefs['language'].lower().startswith('zh'): - from calibre.customize.ui import enable_plugin - for name in ('Douban Books', 'Douban.com covers'): - enable_plugin(name) + lang = prefs['language'].lower()[:2] + metadata_plugins = { + 'zh' : ('Douban Books', 'Douban.com covers'), + 'fr' : ('Nicebooks', 'Nicebooks covers'), + }.get(lang, []) + from calibre.customize.ui import enable_plugin + for name in metadata_plugins: + enable_plugin(name) except: pass