From 9bbff15c27c2be0b6101f17ddaa7f53a504824ea Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 02:12:09 +0800 Subject: [PATCH 01/15] text processing tweaks --- src/calibre/ebooks/conversion/utils.py | 4 ++-- src/calibre/ebooks/txt/input.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 27dacdf5fb..52d1bcc619 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -190,7 +190,7 @@ class PreProcessor(object): line_ending = "\s*\s*()?" blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*" line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" - txt_line_wrap = u"(\u0020|\u0009)*\n" + txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}" unwrap_regex = lookahead+line_ending+blanklines+line_opening if format == 'txt': @@ -357,6 +357,6 @@ class PreProcessor(object): html = blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) # Center separator lines - html = re.sub(u'

\s*(?P([*#•]+\s*)+)\s*

', '

' + '\g' + '

', html) + html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) return html diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 98756c5fa1..eac46385a7 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -90,7 +90,7 @@ class TXTInput(InputFormatPlugin): # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. - if options.paragraph_type == 'single' or 'unformatted': + if options.paragraph_type in ('single', 'unformatted'): txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_paragraphs_print_formatted(txt) From e9130241603a99f7e8dddfb8ff7df6edf4faacb5 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 10:40:30 +0800 Subject: [PATCH 02/15] ... --- src/calibre/ebooks/txt/input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index e2405de617..34a702cc55 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -23,7 +23,7 @@ class TXTInput(InputFormatPlugin): options = set([ OptionRecommendation(name='paragraph_type', recommended_value='auto', - choices=['auto', 'block', 'single', 'print'], + choices=['auto', 'block', 'single', 'print', 'unformatted'], help=_('Paragraph structure.\n' 'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n' '* auto: Try to auto detect paragraph type.\n' From 289cdf33925dc4f80c08889e941becc9c3862471 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 10:43:24 +0800 Subject: [PATCH 03/15] changed unformatted description --- src/calibre/ebooks/txt/input.py | 2 +- src/calibre/ebooks/txt/processor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 34a702cc55..9bc9323a4c 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -31,7 +31,7 @@ class TXTInput(InputFormatPlugin): '* single: Assume every line is a paragraph.\n' '* print: Assume every line starting with 2+ spaces or a tab ' 'starts a paragraph.' - '* unformatted: Most lines have hard line breaks, few/no spaces or indents.')), + '* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')), OptionRecommendation(name='formatting_type', recommended_value='auto', choices=['auto', 'none', 'heuristic', 'markdown'], help=_('Formatting used within the document.' diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 9dc29e45dd..e26f0a9d07 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -117,7 +117,7 @@ def detect_paragraph_type(txt): single: Each line is a paragraph. print: Each paragraph starts with a 2+ spaces or a tab and ends when a new paragraph is reached. - unformatted: most lines have hard line breaks, few/no spaces or indents + unformatted: most lines have hard line breaks, few/no blank lines or indents returns block, single, print, unformatted ''' From f3a9f3f83f7da4821bdc1fca2ba0df66aca714e1 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 17:27:24 +0800 Subject: [PATCH 04/15] added dehyphenation to txt input --- src/calibre/ebooks/conversion/preprocess.py | 15 +++++++++++---- src/calibre/ebooks/txt/input.py | 18 ++++++++++++------ 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index ae111355e4..df9fd66407 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -72,6 +72,8 @@ class DocAnalysis(object): def __init__(self, format='html', raw=''): raw = raw.replace(' ', ' ') + raw = raw.replace('\r\n', '\n') + raw = raw.replace('\r', '\n') if format == 'html': linere = re.compile('(?<=]*>\s*

).*?(?=

)', re.DOTALL) elif format == 'pdf': @@ -79,7 +81,7 @@ class DocAnalysis(object): elif format == 'spanned_html': linere = re.compile('(?<=)', re.DOTALL) elif format == 'txt': - linere = re.compile('.*?\n', re.DOTALL) + linere = re.compile('.*?\n') self.lines = linere.findall(raw) def line_length(self, percent): @@ -177,7 +179,7 @@ class Dehyphenator(object): def __init__(self): # Add common suffixes to the regex below to increase the likelihood of a match - # don't add suffixes which are also complete words, such as 'able' or 'sex' - self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) + self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE) # remove prefixes if the prefix was not already the point of hyphenation self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE) self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE) @@ -194,7 +196,7 @@ class Dehyphenator(object): lookupword = self.removesuffixes.sub('', dehyphenated) if self.prefixes.match(firsthalf) is None: lookupword = self.removeprefix.sub('', lookupword) - #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) + print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) try: searchresult = self.html.find(lookupword.lower()) except: @@ -225,8 +227,13 @@ class Dehyphenator(object): intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P()?\s*(\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*

\s*)?\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length) elif format == 'pdf': intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) + elif format == 'txt': + intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) elif format == 'individual_words': - intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P\w+)\b[^<]*<') # for later, not called anywhere yet + elif format == 'individual_words_txt': + intextmatch = re.compile(u'\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b') + elif format == 'html_cleanup': intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 9bc9323a4c..f6adb617c3 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en' import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation +from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ @@ -91,8 +92,16 @@ class TXTInput(InputFormatPlugin): log.debug('Could not reliably determine paragraph type using block') options.paragraph_type = 'block' else: - log.debug('Auto detected paragraph type as %s' % options.paragraph_type) - + log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + + # Get length for hyphen removal and punctuation unwrap + docanalysis = DocAnalysis('txt', txt) + length = docanalysis.line_length(.5) + + # Dehyphenate + dehyphenator = Dehyphenator() + html = dehyphenator(txt,'txt', length) + # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. @@ -103,10 +112,8 @@ class TXTInput(InputFormatPlugin): if options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import PreProcessor - from calibre.ebooks.conversion.preprocess import DocAnalysis # get length - docanalysis = DocAnalysis('txt', txt) - length = docanalysis.line_length(.5) + # unwrap lines based on punctuation preprocessor = PreProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') @@ -117,7 +124,6 @@ class TXTInput(InputFormatPlugin): html = convert_heuristic(txt, epub_split_size_kb=flow_size) else: html = convert_basic(txt, epub_split_size_kb=flow_size) - from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') From 696d9252324a5fa31ae91f8a3c5d472b5d5d953c Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 18:14:49 +0800 Subject: [PATCH 05/15] normalized line endings to simplify line length and dehyphenation, fixes print formatted output for certain line endings --- src/calibre/ebooks/conversion/preprocess.py | 10 +++++----- src/calibre/ebooks/txt/input.py | 8 ++++++-- src/calibre/ebooks/txt/processor.py | 5 ++++- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index df9fd66407..d9d735e391 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -72,8 +72,8 @@ class DocAnalysis(object): def __init__(self, format='html', raw=''): raw = raw.replace(' ', ' ') - raw = raw.replace('\r\n', '\n') - raw = raw.replace('\r', '\n') + #raw = raw.replace('\r\n', '\n') + #raw = raw.replace('\r', '\n') if format == 'html': linere = re.compile('(?<=]*>\s*

).*?(?=

)', re.DOTALL) elif format == 'pdf': @@ -214,10 +214,10 @@ class Dehyphenator(object): else: if self.html.find(lookupword) != -1 or searchresult != -1: - #print "returned dehyphenated word: " + str(dehyphenated) + print "returned dehyphenated word: " + str(dehyphenated) return dehyphenated else: - #print " returned hyphenated word: " + str(hyphenated) + print " returned hyphenated word: " + str(hyphenated) return hyphenated def __call__(self, html, format, length=1): @@ -228,7 +228,7 @@ class Dehyphenator(object): elif format == 'pdf': intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) elif format == 'txt': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) elif format == 'individual_words': intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P\w+)\b[^<]*<') # for later, not called anywhere yet elif format == 'individual_words_txt': diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index f6adb617c3..2e35e8e345 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \ - convert_heuristic + convert_heuristic, normalize_line_endings from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -94,13 +94,17 @@ class TXTInput(InputFormatPlugin): else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + # Normalize line endings + txt = normalize_line_endings(txt) + # Get length for hyphen removal and punctuation unwrap docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) + print "length is "+str(length) # Dehyphenate dehyphenator = Dehyphenator() - html = dehyphenator(txt,'txt', length) + txt = dehyphenator(txt,'txt', length) # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index e26f0a9d07..ebdadebda2 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False): safe_mode=False) return HTML_TEMPLATE % (title, md.convert(txt)) -def separate_paragraphs_single_line(txt): +def normalize_line_endings(txt): txt = txt.replace('\r\n', '\n') txt = txt.replace('\r', '\n') + return txt + +def separate_paragraphs_single_line(txt): txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt) return txt From 0f109d699f06967394370150a0a35bf671a283c6 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 18:38:52 +0800 Subject: [PATCH 06/15] tweaked the auto-detection to handle cases where the vast majority of the lines are formatted as block or print --- src/calibre/ebooks/txt/processor.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index ebdadebda2..6a1a106681 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -133,15 +133,21 @@ def detect_paragraph_type(txt): hardbreaks = docanalysis.line_histogram(.55) if hardbreaks: - # Check for print + # Determine print percentage tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt)) - if tab_line_count / float(txt_line_count) >= .15: - return 'print' - - # Check for block + print_percent = tab_line_count / float(txt_line_count) + + # Determine block percentage empty_line_count = len(re.findall('(?mu)^\s*$', txt)) - if empty_line_count / float(txt_line_count) >= .15: - return 'block' + block_percent = empty_line_count / float(txt_line_count) + + # Compare the two types - the type with the larger number of instances wins + # in cases where only one or the other represents the vast majority of the document neither wins + if print_percent >= block_percent: + if .15 <= print_percent <= .75: + return 'print' + elif .15 <= block_percent <= .75: + return 'block' # Assume unformatted text with hardbreaks if nothing else matches return 'unformatted' From c2cef786ce19b25cbdfc79c345d4cffa38885248 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 19:34:02 +0800 Subject: [PATCH 07/15] added partial dehyphenation for markdown --- src/calibre/ebooks/conversion/preprocess.py | 16 +++++++-------- src/calibre/ebooks/txt/input.py | 22 +++++++++++++-------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index d9d735e391..e2c51846a4 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -201,15 +201,15 @@ class Dehyphenator(object): searchresult = self.html.find(lookupword.lower()) except: return hyphenated - if self.format == 'html_cleanup': + if self.format == 'html_cleanup' or self.format == 'txt_cleanup': if self.html.find(lookupword) != -1 or searchresult != -1: - #print "Cleanup:returned dehyphenated word: " + str(dehyphenated) + print "Cleanup:returned dehyphenated word: " + str(dehyphenated) return dehyphenated elif self.html.find(hyphenated) != -1: - #print "Cleanup:returned hyphenated word: " + str(hyphenated) + print "Cleanup:returned hyphenated word: " + str(hyphenated) return hyphenated else: - #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) + print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) return firsthalf+u'\u2014'+wraptags+secondhalf else: @@ -230,12 +230,12 @@ class Dehyphenator(object): elif format == 'txt': intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length) elif format == 'individual_words': - intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P\w+)\b[^<]*<') # for later, not called anywhere yet - elif format == 'individual_words_txt': - intextmatch = re.compile(u'\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b') - + intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b[^<]*<') # for later, not called anywhere yet elif format == 'html_cleanup': intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') + elif format == 'txt_cleanup': + intextmatch = re.compile(u'(?P\w+)(-|‐)(?P\s+)(?P[\w\d]+)') + html = intextmatch.sub(self.dehyphenate, html) return html diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 2e35e8e345..5fbdc7131a 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -73,6 +73,14 @@ class TXTInput(InputFormatPlugin): # followed by the   entity. if options.preserve_spaces: txt = preserve_spaces(txt) + + # Normalize line endings + txt = normalize_line_endings(txt) + + # Get length for hyphen removal and punctuation unwrap + docanalysis = DocAnalysis('txt', txt) + length = docanalysis.line_length(.5) + print "length is "+str(length) if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) @@ -94,14 +102,6 @@ class TXTInput(InputFormatPlugin): else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) - # Normalize line endings - txt = normalize_line_endings(txt) - - # Get length for hyphen removal and punctuation unwrap - docanalysis = DocAnalysis('txt', txt) - length = docanalysis.line_length(.5) - print "length is "+str(length) - # Dehyphenate dehyphenator = Dehyphenator() txt = dehyphenator(txt,'txt', length) @@ -129,6 +129,12 @@ class TXTInput(InputFormatPlugin): else: html = convert_basic(txt, epub_split_size_kb=flow_size) + # Dehyphenate in cleanup mode for missed txt and markdown conversion + print "going through final dehyphenation" + dehyphenator = Dehyphenator() + html = dehyphenator(html,'txt_cleanup', length) + html = dehyphenator(html,'html_cleanup', length) + from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: From 9751f99db95185a9a6cdf66029f1d46e4a9d90d8 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 19:57:15 +0800 Subject: [PATCH 08/15] cleaned up print statements --- src/calibre/ebooks/conversion/preprocess.py | 12 ++++++------ src/calibre/ebooks/txt/input.py | 2 -- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index e2c51846a4..32eee713fe 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -196,28 +196,28 @@ class Dehyphenator(object): lookupword = self.removesuffixes.sub('', dehyphenated) if self.prefixes.match(firsthalf) is None: lookupword = self.removeprefix.sub('', lookupword) - print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) + #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) try: searchresult = self.html.find(lookupword.lower()) except: return hyphenated if self.format == 'html_cleanup' or self.format == 'txt_cleanup': if self.html.find(lookupword) != -1 or searchresult != -1: - print "Cleanup:returned dehyphenated word: " + str(dehyphenated) + #print "Cleanup:returned dehyphenated word: " + str(dehyphenated) return dehyphenated elif self.html.find(hyphenated) != -1: - print "Cleanup:returned hyphenated word: " + str(hyphenated) + #print "Cleanup:returned hyphenated word: " + str(hyphenated) return hyphenated else: - print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) + #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) return firsthalf+u'\u2014'+wraptags+secondhalf else: if self.html.find(lookupword) != -1 or searchresult != -1: - print "returned dehyphenated word: " + str(dehyphenated) + #print "returned dehyphenated word: " + str(dehyphenated) return dehyphenated else: - print " returned hyphenated word: " + str(hyphenated) + #print " returned hyphenated word: " + str(hyphenated) return hyphenated def __call__(self, html, format, length=1): diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 5fbdc7131a..3957391494 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -80,7 +80,6 @@ class TXTInput(InputFormatPlugin): # Get length for hyphen removal and punctuation unwrap docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) - print "length is "+str(length) if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) @@ -130,7 +129,6 @@ class TXTInput(InputFormatPlugin): html = convert_basic(txt, epub_split_size_kb=flow_size) # Dehyphenate in cleanup mode for missed txt and markdown conversion - print "going through final dehyphenation" dehyphenator = Dehyphenator() html = dehyphenator(html,'txt_cleanup', length) html = dehyphenator(html,'html_cleanup', length) From 7008e9b64cbe98ca43e77965a84a3f5af4e88f6d Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 9 Jan 2011 21:56:12 +0800 Subject: [PATCH 09/15] ... --- src/calibre/ebooks/conversion/preprocess.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 32eee713fe..08a46cb8d9 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -72,8 +72,6 @@ class DocAnalysis(object): def __init__(self, format='html', raw=''): raw = raw.replace(' ', ' ') - #raw = raw.replace('\r\n', '\n') - #raw = raw.replace('\r', '\n') if format == 'html': linere = re.compile('(?<=]*>\s*

).*?(?=

)', re.DOTALL) elif format == 'pdf': From 1670cd29bae7b41186141f902e0057676d985967 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 9 Jan 2011 10:32:19 -0700 Subject: [PATCH 10/15] Cicero by mad --- resources/recipes/cicero.recipe | 35 +++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 resources/recipes/cicero.recipe diff --git a/resources/recipes/cicero.recipe b/resources/recipes/cicero.recipe new file mode 100644 index 0000000000..2df6b68000 --- /dev/null +++ b/resources/recipes/cicero.recipe @@ -0,0 +1,35 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class Cicero(BasicNewsRecipe): + timefmt = ' [%Y-%m-%d]' + title = u'Cicero' + __author__ = 'mad@sharktooth.de' + description = u'Magazin f\xfcr politische Kultur' + oldest_article = 7 + language = 'de' + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + publisher = 'Ringier Publishing' + category = 'news, politics, Germany' + encoding = 'iso-8859-1' + publication_type = 'magazine' + masthead_url = 'http://www.cicero.de/img2/cicero_logo_rss.gif' + feeds = [ +(u'Das gesamte Portfolio', u'http://www.cicero.de/rss/rss.php?ress_id='), +#(u'Alle Heft-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=heft'), +#(u'Alle Online-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=online'), +#(u'Berliner Republik', u'http://www.cicero.de/rss/rss.php?ress_id=4'), +#(u'Weltb\xfchne', u'http://www.cicero.de/rss/rss.php?ress_id=1'), +#(u'Salon', u'http://www.cicero.de/rss/rss.php?ress_id=7'), +#(u'Kapital', u'http://www.cicero.de/rss/rss.php?ress_id=6'), +#(u'Netzst\xfccke', u'http://www.cicero.de/rss/rss.php?ress_id=9'), +#(u'Leinwand', u'http://www.cicero.de/rss/rss.php?ress_id=12'), +#(u'Bibliothek', u'http://www.cicero.de/rss/rss.php?ress_id=15'), +(u'Kolumne - Alle Kolulmnen', u'http://www.cicero.de/rss/rss2.php?ress_id='), +#(u'Kolumne - Schreiber, Berlin', u'http://www.cicero.de/rss/rss2.php?ress_id=35'), +#(u'Kolumne - TV Kritik', u'http://www.cicero.de/rss/rss2.php?ress_id=34') +] + + def print_version(self, url): + return 'http://www.cicero.de/page_print.php?' + url.rpartition('?')[2] From e58ccd8c5e4f4a251c8bf738a621d1a29c6e91da Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 9 Jan 2011 10:55:15 -0700 Subject: [PATCH 11/15] Fix XSS vulnerability in content server. Fixes #7980 (Security vulnerability in Calibre 0.7.34) --- src/calibre/library/server/browse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/library/server/browse.py b/src/calibre/library/server/browse.py index 37799c4cbc..3e4687be95 100644 --- a/src/calibre/library/server/browse.py +++ b/src/calibre/library/server/browse.py @@ -756,7 +756,7 @@ class BrowseServer(object): sort = self.browse_sort_book_list(items, list_sort) ids = [x[0] for x in items] html = render_book_list(ids, self.opts.url_prefix, - suffix=_('in search')+': '+query) + suffix=_('in search')+': '+xml(query)) return self.browse_template(sort, category=False, initial_search=query).format( title=_('Matching books'), script='booklist();', main=html) From 31c354a164a8816576ce5194a6b0e1b5d64b6728 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 9 Jan 2011 11:15:34 -0700 Subject: [PATCH 12/15] ... --- setup/build_environment.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/setup/build_environment.py b/setup/build_environment.py index 10ab1b0735..bdfddd2205 100644 --- a/setup/build_environment.py +++ b/setup/build_environment.py @@ -117,7 +117,6 @@ if iswindows: poppler_inc_dirs = consolidate('POPPLER_INC_DIR', r'%s\poppler;%s'%(sw_inc_dir, sw_inc_dir)) - popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[1]+r'\qt4'] poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir) popplerqt4_lib_dirs = poppler_lib_dirs poppler_libs = ['poppler'] @@ -131,7 +130,6 @@ elif isosx: fc_lib = '/sw/lib' poppler_inc_dirs = consolidate('POPPLER_INC_DIR', '/sw/build/poppler-0.14.5/poppler:/sw/build/poppler-0.14.5') - popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4'] poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', '/sw/lib') poppler_libs = ['poppler'] @@ -150,9 +148,6 @@ else: # Include directories poppler_inc_dirs = pkgconfig_include_dirs('poppler', 'POPPLER_INC_DIR', '/usr/include/poppler') - popplerqt4_inc_dirs = pkgconfig_include_dirs('poppler-qt4', '', '') - if not popplerqt4_inc_dirs: - popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4'] png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR', '/usr/include') magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick') @@ -187,13 +182,10 @@ if not poppler_inc_dirs or not os.path.exists( poppler_error = \ ('Poppler not found on your system. Various PDF related', ' functionality will not work. Use the POPPLER_INC_DIR and', - ' POPPLER_LIB_DIR environment variables.') - -popplerqt4_error = None -if not popplerqt4_inc_dirs or not os.path.exists( - os.path.join(popplerqt4_inc_dirs[-1], 'poppler-qt4.h')): - popplerqt4_error = \ - ('Poppler Qt4 bindings not found on your system.') + ' POPPLER_LIB_DIR environment variables. calibre requires ' + ' the poppler XPDF headers. If your distro does not ' + ' include them you will have to re-compile poppler ' + ' by hand with --enable-xpdf-headers') magick_error = None if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0], From d63bfeff1158fc9f8ef9f7ba78cd7b39f18c9a98 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 9 Jan 2011 11:18:35 -0700 Subject: [PATCH 13/15] ... --- setup/build_environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup/build_environment.py b/setup/build_environment.py index bdfddd2205..f0adaf9584 100644 --- a/setup/build_environment.py +++ b/setup/build_environment.py @@ -192,7 +192,7 @@ if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0], 'wand')): magick_error = ('ImageMagick not found on your system. ' 'Try setting the environment variables MAGICK_INC ' - 'and MAGICK_LIB to help calibre locate the inclue and libbrary ' + 'and MAGICK_LIB to help calibre locate the include and library ' 'files.') podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib) From be03e57f2cf8d25b87e888b781ab14cc4ff3b20f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 9 Jan 2011 11:44:43 -0700 Subject: [PATCH 14/15] El Correo by desUBIKado --- resources/recipes/el_correo.recipe | 122 +++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 resources/recipes/el_correo.recipe diff --git a/resources/recipes/el_correo.recipe b/resources/recipes/el_correo.recipe new file mode 100644 index 0000000000..9190560b02 --- /dev/null +++ b/resources/recipes/el_correo.recipe @@ -0,0 +1,122 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '08 Januery 2011, desUBIKado' +__author__ = 'desUBIKado' +__description__ = 'Daily newspaper from Biscay' +__version__ = 'v0.08' +__date__ = '08, Januery 2011' +''' +[url]http://www.elcorreo.com/[/url] +''' + +import time +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class heraldo(BasicNewsRecipe): + __author__ = 'desUBIKado' + description = 'Daily newspaper from Biscay' + title = u'El Correo' + publisher = 'Vocento' + category = 'News, politics, culture, economy, general interest' + oldest_article = 2 + delay = 1 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + language = 'es' + timefmt = '[%a, %d %b, %Y]' + encoding = 'iso-8859-1' + remove_empty_feeds = True + remove_javascript = False + + feeds = [ + (u'Portada', u'http://www.elcorreo.com/vizcaya/portada.xml'), + (u'Local', u'http://www.elcorreo.com/vizcaya/rss/feeds/vizcaya.xml'), + (u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'), + (u'Econom\xeda', u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'), + (u'Pol\xedtica', u'http://www.elcorreo.com/vizcaya/rss/feeds/politica.xml'), + (u'Opini\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'), + (u'Deportes', u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'), + (u'Sociedad', u'http://www.elcorreo.com/vizcaya/rss/feeds/sociedad.xml'), + (u'Cultura', u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'), + (u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'), + (u'Gente', u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml') + ] + + keep_only_tags = [ + dict(name='div', attrs={'class':['grouphead','date','art_head','story-texto','text','colC_articulo','contenido_comentarios']}), + dict(name='div' , attrs={'id':['articulo','story-texto','story-entradilla']}) + ] + + remove_tags = [ + dict(name='div', attrs={'class':['art_barra','detalles-opinion','formdenunciar','modulo calculadoras','nubetags','pie']}), + dict(name='div', attrs={'class':['mod_lomas','bloque_lomas','blm_header','link-app3','link-app4','botones_listado']}), + dict(name='div', attrs={'class':['navegacion_galeria','modulocanalpromocion','separa','separacion','compartir','tags_relacionados']}), + dict(name='div', attrs={'class':['moduloBuscadorDeportes','modulo-gente','moddestacadopeq','OpcArt','articulopiniones']}), + dict(name='div', attrs={'class':['modulo-especial','publiEspecial']}), + dict(name='div', attrs={'id':['articulopina']}), + dict(name='br', attrs={'class':'clear'}), + dict(name='form', attrs={'name':'frm_conversor2'}) + ] + + remove_tags_before = dict(name='div' , attrs={'class':'articulo '}) + remove_tags_after = dict(name='div' , attrs={'class':'comentarios'}) + + def get_cover_url(self): + cover = None + st = time.localtime() + year = str(st.tm_year) + month = "%.2d" % st.tm_mon + day = "%.2d" % st.tm_mday + #[url]http://img.kiosko.net/2011/01/02/es/elcorreo.750.jpg[/url] + #[url]http://info.elcorreo.com/pdf/06012011-viz.pdf[/url] + cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf' + + br = BasicNewsRecipe.get_browser() + try: + br.open(cover) + except: + self.log("\nPortada no disponible") + cover ='http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png' + return cover + + extra_css = ''' + h1, .headline {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;} + h2, .subhead {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:18px;} + h3, .overhead {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;} + h4 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;} + h5 {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;} + h6 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;} + .date,.byline, .photo {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;} + img{margin-bottom: 0.4em} + ''' + + + + preprocess_regexps = [ + + # To present the image of the embedded video + (re.compile(r'var RUTA_IMAGEN', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(r'var SITIO = "elcorreo";', re.DOTALL|re.IGNORECASE), lambda match: '