From 9bbff15c27c2be0b6101f17ddaa7f53a504824ea Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 02:12:09 +0800
Subject: [PATCH 01/15] text processing tweaks
---
src/calibre/ebooks/conversion/utils.py | 4 ++--
src/calibre/ebooks/txt/input.py | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 27dacdf5fb..52d1bcc619 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -190,7 +190,7 @@ class PreProcessor(object):
line_ending = "\s*(span|p|div)>\s*((p|span|div)>)?"
blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*"
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
- txt_line_wrap = u"(\u0020|\u0009)*\n"
+ txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
unwrap_regex = lookahead+line_ending+blanklines+line_opening
if format == 'txt':
@@ -357,6 +357,6 @@ class PreProcessor(object):
html = blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html)
# Center separator lines
- html = re.sub(u'\s*(?P([*#•]+\s*)+)\s*
', '' + '\g' + '
', html)
+ html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•]+\s*)+)\s*((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>', '' + '\g' + '
', html)
return html
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 98756c5fa1..eac46385a7 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -90,7 +90,7 @@ class TXTInput(InputFormatPlugin):
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
- if options.paragraph_type == 'single' or 'unformatted':
+ if options.paragraph_type in ('single', 'unformatted'):
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_paragraphs_print_formatted(txt)
From e9130241603a99f7e8dddfb8ff7df6edf4faacb5 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 10:40:30 +0800
Subject: [PATCH 02/15] ...
---
src/calibre/ebooks/txt/input.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index e2405de617..34a702cc55 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -23,7 +23,7 @@ class TXTInput(InputFormatPlugin):
options = set([
OptionRecommendation(name='paragraph_type', recommended_value='auto',
- choices=['auto', 'block', 'single', 'print'],
+ choices=['auto', 'block', 'single', 'print', 'unformatted'],
help=_('Paragraph structure.\n'
'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
'* auto: Try to auto detect paragraph type.\n'
From 289cdf33925dc4f80c08889e941becc9c3862471 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 10:43:24 +0800
Subject: [PATCH 03/15] changed unformatted description
---
src/calibre/ebooks/txt/input.py | 2 +-
src/calibre/ebooks/txt/processor.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 34a702cc55..9bc9323a4c 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -31,7 +31,7 @@ class TXTInput(InputFormatPlugin):
'* single: Assume every line is a paragraph.\n'
'* print: Assume every line starting with 2+ spaces or a tab '
'starts a paragraph.'
- '* unformatted: Most lines have hard line breaks, few/no spaces or indents.')),
+ '* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')),
OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'heuristic', 'markdown'],
help=_('Formatting used within the document.'
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 9dc29e45dd..e26f0a9d07 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -117,7 +117,7 @@ def detect_paragraph_type(txt):
single: Each line is a paragraph.
print: Each paragraph starts with a 2+ spaces or a tab
and ends when a new paragraph is reached.
- unformatted: most lines have hard line breaks, few/no spaces or indents
+ unformatted: most lines have hard line breaks, few/no blank lines or indents
returns block, single, print, unformatted
'''
From f3a9f3f83f7da4821bdc1fca2ba0df66aca714e1 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 17:27:24 +0800
Subject: [PATCH 04/15] added dehyphenation to txt input
---
src/calibre/ebooks/conversion/preprocess.py | 15 +++++++++++----
src/calibre/ebooks/txt/input.py | 18 ++++++++++++------
2 files changed, 23 insertions(+), 10 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index ae111355e4..df9fd66407 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -72,6 +72,8 @@ class DocAnalysis(object):
def __init__(self, format='html', raw=''):
raw = raw.replace(' ', ' ')
+ raw = raw.replace('\r\n', '\n')
+ raw = raw.replace('\r', '\n')
if format == 'html':
linere = re.compile('(?<=]*>\s*
).*?(?=
)', re.DOTALL)
elif format == 'pdf':
@@ -79,7 +81,7 @@ class DocAnalysis(object):
elif format == 'spanned_html':
linere = re.compile('(?<=)', re.DOTALL)
elif format == 'txt':
- linere = re.compile('.*?\n', re.DOTALL)
+ linere = re.compile('.*?\n')
self.lines = linere.findall(raw)
def line_length(self, percent):
@@ -177,7 +179,7 @@ class Dehyphenator(object):
def __init__(self):
# Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex'
- self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+ self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation
self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
@@ -194,7 +196,7 @@ class Dehyphenator(object):
lookupword = self.removesuffixes.sub('', dehyphenated)
if self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword)
- #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+ print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
try:
searchresult = self.html.find(lookupword.lower())
except:
@@ -225,8 +227,13 @@ class Dehyphenator(object):
intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P()?\s*([iubp]>\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*
\s*)?(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length)
elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P|[iub]>\s*
\s*<[iub]>)\s*(?P[\w\d]+)'% length)
+ elif format == 'txt':
+ intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P[\w\d]+)'% length)
elif format == 'individual_words':
- intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P\w+)\b[^<]*<') # for later, not called anywhere yet
+ elif format == 'individual_words_txt':
+ intextmatch = re.compile(u'\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b')
+
elif format == 'html_cleanup':
intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P
\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)')
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 9bc9323a4c..f6adb617c3 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
@@ -91,8 +92,16 @@ class TXTInput(InputFormatPlugin):
log.debug('Could not reliably determine paragraph type using block')
options.paragraph_type = 'block'
else:
- log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
-
+ log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
+
+ # Get length for hyphen removal and punctuation unwrap
+ docanalysis = DocAnalysis('txt', txt)
+ length = docanalysis.line_length(.5)
+
+ # Dehyphenate
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(txt,'txt', length)
+
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
@@ -103,10 +112,8 @@ class TXTInput(InputFormatPlugin):
if options.paragraph_type == 'unformatted':
from calibre.ebooks.conversion.utils import PreProcessor
- from calibre.ebooks.conversion.preprocess import DocAnalysis
# get length
- docanalysis = DocAnalysis('txt', txt)
- length = docanalysis.line_length(.5)
+
# unwrap lines based on punctuation
preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
@@ -117,7 +124,6 @@ class TXTInput(InputFormatPlugin):
html = convert_heuristic(txt, epub_split_size_kb=flow_size)
else:
html = convert_basic(txt, epub_split_size_kb=flow_size)
-
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
From 696d9252324a5fa31ae91f8a3c5d472b5d5d953c Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 18:14:49 +0800
Subject: [PATCH 05/15] normalized line endings to simplify line length and
dehyphenation, fixes print formatted output for certain line endings
---
src/calibre/ebooks/conversion/preprocess.py | 10 +++++-----
src/calibre/ebooks/txt/input.py | 8 ++++++--
src/calibre/ebooks/txt/processor.py | 5 ++++-
3 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index df9fd66407..d9d735e391 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -72,8 +72,8 @@ class DocAnalysis(object):
def __init__(self, format='html', raw=''):
raw = raw.replace(' ', ' ')
- raw = raw.replace('\r\n', '\n')
- raw = raw.replace('\r', '\n')
+ #raw = raw.replace('\r\n', '\n')
+ #raw = raw.replace('\r', '\n')
if format == 'html':
linere = re.compile('(?<=]*>\s*
).*?(?=)', re.DOTALL)
elif format == 'pdf':
@@ -214,10 +214,10 @@ class Dehyphenator(object):
else:
if self.html.find(lookupword) != -1 or searchresult != -1:
- #print "returned dehyphenated word: " + str(dehyphenated)
+ print "returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
else:
- #print " returned hyphenated word: " + str(hyphenated)
+ print " returned hyphenated word: " + str(hyphenated)
return hyphenated
def __call__(self, html, format, length=1):
@@ -228,7 +228,7 @@ class Dehyphenator(object):
elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P|[iub]>\s*
\s*<[iub]>)\s*(?P[\w\d]+)'% length)
elif format == 'txt':
- intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P((\n|\r|\r\n)(\u0020|\u0009)*)+)(?P[\w\d]+)'% length)
+ intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length)
elif format == 'individual_words':
intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P\w+)\b[^<]*<') # for later, not called anywhere yet
elif format == 'individual_words_txt':
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index f6adb617c3..2e35e8e345 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
- convert_heuristic
+ convert_heuristic, normalize_line_endings
from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin):
@@ -94,13 +94,17 @@ class TXTInput(InputFormatPlugin):
else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
+ # Normalize line endings
+ txt = normalize_line_endings(txt)
+
# Get length for hyphen removal and punctuation unwrap
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
+ print "length is "+str(length)
# Dehyphenate
dehyphenator = Dehyphenator()
- html = dehyphenator(txt,'txt', length)
+ txt = dehyphenator(txt,'txt', length)
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index e26f0a9d07..ebdadebda2 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -80,9 +80,12 @@ def convert_markdown(txt, title='', disable_toc=False):
safe_mode=False)
return HTML_TEMPLATE % (title, md.convert(txt))
-def separate_paragraphs_single_line(txt):
+def normalize_line_endings(txt):
txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n')
+ return txt
+
+def separate_paragraphs_single_line(txt):
txt = re.sub(u'(?<=.)\n(?=.)', '\n\n', txt)
return txt
From 0f109d699f06967394370150a0a35bf671a283c6 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 18:38:52 +0800
Subject: [PATCH 06/15] tweaked the auto-detection to handle cases where the
vast majority of the lines are formatted as block or print
---
src/calibre/ebooks/txt/processor.py | 20 +++++++++++++-------
1 file changed, 13 insertions(+), 7 deletions(-)
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index ebdadebda2..6a1a106681 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -133,15 +133,21 @@ def detect_paragraph_type(txt):
hardbreaks = docanalysis.line_histogram(.55)
if hardbreaks:
- # Check for print
+ # Determine print percentage
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
- if tab_line_count / float(txt_line_count) >= .15:
- return 'print'
-
- # Check for block
+ print_percent = tab_line_count / float(txt_line_count)
+
+ # Determine block percentage
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
- if empty_line_count / float(txt_line_count) >= .15:
- return 'block'
+ block_percent = empty_line_count / float(txt_line_count)
+
+ # Compare the two types - the type with the larger number of instances wins
+ # in cases where only one or the other represents the vast majority of the document neither wins
+ if print_percent >= block_percent:
+ if .15 <= print_percent <= .75:
+ return 'print'
+ elif .15 <= block_percent <= .75:
+ return 'block'
# Assume unformatted text with hardbreaks if nothing else matches
return 'unformatted'
From c2cef786ce19b25cbdfc79c345d4cffa38885248 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 19:34:02 +0800
Subject: [PATCH 07/15] added partial dehyphenation for markdown
---
src/calibre/ebooks/conversion/preprocess.py | 16 +++++++--------
src/calibre/ebooks/txt/input.py | 22 +++++++++++++--------
2 files changed, 22 insertions(+), 16 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index d9d735e391..e2c51846a4 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -201,15 +201,15 @@ class Dehyphenator(object):
searchresult = self.html.find(lookupword.lower())
except:
return hyphenated
- if self.format == 'html_cleanup':
+ if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
if self.html.find(lookupword) != -1 or searchresult != -1:
- #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+ print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
elif self.html.find(hyphenated) != -1:
- #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+ print "Cleanup:returned hyphenated word: " + str(hyphenated)
return hyphenated
else:
- #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+ print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
return firsthalf+u'\u2014'+wraptags+secondhalf
else:
@@ -230,12 +230,12 @@ class Dehyphenator(object):
elif format == 'txt':
intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P(\n(\u0020|\u0009)*)+)(?P[\w\d]+)'% length)
elif format == 'individual_words':
- intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P\w+)\b[^<]*<') # for later, not called anywhere yet
- elif format == 'individual_words_txt':
- intextmatch = re.compile(u'\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b')
-
+ intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P\w+)\b[^<]*<') # for later, not called anywhere yet
elif format == 'html_cleanup':
intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P
\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)')
+ elif format == 'txt_cleanup':
+ intextmatch = re.compile(u'(?P\w+)(-|‐)(?P\s+)(?P[\w\d]+)')
+
html = intextmatch.sub(self.dehyphenate, html)
return html
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 2e35e8e345..5fbdc7131a 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -73,6 +73,14 @@ class TXTInput(InputFormatPlugin):
# followed by the entity.
if options.preserve_spaces:
txt = preserve_spaces(txt)
+
+ # Normalize line endings
+ txt = normalize_line_endings(txt)
+
+ # Get length for hyphen removal and punctuation unwrap
+ docanalysis = DocAnalysis('txt', txt)
+ length = docanalysis.line_length(.5)
+ print "length is "+str(length)
if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
@@ -94,14 +102,6 @@ class TXTInput(InputFormatPlugin):
else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
- # Normalize line endings
- txt = normalize_line_endings(txt)
-
- # Get length for hyphen removal and punctuation unwrap
- docanalysis = DocAnalysis('txt', txt)
- length = docanalysis.line_length(.5)
- print "length is "+str(length)
-
# Dehyphenate
dehyphenator = Dehyphenator()
txt = dehyphenator(txt,'txt', length)
@@ -129,6 +129,12 @@ class TXTInput(InputFormatPlugin):
else:
html = convert_basic(txt, epub_split_size_kb=flow_size)
+ # Dehyphenate in cleanup mode for missed txt and markdown conversion
+ print "going through final dehyphenation"
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(html,'txt_cleanup', length)
+ html = dehyphenator(html,'html_cleanup', length)
+
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:
From 9751f99db95185a9a6cdf66029f1d46e4a9d90d8 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 19:57:15 +0800
Subject: [PATCH 08/15] cleaned up print statements
---
src/calibre/ebooks/conversion/preprocess.py | 12 ++++++------
src/calibre/ebooks/txt/input.py | 2 --
2 files changed, 6 insertions(+), 8 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index e2c51846a4..32eee713fe 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -196,28 +196,28 @@ class Dehyphenator(object):
lookupword = self.removesuffixes.sub('', dehyphenated)
if self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword)
- print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+ #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
try:
searchresult = self.html.find(lookupword.lower())
except:
return hyphenated
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
if self.html.find(lookupword) != -1 or searchresult != -1:
- print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+ #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
elif self.html.find(hyphenated) != -1:
- print "Cleanup:returned hyphenated word: " + str(hyphenated)
+ #print "Cleanup:returned hyphenated word: " + str(hyphenated)
return hyphenated
else:
- print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+ #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
return firsthalf+u'\u2014'+wraptags+secondhalf
else:
if self.html.find(lookupword) != -1 or searchresult != -1:
- print "returned dehyphenated word: " + str(dehyphenated)
+ #print "returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
else:
- print " returned hyphenated word: " + str(hyphenated)
+ #print " returned hyphenated word: " + str(hyphenated)
return hyphenated
def __call__(self, html, format, length=1):
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 5fbdc7131a..3957391494 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -80,7 +80,6 @@ class TXTInput(InputFormatPlugin):
# Get length for hyphen removal and punctuation unwrap
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
- print "length is "+str(length)
if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
@@ -130,7 +129,6 @@ class TXTInput(InputFormatPlugin):
html = convert_basic(txt, epub_split_size_kb=flow_size)
# Dehyphenate in cleanup mode for missed txt and markdown conversion
- print "going through final dehyphenation"
dehyphenator = Dehyphenator()
html = dehyphenator(html,'txt_cleanup', length)
html = dehyphenator(html,'html_cleanup', length)
From 7008e9b64cbe98ca43e77965a84a3f5af4e88f6d Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sun, 9 Jan 2011 21:56:12 +0800
Subject: [PATCH 09/15] ...
---
src/calibre/ebooks/conversion/preprocess.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 32eee713fe..08a46cb8d9 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -72,8 +72,6 @@ class DocAnalysis(object):
def __init__(self, format='html', raw=''):
raw = raw.replace(' ', ' ')
- #raw = raw.replace('\r\n', '\n')
- #raw = raw.replace('\r', '\n')
if format == 'html':
linere = re.compile('(?<=]*>\s*
).*?(?=)', re.DOTALL)
elif format == 'pdf':
From 1670cd29bae7b41186141f902e0057676d985967 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 9 Jan 2011 10:32:19 -0700
Subject: [PATCH 10/15] Cicero by mad
---
resources/recipes/cicero.recipe | 35 +++++++++++++++++++++++++++++++++
1 file changed, 35 insertions(+)
create mode 100644 resources/recipes/cicero.recipe
diff --git a/resources/recipes/cicero.recipe b/resources/recipes/cicero.recipe
new file mode 100644
index 0000000000..2df6b68000
--- /dev/null
+++ b/resources/recipes/cicero.recipe
@@ -0,0 +1,35 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Cicero(BasicNewsRecipe):
+ timefmt = ' [%Y-%m-%d]'
+ title = u'Cicero'
+ __author__ = 'mad@sharktooth.de'
+ description = u'Magazin f\xfcr politische Kultur'
+ oldest_article = 7
+ language = 'de'
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ use_embedded_content = False
+ publisher = 'Ringier Publishing'
+ category = 'news, politics, Germany'
+ encoding = 'iso-8859-1'
+ publication_type = 'magazine'
+ masthead_url = 'http://www.cicero.de/img2/cicero_logo_rss.gif'
+ feeds = [
+(u'Das gesamte Portfolio', u'http://www.cicero.de/rss/rss.php?ress_id='),
+#(u'Alle Heft-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=heft'),
+#(u'Alle Online-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=online'),
+#(u'Berliner Republik', u'http://www.cicero.de/rss/rss.php?ress_id=4'),
+#(u'Weltb\xfchne', u'http://www.cicero.de/rss/rss.php?ress_id=1'),
+#(u'Salon', u'http://www.cicero.de/rss/rss.php?ress_id=7'),
+#(u'Kapital', u'http://www.cicero.de/rss/rss.php?ress_id=6'),
+#(u'Netzst\xfccke', u'http://www.cicero.de/rss/rss.php?ress_id=9'),
+#(u'Leinwand', u'http://www.cicero.de/rss/rss.php?ress_id=12'),
+#(u'Bibliothek', u'http://www.cicero.de/rss/rss.php?ress_id=15'),
+(u'Kolumne - Alle Kolulmnen', u'http://www.cicero.de/rss/rss2.php?ress_id='),
+#(u'Kolumne - Schreiber, Berlin', u'http://www.cicero.de/rss/rss2.php?ress_id=35'),
+#(u'Kolumne - TV Kritik', u'http://www.cicero.de/rss/rss2.php?ress_id=34')
+]
+
+ def print_version(self, url):
+ return 'http://www.cicero.de/page_print.php?' + url.rpartition('?')[2]
From e58ccd8c5e4f4a251c8bf738a621d1a29c6e91da Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 9 Jan 2011 10:55:15 -0700
Subject: [PATCH 11/15] Fix XSS vulnerability in content server. Fixes #7980
(Security vulnerability in Calibre 0.7.34)
---
src/calibre/library/server/browse.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/library/server/browse.py b/src/calibre/library/server/browse.py
index 37799c4cbc..3e4687be95 100644
--- a/src/calibre/library/server/browse.py
+++ b/src/calibre/library/server/browse.py
@@ -756,7 +756,7 @@ class BrowseServer(object):
sort = self.browse_sort_book_list(items, list_sort)
ids = [x[0] for x in items]
html = render_book_list(ids, self.opts.url_prefix,
- suffix=_('in search')+': '+query)
+ suffix=_('in search')+': '+xml(query))
return self.browse_template(sort, category=False, initial_search=query).format(
title=_('Matching books'),
script='booklist();', main=html)
From 31c354a164a8816576ce5194a6b0e1b5d64b6728 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 9 Jan 2011 11:15:34 -0700
Subject: [PATCH 12/15] ...
---
setup/build_environment.py | 16 ++++------------
1 file changed, 4 insertions(+), 12 deletions(-)
diff --git a/setup/build_environment.py b/setup/build_environment.py
index 10ab1b0735..bdfddd2205 100644
--- a/setup/build_environment.py
+++ b/setup/build_environment.py
@@ -117,7 +117,6 @@ if iswindows:
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
r'%s\poppler;%s'%(sw_inc_dir, sw_inc_dir))
- popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[1]+r'\qt4']
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir)
popplerqt4_lib_dirs = poppler_lib_dirs
poppler_libs = ['poppler']
@@ -131,7 +130,6 @@ elif isosx:
fc_lib = '/sw/lib'
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
'/sw/build/poppler-0.14.5/poppler:/sw/build/poppler-0.14.5')
- popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
'/sw/lib')
poppler_libs = ['poppler']
@@ -150,9 +148,6 @@ else:
# Include directories
poppler_inc_dirs = pkgconfig_include_dirs('poppler',
'POPPLER_INC_DIR', '/usr/include/poppler')
- popplerqt4_inc_dirs = pkgconfig_include_dirs('poppler-qt4', '', '')
- if not popplerqt4_inc_dirs:
- popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
'/usr/include')
magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick')
@@ -187,13 +182,10 @@ if not poppler_inc_dirs or not os.path.exists(
poppler_error = \
('Poppler not found on your system. Various PDF related',
' functionality will not work. Use the POPPLER_INC_DIR and',
- ' POPPLER_LIB_DIR environment variables.')
-
-popplerqt4_error = None
-if not popplerqt4_inc_dirs or not os.path.exists(
- os.path.join(popplerqt4_inc_dirs[-1], 'poppler-qt4.h')):
- popplerqt4_error = \
- ('Poppler Qt4 bindings not found on your system.')
+ ' POPPLER_LIB_DIR environment variables. calibre requires '
+ ' the poppler XPDF headers. If your distro does not '
+ ' include them you will have to re-compile poppler '
+ ' by hand with --enable-xpdf-headers')
magick_error = None
if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
From d63bfeff1158fc9f8ef9f7ba78cd7b39f18c9a98 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 9 Jan 2011 11:18:35 -0700
Subject: [PATCH 13/15] ...
---
setup/build_environment.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/setup/build_environment.py b/setup/build_environment.py
index bdfddd2205..f0adaf9584 100644
--- a/setup/build_environment.py
+++ b/setup/build_environment.py
@@ -192,7 +192,7 @@ if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
'wand')):
magick_error = ('ImageMagick not found on your system. '
'Try setting the environment variables MAGICK_INC '
- 'and MAGICK_LIB to help calibre locate the inclue and libbrary '
+ 'and MAGICK_LIB to help calibre locate the include and library '
'files.')
podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)
From be03e57f2cf8d25b87e888b781ab14cc4ff3b20f Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 9 Jan 2011 11:44:43 -0700
Subject: [PATCH 14/15] El Correo by desUBIKado
---
resources/recipes/el_correo.recipe | 122 +++++++++++++++++++++++++++++
1 file changed, 122 insertions(+)
create mode 100644 resources/recipes/el_correo.recipe
diff --git a/resources/recipes/el_correo.recipe b/resources/recipes/el_correo.recipe
new file mode 100644
index 0000000000..9190560b02
--- /dev/null
+++ b/resources/recipes/el_correo.recipe
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+__license__ = 'GPL v3'
+__copyright__ = '08 Januery 2011, desUBIKado'
+__author__ = 'desUBIKado'
+__description__ = 'Daily newspaper from Biscay'
+__version__ = 'v0.08'
+__date__ = '08, Januery 2011'
+'''
+[url]http://www.elcorreo.com/[/url]
+'''
+
+import time
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class heraldo(BasicNewsRecipe):
+ __author__ = 'desUBIKado'
+ description = 'Daily newspaper from Biscay'
+ title = u'El Correo'
+ publisher = 'Vocento'
+ category = 'News, politics, culture, economy, general interest'
+ oldest_article = 2
+ delay = 1
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ use_embedded_content = False
+ language = 'es'
+ timefmt = '[%a, %d %b, %Y]'
+ encoding = 'iso-8859-1'
+ remove_empty_feeds = True
+ remove_javascript = False
+
+ feeds = [
+ (u'Portada', u'http://www.elcorreo.com/vizcaya/portada.xml'),
+ (u'Local', u'http://www.elcorreo.com/vizcaya/rss/feeds/vizcaya.xml'),
+ (u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'),
+ (u'Econom\xeda', u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'),
+ (u'Pol\xedtica', u'http://www.elcorreo.com/vizcaya/rss/feeds/politica.xml'),
+ (u'Opini\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'),
+ (u'Deportes', u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'),
+ (u'Sociedad', u'http://www.elcorreo.com/vizcaya/rss/feeds/sociedad.xml'),
+ (u'Cultura', u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'),
+ (u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'),
+ (u'Gente', u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml')
+ ]
+
+ keep_only_tags = [
+ dict(name='div', attrs={'class':['grouphead','date','art_head','story-texto','text','colC_articulo','contenido_comentarios']}),
+ dict(name='div' , attrs={'id':['articulo','story-texto','story-entradilla']})
+ ]
+
+ remove_tags = [
+ dict(name='div', attrs={'class':['art_barra','detalles-opinion','formdenunciar','modulo calculadoras','nubetags','pie']}),
+ dict(name='div', attrs={'class':['mod_lomas','bloque_lomas','blm_header','link-app3','link-app4','botones_listado']}),
+ dict(name='div', attrs={'class':['navegacion_galeria','modulocanalpromocion','separa','separacion','compartir','tags_relacionados']}),
+ dict(name='div', attrs={'class':['moduloBuscadorDeportes','modulo-gente','moddestacadopeq','OpcArt','articulopiniones']}),
+ dict(name='div', attrs={'class':['modulo-especial','publiEspecial']}),
+ dict(name='div', attrs={'id':['articulopina']}),
+ dict(name='br', attrs={'class':'clear'}),
+ dict(name='form', attrs={'name':'frm_conversor2'})
+ ]
+
+ remove_tags_before = dict(name='div' , attrs={'class':'articulo '})
+ remove_tags_after = dict(name='div' , attrs={'class':'comentarios'})
+
+ def get_cover_url(self):
+ cover = None
+ st = time.localtime()
+ year = str(st.tm_year)
+ month = "%.2d" % st.tm_mon
+ day = "%.2d" % st.tm_mday
+ #[url]http://img.kiosko.net/2011/01/02/es/elcorreo.750.jpg[/url]
+ #[url]http://info.elcorreo.com/pdf/06012011-viz.pdf[/url]
+ cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf'
+
+ br = BasicNewsRecipe.get_browser()
+ try:
+ br.open(cover)
+ except:
+ self.log("\nPortada no disponible")
+ cover ='http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
+ return cover
+
+ extra_css = '''
+ h1, .headline {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
+ h2, .subhead {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:18px;}
+ h3, .overhead {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
+ h4 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
+ h5 {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
+ h6 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
+ .date,.byline, .photo {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
+ img{margin-bottom: 0.4em}
+ '''
+
+
+
+ preprocess_regexps = [
+
+ # To present the image of the embedded video
+ (re.compile(r'var RUTA_IMAGEN', re.DOTALL|re.IGNORECASE), lambda match: '
'),
+ (re.compile(r'var SITIO = "elcorreo";', re.DOTALL|re.IGNORECASE), lambda match: '