From 0ad1f3c088f2ff0872de49171fd99a91a50a031a Mon Sep 17 00:00:00 2001
From: ldolse ]*>\s*]*>\s*(?P ]*>\s*]*>\s*(?P
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P
]*>|?p[^>]*>)))?'), chap_head),
# Have paragraphs show better
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index dcffbe68ca..eaba28e429 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -231,12 +231,12 @@ class RTFInput(InputFormatPlugin):
if self.options.preprocess_html:
print "********* Preprocessing HTML *********\n"
# Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r''+'\g
\n', res)
# Unwrap lines using punctation if the median length of all lines is less than 150
length = line_length('html', res, 0.4)
print "*** Median length is " + str(length) + " ***\n"
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*\s*(
]*>\s*(]*>\s*\s*)
\s*){0,3}\s*]*>\s*(]*>)?\s*" % length, re.UNICODE) + unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*\s*
\s*(?P]*>\s*(]*>\s*\s*)
\s*){0,3}\s*]*>\s*]*>\s*" % length, re.UNICODE)
if length < 150:
res = unwrap.sub(' ', res)
f.write(res)
From 5c951fb9628617133f17ead6d1393ea84b7c6412 Mon Sep 17 00:00:00 2001
From: ldolse \n ' + match.group(1) + '
- (re.compile(r' '),
+ # (re.compile(r' '),
- # Remove hyphenation
- (re.compile(r'- '),
# Clean up spaces
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
- # Connect paragraphs split by -
- (re.compile(u'(?<=[^\s][-–])[\s]*( )*\s*(?=[^\s])'), lambda match: ''),
# Add space before and after italics
(re.compile(u'(?'), lambda match: ' '),
(re.compile(r'(?=\w)'), lambda match: ' '),
+
]
# Fix Book Designer markup
@@ -293,6 +298,13 @@ class HTMLPreProcessor(object):
import traceback
print 'Failed to parse remove_footer regexp'
traceback.print_exc()
+
+ # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
+ if getattr(self.extra_opts, 'preprocess_html', None):
+ if is_pdftohtml:
+ end_rules.append(
+ (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P )', re.IGNORECASE)
+ blankreg = re.compile(r'\s* ]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
+
+ # Center separator lines
+ (re.compile(u'
\s*(?P
'), lambda match: '
tags
(re.compile(r'
'),
# Replace
with
\s*
', re.IGNORECASE), lambda match: '\n
\s*(?=[[a-z\d])'), lambda match: ''),
# Remove gray background
(re.compile(r']+>'), lambda match : ''),
# Detect Chapters to match default XPATH in GUI
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P
]*>|?p[^>]*>)))?'), chap_head),
+ (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P
]*>|?p[^>]*>)))?'), chap_head),
+ )
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index d57bfddd3e..35a8a1a9bc 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -494,7 +494,7 @@ class HTMLInput(InputFormatPlugin):
if not hasattr(self, 'log'):
from calibre.utils.logging import default_log
self.log = default_log
- self.log("********* Preprocessing HTML *********")
+ self.log("********* Preprocessing HTML - HTML Input plugin *********")
# Detect Chapters to match the xpath in the GUI
chapdetect = re.compile(r'(?=?(br|p|span))(?(br|p|span)[^>]*>)?\s*(?P'+'\g
\n', html)
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 9bf20fb1d4..f7bb0fbfd9 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -11,12 +11,14 @@ import re
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.preprocess import line_length
+
class LITInput(InputFormatPlugin):
name = 'LIT Input'
author = 'Marshall T. Vandegrift'
description = 'Convert LIT files to HTML'
file_types = set(['lit'])
+ html_preprocess_sections = 0
def convert(self, stream, options, file_ext, log,
accelerators):
@@ -55,14 +57,104 @@ class LITInput(InputFormatPlugin):
def preprocess_html(self, html):
+
+ def chapter_head(match):
+ chap = match.group('chap')
+ title = match.group('title')
+ if not title:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+ return ''+chap+'
\n'
+ else:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+ return ''+chap+'
\n'+title+'
\n'
+
+ def chapter_link(match):
+ chap = match.group('sectionlink')
+ if not chap:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
+ return '
'
+ else:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
+ return '
\n'+chap+'
'
+
+
+ def no_markup(raw, percent):
+ '''
+ Detects total marked up line endings in the file. raw is the text to
+ inspect. Percent is the minimum percent of line endings which should
+ be marked up to return true.
+ '''
+ htm_end_ere = re.compile(''+'\g
\n', html)
- # Unwrap lines using punctation if the median length of all lines is less than 150
+ # remove non-breaking spaces
+ html = re.sub(ur'\u00a0', ' ', html)
+ # Get rid of empty
tags or equivalent, check and + # mark up line endings if required before proceeding + if no_markup(html, 0.1): + self.log("not enough paragraph markers, adding now") + add_markup = re.compile('(?)(\n)') + html = add_markup.sub('
\n', html) + + # detect chapters/sections to match xpath or splitting logic # - # Insert extra line feeds so the line length regex functions properly - html = re.sub(r"
", "\n", html) + # Mark split points based on embedded links + chaplink = re.compile(r']*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P'),
# unwrap hyphenation - don't delete the hyphen (often doesn't split words)
- (re.compile(r'(?<=[-–])\s*
\s*(?=[[a-z\d])'), lambda match: ''),
+ (re.compile(u'(?<=[-–—])\s*
\s*(?=[[a-z\d])'), lambda match: ''),
# Remove gray background
(re.compile(r'
'),
@@ -303,15 +309,16 @@ class HTMLPreProcessor(object):
if getattr(self.extra_opts, 'preprocess_html', None):
if is_pdftohtml:
end_rules.append(
- (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P ]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P \n ' + match.group(1) + ' \n ' + match.group(1) + ' '),
# unwrap hyphenation - don't delete the hyphen (often doesn't split words)
- (re.compile(u'(?<=[-–—])\s* '),
# Clean up spaces
@@ -322,21 +324,29 @@ class HTMLPreProcessor(object):
import traceback
print 'Failed to parse remove_footer regexp'
traceback.print_exc()
+
+ # unwrap hyphenation - moved here so it's executed after header/footer removal
+ if is_pdftohtml:
+ # unwrap visible dashes and hyphens - don't delete as 50% or more of the time these
+ # hyphens are for compound words, formatting, etc
+ end_rules.append((re.compile(u'(?<=[-–—])\s* \s*(?=[[a-z\d])'), lambda match: ''))
+ # unwrap/delete soft hyphens
+ end_rules.append((re.compile(u'[](\s* )+\s*(?=[[a-z\d])'), lambda match: ''))
+ # unwrap/delete soft hyphens with formatting
+ end_rules.append((re.compile(u'[]\s*((i|u|b)>)+(\s* )+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
# Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
if getattr(self.extra_opts, 'preprocess_html', None):
if is_pdftohtml:
- end_rules.append(
- (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P
- # (re.compile(r' '),
-
- # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
- #(re.compile(u'(?<=[-–—])\s* )', re.IGNORECASE)
+ blankreg = re.compile(r'\s* ]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P
]*>|?p[^>]*>)))?'), chap_head),
+ (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P
]*>|?p[^>]*>)))?'), chap_head),
)
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
if length:
+ print "The pdf line length returned is " + str(length)
end_rules.append(
# Un wrap using punctuation
- (re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P'+'
', html)
- #
- # Unwrap lines using punctation if the median length of all lines is less than 150
- length = line_length('html', html, 0.4)
- self.log("*** Median length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
-
- # Center separator lines
- (re.compile(u'
\s*(?P
'), lambda match: '
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
+
+ # Center separator lines
+ (re.compile(u'
\s*(?P
'), lambda match: '
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
From 480eccb0b0c3921fd356d329e6d601b9207c2d26 Mon Sep 17 00:00:00 2001
From: ldolse
).*?(?=
)', re.DOTALL)
lines = linere.findall(raw)
- print "percent is " + str(percent)
lengths = []
for line in lines:
@@ -230,14 +229,17 @@ class HTMLPreProcessor(object):
# (re.compile(r'
\s*
', re.IGNORECASE), lambda match: '\n
\s*(?=[[a-z\d])'), lambda match: ''),
+ #(re.compile(u'(?<=[-–—])\s*
\s*(?=[[a-z\d])'), lambda match: ''),
+ # unwrap/delete soft hyphens
+ #(re.compile(u'[]\s*
\s*(?=[[a-z\d])'), lambda match: ''),
# Remove gray background
(re.compile(r'
\s*(?P
\s*){1,3}\s*(?P
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P
]*>|?p[^>]*>)))?'), chap_head),
- )
-
+ end_rules.append((re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P
]*>|?p[^>]*>)))?'), chap_head))
+
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
if length:
- print "The pdf line length returned is " + str(length)
+ # print "The pdf line length returned is " + str(length)
end_rules.append(
# Un wrap using punctuation
- (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
@@ -225,13 +224,6 @@ class HTMLPreProcessor(object):
(re.compile(r'', re.IGNORECASE), lambda match: ''),
# Remove
tags
(re.compile(r'
'),
- # Replace
with
\s*
', re.IGNORECASE), lambda match: '\n
\s*(?=[[a-z\d])'), lambda match: ''),
- # unwrap/delete soft hyphens
- #(re.compile(u'[]\s*
\s*(?=[[a-z\d])'), lambda match: ''),
# Remove gray background
(re.compile(r''+chap+'
\n'
+ else:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+ return ''+chap+'
\n'+title+'
\n'
+
+ def chapter_link(self, match):
+ chap = match.group('sectionlink')
+ if not chap:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
+ return '
'
+ else:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
+ return '
\n'+chap+'
'
+
+ def no_markup(self, raw, percent):
+ '''
+ Detects total marked up line endings in the file. raw is the text to
+ inspect. Percent is the minimum percent of line endings which should
+ be marked up to return true.
+ '''
+ htm_end_ere = re.compile('
\s*", "\n
", html) + + # some lit files don't have any
tags or equivalent, check and + # mark up line endings if required before proceeding + if self.no_markup(html, 0.1): + self.log("not enough paragraph markers, adding now") + add_markup = re.compile('(?)(\n)') + html = add_markup.sub('
\n', html)
+
+ # detect chapters/sections to match xpath or splitting logic
+ #
+ # Start with most typical chapter headings
+ chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P ]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P'+'
', html)
+
+ return html
\ No newline at end of file
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 35a8a1a9bc..e83216ae1f 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
from calibre import unicode_path
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
class Link(object):
'''
@@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin):
return (None, raw)
def preprocess_html(self, html):
- if not hasattr(self, 'log'):
- from calibre.utils.logging import default_log
- self.log = default_log
- self.log("********* Preprocessing HTML - HTML Input plugin *********")
- # Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r'(?=?(br|p|span))(?(br|p|span)[^>]*>)?\s*(?P'+'\g
\n', html)
- # Unwrap lines using punctation if the median length of all lines is less than 150
- #
- # Insert extra line feeds so the line length regex functions properly
- html = re.sub(r"
)', re.IGNORECASE) - blankreg = re.compile(r'\s*
]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
', re.IGNORECASE) - blanklines = blankreg.findall(html) - lines = linereg.findall(html) - if len(lines) > 1: - self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") - if float(len(blanklines)) / float(len(lines)) > 0.40: - self.log("deleting blank lines") - html = blankreg.sub('', html) - # Arrange line feeds and tags so the line_length and no_markup functions work correctly - html = re.sub(r"\s*", "\n", html) - - # some lit files don't have anytags or equivalent, check and - # mark up line endings if required before proceeding - if no_markup(html, 0.1): - self.log("not enough paragraph markers, adding now") - add_markup = re.compile('(?)(\n)') - html = add_markup.sub('
\n', html)
-
- # detect chapters/sections to match xpath or splitting logic
- #
- # Mark split points based on embedded links
- chaplink = re.compile(r']*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P ]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P )', re.DOTALL)
- elif format == 'pdf':
- linere = re.compile('(?<= \n ' + match.group(1) + '
- # (re.compile(r' '),
+ if isChar(self.rtfData[i], '{'):
+ if lastDataStart > -1:
+ self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+ lastDataStart = -1
+ self.tokens.append(tokenDelimitatorStart())
+ i = i + 1
+ continue
- # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
- (re.compile(u'(?<=[-–—])\s* '),
- # Clean up spaces
- (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
- # Add space before and after italics
- (re.compile(u'(?'), lambda match: ' '),
- (re.compile(r'(?=\w)'), lambda match: ' '),
-
- ]
+ tokenStart = i
+ i = i + 1
- # Fix Book Designer markup
- BOOK_DESIGNER = [
- # HR
- (re.compile(' \s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens
end_rules.append((re.compile(u'[](\s* )+\s*(?=[[a-z\d])'), lambda match: ''))
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 68cebb3a11..fb683bdb12 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -29,16 +29,12 @@ class PreProcessor(object):
self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
return ' )', re.IGNORECASE)
+ linereg = re.compile('(?<= )', re.IGNORECASE|re.DOTALL)
blankreg = re.compile(r'\s* ]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s* ]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P )', re.DOTALL)
elif format == 'pdf':
linere = re.compile('(?<= '),
@@ -238,8 +240,7 @@ class HTMLPreProcessor(object):
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics
(re.compile(u'(?'), lambda match: ' '),
- (re.compile(r'(?=\w)'), lambda match: ' '),
-
+ (re.compile(r'(?=\w)'), lambda match: ' '),
]
# Fix Book Designer markup
@@ -327,10 +328,11 @@ class HTMLPreProcessor(object):
# unwrap/delete soft hyphens with formatting
end_rules.append((re.compile(u'[]\s*((i|u|b)>)+(\s* )+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
- # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
+ # Make the more aggressive chapter marking regex optional with the preprocess option to
+ # reduce false positives and move after header/footer removal
if getattr(self.extra_opts, 'preprocess_html', None):
if is_pdftohtml:
- end_rules.append((re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P \s*(?P \s*(?P )?'), chap_head),)
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index fb683bdb12..abfa43e7ed 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -8,10 +8,10 @@ __docformat__ = 'restructuredtext en'
import re
from calibre.ebooks.conversion.preprocess import line_length
from calibre.utils.logging import default_log
-from lxml import etree
class PreProcessor(object):
html_preprocess_sections = 0
+ found_indents = 0
def __init__(self, args):
self.args = args
@@ -22,11 +22,11 @@ class PreProcessor(object):
title = match.group('title')
if not title:
self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+ self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
return ' '
+ else:
+ return ' '+span
+ else:
+ if not span:
+ return ' '
+ else:
+ return ' '+span
+
def no_markup(self, raw, percent):
'''
Detects total marked up line endings in the file. raw is the text to
@@ -48,7 +63,7 @@ class PreProcessor(object):
line_end = line_end_ere.findall(raw)
tot_htm_ends = len(htm_end)
tot_ln_fds = len(line_end)
- self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
+ self.log("There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked up endings")
if percent > 1:
percent = 1
@@ -56,13 +71,18 @@ class PreProcessor(object):
percent = 0
min_lns = tot_ln_fds * percent
- self.log("There must be fewer than " + str(min_lns) + " unmarked lines to return true")
+ self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
if min_lns > tot_htm_ends:
return True
def __call__(self, html):
self.log("********* Preprocessing HTML *********")
- # remove non-breaking spaces
+ # Replace series of non-breaking spaces with text-indent
+ txtindent = re.compile(ur' [^>]*)>\s*(?P(]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
+ html = txtindent.sub(self.insert_indent, html)
+ if self.found_indents > 1:
+ self.log("replaced "+str(self.found_indents)+ " nbsp indents with inline styles")
+ # remove remaining non-breaking spaces
html = re.sub(ur'\u00a0', ' ', html)
# Get rid of empty \s*", "\n ", html)
- # some lit files don't have any tags or equivalent, check and
- # mark up line endings if required before proceeding
+ # some lit files don't have any tags or equivalent (generally just plain text between
+ # ', html)
# detect chapters/sections to match xpath or splitting logic
+ heading = re.compile(' ]*>', re.IGNORECASE)
+ spans_reg = re.compile(']*>', re.IGNORECASE)
+ paras = len(paras_reg.findall(html))
+ spans = len(spans_reg.findall(html))
+ if spans > 1:
+ if float(paras) / float(spans) < 0.75:
+ format = 'spanned_html'
+ else:
+ format = 'html'
+ else:
+ format = 'html'
+
+ # Calculate Length
+ length = line_length(format, html, 0.4)
+ self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
+ #
+ # Unwrap and/or delete soft-hyphens, hyphens
+ html = re.sub(u'\s*(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+ html = re.sub(u'(?<=[-–—])\s*(?=<)(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
+
+ # Unwrap lines using punctation if the median length of all lines is less than 200
+ unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P ', html)
# detect chapters/sections to match xpath or splitting logic
- heading = re.compile(' ]*>', re.IGNORECASE)
spans_reg = re.compile(']*>', re.IGNORECASE)
paras = len(paras_reg.findall(html))
From b73e1b3da50810e151d10a5d62251754a077e605 Mon Sep 17 00:00:00 2001
From: ldolse ]*>\s*]*>\s*(?P ]*>\s*(]*>\s*\s*) ]*>\s*]*>\s*" % length, re.UNICODE)
- if length < 150:
- res = unwrap.sub(' ', res)
+ preprocessor = PreProcessor(res)
+ res = preprocessor(res)
f.write(res)
self.write_inline_css(inline_class)
stream.seek(0)
diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui
index 626c68ea63..b2ee421922 100644
--- a/src/calibre/gui2/convert/pdf_input.ui
+++ b/src/calibre/gui2/convert/pdf_input.ui
@@ -46,7 +46,7 @@
'+'
', html)
-
+ preprocessor = PreProcessor(html)
+ html = preprocessor(html)
return html
diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py
index 3ae9f8ccca..c151551866 100644
--- a/src/calibre/ebooks/pdb/pdf/reader.py
+++ b/src/calibre/ebooks/pdb/pdf/reader.py
@@ -21,7 +21,7 @@ class Reader(FormatReader):
self.options = options
setattr(self.options, 'new_pdf_engine', False)
setattr(self.options, 'no_images', False)
- setattr(self.options, 'unwrap_factor', 0.5)
+ setattr(self.options, 'unwrap_factor', 0.45)
def extract_content(self, output_dir):
self.log.info('Extracting PDF...')
diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 113c3d99d8..14b3552b04 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -25,7 +25,7 @@ class PDFInput(InputFormatPlugin):
OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
- 'default is 0.45, this is the median line length.')),
+ 'default is 0.45, just below the median line length.')),
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
help=_('Use the new PDF conversion engine.'))
])
From f6de0bef13d7d1001b951d465cff3135aad616ed Mon Sep 17 00:00:00 2001
From: ldolse '+chap+'
\n'
- else:
- return ''+chap+'
\n'+title+'
\n'
+class tokenData():
+ def __init__(self, data):
+ self.data = data
+ def toRTF(self):
+ return self.data
+ def __repr__(self):
+ return self.data
-def wrap_lines(match):
- ital = match.group('ital')
- if not ital:
- return ' '
- else:
- return ital+' '
+class tokenBinN():
+ def __init__(self, data, separator = ''):
+ self.data = data
+ self.separator = separator
+ def toRTF(self):
+ return "\\bin" + repr(len(self.data)) + self.separator + self.data
+ def __repr__(self):
+ return "\\bin" + repr(len(self.data)) + self.separator + self.data
+
+class token8bitChar():
+ def __init__(self, data):
+ self.data = data
+ def toRTF(self):
+ return "\\'" + self.data
+ def __repr__(self):
+ return "\\'" + self.data
+
+class tokenUnicode():
+ def __init__(self, data, separator = '', current_ucn = 1, eqList = []):
+ self.data = data
+ self.separator = separator
+ self.current_ucn = current_ucn
+ self.eqList = eqList
+ def toRTF(self):
+ result = '\\u' + repr(self.data) + ' '
+ ucn = self.current_ucn
+ if len(self.eqList) < ucn:
+ ucn = len(self.eqList)
+ result = tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result
+ i = 0
+ for eq in self.eqList:
+ if i >= ucn:
+ break
+ result = result + eq.toRTF()
+ return result
+ def __repr__(self):
+ return '\\u' + repr(self.data)
-def line_length(format, raw, percent):
- '''
- raw is the raw text to find the line length to use for wrapping.
- percentage is a decimal number, 0 - 1 which is used to determine
- how far in the list of line lengths to use. The list of line lengths is
- ordered smallest to larged and does not include duplicates. 0.5 is the
- median value.
- '''
- raw = raw.replace(' ', ' ')
- if format == 'html':
- linere = re.compile('(?<=
).*?(?=
)', re.DOTALL)
- lines = linere.findall(raw)
- print "percent is " + str(percent)
+def isAsciiLetter(value):
+ return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z'))
- lengths = []
- for line in lines:
- if len(line) > 0:
- lengths.append(len(line))
+def isDigit(value):
+ return (value >= '0') and (value <= '9')
- if not lengths:
- return 0
+def isChar(value, char):
+ return value == char
- lengths = list(set(lengths))
- total = sum(lengths)
- avg = total / len(lengths)
- max_line = avg * 2
-
- lengths = sorted(lengths)
- for i in range(len(lengths) - 1, -1, -1):
- if lengths[i] > max_line:
- del lengths[i]
-
- if percent > 1:
- percent = 1
- if percent < 0:
- percent = 0
-
- index = int(len(lengths) * percent) - 1
-
- return lengths[index]
+def isString(buffer, string):
+ return buffer == string
-class CSSPreProcessor(object):
+class RtfTokenParser():
+ def __init__(self, tokens):
+ self.tokens = tokens
+ self.process()
+ self.processUnicode()
- PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
+ def process(self):
+ i = 0
+ newTokens = []
+ while i < len(self.tokens):
+ if isinstance(self.tokens[i], tokenControlSymbol):
+ if isString(self.tokens[i].name, "\\'"):
+ i = i + 1
+ if not isinstance(self.tokens[i], tokenData):
+ raise Exception('Error: token8bitChar without data.')
+ if len(self.tokens[i].data) < 2:
+ raise Exception('Error: token8bitChar without data.')
+ newTokens.append(token8bitChar(self.tokens[i].data[0:2]))
+ if len(self.tokens[i].data) > 2:
+ newTokens.append(tokenData(self.tokens[i].data[2:]))
+ i = i + 1
+ continue
- def __call__(self, data, add_namespace=False):
- from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
- data = self.PAGE_PAT.sub('', data)
- if not add_namespace:
- return data
- ans, namespaced = [], False
- for line in data.splitlines():
- ll = line.lstrip()
- if not (namespaced or ll.startswith('@import') or
- ll.startswith('@charset')):
- ans.append(XHTML_CSS_NAMESPACE.strip())
- namespaced = True
- ans.append(line)
+ newTokens.append(self.tokens[i])
+ i = i + 1
- return u'\n'.join(ans)
+ self.tokens = list(newTokens)
-class HTMLPreProcessor(object):
+ def processUnicode(self):
+ i = 0
+ newTokens = []
+ ucNbStack = [1]
+ while i < len(self.tokens):
+ if isinstance(self.tokens[i], tokenDelimitatorStart):
+ ucNbStack.append(ucNbStack[len(ucNbStack) - 1])
+ newTokens.append(self.tokens[i])
+ i = i + 1
+ continue
+ if isinstance(self.tokens[i], tokenDelimitatorEnd):
+ ucNbStack.pop()
+ newTokens.append(self.tokens[i])
+ i = i + 1
+ continue
+ if isinstance(self.tokens[i], tokenControlWordWithNumericArgument):
+ if isString(self.tokens[i].name, '\\uc'):
+ ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument
+ newTokens.append(self.tokens[i])
+ i = i + 1
+ continue
+ if isString(self.tokens[i].name, '\\u'):
+ x = i
+ j = 0
+ i = i + 1
+ replace = []
+ partialData = None
+ ucn = ucNbStack[len(ucNbStack) - 1]
+ while (i < len(self.tokens)) and (j < ucn):
+ if isinstance(self.tokens[i], tokenDelimitatorStart):
+ break
+ if isinstance(self.tokens[i], tokenDelimitatorEnd):
+ break
+ if isinstance(self.tokens[i], tokenData):
+ if len(self.tokens[i].data) >= ucn - j:
+ replace.append(tokenData(self.tokens[i].data[0 : ucn - j]))
+ if len(self.tokens[i].data) > ucn - j:
+ partialData = tokenData(self.tokens[i].data[ucn - j:])
+ i = i + 1
+ break
+ else:
+ replace.append(self.tokens[i])
+ j = j + len(self.tokens[i].data)
+ i = i + 1
+ continue
+ if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN):
+ replace.append(self.tokens[i])
+ i = i + 1
+ j = j + 1
+ continue
+ raise Exception('Error: incorect utf replacement.')
- PREPROCESS = [
- # Some idiotic HTML generators (Frontpage I'm looking at you)
- # Put all sorts of crap into
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
+class RtfTokenizer():
+ def __init__(self, rtfData):
+ self.rtfData = []
+ self.tokens = []
+ self.rtfData = rtfData
+ self.tokenize()
- # Center separator lines
- (re.compile(u'
\s*(?P
'), lambda match: '
tags
- (re.compile(r'
'),
- # Replace
with
\s*
', re.IGNORECASE), lambda match: '\n
\s*(?=[[a-z\d])'), lambda match: ''),
+ if isChar(self.rtfData[i], '}'):
+ if lastDataStart > -1:
+ self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+ lastDataStart = -1
+ self.tokens.append(tokenDelimitatorEnd())
+ i = i + 1
+ continue
- # Remove gray background
- (re.compile(r'
', re.IGNORECASE),
- lambda match : ' '),
- # Create header tags
- (re.compile('<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?
', re.IGNORECASE),
- lambda match : '%s
'%(match.group(2) if match.group(2) else 'center', match.group(3))),
- (re.compile('<]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?
', re.IGNORECASE),
- lambda match : '%s
'%(match.group(2) if match.group(2) else 'center', match.group(3))),
- (re.compile('<]*?id=title[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL),
- lambda match : '%s
'%(match.group(1),)),
- (re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL),
- lambda match : '%s
'%(match.group(1),)),
- ]
- def __init__(self, input_plugin_preprocess, plugin_preprocess,
- extra_opts=None):
- self.input_plugin_preprocess = input_plugin_preprocess
- self.plugin_preprocess = plugin_preprocess
- self.extra_opts = extra_opts
+ #Control Words
+ if isAsciiLetter(self.rtfData[i]):
+ #consume
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P
]*>|?p[^>]*>)))?'), chap_head),
- )
- if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
- length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
- if length:
- print "The pdf line length returned is " + str(length)
- end_rules.append(
- # Un wrap using punctuation
- (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P'+chap+'
\n'+title+'
\n'
- def chapter_link(self, match):
- chap = match.group('sectionlink')
- if not chap:
- self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
- return '
'
- else:
- self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
- return '
\n'+chap+'
'
+ def chapter_break(self, match):
+ chap = match.group('section')
+ styles = match.group('styles')
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
+ return '<'+styles+' style="page-break-before:always">'+chap
def no_markup(self, raw, percent):
'''
@@ -74,7 +70,7 @@ class PreProcessor(object):
html = re.sub(r"\s*]*>\s*", " ", html)
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
- linereg = re.compile('(?<=
).*?(?=
)', re.DOTALL)
+ elif format == 'spanned_html':
+ linere = re.compile('(?<=)', re.DOTALL)
lines = linere.findall(raw)
lengths = []
@@ -223,14 +224,15 @@ class HTMLPreProcessor(object):
# Remove page links
(re.compile(r'', re.IGNORECASE), lambda match: ''),
# Remove
tags
- (re.compile(r'
'),
+ (re.compile(r'
'),
# Remove gray background
(re.compile(r']+>'), lambda match : ''),
# Detect Chapters to match default XPATH in GUI
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P
\s*(?P
\s*){1,3}\s*(?P
\s*(?P
\s*){1,3}\s*(?P
)?', re.IGNORECASE), chap_head),
+ # Cover the case where every letter in a chapter title is separated by a space
+ (re.compile(r'
\s*(?P
\s*){1,3}\s*(?P
))?'), chap_head),
# Have paragraphs show better
(re.compile(r'
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P
]*>|?p[^>]*>)))?'), chap_head))
+ end_rules.append((re.compile(r''+chap+'
\n'
else:
self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+ self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
return ''+chap+'
\n'+title+'
\n'
def chapter_break(self, match):
@@ -35,7 +35,22 @@ class PreProcessor(object):
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
return '<'+styles+' style="page-break-before:always">'+chap
-
+
+ def insert_indent(self, match):
+ pstyle = match.group('formatting')
+ span = match.group('span')
+ self.found_indents = self.found_indents + 1
+ if pstyle:
+ if not span:
+ return ' tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now")
add_markup = re.compile('(?)(\n)')
html = add_markup.sub('
'+'\g
\n', res)
- # Unwrap lines using punctation if the median length of all lines is less than 150
- length = line_length('html', res, 0.4)
- self.log("*** Median length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*