mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Fix #6744 (Convert from PDF - Some dashes and single-quotes not being unwrapped properly.)
This commit is contained in:
commit
b34457722c
@ -6,7 +6,7 @@ nspm.rs
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
from calibre.ebooks.BeautifulSoup import NavigableString
|
||||||
|
|
||||||
class Nspm(BasicNewsRecipe):
|
class Nspm(BasicNewsRecipe):
|
||||||
title = 'Nova srpska politicka misao'
|
title = 'Nova srpska politicka misao'
|
||||||
|
@ -75,6 +75,8 @@ def line_length(format, raw, percent):
|
|||||||
linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
|
linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
|
||||||
elif format == 'pdf':
|
elif format == 'pdf':
|
||||||
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
||||||
|
elif format == 'spanned_html':
|
||||||
|
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||||
lines = linere.findall(raw)
|
lines = linere.findall(raw)
|
||||||
|
|
||||||
lengths = []
|
lengths = []
|
||||||
@ -224,30 +226,29 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
|
(re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
|
||||||
(re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
|
(re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
|
||||||
|
|
||||||
|
# If pdf printed from a browser then the header/footer has a reliable pattern
|
||||||
|
(re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||||
|
|
||||||
|
# Center separator lines
|
||||||
|
(re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
|
||||||
|
|
||||||
# Remove page links
|
# Remove page links
|
||||||
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
||||||
# Remove <hr> tags
|
# Remove <hr> tags
|
||||||
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
|
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br>'),
|
||||||
# Replace <br><br> with <p>
|
|
||||||
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
|
|
||||||
|
|
||||||
# Remove hyphenation
|
|
||||||
(re.compile(r'-<br.*?>\n\r?'), lambda match: ''),
|
|
||||||
|
|
||||||
# Remove gray background
|
# Remove gray background
|
||||||
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
||||||
|
|
||||||
# Detect Chapters to match default XPATH in GUI
|
# Detect Chapters to match default XPATH in GUI
|
||||||
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
|
(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
|
||||||
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
|
# Cover the case where every letter in a chapter title is separated by a space
|
||||||
|
(re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
|
||||||
|
|
||||||
# Have paragraphs show better
|
# Have paragraphs show better
|
||||||
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
||||||
# Clean up spaces
|
# Clean up spaces
|
||||||
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
||||||
# Connect paragraphs split by -
|
|
||||||
(re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''),
|
|
||||||
# Add space before and after italics
|
# Add space before and after italics
|
||||||
(re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
|
(re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
|
||||||
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
|
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
|
||||||
@ -328,12 +329,29 @@ class HTMLPreProcessor(object):
|
|||||||
print 'Failed to parse remove_footer regexp'
|
print 'Failed to parse remove_footer regexp'
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
# unwrap hyphenation - moved here so it's executed after header/footer removal
|
||||||
|
if is_pdftohtml:
|
||||||
|
# unwrap visible dashes and hyphens - don't delete they are often hyphens for
|
||||||
|
# for compound words, formatting, etc
|
||||||
|
end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
|
# unwrap/delete soft hyphens
|
||||||
|
end_rules.append((re.compile(u'[](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
|
# unwrap/delete soft hyphens with formatting
|
||||||
|
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
|
|
||||||
|
# Make the more aggressive chapter marking regex optional with the preprocess option to
|
||||||
|
# reduce false positives and move after header/footer removal
|
||||||
|
if getattr(self.extra_opts, 'preprocess_html', None):
|
||||||
|
if is_pdftohtml:
|
||||||
|
end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||||
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
|
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
|
||||||
if length:
|
if length:
|
||||||
|
# print "The pdf line length returned is " + str(length)
|
||||||
end_rules.append(
|
end_rules.append(
|
||||||
# Un wrap using punctuation
|
# Un wrap using punctuation
|
||||||
(re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
|
(re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||||
)
|
)
|
||||||
|
|
||||||
for rule in self.PREPROCESS + start_rules:
|
for rule in self.PREPROCESS + start_rules:
|
||||||
|
173
src/calibre/ebooks/conversion/utils.py
Normal file
173
src/calibre/ebooks/conversion/utils.py
Normal file
@ -0,0 +1,173 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
|
from calibre.ebooks.conversion.preprocess import line_length
|
||||||
|
from calibre.utils.logging import default_log
|
||||||
|
|
||||||
|
class PreProcessor(object):
|
||||||
|
|
||||||
|
def __init__(self, log=None):
|
||||||
|
self.log = default_log if log is None else log
|
||||||
|
self.html_preprocess_sections = 0
|
||||||
|
self.found_indents = 0
|
||||||
|
|
||||||
|
def chapter_head(self, match):
|
||||||
|
chap = match.group('chap')
|
||||||
|
title = match.group('title')
|
||||||
|
if not title:
|
||||||
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
|
self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
|
||||||
|
return '<h2>'+chap+'</h2>\n'
|
||||||
|
else:
|
||||||
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
|
self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
|
||||||
|
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
||||||
|
|
||||||
|
def chapter_break(self, match):
|
||||||
|
chap = match.group('section')
|
||||||
|
styles = match.group('styles')
|
||||||
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
|
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
|
||||||
|
return '<'+styles+' style="page-break-before:always">'+chap
|
||||||
|
|
||||||
|
def insert_indent(self, match):
|
||||||
|
pstyle = match.group('formatting')
|
||||||
|
span = match.group('span')
|
||||||
|
self.found_indents = self.found_indents + 1
|
||||||
|
if pstyle:
|
||||||
|
if not span:
|
||||||
|
return '<p '+pstyle+' style="text-indent:3%">'
|
||||||
|
else:
|
||||||
|
return '<p '+pstyle+' style="text-indent:3%">'+span
|
||||||
|
else:
|
||||||
|
if not span:
|
||||||
|
return '<p style="text-indent:3%">'
|
||||||
|
else:
|
||||||
|
return '<p style="text-indent:3%">'+span
|
||||||
|
|
||||||
|
def no_markup(self, raw, percent):
|
||||||
|
'''
|
||||||
|
Detects total marked up line endings in the file. raw is the text to
|
||||||
|
inspect. Percent is the minimum percent of line endings which should
|
||||||
|
be marked up to return true.
|
||||||
|
'''
|
||||||
|
htm_end_ere = re.compile('</p>', re.DOTALL)
|
||||||
|
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
|
||||||
|
htm_end = htm_end_ere.findall(raw)
|
||||||
|
line_end = line_end_ere.findall(raw)
|
||||||
|
tot_htm_ends = len(htm_end)
|
||||||
|
tot_ln_fds = len(line_end)
|
||||||
|
self.log("There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked up endings")
|
||||||
|
|
||||||
|
if percent > 1:
|
||||||
|
percent = 1
|
||||||
|
if percent < 0:
|
||||||
|
percent = 0
|
||||||
|
|
||||||
|
min_lns = tot_ln_fds * percent
|
||||||
|
self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
|
||||||
|
if min_lns > tot_htm_ends:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def __call__(self, html):
|
||||||
|
self.log("********* Preprocessing HTML *********")
|
||||||
|
# Replace series of non-breaking spaces with text-indent
|
||||||
|
txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
|
||||||
|
html = txtindent.sub(self.insert_indent, html)
|
||||||
|
if self.found_indents > 1:
|
||||||
|
self.log("replaced "+str(self.found_indents)+ " nbsp indents with inline styles")
|
||||||
|
# remove remaining non-breaking spaces
|
||||||
|
html = re.sub(ur'\u00a0', ' ', html)
|
||||||
|
# Get rid of empty <o:p> tags to simplify other processing
|
||||||
|
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||||
|
# Get rid of empty span tags
|
||||||
|
html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
|
||||||
|
|
||||||
|
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
|
||||||
|
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||||
|
blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
|
||||||
|
blanklines = blankreg.findall(html)
|
||||||
|
lines = linereg.findall(html)
|
||||||
|
if len(lines) > 1:
|
||||||
|
self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
||||||
|
if float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||||
|
self.log("deleting blank lines")
|
||||||
|
html = blankreg.sub('', html)
|
||||||
|
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||||
|
html = re.sub(r"\s*</p>", "</p>\n", html)
|
||||||
|
html = re.sub(r"\s*<p>\s*", "\n<p>", html)
|
||||||
|
|
||||||
|
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
||||||
|
# <pre> tags), check and mark up line endings if required before proceeding
|
||||||
|
if self.no_markup(html, 0.1):
|
||||||
|
self.log("not enough paragraph markers, adding now")
|
||||||
|
add_markup = re.compile('(?<!>)(\n)')
|
||||||
|
html = add_markup.sub('</p>\n<p>', html)
|
||||||
|
|
||||||
|
# detect chapters/sections to match xpath or splitting logic
|
||||||
|
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||||
|
self.html_preprocess_sections = len(heading.findall(html))
|
||||||
|
self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
|
||||||
|
#
|
||||||
|
# Start with most typical chapter headings, get more aggressive until one works
|
||||||
|
if self.html_preprocess_sections < 10:
|
||||||
|
chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
|
||||||
|
html = chapdetect.sub(self.chapter_head, html)
|
||||||
|
if self.html_preprocess_sections < 10:
|
||||||
|
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
|
||||||
|
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
||||||
|
html = chapdetect2.sub(self.chapter_head, html)
|
||||||
|
|
||||||
|
if self.html_preprocess_sections < 10:
|
||||||
|
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
|
||||||
|
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
||||||
|
html = chapdetect2.sub(self.chapter_head, html)
|
||||||
|
|
||||||
|
# Unwrap lines
|
||||||
|
#
|
||||||
|
self.log("Unwrapping Lines")
|
||||||
|
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
||||||
|
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
||||||
|
# that lines can be un-wrapped across page boundaries
|
||||||
|
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
|
||||||
|
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
|
||||||
|
paras = len(paras_reg.findall(html))
|
||||||
|
spans = len(spans_reg.findall(html))
|
||||||
|
if spans > 1:
|
||||||
|
if float(paras) / float(spans) < 0.75:
|
||||||
|
format = 'spanned_html'
|
||||||
|
else:
|
||||||
|
format = 'html'
|
||||||
|
else:
|
||||||
|
format = 'html'
|
||||||
|
|
||||||
|
# Calculate Length
|
||||||
|
length = line_length(format, html, 0.4)
|
||||||
|
self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
|
||||||
|
#
|
||||||
|
# Unwrap and/or delete soft-hyphens, hyphens
|
||||||
|
html = re.sub(u'\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
||||||
|
html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
|
||||||
|
|
||||||
|
# Unwrap lines using punctation if the median length of all lines is less than 200
|
||||||
|
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||||
|
html = unwrap.sub(' ', html)
|
||||||
|
|
||||||
|
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||||
|
if self.html_preprocess_sections < 10:
|
||||||
|
self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
|
||||||
|
#self.log(html)
|
||||||
|
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||||
|
html = chapdetect3.sub(self.chapter_break, html)
|
||||||
|
# search for places where a first or second level heading is immediately followed by another
|
||||||
|
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||||
|
# headings and titles, images, etc
|
||||||
|
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||||
|
html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
|
||||||
|
|
||||||
|
return html
|
@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
|
|||||||
from calibre import unicode_path
|
from calibre import unicode_path
|
||||||
from calibre.utils.localization import get_lang
|
from calibre.utils.localization import get_lang
|
||||||
from calibre.utils.filenames import ascii_filename
|
from calibre.utils.filenames import ascii_filename
|
||||||
from calibre.ebooks.conversion.preprocess import line_length
|
from calibre.ebooks.conversion.utils import PreProcessor
|
||||||
|
|
||||||
class Link(object):
|
class Link(object):
|
||||||
'''
|
'''
|
||||||
@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
return (None, raw)
|
return (None, raw)
|
||||||
|
|
||||||
def preprocess_html(self, html):
|
def preprocess_html(self, html):
|
||||||
if not hasattr(self, 'log'):
|
preprocessor = PreProcessor(log=getattr(self, 'log', None))
|
||||||
from calibre.utils.logging import default_log
|
return preprocessor(html)
|
||||||
self.log = default_log
|
|
||||||
self.log("********* Preprocessing HTML *********")
|
|
||||||
# Detect Chapters to match the xpath in the GUI
|
|
||||||
chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
|
|
||||||
html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
|
|
||||||
# Unwrap lines using punctation if the median length of all lines is less than 150
|
|
||||||
#
|
|
||||||
# Insert extra line feeds so the line length regex functions properly
|
|
||||||
html = re.sub(r"</p>", "</p>\n", html)
|
|
||||||
length = line_length('html', html, 0.4)
|
|
||||||
self.log.debug("*** Median length is " + str(length) + " ***")
|
|
||||||
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
|
||||||
if length < 150:
|
|
||||||
html = unwrap.sub(' ', html)
|
|
||||||
return html
|
|
||||||
|
@ -6,10 +6,9 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.preprocess import line_length
|
from calibre.ebooks.conversion.utils import PreProcessor
|
||||||
|
|
||||||
|
|
||||||
class LITInput(InputFormatPlugin):
|
class LITInput(InputFormatPlugin):
|
||||||
|
|
||||||
@ -55,18 +54,6 @@ class LITInput(InputFormatPlugin):
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, html):
|
def preprocess_html(self, html):
|
||||||
self.log("********* Preprocessing HTML *********")
|
preprocessor = PreProcessor(log=getattr(self, 'log', None))
|
||||||
# Detect Chapters to match the xpath in the GUI
|
return preprocessor(html)
|
||||||
chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
|
|
||||||
html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
|
|
||||||
# Unwrap lines using punctation if the median length of all lines is less than 150
|
|
||||||
#
|
|
||||||
# Insert extra line feeds so the line length regex functions properly
|
|
||||||
html = re.sub(r"</p>", "</p>\n", html)
|
|
||||||
length = line_length('html', html, 0.4)
|
|
||||||
self.log("*** Median length is " + str(length) + " ***")
|
|
||||||
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
|
||||||
if length < 150:
|
|
||||||
html = unwrap.sub(' ', html)
|
|
||||||
return html
|
|
||||||
|
|
||||||
|
@ -3,6 +3,7 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
|
|
||||||
class MOBIInput(InputFormatPlugin):
|
class MOBIInput(InputFormatPlugin):
|
||||||
@ -37,3 +38,12 @@ class MOBIInput(InputFormatPlugin):
|
|||||||
include_meta_content_type=False))
|
include_meta_content_type=False))
|
||||||
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
||||||
return mr.created_opf_path
|
return mr.created_opf_path
|
||||||
|
|
||||||
|
def preprocess_html(self, html):
|
||||||
|
# search for places where a first or second level heading is immediately followed by another
|
||||||
|
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||||
|
# headings and titles, images, etc
|
||||||
|
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||||
|
html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
|
||||||
|
return html
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ class Reader(FormatReader):
|
|||||||
self.options = options
|
self.options = options
|
||||||
setattr(self.options, 'new_pdf_engine', False)
|
setattr(self.options, 'new_pdf_engine', False)
|
||||||
setattr(self.options, 'no_images', False)
|
setattr(self.options, 'no_images', False)
|
||||||
setattr(self.options, 'unwrap_factor', 0.5)
|
setattr(self.options, 'unwrap_factor', 0.45)
|
||||||
|
|
||||||
def extract_content(self, output_dir):
|
def extract_content(self, output_dir):
|
||||||
self.log.info('Extracting PDF...')
|
self.log.info('Extracting PDF...')
|
||||||
|
@ -22,10 +22,10 @@ class PDFInput(InputFormatPlugin):
|
|||||||
options = set([
|
options = set([
|
||||||
OptionRecommendation(name='no_images', recommended_value=False,
|
OptionRecommendation(name='no_images', recommended_value=False,
|
||||||
help=_('Do not extract images from the document')),
|
help=_('Do not extract images from the document')),
|
||||||
OptionRecommendation(name='unwrap_factor', recommended_value=0.5,
|
OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
|
||||||
help=_('Scale used to determine the length at which a line should '
|
help=_('Scale used to determine the length at which a line should '
|
||||||
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
||||||
'default is 0.5, this is the median line length.')),
|
'default is 0.45, just below the median line length.')),
|
||||||
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
|
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
|
||||||
help=_('Use the new PDF conversion engine.'))
|
help=_('Use the new PDF conversion engine.'))
|
||||||
])
|
])
|
||||||
|
@ -7,7 +7,7 @@ import os, glob, re, textwrap
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.preprocess import line_length
|
from calibre.ebooks.conversion.utils import PreProcessor
|
||||||
|
|
||||||
class InlineClass(etree.XSLTExtension):
|
class InlineClass(etree.XSLTExtension):
|
||||||
|
|
||||||
@ -229,16 +229,8 @@ class RTFInput(InputFormatPlugin):
|
|||||||
res = transform.tostring(result)
|
res = transform.tostring(result)
|
||||||
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
||||||
if self.options.preprocess_html:
|
if self.options.preprocess_html:
|
||||||
self.log("********* Preprocessing HTML *********")
|
preprocessor = PreProcessor(log=getattr(self, 'log', None))
|
||||||
# Detect Chapters to match the xpath in the GUI
|
res = preprocessor(res)
|
||||||
chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)\s*</span>\s*</p>', re.IGNORECASE)
|
|
||||||
res = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', res)
|
|
||||||
# Unwrap lines using punctation if the median length of all lines is less than 150
|
|
||||||
length = line_length('html', res, 0.4)
|
|
||||||
self.log("*** Median length is " + str(length) + " ***")
|
|
||||||
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*(</p>)?\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*(<span[^>]*>)?\s*" % length, re.UNICODE)
|
|
||||||
if length < 150:
|
|
||||||
res = unwrap.sub(' ', res)
|
|
||||||
f.write(res)
|
f.write(res)
|
||||||
self.write_inline_css(inline_class)
|
self.write_inline_css(inline_class)
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
|
@ -46,7 +46,7 @@
|
|||||||
<double>0.010000000000000</double>
|
<double>0.010000000000000</double>
|
||||||
</property>
|
</property>
|
||||||
<property name="value">
|
<property name="value">
|
||||||
<double>0.500000000000000</double>
|
<double>0.450000000000000</double>
|
||||||
</property>
|
</property>
|
||||||
</widget>
|
</widget>
|
||||||
</item>
|
</item>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user