mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Preprocessing Updates
This commit is contained in:
parent
132df9b6c8
commit
5c951fb962
@ -62,6 +62,7 @@ def wrap_lines(match):
|
|||||||
else:
|
else:
|
||||||
return ital+' '
|
return ital+' '
|
||||||
|
|
||||||
|
|
||||||
def line_length(format, raw, percent):
|
def line_length(format, raw, percent):
|
||||||
'''
|
'''
|
||||||
raw is the raw text to find the line length to use for wrapping.
|
raw is the raw text to find the line length to use for wrapping.
|
||||||
@ -191,32 +192,36 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'),
|
(re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'),
|
||||||
(re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'),
|
(re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'),
|
||||||
|
|
||||||
|
# If pdf printed from a browser then the header/footer has a reliable pattern
|
||||||
|
(re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||||
|
|
||||||
|
# Center separator lines
|
||||||
|
(re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
|
||||||
|
|
||||||
# Remove page links
|
# Remove page links
|
||||||
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
||||||
# Remove <hr> tags
|
# Remove <hr> tags
|
||||||
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
|
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
|
||||||
# Replace <br><br> with <p>
|
# Replace <br><br> with <p>
|
||||||
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
|
# (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
|
||||||
|
|
||||||
# Remove hyphenation
|
# unwrap hyphenation - don't delete the hyphen (often doesn't split words)
|
||||||
(re.compile(r'-<br.*?>\n\r?'), lambda match: ''),
|
(re.compile(r'(?<=[-–])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
|
||||||
|
|
||||||
# Remove gray background
|
# Remove gray background
|
||||||
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
||||||
|
|
||||||
# Detect Chapters to match default XPATH in GUI
|
# Detect Chapters to match default XPATH in GUI
|
||||||
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
|
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+(\s\w+)?)?\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
|
||||||
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
|
|
||||||
|
|
||||||
# Have paragraphs show better
|
# Have paragraphs show better
|
||||||
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
||||||
# Clean up spaces
|
# Clean up spaces
|
||||||
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
||||||
# Connect paragraphs split by -
|
|
||||||
(re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''),
|
|
||||||
# Add space before and after italics
|
# Add space before and after italics
|
||||||
(re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
|
(re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
|
||||||
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
|
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Fix Book Designer markup
|
# Fix Book Designer markup
|
||||||
@ -293,6 +298,13 @@ class HTMLPreProcessor(object):
|
|||||||
import traceback
|
import traceback
|
||||||
print 'Failed to parse remove_footer regexp'
|
print 'Failed to parse remove_footer regexp'
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
# Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
|
||||||
|
if getattr(self.extra_opts, 'preprocess_html', None):
|
||||||
|
if is_pdftohtml:
|
||||||
|
end_rules.append(
|
||||||
|
(re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
|
||||||
|
)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||||
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
|
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
|
||||||
|
@ -494,7 +494,7 @@ class HTMLInput(InputFormatPlugin):
|
|||||||
if not hasattr(self, 'log'):
|
if not hasattr(self, 'log'):
|
||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
self.log = default_log
|
self.log = default_log
|
||||||
self.log("********* Preprocessing HTML *********")
|
self.log("********* Preprocessing HTML - HTML Input plugin *********")
|
||||||
# Detect Chapters to match the xpath in the GUI
|
# Detect Chapters to match the xpath in the GUI
|
||||||
chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
|
chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
|
||||||
html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
|
html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
|
||||||
|
@ -11,12 +11,14 @@ import re
|
|||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.conversion.preprocess import line_length
|
from calibre.ebooks.conversion.preprocess import line_length
|
||||||
|
|
||||||
|
|
||||||
class LITInput(InputFormatPlugin):
|
class LITInput(InputFormatPlugin):
|
||||||
|
|
||||||
name = 'LIT Input'
|
name = 'LIT Input'
|
||||||
author = 'Marshall T. Vandegrift'
|
author = 'Marshall T. Vandegrift'
|
||||||
description = 'Convert LIT files to HTML'
|
description = 'Convert LIT files to HTML'
|
||||||
file_types = set(['lit'])
|
file_types = set(['lit'])
|
||||||
|
html_preprocess_sections = 0
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
@ -55,14 +57,104 @@ class LITInput(InputFormatPlugin):
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_html(self, html):
|
def preprocess_html(self, html):
|
||||||
|
|
||||||
|
def chapter_head(match):
|
||||||
|
chap = match.group('chap')
|
||||||
|
title = match.group('title')
|
||||||
|
if not title:
|
||||||
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
|
self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
|
||||||
|
return '<h2>'+chap+'</h2>\n'
|
||||||
|
else:
|
||||||
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
|
self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
|
||||||
|
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
||||||
|
|
||||||
|
def chapter_link(match):
|
||||||
|
chap = match.group('sectionlink')
|
||||||
|
if not chap:
|
||||||
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
|
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
|
||||||
|
return '<br style="page-break-before:always">'
|
||||||
|
else:
|
||||||
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
|
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
|
||||||
|
return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
|
||||||
|
|
||||||
|
|
||||||
|
def no_markup(raw, percent):
|
||||||
|
'''
|
||||||
|
Detects total marked up line endings in the file. raw is the text to
|
||||||
|
inspect. Percent is the minimum percent of line endings which should
|
||||||
|
be marked up to return true.
|
||||||
|
'''
|
||||||
|
htm_end_ere = re.compile('</p>', re.DOTALL)
|
||||||
|
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
|
||||||
|
htm_end = htm_end_ere.findall(raw)
|
||||||
|
line_end = line_end_ere.findall(raw)
|
||||||
|
tot_htm_ends = len(htm_end)
|
||||||
|
tot_ln_fds = len(line_end)
|
||||||
|
self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
|
||||||
|
|
||||||
|
if percent > 1:
|
||||||
|
percent = 1
|
||||||
|
if percent < 0:
|
||||||
|
percent = 0
|
||||||
|
|
||||||
|
min_lns = tot_ln_fds * percent
|
||||||
|
self.log("There must be more than " + str(min_lns) + " unmarked lines to be true")
|
||||||
|
if min_lns > tot_htm_ends:
|
||||||
|
return True
|
||||||
|
|
||||||
self.log("********* Preprocessing HTML *********")
|
self.log("********* Preprocessing HTML *********")
|
||||||
# Detect Chapters to match the xpath in the GUI
|
# remove non-breaking spaces
|
||||||
chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
|
html = re.sub(ur'\u00a0', ' ', html)
|
||||||
html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
|
# Get rid of empty <o:p> tags to simplify other processing
|
||||||
# Unwrap lines using punctation if the median length of all lines is less than 150
|
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||||
|
# Get rid of empty span tags
|
||||||
|
html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
|
||||||
|
|
||||||
|
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
|
||||||
|
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
|
||||||
|
blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
|
||||||
|
blanklines = blankreg.findall(html)
|
||||||
|
lines = linereg.findall(html)
|
||||||
|
if len(lines) > 1:
|
||||||
|
self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
||||||
|
if float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||||
|
self.log("deleting blank lines")
|
||||||
|
html = blankreg.sub('', html)
|
||||||
|
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||||
|
html = re.sub(r"\s*</p>", "</p>\n", html)
|
||||||
|
|
||||||
|
# some lit files don't have any <p> tags or equivalent, check and
|
||||||
|
# mark up line endings if required before proceeding
|
||||||
|
if no_markup(html, 0.1):
|
||||||
|
self.log("not enough paragraph markers, adding now")
|
||||||
|
add_markup = re.compile('(?<!>)(\n)')
|
||||||
|
html = add_markup.sub('</p>\n<p>', html)
|
||||||
|
|
||||||
|
# detect chapters/sections to match xpath or splitting logic
|
||||||
#
|
#
|
||||||
# Insert extra line feeds so the line length regex functions properly
|
# Mark split points based on embedded links
|
||||||
html = re.sub(r"</p>", "</p>\n", html)
|
chaplink = re.compile(r'<a\sname[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<sectionlink>[^\s<]+(\s*[^\s<]+){0,4})?\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*</a>', re.IGNORECASE)
|
||||||
|
html = chaplink.sub(chapter_link, html)
|
||||||
|
# Continue with alternate patterns, start with most typical chapter headings
|
||||||
|
if self.html_preprocess_sections < 10:
|
||||||
|
chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}.?(\d+\.?|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\s*){0,4}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
|
||||||
|
html = chapdetect.sub(chapter_head, html)
|
||||||
|
if self.html_preprocess_sections < 10:
|
||||||
|
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
|
||||||
|
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
||||||
|
html = chapdetect2.sub(chapter_head, html)
|
||||||
|
|
||||||
|
# search for places where a first or second level heading is immediately followed by another
|
||||||
|
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||||
|
# headings and titles, images, etc
|
||||||
|
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||||
|
html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
|
||||||
|
#
|
||||||
|
# Unwrap lines using punctation if the median length of all lines is less than 150
|
||||||
length = line_length('html', html, 0.4)
|
length = line_length('html', html, 0.4)
|
||||||
self.log("*** Median length is " + str(length) + " ***")
|
self.log("*** Median length is " + str(length) + " ***")
|
||||||
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||||
|
@ -3,6 +3,7 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
from calibre.customize.conversion import InputFormatPlugin
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
|
|
||||||
class MOBIInput(InputFormatPlugin):
|
class MOBIInput(InputFormatPlugin):
|
||||||
@ -37,3 +38,12 @@ class MOBIInput(InputFormatPlugin):
|
|||||||
include_meta_content_type=False))
|
include_meta_content_type=False))
|
||||||
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
||||||
return mr.created_opf_path
|
return mr.created_opf_path
|
||||||
|
|
||||||
|
def preprocess_html(self, html):
|
||||||
|
# search for places where a first or second level heading is immediately followed by another
|
||||||
|
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||||
|
# headings and titles, images, etc
|
||||||
|
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||||
|
html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
|
||||||
|
return html
|
||||||
|
|
||||||
|
@ -408,6 +408,10 @@ class Page(object):
|
|||||||
# Fraction of text height that two strings' bottoms can differ by
|
# Fraction of text height that two strings' bottoms can differ by
|
||||||
# for them to be considered to be part of the same text fragment
|
# for them to be considered to be part of the same text fragment
|
||||||
LINE_FACTOR = 0.4
|
LINE_FACTOR = 0.4
|
||||||
|
|
||||||
|
# Percentage of the page heigth which should be considered header
|
||||||
|
# or footer to be discarded from reflow considerations
|
||||||
|
HEAD_FOOTER_MARGIN
|
||||||
|
|
||||||
# Multiplies the average line height when determining row height
|
# Multiplies the average line height when determining row height
|
||||||
# of a particular element to detect columns.
|
# of a particular element to detect columns.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user