mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Better pdftohtml processing rules based on ldolse from mobileread's work.
This commit is contained in:
parent
fe3d1f5bc7
commit
3fe2c7a2ed
@ -23,6 +23,14 @@ def sanitize_head(match):
|
|||||||
x = _span_pat.sub('', x)
|
x = _span_pat.sub('', x)
|
||||||
return '<head>\n'+x+'\n</head>'
|
return '<head>\n'+x+'\n</head>'
|
||||||
|
|
||||||
|
def chap_head(match):
|
||||||
|
chap = match.group('chap')
|
||||||
|
title = match.group('title')
|
||||||
|
if not title:
|
||||||
|
return '<h1>'+chap+'</h1><br/>'
|
||||||
|
else:
|
||||||
|
return '<h1>'+chap+'<br/>'+title+'</h1><br/>'
|
||||||
|
|
||||||
|
|
||||||
class CSSPreProcessor(object):
|
class CSSPreProcessor(object):
|
||||||
|
|
||||||
@ -54,8 +62,9 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
|
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
|
||||||
# Remove page numbers
|
# Remove page numbers
|
||||||
(re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
|
(re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
|
||||||
# Remove <br> and replace <br><br> with <p>
|
# Replace <br><br> with <p>
|
||||||
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
|
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
|
||||||
|
# Remove <br>
|
||||||
(re.compile(r'(.*)<br.*?>', re.IGNORECASE),
|
(re.compile(r'(.*)<br.*?>', re.IGNORECASE),
|
||||||
lambda match: match.group() if \
|
lambda match: match.group() if \
|
||||||
re.match('<', match.group(1).lstrip()) or \
|
re.match('<', match.group(1).lstrip()) or \
|
||||||
@ -69,15 +78,22 @@ class HTMLPreProcessor(object):
|
|||||||
# Remove non breaking spaces
|
# Remove non breaking spaces
|
||||||
(re.compile(ur'\u00a0'), lambda match : ' '),
|
(re.compile(ur'\u00a0'), lambda match : ' '),
|
||||||
|
|
||||||
|
# Detect Chapters to match default XPATH in GUI
|
||||||
|
(re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head),
|
||||||
|
(re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
|
||||||
|
|
||||||
# Have paragraphs show better
|
# Have paragraphs show better
|
||||||
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
||||||
|
|
||||||
# Un wrap lines
|
# Un wrap lines
|
||||||
(re.compile(r'(?<=\w)\s*</(i|b|u)>\s*<p.*?>\s*<(i|b|u)>\s*(?=\w)'), lambda match: ' '),
|
(re.compile(r'(?<=[^\.^\^?^!^"^”])\s*(</(i|b|u)>)*\s*<p.*?>\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '),
|
||||||
(re.compile(r'(?<=\w)\s*<p.*?>\s*(?=\w)', re.UNICODE), lambda match: ' '),
|
|
||||||
# Clean up spaces
|
# Clean up spaces
|
||||||
(re.compile(u'(?<=\.|,|:|;|\?|!|”|"|\')[\s^ ]*(?=<)'), lambda match: ' '),
|
(re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
||||||
]
|
# Add space before and after italics
|
||||||
|
(re.compile(r'(?<!“)<i>'), lambda match: ' <i>'),
|
||||||
|
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
|
||||||
|
]
|
||||||
|
|
||||||
# Fix Book Designer markup
|
# Fix Book Designer markup
|
||||||
BOOK_DESIGNER = [
|
BOOK_DESIGNER = [
|
||||||
|
@ -98,7 +98,7 @@ def get_cover(stream):
|
|||||||
data = cStringIO.StringIO()
|
data = cStringIO.StringIO()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
StreamReadWrapper(stream) as stream:
|
with StreamReadWrapper(stream) as stream:
|
||||||
pdf = PdfFileReader(stream)
|
pdf = PdfFileReader(stream)
|
||||||
output = PdfFileWriter()
|
output = PdfFileWriter()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user