mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
ereader output work
This commit is contained in:
parent
19c8343d6a
commit
6ee829ff79
@ -27,9 +27,9 @@ def chap_head(match):
|
|||||||
chap = match.group('chap')
|
chap = match.group('chap')
|
||||||
title = match.group('title')
|
title = match.group('title')
|
||||||
if not title:
|
if not title:
|
||||||
return '<h1>'+chap+'</h1><br/>'
|
return '<h1>'+chap+'</h1><br/>\n'
|
||||||
else:
|
else:
|
||||||
return '<h1>'+chap+'<br/>'+title+'</h1><br/>'
|
return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n'
|
||||||
|
|
||||||
def wrap_lines(match):
|
def wrap_lines(match):
|
||||||
ital = match.group('ital')
|
ital = match.group('ital')
|
||||||
@ -121,7 +121,7 @@ class HTMLPreProcessor(object):
|
|||||||
# Clean up spaces
|
# Clean up spaces
|
||||||
(re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
(re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
||||||
# Add space before and after italics
|
# Add space before and after italics
|
||||||
(re.compile(r'(?<!“)<i>'), lambda match: ' <i>'),
|
(re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
|
||||||
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
|
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -161,7 +161,7 @@ class HTMLPreProcessor(object):
|
|||||||
elif self.is_pdftohtml(html):
|
elif self.is_pdftohtml(html):
|
||||||
line_length_rules = [
|
line_length_rules = [
|
||||||
# Un wrap using punctuation
|
# Un wrap using punctuation
|
||||||
(re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .2), re.UNICODE), wrap_lines),
|
(re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines),
|
||||||
]
|
]
|
||||||
|
|
||||||
rules = self.PDFTOHTML + line_length_rules
|
rules = self.PDFTOHTML + line_length_rules
|
||||||
|
@ -50,6 +50,7 @@ PML_HTML_RULES = [
|
|||||||
# eReader files are one paragraph per line.
|
# eReader files are one paragraph per line.
|
||||||
# This forces the lines to wrap properly.
|
# This forces the lines to wrap properly.
|
||||||
(re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')),
|
(re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')),
|
||||||
|
(re.compile('<p>[ ]*</p>'), lambda match: ''),
|
||||||
|
|
||||||
# Remove unmatched plm codes.
|
# Remove unmatched plm codes.
|
||||||
(re.compile(r'(?<=[^\\])\\[pxcriouvtblBk]'), lambda match: ''),
|
(re.compile(r'(?<=[^\\])\\[pxcriouvtblBk]'), lambda match: ''),
|
||||||
@ -82,7 +83,7 @@ HTML_PML_RULES = [
|
|||||||
(re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')),
|
(re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')),
|
||||||
(re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
|
(re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
|
||||||
#(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')),
|
#(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')),
|
||||||
(re.compile('<img.*?src="(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % image_name(match.group('name'))),
|
(re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name'))),
|
||||||
#(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))),
|
#(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))),
|
||||||
(re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')),
|
(re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')),
|
||||||
(re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
|
(re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
|
||||||
@ -93,6 +94,8 @@ HTML_PML_RULES = [
|
|||||||
(re.compile('<sup>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
|
(re.compile('<sup>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
|
||||||
(re.compile('<b .*?>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
|
(re.compile('<b .*?>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
|
||||||
(re.compile('<b>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
|
(re.compile('<b>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
|
||||||
|
(re.compile('<strong .*?>(?P<text>.+?)</strong>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
|
||||||
|
(re.compile('<strong>(?P<text>.+?)</strong>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
|
||||||
(re.compile('<big .*?>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
|
(re.compile('<big .*?>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
|
||||||
(re.compile('<big>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
|
(re.compile('<big>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
|
||||||
(re.compile('<hr.*?width="(?P<val>\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
|
(re.compile('<hr.*?width="(?P<val>\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
|
||||||
@ -108,8 +111,8 @@ HTML_PML_RULES = [
|
|||||||
(re.compile('<div.*?style.*?text-align: center;.*?".*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')),
|
(re.compile('<div.*?style.*?text-align: center;.*?".*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')),
|
||||||
(re.compile('<h(?P<val>[0-4]).*?>(?P<text>.+?)</h[0-4]>', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
|
(re.compile('<h(?P<val>[0-4]).*?>(?P<text>.+?)</h[0-4]>', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
|
||||||
(re.compile('<h1.*?>(?P<text>.+?)</h1>', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')),
|
(re.compile('<h1.*?>(?P<text>.+?)</h1>', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')),
|
||||||
(re.compile('<br .*?>'), lambda match: '\\p'),
|
(re.compile('<br .*?>'), lambda match: '\n'),
|
||||||
(re.compile('<br/*>'), lambda match: '\\p'),
|
(re.compile('<br/*>'), lambda match: '\n'),
|
||||||
|
|
||||||
# Remove remaining HTML tags
|
# Remove remaining HTML tags
|
||||||
(re.compile('<.*?>'), lambda match: ''),
|
(re.compile('<.*?>'), lambda match: ''),
|
||||||
@ -119,6 +122,8 @@ HTML_PML_RULES = [
|
|||||||
|
|
||||||
# Remove whitespace on empty lines
|
# Remove whitespace on empty lines
|
||||||
(re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''),
|
(re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''),
|
||||||
|
# Remove excess whitespace in lines
|
||||||
|
(re.compile('(?<=.)[ ]{2,}(?=.)'), lambda match: ' '),
|
||||||
|
|
||||||
# Remove excess newlines at the beginning and end
|
# Remove excess newlines at the beginning and end
|
||||||
(re.compile('^(\r\n){1,}'), lambda match: ''),
|
(re.compile('^(\r\n){1,}'), lambda match: ''),
|
||||||
|
@ -154,7 +154,7 @@ class Reader(FormatReader):
|
|||||||
|
|
||||||
for i in images:
|
for i in images:
|
||||||
manifest.append((os.path.join('images/', i), None))
|
manifest.append((os.path.join('images/', i), None))
|
||||||
|
|
||||||
opf.create_manifest(manifest)
|
opf.create_manifest(manifest)
|
||||||
opf.create_spine(['index.html'])
|
opf.create_spine(['index.html'])
|
||||||
with open('metadata.opf', 'wb') as opffile:
|
with open('metadata.opf', 'wb') as opffile:
|
||||||
|
@ -39,7 +39,7 @@ class Writer(object):
|
|||||||
pml_pages = []
|
pml_pages = []
|
||||||
|
|
||||||
for page in pages:
|
for page in pages:
|
||||||
pml_pages.append(zlib.compress(html_to_pml(unicode(page))))
|
pml_pages.append(zlib.compress(html_to_pml(unicode(page)).encode('utf-8')))
|
||||||
|
|
||||||
return pml_pages
|
return pml_pages
|
||||||
|
|
||||||
@ -67,7 +67,7 @@ class Writer(object):
|
|||||||
image_items = the number of images
|
image_items = the number of images
|
||||||
'''
|
'''
|
||||||
version = 10
|
version = 10
|
||||||
non_text_offset = text_items
|
non_text_offset = text_items + 1
|
||||||
|
|
||||||
if image_items > 0:
|
if image_items > 0:
|
||||||
image_data_offset = text_items + 1
|
image_data_offset = text_items + 1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user