ereader output work

This commit is contained in:
John Schember 2009-04-25 15:50:52 -04:00
parent 19c8343d6a
commit 6ee829ff79
4 changed files with 15 additions and 10 deletions

View File

@ -27,9 +27,9 @@ def chap_head(match):
chap = match.group('chap')
title = match.group('title')
if not title:
return '<h1>'+chap+'</h1><br/>'
return '<h1>'+chap+'</h1><br/>\n'
else:
return '<h1>'+chap+'<br/>'+title+'</h1><br/>'
return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n'
def wrap_lines(match):
ital = match.group('ital')
@ -121,7 +121,7 @@ class HTMLPreProcessor(object):
# Clean up spaces
(re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics
(re.compile(r'(?<!“)<i>'), lambda match: ' <i>'),
(re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
]
@ -161,7 +161,7 @@ class HTMLPreProcessor(object):
elif self.is_pdftohtml(html):
line_length_rules = [
# Un wrap using punctuation
(re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .2), re.UNICODE), wrap_lines),
(re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines),
]
rules = self.PDFTOHTML + line_length_rules

View File

@ -50,6 +50,7 @@ PML_HTML_RULES = [
# eReader files are one paragraph per line.
# This forces the lines to wrap properly.
(re.compile('^(?P<text>.+)$', re.MULTILINE), lambda match: '<p>%s</p>' % match.group('text')),
(re.compile('<p>[ ]*</p>'), lambda match: ''),
# Remove unmatched plm codes.
(re.compile(r'(?<=[^\\])\\[pxcriouvtblBk]'), lambda match: ''),
@ -82,7 +83,7 @@ HTML_PML_RULES = [
(re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')),
(re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
#(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')),
(re.compile('<img.*?src="(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % image_name(match.group('name'))),
(re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name'))),
#(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))),
(re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')),
(re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),
@ -93,6 +94,8 @@ HTML_PML_RULES = [
(re.compile('<sup>(?P<text>.+?)</sup>', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')),
(re.compile('<b .*?>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
(re.compile('<b>(?P<text>.+?)</b>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
(re.compile('<strong .*?>(?P<text>.+?)</strong>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
(re.compile('<strong>(?P<text>.+?)</strong>', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')),
(re.compile('<big .*?>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
(re.compile('<big>(?P<text>.+?)</big>', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')),
(re.compile('<hr.*?width="(?P<val>\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')),
@ -108,8 +111,8 @@ HTML_PML_RULES = [
(re.compile('<div.*?style.*?text-align: center;.*?".*?>(?P<text>.+?)</div>', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')),
(re.compile('<h(?P<val>[0-4]).*?>(?P<text>.+?)</h[0-4]>', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)),
(re.compile('<h1.*?>(?P<text>.+?)</h1>', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')),
(re.compile('<br .*?>'), lambda match: '\\p'),
(re.compile('<br/*>'), lambda match: '\\p'),
(re.compile('<br .*?>'), lambda match: '\n'),
(re.compile('<br/*>'), lambda match: '\n'),
# Remove remaining HTML tags
(re.compile('<.*?>'), lambda match: ''),
@ -119,6 +122,8 @@ HTML_PML_RULES = [
# Remove whitespace on empty lines
(re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''),
# Remove excess whitespace in lines
(re.compile('(?<=.)[ ]{2,}(?=.)'), lambda match: ' '),
# Remove excess newlines at the beginning and end
(re.compile('^(\r\n){1,}'), lambda match: ''),

View File

@ -154,7 +154,7 @@ class Reader(FormatReader):
for i in images:
manifest.append((os.path.join('images/', i), None))
opf.create_manifest(manifest)
opf.create_spine(['index.html'])
with open('metadata.opf', 'wb') as opffile:

View File

@ -39,7 +39,7 @@ class Writer(object):
pml_pages = []
for page in pages:
pml_pages.append(zlib.compress(html_to_pml(unicode(page))))
pml_pages.append(zlib.compress(html_to_pml(unicode(page)).encode('utf-8')))
return pml_pages
@ -67,7 +67,7 @@ class Writer(object):
image_items = the number of images
'''
version = 10
non_text_offset = text_items
non_text_offset = text_items + 1
if image_items > 0:
image_data_offset = text_items + 1