ereader writer working

This commit is contained in:
John Schember 2009-04-25 16:57:29 -04:00
parent 6ee829ff79
commit e7ec12575d
2 changed files with 7 additions and 7 deletions

View File

@ -39,7 +39,7 @@ PML_HTML_RULES = [
(re.compile(r'\\k(?P<text>.+?)\\k', re.DOTALL), lambda match: '<small>%s</small>' % match.group('text')), (re.compile(r'\\k(?P<text>.+?)\\k', re.DOTALL), lambda match: '<small>%s</small>' % match.group('text')),
(re.compile(r'\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')), (re.compile(r'\\a(?P<num>\d\d\d)'), lambda match: '&#%s;' % match.group('num')),
(re.compile(r'\\U(?P<num>\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))), (re.compile(r'\\U(?P<num>\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))),
(re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % match.group('name')), (re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
(re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.+?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))), (re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.+?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))),
(re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<div id="%s"></div>' % match.group('target')), (re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<div id="%s"></div>' % match.group('target')),
(re.compile(r'\\-'), lambda match: ''), (re.compile(r'\\-'), lambda match: ''),
@ -83,7 +83,7 @@ HTML_PML_RULES = [
(re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')), (re.compile('<div.*?id="(?P<target>.+?).*?"></div>'), lambda match: '\\\\Q="%s"' % match.group('target')),
(re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))), (re.compile('<a.*?href="(?P<target>#.+?).*?">(?P<text>)</a>', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))),
#(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')), #(re.compile('<img.*?src="images/(?P<name>.+?)".*?>'), lambda match: '\\m="%s"' % match.group('name')),
(re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name'))), (re.compile('<img.*?src="(?P<name>.+?)".*?>(.*?</img>)*'), lambda match: '\\m="%s"' % image_name(match.group('name').strip('\x00'))),
#(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))), #(re.compile('&#(?P<num>\d\d\d\d);'), lambda match: '\\U%s' % int(match.group('num'))),
(re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')), (re.compile('&#(?P<num>\d\d\d);'), lambda match: '\\a%s' % match.group('num')),
(re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), (re.compile('<small .*?>(?P<text>.+?)</small>', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')),

View File

@ -76,7 +76,7 @@ class Reader(FormatReader):
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1: if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
return 'empty', '' return 'empty', ''
data = self.section_data(number) data = self.section_data(number)
name = data[4:4+32].strip('\0') name = data[4:4+32].strip('\x00')
img = data[62:] img = data[62:]
return name, img return name, img
@ -97,7 +97,7 @@ class Reader(FormatReader):
if not os.path.exists(output_dir): if not os.path.exists(output_dir):
os.makedirs(output_dir) os.makedirs(output_dir)
html = '<html><head><title></title></head><body>' html = u'<html><head><title></title></head><body>'
for i in range(1, self.header_record.num_text_pages + 1): for i in range(1, self.header_record.num_text_pages + 1):
self.log.debug('Extracting text page %i' % i) self.log.debug('Extracting text page %i' % i)
@ -110,8 +110,7 @@ class Reader(FormatReader):
self.log.debug('Extracting footnote page %i' % i) self.log.debug('Extracting footnote page %i' % i)
html += '<dl>' html += '<dl>'
html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i)) html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
html += '</dl>' html += '</dl>'
if self.header_record.sidebar_rec > 0: if self.header_record.sidebar_rec > 0:
html += '<br /><h1>%s</h1>' % _('Sidebar') html += '<br /><h1>%s</h1>' % _('Sidebar')
@ -127,7 +126,8 @@ class Reader(FormatReader):
with CurrentDir(output_dir): with CurrentDir(output_dir):
with open('index.html', 'wb') as index: with open('index.html', 'wb') as index:
self.log.debug('Writing text to index.html') self.log.debug('Writing text to index.html')
index.write(html.encode('utf-8')) index.write(html)
# print html
if not os.path.exists(os.path.join(output_dir, 'images/')): if not os.path.exists(os.path.join(output_dir, 'images/')):
os.makedirs(os.path.join(output_dir, 'images/')) os.makedirs(os.path.join(output_dir, 'images/'))