Handle ancient text-only PRC ebooks.

This commit is contained in:
Kovid Goyal 2008-08-03 16:36:48 -07:00
parent 1c5ecad88f
commit 82d021b460

View File

@ -72,13 +72,21 @@ class BookHeader(object):
self.compression_type = raw[:2]
self.records, self.records_size = struct.unpack('>HH', raw[8:12])
self.encryption_type, = struct.unpack('>H', raw[12:14])
if ident == 'TEXTREAD':
self.codepage = 1252
if len(raw) <= 16:
self.codec = 'cp1251'
self.extra_flags = 0
self.language = 'ENGLISH'
self.sublanguage = 'NEUTRAL'
self.exth_flag, self.exth = 0, None
self.ancient = True
else:
self.ancient = False
self.doctype = raw[16:20]
self.length, self.type, self.codepage, self.unique_id, self.version = \
struct.unpack('>LLLLL', raw[20:40])
if ident == 'TEXTREAD':
self.codepage = 1252
try:
self.codec = {
@ -145,7 +153,6 @@ class MobiReader(object):
else:
end_off = self.section_headers[section_number + 1][0]
off = self.section_headers[section_number][0]
return raw[off:end_off]
for i in range(self.num_sections):
@ -201,6 +208,8 @@ class MobiReader(object):
def cleanup_html(self):
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
self.processed_html = '<html><p>'+self.processed_html.replace('\n\n', '<p>')+'</html>'
def cleanup_soup(self, soup):
for tag in soup.recursiveChildGenerator():
@ -313,7 +322,8 @@ class MobiReader(object):
self.mobi_html = ''.join(text_sections)
else:
raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type))
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
self.mobi_html = self.mobi_html.replace('\r ', '\n\n ')
return processed_records