Handle single line paragraphs in PDB files

This commit is contained in:
Kovid Goyal 2009-08-25 11:11:56 -06:00
parent a04024d455
commit cb232d395e
9 changed files with 31 additions and 23 deletions

View File

@ -15,13 +15,13 @@ from calibre.ebooks.pdb.ereader.reader202 import Reader202
class Reader(FormatReader): class Reader(FormatReader):
def __init__(self, header, stream, log, encoding=None): def __init__(self, header, stream, log, options):
record0_size = len(header.section_data(0)) record0_size = len(header.section_data(0))
if record0_size == 132: if record0_size == 132:
self.reader = Reader132(header, stream, log, encoding) self.reader = Reader132(header, stream, log, options)
elif record0_size == 202: elif record0_size == 202:
self.reader = Reader202(header, stream, log, encoding) self.reader = Reader202(header, stream, log, options)
else: else:
raise EreaderError('Size mismatch. eReader header record size %s KB is not supported.' % record0_size) raise EreaderError('Size mismatch. eReader header record size %s KB is not supported.' % record0_size)

View File

@ -47,9 +47,9 @@ class HeaderRecord(object):
class Reader132(FormatReader): class Reader132(FormatReader):
def __init__(self, header, stream, log, encoding=None): def __init__(self, header, stream, log, options):
self.log = log self.log = log
self.encoding = encoding self.encoding = options.input_encoding
self.log.debug('132 byte header version found.') self.log.debug('132 byte header version found.')

View File

@ -33,9 +33,9 @@ class HeaderRecord(object):
class Reader202(FormatReader): class Reader202(FormatReader):
def __init__(self, header, stream, log, encoding=None): def __init__(self, header, stream, log, options):
self.log = log self.log = log
self.encoding = encoding self.encoding = options.input_encoding
self.log.debug('202 byte header version found.') self.log.debug('202 byte header version found.')

View File

@ -11,7 +11,7 @@ __docformat__ = 'restructuredtext en'
class FormatReader(object): class FormatReader(object):
def __init__(self, header, stream, log, encoding=None): def __init__(self, header, stream, log, options):
raise NotImplementedError() raise NotImplementedError()
def extract_content(self, output_dir): def extract_content(self, output_dir):

View File

@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
import os import os
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
@ -17,6 +17,13 @@ class PDBInput(InputFormatPlugin):
description = 'Convert PDB to HTML' description = 'Convert PDB to HTML'
file_types = set(['pdb']) file_types = set(['pdb'])
options = set([
OptionRecommendation(name='single_line_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line represents '
'a paragraph instead.')),
])
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
header = PdbHeaderReader(stream) header = PdbHeaderReader(stream)
@ -27,7 +34,7 @@ class PDBInput(InputFormatPlugin):
log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident)) log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident))
reader = Reader(header, stream, log, options.input_encoding) reader = Reader(header, stream, log, options)
opf = reader.extract_content(os.getcwd()) opf = reader.extract_content(os.getcwd())
return opf return opf

View File

@ -31,10 +31,11 @@ class HeaderRecord(object):
class Reader(FormatReader): class Reader(FormatReader):
def __init__(self, header, stream, log, encoding=None): def __init__(self, header, stream, log, options):
self.stream = stream self.stream = stream
self.log = log self.log = log
self.encoding = encoding self.encoding = options.input_encoding
self.single_line_paras = options.single_line_paras
self.sections = [] self.sections = []
for i in range(header.num_sections): for i in range(header.num_sections):
@ -61,7 +62,7 @@ class Reader(FormatReader):
txt += self.decompress_text(i) txt += self.decompress_text(i)
self.log.info('Converting text to OEB...') self.log.info('Converting text to OEB...')
html = txt_to_markdown(txt) html = txt_to_markdown(txt, single_line_paras=self.single_line_paras)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index: with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8')) index.write(html.encode('utf-8'))

View File

@ -34,10 +34,11 @@ class HeaderRecord(object):
class Reader(FormatReader): class Reader(FormatReader):
def __init__(self, header, stream, log, encoding=None): def __init__(self, header, stream, log, options):
self.stream = stream self.stream = stream
self.log = log self.log = log
self.encoding = encoding self.encoding = options.input_encoding
self.single_line_paras = options.single_line_paras
self.sections = [] self.sections = []
for i in range(header.num_sections): for i in range(header.num_sections):
@ -76,7 +77,7 @@ class Reader(FormatReader):
txt += self.decompress_text(i) txt += self.decompress_text(i)
self.log.info('Converting text to OEB...') self.log.info('Converting text to OEB...')
html = txt_to_markdown(txt) html = txt_to_markdown(txt, single_line_paras=self.single_line_paras)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index: with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8')) index.write(html.encode('utf-8'))

View File

@ -31,14 +31,9 @@ class TXTInput(InputFormatPlugin):
log.debug('Reading text from file...') log.debug('Reading text from file...')
txt = stream.read().decode(ienc, 'replace') txt = stream.read().decode(ienc, 'replace')
if options.single_line_paras:
txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n')
txt = txt.replace('\n', '\n\n')
log.debug('Running text though markdown conversion...') log.debug('Running text though markdown conversion...')
try: try:
html = txt_to_markdown(txt) html = txt_to_markdown(txt, single_line_paras=options.single_line_paras)
except RuntimeError: except RuntimeError:
raise ValueError('This txt file has malformed markup, it cannot be' raise ValueError('This txt file has malformed markup, it cannot be'
'converted by calibre. See http://daringfireball.net/projects/markdown/syntax') 'converted by calibre. See http://daringfireball.net/projects/markdown/syntax')

View File

@ -13,7 +13,11 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
def txt_to_markdown(txt, title=''): def txt_to_markdown(txt, title='', single_line_paras=False):
if single_line_paras:
txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n')
txt = txt.replace('\n', '\n\n')
md = markdown.Markdown( md = markdown.Markdown(
extensions=['footnotes', 'tables', 'toc'], extensions=['footnotes', 'tables', 'toc'],
safe_mode=False,) safe_mode=False,)