Handle single line paragraphs in PDB files

2025-11-21 14:03:03 -05:00 · 2009-08-25 11:11:56 -06:00 · 2009-08-25 11:11:56 -06:00 · cb232d395e
commit cb232d395e
parent a04024d455
9 changed files with 31 additions and 23 deletions
--- a/src/calibre/ebooks/pdb/ereader/reader.py
+++ b/src/calibre/ebooks/pdb/ereader/reader.py
@ -15,13 +15,13 @@ from calibre.ebooks.pdb.ereader.reader202 import Reader202

 class Reader(FormatReader):

-    def __init__(self, header, stream, log, encoding=None):
+    def __init__(self, header, stream, log, options):
        record0_size = len(header.section_data(0))

        if record0_size == 132:
-            self.reader = Reader132(header, stream, log, encoding)
+            self.reader = Reader132(header, stream, log, options)
        elif record0_size == 202:
-            self.reader = Reader202(header, stream, log, encoding)
+            self.reader = Reader202(header, stream, log, options)
        else:
            raise EreaderError('Size mismatch. eReader header record size %s KB is not supported.' % record0_size)

--- a/src/calibre/ebooks/pdb/ereader/reader132.py
+++ b/src/calibre/ebooks/pdb/ereader/reader132.py
@ -47,9 +47,9 @@ class HeaderRecord(object):

 class Reader132(FormatReader):

-    def __init__(self, header, stream, log, encoding=None):
+    def __init__(self, header, stream, log, options):
        self.log = log
-        self.encoding = encoding
+        self.encoding = options.input_encoding
        
        self.log.debug('132 byte header version found.')

--- a/src/calibre/ebooks/pdb/ereader/reader202.py
+++ b/src/calibre/ebooks/pdb/ereader/reader202.py
@ -33,9 +33,9 @@ class HeaderRecord(object):

 class Reader202(FormatReader):

-    def __init__(self, header, stream, log, encoding=None):
+    def __init__(self, header, stream, log, options):
        self.log = log
-        self.encoding = encoding
+        self.encoding = options.input_encoding

        self.log.debug('202 byte header version found.')

--- a/src/calibre/ebooks/pdb/formatreader.py
+++ b/src/calibre/ebooks/pdb/formatreader.py
@ -11,7 +11,7 @@ __docformat__ = 'restructuredtext en'

 class FormatReader(object):

-    def __init__(self, header, stream, log, encoding=None):
+    def __init__(self, header, stream, log, options):
        raise NotImplementedError()
        
    def extract_content(self, output_dir):
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'

 import os

-from calibre.customize.conversion import InputFormatPlugin
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader

@ -17,6 +17,13 @@ class PDBInput(InputFormatPlugin):
    description = 'Convert PDB to HTML'
    file_types  = set(['pdb'])

+    options = set([
+        OptionRecommendation(name='single_line_paras', recommended_value=False,
+            help=_('Normally calibre treats blank lines as paragraph markers. '
+                'With this option it will assume that every line represents '
+                'a paragraph instead.')),
+    ])
+
    def convert(self, stream, options, file_ext, log,
                accelerators):
        header = PdbHeaderReader(stream)
@ -27,7 +34,7 @@ class PDBInput(InputFormatPlugin):

        log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident))

-        reader = Reader(header, stream, log, options.input_encoding)
+        reader = Reader(header, stream, log, options)
        opf = reader.extract_content(os.getcwd())

        return opf
--- a/src/calibre/ebooks/pdb/palmdoc/reader.py
+++ b/src/calibre/ebooks/pdb/palmdoc/reader.py
@ -31,10 +31,11 @@ class HeaderRecord(object):

 class Reader(FormatReader):

-    def __init__(self, header, stream, log, encoding=None):
+    def __init__(self, header, stream, log, options):
        self.stream = stream
        self.log = log
-        self.encoding = encoding
+        self.encoding = options.input_encoding
+        self.single_line_paras = options.single_line_paras

        self.sections = []
        for i in range(header.num_sections):
@ -61,7 +62,7 @@ class Reader(FormatReader):
            txt += self.decompress_text(i)

        self.log.info('Converting text to OEB...')
-        html = txt_to_markdown(txt)
+        html = txt_to_markdown(txt, single_line_paras=self.single_line_paras)
        with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
            index.write(html.encode('utf-8'))

--- a/src/calibre/ebooks/pdb/ztxt/reader.py
+++ b/src/calibre/ebooks/pdb/ztxt/reader.py
@ -34,10 +34,11 @@ class HeaderRecord(object):
    
 class Reader(FormatReader):
    
-    def __init__(self, header, stream, log, encoding=None):
+    def __init__(self, header, stream, log, options):
        self.stream = stream
        self.log = log
-        self.encoding = encoding
+        self.encoding = options.input_encoding
+        self.single_line_paras = options.single_line_paras
    
        self.sections = []
        for i in range(header.num_sections):
@ -76,7 +77,7 @@ class Reader(FormatReader):
            txt += self.decompress_text(i)

        self.log.info('Converting text to OEB...')
-        html = txt_to_markdown(txt)
+        html = txt_to_markdown(txt, single_line_paras=self.single_line_paras)
        with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
            index.write(html.encode('utf-8'))
                        
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -31,14 +31,9 @@ class TXTInput(InputFormatPlugin):
        log.debug('Reading text from file...')
        txt = stream.read().decode(ienc, 'replace')

-        if options.single_line_paras:
-            txt = txt.replace('\r\n', '\n')
-            txt = txt.replace('\r', '\n')
-            txt = txt.replace('\n', '\n\n')
-
        log.debug('Running text though markdown conversion...')
        try:
-            html = txt_to_markdown(txt)
+            html = txt_to_markdown(txt, single_line_paras=options.single_line_paras)
        except RuntimeError:
            raise ValueError('This txt file has malformed markup, it cannot be'
                'converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -13,7 +13,11 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'

-def txt_to_markdown(txt, title=''):
+def txt_to_markdown(txt, title='', single_line_paras=False):
+    if single_line_paras:
+        txt = txt.replace('\r\n', '\n')
+        txt = txt.replace('\r', '\n')
+        txt = txt.replace('\n', '\n\n')
    md = markdown.Markdown(
        extensions=['footnotes', 'tables', 'toc'],
        safe_mode=False,)