Initial support for reading metadata from and converting Plucker format PDB files

2025-07-09 03:04:10 -04:00 · 2011-04-24 08:55:41 -06:00 · 2011-04-24 08:55:41 -06:00 · 0a0ba36ee0
commit 0a0ba36ee0
parent 046f6bc2e4 e690e7196e
6 changed files with 846 additions and 3 deletions
--- a/src/calibre/ebooks/metadata/pdb.py
+++ b/src/calibre/ebooks/metadata/pdb.py
@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 '''
-Read meta information from eReader pdb files.
+Read meta information from pdb files.
 '''
 __license__   = 'GPL v3'
@ -13,10 +13,12 @@ import re
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.ebooks.metadata.ereader import get_metadata as get_eReader
 from calibre.ebooks.metadata.plucker import get_metadata as get_plucker
 MREADER = {
    'PNPdPPrs' : get_eReader,
    'PNRdPPrs' : get_eReader,
    'DataPlkr' : get_plucker,
 }
 from calibre.ebooks.metadata.ereader import set_metadata as set_eReader
--- a/src/calibre/ebooks/metadata/plucker.py
+++ b/src/calibre/ebooks/metadata/plucker.py
@ -0,0 +1,73 @@
 # -*- coding: utf-8 -*-
 from __future__ import (unicode_literals, division, absolute_import, print_function)
 '''
 Read meta information from Plucker pdb files.
 '''
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import struct
 from datetime import datetime
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.ebooks.pdb.plucker.reader import SectionHeader, DATATYPE_METADATA, \
    MIBNUM_TO_NAME
 def get_metadata(stream, extract_cover=True):
    '''
    Return metadata as a L{MetaInfo} object
    '''
    mi = MetaInformation(_('Unknown'), [_('Unknown')])
    stream.seek(0)
    pheader = PdbHeaderReader(stream)
    section_data = None
    for i in range(1, pheader.num_sections):
        raw_data = pheader.section_data(i)
        section_header = SectionHeader(raw_data)
        if section_header.type == DATATYPE_METADATA:
            section_data = raw_data[8:]
            break
    if not section_data:
        return mi
    default_encoding = 'latin-1'
    record_count, = struct.unpack('>H', section_data[0:2])
    adv = 0
    title = None
    author = None
    pubdate = 0
    for i in xrange(record_count):
        type, = struct.unpack('>H', section_data[2+adv:4+adv])
        length, = struct.unpack('>H', section_data[4+adv:6+adv])
        # CharSet
        if type == 1:
            val, = struct.unpack('>H', section_data[6+adv:8+adv])
            default_encoding = MIBNUM_TO_NAME.get(val, 'latin-1')
        # Author
        elif type == 4:
            author = section_data[6+adv+(2*length)]
        # Title
        elif type == 5:
            title = section_data[6+adv+(2*length)]
        # Publication Date
        elif type == 6:
            pubdate, = struct.unpack('>I', section_data[6+adv:6+adv+4])
        adv += 2*length
    if title:
        mi.title = title.replace('\0', '').decode(default_encoding, 'replace')
    if author:
        author = author.replace('\0', '').decode(default_encoding, 'replace')
        mi.author = author.split(',')
    mi.pubdate = datetime.fromtimestamp(pubdate)
    return mi
--- a/src/calibre/ebooks/pdb/init.py
+++ b/src/calibre/ebooks/pdb/init.py
@ -12,6 +12,7 @@ from calibre.ebooks.pdb.ereader.reader import Reader as ereader_reader
 from calibre.ebooks.pdb.palmdoc.reader import Reader as palmdoc_reader
 from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader
 from calibre.ebooks.pdb.pdf.reader import Reader as pdf_reader
 from calibre.ebooks.pdb.plucker.reader import Reader as plucker_reader
 FORMAT_READERS = {
    'PNPdPPrs': ereader_reader,
@ -19,6 +20,7 @@ FORMAT_READERS = {
    'zTXTGPlm': ztxt_reader,
    'TEXtREAd': palmdoc_reader,
    '.pdfADBE': pdf_reader,
    'DataPlkr': plucker_reader,
 }
 from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer
@ -37,6 +39,7 @@ IDENTITY_TO_NAME = {
    'zTXTGPlm': 'zTXT',
    'TEXtREAd': 'PalmDOC',
    '.pdfADBE': 'Adobe Reader',
    'DataPlkr': 'Plucker',
    'BVokBDIC': 'BDicty',
    'DB99DBOS': 'DB (Database program)',
@ -50,7 +53,6 @@ IDENTITY_TO_NAME = {
    'DATALSdb': 'LIST',
    'Mdb1Mdb1': 'MobileDB',
    'BOOKMOBI': 'MobiPocket',
    'DataPlkr': 'Plucker',
    'DataSprd': 'QuickSheet',
    'SM01SMem': 'SuperMemo',
    'TEXtTlDc': 'TealDoc',
--- a/src/calibre/ebooks/pdb/plucker/init.py
+++ b/src/calibre/ebooks/pdb/plucker/init.py
--- a/src/calibre/ebooks/pdb/plucker/reader.py
+++ b/src/calibre/ebooks/pdb/plucker/reader.py
@ -0,0 +1,764 @@
 # -*- coding: utf-8 -*-
 #from __future__ import (unicode_literals, division, absolute_import, print_function)
 __license__   = 'GPL v3'
 __copyright__ = '20011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import os
 import struct
 import zlib
 from collections import OrderedDict
 from calibre import CurrentDir
 from calibre.ebooks.pdb.formatreader import FormatReader
 from calibre.ptempfile import TemporaryFile
 from calibre.utils.magick import Image, create_canvas
 DATATYPE_PHTML = 0
 DATATYPE_PHTML_COMPRESSED = 1
 DATATYPE_TBMP = 2
 DATATYPE_TBMP_COMPRESSED = 3
 DATATYPE_MAILTO = 4
 DATATYPE_LINK_INDEX = 5
 DATATYPE_LINKS = 6
 DATATYPE_LINKS_COMPRESSED = 7
 DATATYPE_BOOKMARKS = 8
 DATATYPE_CATEGORY = 9
 DATATYPE_METADATA = 10
 DATATYPE_STYLE_SHEET = 11
 DATATYPE_FONT_PAGE = 12
 DATATYPE_TABLE = 13
 DATATYPE_TABLE_COMPRESSED = 14
 DATATYPE_COMPOSITE_IMAGE = 15
 DATATYPE_PAGELIST_METADATA = 16
 DATATYPE_SORTED_URL_INDEX = 17
 DATATYPE_SORTED_URL = 18
 DATATYPE_SORTED_URL_COMPRESSED = 19
 DATATYPE_EXT_ANCHOR_INDEX = 20
 DATATYPE_EXT_ANCHOR = 21
 DATATYPE_EXT_ANCHOR_COMPRESSED = 22
 # IETF IANA MIBenum value for the character set.
 # See the http://www.iana.org/assignments/character-sets for valid values.
 # Not all character sets are handled by Python. This is a small subset that
 # the MIBenum maps to Python standard encodings
 # from http://docs.python.org/library/codecs.html#standard-encodings
 MIBNUM_TO_NAME = {
    3: 'ascii',
    4: 'latin_1',
    5: 'iso8859_2',
    6: 'iso8859_3',
    7: 'iso8859_4',
    8: 'iso8859_5',
    9: 'iso8859_6',
    10: 'iso8859_7',
    11: 'iso8859_8',
    12: 'iso8859_9',
    13: 'iso8859_10',
    17: 'shift_jis',
    18: 'euc_jp',
    27: 'utf_7',
    36: 'euc_kr',
    37: 'iso2022_kr',
    38: 'euc_kr',
    39: 'iso2022_jp',
    40: 'iso2022_jp_2',
    106: 'utf-8',
    109: 'iso8859_13',
    110: 'iso8859_14',
    111: 'iso8859_15',
    112: 'iso8859_16',
    1013: 'utf_16_be',
    1014: 'utf_16_le',
    1015: 'utf_16',
    2009: 'cp850',
    2010: 'cp852',
    2011: 'cp437',
    2013: 'cp862',
    2025: 'gb2312',
    2026: 'big5',
    2028: 'cp037',
    2043: 'cp424',
    2044: 'cp500',
    2046: 'cp855',
    2047: 'cp857',
    2048: 'cp860',
    2049: 'cp861',
    2050: 'cp863',
    2051: 'cp864',
    2052: 'cp865',
    2054: 'cp869',
    2063: 'cp1026',
    2085: 'hz',
    2086: 'cp866',
    2087: 'cp775',
    2089: 'cp858',
    2091: 'cp1140',
    2102: 'big5hkscs',
    2250: 'cp1250',
    2251: 'cp1251',
    2252: 'cp1252',
    2253: 'cp1253',
    2254: 'cp1254',
    2255: 'cp1255',
    2256: 'cp1256',
    2257: 'cp1257',
    2258: 'cp1258',
 }
 def decompress_doc(data):
    buffer = [ord(i) for i in data]
    res = []
    i = 0
    while i < len(buffer):
        c = buffer[i]
        i += 1
        if c >= 1 and c <= 8:
            res.extend(buffer[i:i+c])
            i += c
        elif c <= 0x7f:
            res.append(c)
        elif c >= 0xc0:
            res.extend( (ord(' '), c^0x80) )
        else:
            c = (c << 8) + buffer[i]
            i += 1
            di = (c & 0x3fff) >> 3
            j = len(res)
            num = (c & ((1 << 3) - 1)) + 3
            for k in range( num ):
                res.append(res[j - di+k])
    return ''.join([chr(i) for i in res])
 class HeaderRecord(object):
    '''
    Plucker header. PDB record 0.
    '''
    def __init__(self, raw):
        self.uid, = struct.unpack('>H', raw[0:2])
        # This is labled version in the spec.
        # 2 is ZLIB compressed,
        # 1 is DOC compressed
        self.compression, = struct.unpack('>H', raw[2:4])
        self.records, = struct.unpack('>H', raw[4:6])
        # uid of the first html file. This should link
        # to other files which in turn may link to others.
        self.home_html = None
        self.reserved = {}
        for i in xrange(self.records):
            adv = 4*i
            name, = struct.unpack('>H', raw[6+adv:8+adv])
            id, = struct.unpack('>H', raw[8+adv:10+adv])
            self.reserved[id] = name
            if name == 0:
                self.home_html = id
 class SectionHeader(object):
    '''
    Every sections (record) has this header. It gives
    details about the section such as it's uid.
    '''
    def __init__(self, raw):
        self.uid, = struct.unpack('>H', raw[0:2])
        self.paragraphs, = struct.unpack('>H', raw[2:4])
        self.size, = struct.unpack('>H', raw[4:6])
        self.type, = struct.unpack('>B', raw[6])
        self.flags, = struct.unpack('>B', raw[7])
 class SectionHeaderText(object):
    '''
    Sub header for text records.
    '''
    def __init__(self, section_header, raw):
        # The uncompressed size of each paragraph.
        self.sizes = []
        # uncompressed offset of each paragraph starting
        # at the beginning of the PHTML.
        self.paragraph_offsets = []
        # Paragraph attributes.
        self.attributes = []
        for i in xrange(section_header.paragraphs):
            adv = 4*i
            self.sizes.append(struct.unpack('>H', raw[adv:2+adv])[0])
            self.attributes.append(struct.unpack('>H', raw[2+adv:4+adv])[0])
        running_offset = 0
        for size in self.sizes:
            running_offset += size
            self.paragraph_offsets.append(running_offset)
 class SectionMetadata(object):
    '''
    Metadata.
    This does not store metadata such as title, or author.
    That metadata would be best retrieved with the PDB (plucker)
    metdata reader.
    This stores document specific information such as the
    text encoding.
    Note: There is a default encoding but each text section
    can be assigned a different encoding.
    '''
    def __init__(self, raw):
        self.default_encoding = 'latin-1'
        self.exceptional_uid_encodings = {}
        self.owner_id = None
        record_count, = struct.unpack('>H', raw[0:2])
        adv = 0
        for i in xrange(record_count):
            type, = struct.unpack('>H', raw[2+adv:4+adv])
            length, = struct.unpack('>H', raw[4+adv:6+adv])
            # CharSet
            if type == 1:
                val, = struct.unpack('>H', raw[6+adv:8+adv])
                self.default_encoding = MIBNUM_TO_NAME.get(val, 'latin-1')
            # ExceptionalCharSets
            elif type == 2:
                ii_adv = 0
                for ii in xrange(length / 2):
                    uid, = struct.unpack('>H', raw[6+adv+ii_adv:8+adv+ii_adv])
                    mib, = struct.unpack('>H', raw[8+adv+ii_adv:10+adv+ii_adv])
                    self.exceptional_uid_encodings[uid] = MIBNUM_TO_NAME.get(mib, 'latin-1')
                    ii_adv += 4
            # OwnerID
            elif type == 3:
                self.owner_id = struct.unpack('>I', raw[6+adv:10+adv])
            # Author, Title, PubDate
            # Ignored here. The metadata reader plugin
            # will get this info because if it's missing
            # the metadata reader plugin will use fall
            # back data from elsewhere in the file.
            elif type in (4, 5, 6):
                pass
            # Linked Documents
            elif type == 7:
                pass
            adv += 2*length
 class SectionText(object):
    '''
    Text data. Stores a text section header and the PHTML.
    '''
    def __init__(self, section_header, raw):
        self.header = SectionHeaderText(section_header, raw)
        self.data = raw[section_header.paragraphs * 4:]
 class SectionCompositeImage(object):
    '''
    A composite image consists of a a 2D array
    of rows and columns. The entries in the array
    are uid's.
    '''
    def __init__(self, raw):
        self.columns, = struct.unpack('>H', raw[0:2])
        self.rows, = struct.unpack('>H', raw[2:4])
        # [
        #  [uid, uid, uid, ...],
        #  [uid, uid, uid, ...],
        #  ...
        # ]
        #
        # Each item in the layout is in it's
        # correct position in the final
        # composite.
        #
        # Each item in the layout is a uid
        # to an image record.
        self.layout = []
        offset = 4
        for i in xrange(self.rows):
            col = []
            for j in xrange(self.columns):
                col.append(struct.unpack('>H', raw[offset:offset+2])[0])
                offset += 2
            self.layout.append(col)
 class Reader(FormatReader):
    '''
    Convert a plucker archive into HTML.
    TODO:
          * UTF 16 and 32 characters.
          * Margins.
          * Alignment.
          * Font color.
          * DATATYPE_MAILTO
          * DATATYPE_TABLE(_COMPRESSED)
          * DATATYPE_EXT_ANCHOR_INDEX
          * DATATYPE_EXT_ANCHOR(_COMPRESSED)
    '''
    def __init__(self, header, stream, log, options):
        self.stream = stream
        self.log = log
        self.options = options
        # Mapping of section uid to our internal
        # list of sections.
        self.uid_section_number = OrderedDict()
        self.uid_text_secion_number = OrderedDict()
        self.uid_text_secion_encoding = {}
        self.uid_image_section_number = {}
        self.uid_composite_image_section_number = {}
        self.metadata_section_number = None
        self.default_encoding = 'latin-1'
        self.owner_id = None
        self.sections = []
        # The Plucker record0 header
        self.header_record = HeaderRecord(header.section_data(0))
        for i in range(1, header.num_sections):
            section_number = len(self.sections)
            # The length of the section header.
            # Where the actual data in the section starts.
            start = 8
            section = None
            raw_data = header.section_data(i)
            # Every sections has a section header.
            section_header = SectionHeader(raw_data)
            # Store sections we care able.
            if section_header.type in (DATATYPE_PHTML, DATATYPE_PHTML_COMPRESSED):
                self.uid_text_secion_number[section_header.uid] = section_number
                section = SectionText(section_header, raw_data[start:])
            elif section_header.type in (DATATYPE_TBMP, DATATYPE_TBMP_COMPRESSED):
                self.uid_image_section_number[section_header.uid] = section_number
                section = raw_data[start:]
            elif section_header.type == DATATYPE_METADATA:
                self.metadata_section_number = section_number
                section = SectionMetadata(raw_data[start:])
            elif section_header.type == DATATYPE_COMPOSITE_IMAGE:
                self.uid_composite_image_section_number[section_header.uid] = section_number
                section = SectionCompositeImage(raw_data[start:])
            # Store the section.
            if section:
                self.uid_section_number[section_header.uid] = section_number
                self.sections.append((section_header, section))
        # Store useful information from the metadata section locally
        # to make access easier.
        if self.metadata_section_number:
            mdata_section = self.sections[self.metadata_section_number][1]
            for k, v in mdata_section.exceptional_uid_encodings.items():
                self.uid_text_secion_encoding[k] = v
            self.default_encoding = mdata_section.default_encoding
            self.owner_id = mdata_section.owner_id
        # Get the metadata (tile, author, ...) with the metadata reader.
        from calibre.ebooks.metadata.pdb import get_metadata
        self.mi = get_metadata(stream, False)
    def extract_content(self, output_dir):
        # Each text record is independent (unless the continuation
        # value is set in the previous record). Put each converted
        # text recored into a separate file. We will reference the
        # home.html file as the first file and let the HTML input
        # plugin assemble the order based on hyperlinks.
        with CurrentDir(output_dir):
            for uid, num in self.uid_text_secion_number.items():
                self.log.debug(_('Writing record with uid: %s as %s.html' % (uid, uid)))
                with open('%s.html' % uid, 'wb') as htmlf:
                    html = u'<html><body>'
                    section_header, section_data = self.sections[num]
                    if section_header.type == DATATYPE_PHTML:
                        html += self.process_phtml(section_data.data, section_data.header.paragraph_offsets)
                    elif section_header.type == DATATYPE_PHTML_COMPRESSED:
                        d = self.decompress_phtml(section_data.data)
                        html += self.process_phtml(d, section_data.header.paragraph_offsets).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
                    html += '</body></html>'
                    htmlf.write(html.encode('utf-8'))
        # Images.
        # Cache the image sizes in case they are used by a composite image.
        image_sizes = {}
        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
        with CurrentDir(os.path.join(output_dir, 'images/')):
            # Single images.
            for uid, num in self.uid_image_section_number.items():
                section_header, section_data = self.sections[num]
                if section_data:
                    idata = None
                    if section_header.type == DATATYPE_TBMP:
                        idata = section_data
                    elif section_header.type == DATATYPE_TBMP_COMPRESSED:
                        if self.header_record.compression == 1:
                            idata = decompress_doc(section_data)
                        elif self.header_record.compression == 2:
                            idata = zlib.decompress(section_data)
                    try:
                        with TemporaryFile(suffix='.palm') as itn:
                            with open(itn, 'wb') as itf:
                                itf.write(idata)
                            im = Image()
                            im.read(itn)
                            image_sizes[uid] = im.size
                            im.set_compression_quality(70)
                            im.save('%s.jpg' % uid)
                            self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid))
                    except Exception as e:
                        self.log.error('Failed to write image with uid %s: %s' % (uid, e))
                else:
                    self.log.error('Failed to write image with uid %s: No data.' % uid)
            # Composite images.
            # We're going to use the already compressed .jpg images here.
            for uid, num in self.uid_composite_image_section_number.items():
                try:
                    section_header, section_data = self.sections[num]
                    # Get the final width and height.
                    width = 0
                    height = 0
                    for row in section_data.layout:
                        row_width = 0
                        col_height = 0
                        for col in row:
                            if col not in image_sizes:
                                raise Exception('Image with uid: %s missing.' % col)
                            im = Image()
                            im.read('%s.jpg' % col)
                            w, h = im.size
                            row_width += w
                            if col_height < h:
                                col_height = h
                        if width < row_width:
                            width = row_width
                        height += col_height
                    # Create a new image the total size of all image
                    # parts. Put the parts into the new image.
                    canvas = create_canvas(width, height)
                    y_off = 0
                    for row in section_data.layout:
                        x_off = 0
                        largest_height = 0
                        for col in row:
                            im = Image()
                            im.read('%s.jpg' % col)
                            canvas.compose(im, x_off, y_off)
                            w, h = im.size
                            x_off += w
                            if largest_height < h:
                                largest_height = h
                        y_off += largest_height
                    canvas.set_compression_quality(70)
                    canvas.save('%s.jpg' % uid)
                    self.log.debug('Wrote composite image with uid %s to images/%s.jpg' % (uid, uid))
                except Exception as e:
                    self.log.error('Failed to write composite image with uid %s: %s' % (uid, e))
        # Run the HTML through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(self.options, opt.option.name, opt.recommended_value)
        self.options.input_encoding = 'utf-8'
        odi = self.options.debug_pipeline
        self.options.debug_pipeline = None
        # Determine the home.html record uid. This should be set in the
        # reserved values in the metadata recored. home.html is the first
        # text record (should have hyper link references to other records)
        # in the document.
        try:
            home_html = self.header_record.home_html
            if not home_html:
                home_html = self.uid_text_secion_number.items()[0][0]
        except:
            raise Exception(_('Could not determine home.html'))
        # Generate oeb from html conversion.
        oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {})
        self.options.debug_pipeline = odi
        return oeb
    def decompress_phtml(self, data):
        if self.header_record.compression == 2:
            if self.owner_id:
                raise NotImplementedError
            return zlib.decompress(data)
        elif self.header_record.compression == 1:
            #from calibre.ebooks.compression.palmdoc import decompress_doc
            return decompress_doc(data)
    def process_phtml(self, d, paragraph_offsets=[]):
        html = u'<p id="p0">'
        offset = 0
        paragraph_open = True
        link_open = False
        need_set_p_id = False
        p_num = 1
        font_specifier_close = ''
        while offset < len(d):
            if not paragraph_open:
                if need_set_p_id:
                    html += u'<p id="p%s">' % p_num
                    p_num += 1
                    need_set_p_id = False
                else:
                    html += u'<p>'
                paragraph_open = True
            c = ord(d[offset])
            # PHTML "functions"
            if c == 0x0:
                offset += 1
                c = ord(d[offset])
                # Page link begins
                # 2 Bytes
                # record ID
                if c == 0x0a:
                    offset += 1
                    id = struct.unpack('>H', d[offset:offset+2])[0]
                    if id in self.uid_text_secion_number:
                        html += '<a href="%s.html">' % id
                        link_open = True
                    offset += 1
                # Targeted page link begins
                # 3 Bytes
                # record ID, target
                elif c == 0x0b:
                    offset += 3
                # Paragraph link begins
                # 4 Bytes
                # record ID, paragraph number
                elif c == 0x0c:
                    offset += 1
                    id = struct.unpack('>H', d[offset:offset+2])[0]
                    offset += 2
                    pid = struct.unpack('>H', d[offset:offset+2])[0]
                    if id in self.uid_text_secion_number:
                        html += '<a href="%s.html#p%s">' % (id, pid)
                        link_open = True
                    offset += 1
                # Targeted paragraph link begins
                # 5 Bytes
                # record ID, paragraph number, target
                elif c == 0x0d:
                    offset += 5
                # Link ends
                # 0 Bytes
                elif c == 0x08:
                    if link_open:
                        html += '</a>'
                        link_open = False
                # Set font
                # 1 Bytes
                # font specifier
                elif c == 0x11:
                    offset += 1
                    specifier = d[offset]
                    html += font_specifier_close
                    # Regular text
                    if specifier == 0:
                        font_specifier_close = ''
                    # h1
                    elif specifier == 1:
                        html += '<h1>'
                        font_specifier_close = '</h1>'
                    # h2
                    elif specifier == 2:
                        html += '<h2>'
                        font_specifier_close = '</h2>'
                    # h3
                    elif specifier == 3:
                        html += '<h13>'
                        font_specifier_close = '</h3>'
                    # h4
                    elif specifier == 4:
                        html += '<h4>'
                        font_specifier_close = '</h4>'
                    # h5
                    elif specifier == 5:
                        html += '<h5>'
                        font_specifier_close = '</h5>'
                    # h6
                    elif specifier == 6:
                        html += '<h6>'
                        font_specifier_close = '</h6>'
                    # Bold
                    elif specifier == 7:
                        html += '<b>'
                        font_specifier_close = '</b>'
                    # Fixed-width
                    elif specifier == 8:
                        html += '<tt>'
                        font_specifier_close = '</tt>'
                    # Small
                    elif specifier == 9:
                        html += '<small>'
                        font_specifier_close = '</small>'
                    # Subscript
                    elif specifier == 10:
                        html += '<sub>'
                        font_specifier_close = '</sub>'
                    # Superscript
                    elif specifier == 11:
                        html += '<sup>'
                        font_specifier_close = '</sup>'
                # Embedded image
                # 2 Bytes
                # image record ID
                elif c == 0x1a:
                    offset += 1
                    uid = struct.unpack('>H', d[offset:offset+2])[0]
                    html += '<img src="images/%s.jpg" />' % uid
                    offset += 1
                # Set margin
                # 2 Bytes
                # left margin, right margin
                elif c == 0x22:
                    offset += 2
                # Alignment of text
                # 1 Bytes
                # alignment
                elif c == 0x29:
                    offset += 1
                # Horizontal rule
                # 3 Bytes
                # 8-bit height, 8-bit width (pixels), 8-bit width (%, 1-100)
                elif c == 0x33:
                    offset += 3
                    if paragraph_open:
                        html += u'</p>'
                        paragraph_open = False
                    html += u'<hr />'
                # New line
                # 0 Bytes
                elif c == 0x38:
                    if paragraph_open:
                        html += u'</p>\n'
                        paragraph_open = False
                # Italic text begins
                # 0 Bytes
                elif c == 0x40:
                    html += u'<i>'
                # Italic text ends
                # 0 Bytes
                elif c == 0x48:
                    html += u'</i>'
                # Set text color
                # 3 Bytes
                # 8-bit red, 8-bit green, 8-bit blue
                elif c == 0x53:
                    offset += 3
                # Multiple embedded image
                # 4 Bytes
                # alternate image record ID, image record ID
                elif c == 0x5c:
                    offset += 3
                    uid = struct.unpack('>H', d[offset:offset+2])[0]
                    html += '<img src="images/%s.jpg" />' % uid
                    offset += 1
                # Underline text begins
                # 0 Bytes
                elif c == 0x60:
                    html += u'<u>'
                # Underline text ends
                # 0 Bytes
                elif c == 0x68:
                    html += u'</u>'
                # Strike-through text begins
                # 0 Bytes
                elif c == 0x70:
                    html += u'<s>'
                # Strike-through text ends
                # 0 Bytes
                elif c == 0x78:
                    html += u'</s>'
                # 16-bit Unicode character
                # 3 Bytes
                # alternate text length, 16-bit unicode character
                elif c == 0x83:
                    offset += 3
                # 32-bit Unicode character
                # 5 Bytes
                # alternate text length, 32-bit unicode character
                elif c == 0x85:
                    offset += 5
                # Begin custom font span
                # 6 Bytes
                # font page record ID, X page position, Y page position
                elif c == 0x8e:
                    offset += 6
                # Adjust custom font glyph position
                # 4 Bytes
                # X page position, Y page position
                elif c == 0x8c:
                    offset += 4
                # Change font page
                # 2 Bytes
                # font record ID
                elif c == 0x8a:
                    offset += 2
                # End custom font span
                # 0 Bytes
                elif c == 0x88:
                    pass
                # Begin new table row
                # 0 Bytes
                elif c == 0x90:
                    pass
                # Insert table (or table link)
                # 2 Bytes
                # table record ID
                elif c == 0x92:
                    offset += 2
                # Table cell data
                # 7 Bytes
                # 8-bit alignment, 16-bit image record ID, 8-bit columns, 8-bit rows, 16-bit text length
                elif c == 0x97:
                    offset += 7
                # Exact link modifier
                # 2 Bytes
                # Paragraph Offset (The Exact Link Modifier modifies a Paragraph Link or Targeted Paragraph Link function to specify an exact byte offset within the paragraph. This function must be followed immediately by the function it modifies).
                elif c == 0x9a:
                    offset += 2
            elif c == 0xa0:
                html += '&nbsp;'
            else:
                html += unichr(c)
            offset += 1
            if offset in paragraph_offsets:
                need_set_p_id = True
                if paragraph_open:
                    html += u'</p>\n'
                    paragraph_open = False
        if paragraph_open:
            html += u'</p>'
        return html
    def get_text_uid_encoding(self, uid):
        # Return the user sepcified input encoding,
        # otherwise return the alternate encoding specified for the uid,
        # otherwise retur the default encoding for the document.
        return self.options.input_encoding if self.options.input_encoding else self.uid_text_secion_encoding.get(uid, self.default_encoding)
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -20,12 +20,14 @@ What formats does |app| support conversion to/from?
 |app| supports the conversion of many input formats to many output formats.
 It can convert every input format in the following list, to every output format.
-*Input Formats:* CBZ, CBR, CBC, CHM, EPUB, FB2, HTML, HTMLZ, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB, PML, RB, RTF, SNB, TCR, TXT, TXTZ
+*Input Formats:* CBZ, CBR, CBC, CHM, EPUB, FB2, HTML, HTMLZ, LIT, LRF, MOBI, ODT, PDF, PRC**, PDB***, PML, RB, RTF, SNB, TCR, TXT, TXTZ
 *Output Formats:* EPUB, FB2, OEB, LIT, LRF, MOBI, HTMLZ, PDB, PML, RB, PDF, SNB, TCR, TXT, TXTZ
 ** PRC is a generic format, |app| supports PRC files with TextRead and MOBIBook headers
 *** PDB is also a generic format. |app| supports eReder, Plucker, PML and zTxt PDB files.
 .. _best-source-formats:
 What are the best source formats to convert?