From b9be0e215cb0e95b2b2155eabadf04acbf85c8d8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 8 Apr 2012 16:31:09 +0530 Subject: [PATCH] Read support for the Haodoo PDB ebook format by Kan-Ru Chen. Fixes #976478 (Add Haodoo.net PDB format support) --- src/calibre/customize/builtins.py | 2 +- src/calibre/ebooks/__init__.py | 2 +- .../ebooks/conversion/plugins/pdb_input.py | 2 +- src/calibre/ebooks/metadata/haodoo.py | 23 +++ src/calibre/ebooks/metadata/pdb.py | 3 + src/calibre/ebooks/pdb/__init__.py | 5 + src/calibre/ebooks/pdb/haodoo/reader.py | 151 ++++++++++++++++++ 7 files changed, 185 insertions(+), 3 deletions(-) create mode 100644 src/calibre/ebooks/metadata/haodoo.py create mode 100644 src/calibre/ebooks/pdb/haodoo/reader.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 14e0a564db..d91fc97a1d 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -289,7 +289,7 @@ class OPFMetadataReader(MetadataReaderPlugin): class PDBMetadataReader(MetadataReaderPlugin): name = 'Read PDB metadata' - file_types = set(['pdb']) + file_types = set(['pdb', 'updb']) description = _('Read metadata from %s files') % 'PDB' author = 'John Schember' diff --git a/src/calibre/ebooks/__init__.py b/src/calibre/ebooks/__init__.py index 42ec38701a..0adfab4e11 100644 --- a/src/calibre/ebooks/__init__.py +++ b/src/calibre/ebooks/__init__.py @@ -27,7 +27,7 @@ class ParserError(ValueError): pass BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text', 'htm', 'xhtm', - 'html', 'htmlz', 'xhtml', 'pdf', 'pdb', 'pdr', 'prc', 'mobi', 'azw', 'doc', + 'html', 'htmlz', 'xhtml', 'pdf', 'pdb', 'updb', 'pdr', 'prc', 'mobi', 'azw', 'doc', 'epub', 'fb2', 'djv', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip', 'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz', 'mbp', 'tan', 'snb', 'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi', 'docx', 'md', diff --git a/src/calibre/ebooks/conversion/plugins/pdb_input.py b/src/calibre/ebooks/conversion/plugins/pdb_input.py index 641122fb3d..ade8fa7cd7 100644 --- a/src/calibre/ebooks/conversion/plugins/pdb_input.py +++ b/src/calibre/ebooks/conversion/plugins/pdb_input.py @@ -13,7 +13,7 @@ class PDBInput(InputFormatPlugin): name = 'PDB Input' author = 'John Schember' description = 'Convert PDB to HTML' - file_types = set(['pdb']) + file_types = set(['pdb', 'updb']) def convert(self, stream, options, file_ext, log, accelerators): diff --git a/src/calibre/ebooks/metadata/haodoo.py b/src/calibre/ebooks/metadata/haodoo.py new file mode 100644 index 0000000000..a32f7a2268 --- /dev/null +++ b/src/calibre/ebooks/metadata/haodoo.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +''' +Read meta information from Haodoo.net pdb files. +''' + +__license__ = 'GPL v3' +__copyright__ = '2012, Kan-Ru Chen ' +__docformat__ = 'restructuredtext en' + +from calibre.ebooks.pdb.header import PdbHeaderReader +from calibre.ebooks.pdb.haodoo.reader import Reader + +def get_metadata(stream, extract_cover=True): + ''' + Return metadata as a L{MetaInfo} object + ''' + stream.seek(0) + + pheader = PdbHeaderReader(stream) + reader = Reader(pheader, stream, None, None) + + return reader.get_metadata() diff --git a/src/calibre/ebooks/metadata/pdb.py b/src/calibre/ebooks/metadata/pdb.py index d01bb0ecdb..70bcca132e 100644 --- a/src/calibre/ebooks/metadata/pdb.py +++ b/src/calibre/ebooks/metadata/pdb.py @@ -14,11 +14,14 @@ from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.metadata.ereader import get_metadata as get_eReader from calibre.ebooks.metadata.plucker import get_metadata as get_plucker +from calibre.ebooks.metadata.haodoo import get_metadata as get_Haodoo MREADER = { 'PNPdPPrs' : get_eReader, 'PNRdPPrs' : get_eReader, 'DataPlkr' : get_plucker, + 'BOOKMTIT' : get_Haodoo, + 'BOOKMTIU' : get_Haodoo, } from calibre.ebooks.metadata.ereader import set_metadata as set_eReader diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py index 428cbe82ab..020f64f613 100644 --- a/src/calibre/ebooks/pdb/__init__.py +++ b/src/calibre/ebooks/pdb/__init__.py @@ -16,6 +16,7 @@ def _import_readers(): from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader from calibre.ebooks.pdb.pdf.reader import Reader as pdf_reader from calibre.ebooks.pdb.plucker.reader import Reader as plucker_reader + from calibre.ebooks.pdb.haodoo.reader import Reader as haodoo_reader FORMAT_READERS = { 'PNPdPPrs': ereader_reader, @@ -24,6 +25,8 @@ def _import_readers(): 'TEXtREAd': palmdoc_reader, '.pdfADBE': pdf_reader, 'DataPlkr': plucker_reader, + 'BOOKMTIT': haodoo_reader, + 'BOOKMTIU': haodoo_reader, } ALL_FORMAT_WRITERS = {'doc', 'ztxt', 'ereader'} @@ -47,6 +50,8 @@ IDENTITY_TO_NAME = { 'TEXtREAd': 'PalmDOC', '.pdfADBE': 'Adobe Reader', 'DataPlkr': 'Plucker', + 'BOOKMTIT': 'Haodoo.net', + 'BOOKMTIU': 'Haodoo.net', 'BVokBDIC': 'BDicty', 'DB99DBOS': 'DB (Database program)', diff --git a/src/calibre/ebooks/pdb/haodoo/reader.py b/src/calibre/ebooks/pdb/haodoo/reader.py new file mode 100644 index 0000000000..f97ee0d90f --- /dev/null +++ b/src/calibre/ebooks/pdb/haodoo/reader.py @@ -0,0 +1,151 @@ +# -*- coding: utf-8 -*- + +''' +Read content from Haodoo.net pdb file. +''' + +__license__ = 'GPL v3' +__copyright__ = '2012, Kan-Ru Chen ' +__docformat__ = 'restructuredtext en' + + +import struct +import os + +from calibre import prepare_string_for_xml +from calibre.ebooks.pdb.formatreader import FormatReader +from calibre.ebooks.metadata import MetaInformation +from calibre.ebooks.txt.processor import opf_writer, HTML_TEMPLATE + +BPDB_IDENT = b'BOOKMTIT' +UPDB_IDENT = b'BOOKMTIU' + +punct_table = { + u"︵": u"(", + u"︶": u")", + u"︷": u"{", + u"︸": u"}", + u"︹": u"〔", + u"︺": u"〕", + u"︻": u"【", + u"︼": u"】", + u"︗": u"〖", + u"︘": u"〗", + u"﹇": u"[]", + u"﹈": u"[]", + u"︽": u"《", + u"︾": u"》", + u"︿": u"〈", + u"﹀": u"〉", + u"﹁": u"「", + u"﹂": u"」", + u"﹃": u"『", + u"﹄": u"』", + u"|": u"—", + u"︙": u"…", + u"ⸯ": u"~", + u"│": u"…", + u"¦": u"…", + u" ": u" ", + } + +def fix_punct(line): + for (key, value) in punct_table.items(): + line = line.replace(key, value) + return line + +class LegacyHeaderRecord(object): + + def __init__(self, raw): + fields = raw.lstrip().replace(b'\x1b\x1b\x1b', b'\x1b').split(b'\x1b') + self.title = fix_punct(fields[0].decode('cp950', 'replace')) + self.num_records = int(fields[1]) + self.chapter_titles = map( + lambda x: fix_punct(x.decode('cp950', 'replace').rstrip(b'\x00')), + fields[2:]) + +class UnicodeHeaderRecord(object): + + def __init__(self, raw): + fields = raw.lstrip().replace(b'\x1b\x00\x1b\x00\x1b\x00', + b'\x1b\x00').split(b'\x1b\x00') + self.title = fix_punct(fields[0].decode('utf_16_le', 'ignore')) + self.num_records = int(fields[1]) + self.chapter_titles = map( + lambda x: fix_punct(x.decode('utf_16_le', 'replace').rstrip(b'\x00')), + fields[2].split(b'\r\x00\n\x00')) + +class Reader(FormatReader): + + def __init__(self, header, stream, log, options): + self.stream = stream + self.log = log + + self.sections = [] + for i in range(header.num_sections): + self.sections.append(header.section_data(i)) + + if header.ident == BPDB_IDENT: + self.header_record = LegacyHeaderRecord(self.section_data(0)) + self.encoding = 'cp950' + else: + self.header_record = UnicodeHeaderRecord(self.section_data(0)) + self.encoding = 'utf_16_le' + + def author(self): + self.stream.seek(35) + version = struct.unpack(b'>b', self.stream.read(1))[0] + if version == 2: + self.stream.seek(0) + author = self.stream.read(35).rstrip(b'\x00').decode(self.encoding, 'replace') + return author + else: + return u'Unknown' + + def get_metadata(self): + mi = MetaInformation(self.header_record.title, + [self.author()]) + mi.language = u'zh-tw' + + return mi + + def section_data(self, number): + return self.sections[number] + + def decompress_text(self, number): + return self.section_data(number).decode(self.encoding, + 'replace').rstrip(b'\x00') + + def extract_content(self, output_dir): + txt = u'' + + self.log.info(u'Decompressing text...') + for i in range(1, self.header_record.num_records + 1): + self.log.debug(u'\tDecompressing text section %i' % i) + title = self.header_record.chapter_titles[i-1] + lines = [] + title_added = False + for line in self.decompress_text(i).splitlines(): + line = fix_punct(line) + line = line.strip() + if not title_added and title in line: + line = u'

' + line + u'

\n' + title_added = True + else: + line = prepare_string_for_xml(line) + lines.append(u'

%s

' % line) + if not title_added: + lines.insert(0, u'

' + title + u'

\n') + txt += u'\n'.join(lines) + + self.log.info(u'Converting text to OEB...') + html = HTML_TEMPLATE % (self.header_record.title, txt) + with open(os.path.join(output_dir, u'index.html'), 'wb') as index: + index.write(html.encode('utf-8')) + + mi = self.get_metadata() + manifest = [(u'index.html', None)] + spine = [u'index.html'] + opf_writer(output_dir, u'metadata.opf', manifest, spine, mi) + + return os.path.join(output_dir, u'metadata.opf')