From 9c74cd945fb3548d97b673116aa2ecbbfa0943f6 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 21 Feb 2010 10:48:24 -0500 Subject: [PATCH 1/5] Implement bug #4971: Support reading of PDF PDB files. --- src/calibre/ebooks/oeb/iterator.py | 2 +- src/calibre/ebooks/pdb/__init__.py | 4 ++- src/calibre/ebooks/pdb/pdf/__init__.py | 0 src/calibre/ebooks/pdb/pdf/reader.py | 38 ++++++++++++++++++++++++++ 4 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 src/calibre/ebooks/pdb/pdf/__init__.py create mode 100644 src/calibre/ebooks/pdb/pdf/reader.py diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index 8959d62fac..d09c49ebeb 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -177,7 +177,7 @@ class EbookIterator(object): plumber.opts, plumber.input_fmt, self.log, {}, self.base) - if processed or plumber.input_fmt.lower() in ('pdf', 'rb') and \ + if processed or plumber.input_fmt.lower() in ('pdb', 'pdf', 'rb') and \ not hasattr(self.pathtoopf, 'manifest'): self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts, plumber.input_plugin) diff --git a/src/calibre/ebooks/pdb/__init__.py b/src/calibre/ebooks/pdb/__init__.py index 54f3826470..092c8a21bd 100644 --- a/src/calibre/ebooks/pdb/__init__.py +++ b/src/calibre/ebooks/pdb/__init__.py @@ -11,12 +11,14 @@ class PDBError(Exception): from calibre.ebooks.pdb.ereader.reader import Reader as ereader_reader from calibre.ebooks.pdb.palmdoc.reader import Reader as palmdoc_reader from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader +from calibre.ebooks.pdb.pdf.reader import Reader as pdf_reader FORMAT_READERS = { 'PNPdPPrs': ereader_reader, 'PNRdPPrs': ereader_reader, 'zTXTGPlm': ztxt_reader, 'TEXtREAd': palmdoc_reader, + '.pdfADBE': pdf_reader, } from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer @@ -34,8 +36,8 @@ IDENTITY_TO_NAME = { 'PNRdPPrs': 'eReader', 'zTXTGPlm': 'zTXT', 'TEXtREAd': 'PalmDOC', - '.pdfADBE': 'Adobe Reader', + 'BVokBDIC': 'BDicty', 'DB99DBOS': 'DB (Database program)', 'vIMGView': 'FireViewer (ImageViewer)', diff --git a/src/calibre/ebooks/pdb/pdf/__init__.py b/src/calibre/ebooks/pdb/pdf/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py new file mode 100644 index 0000000000..913d06f634 --- /dev/null +++ b/src/calibre/ebooks/pdb/pdf/reader.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- + +''' +Read content from palmdoc pdb file. +''' + +__license__ = 'GPL v3' +__copyright__ = '2010, John Schember ' +__docformat__ = 'restructuredtext en' + +import cStringIO + +from calibre.ebooks.pdb.formatreader import FormatReader +from calibre.ptempfile import TemporaryFile + +class Reader(FormatReader): + + def __init__(self, header, stream, log, options): + self.header = header + self.stream = stream + self.log = log + self.options = options + setattr(self.options, 'new_pdf_engine', False) + setattr(self.options, 'no_images', False) + setattr(self.options, 'unwrap_factor', 0.5) + + def extract_content(self, output_dir): + self.log.info('Extracting PDF...') + + with TemporaryFile() as pdf_n: + pdf = open(pdf_n, 'rw+b') + for x in xrange(self.header.section_count()): + pdf.write(self.header.section_data(x)) + + from calibre.customize.ui import plugin_for_input_format + pdf.seek(0) + return plugin_for_input_format('pdf').convert(pdf, self.options, + 'pdf', self.log, []) From 0ef1d8f5f764979189bf66e69edc3dd892671771 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 21 Feb 2010 11:03:32 -0500 Subject: [PATCH 2/5] Remove debugging print. --- src/calibre/gui2/ui.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/calibre/gui2/ui.py b/src/calibre/gui2/ui.py index 756ac113dc..b741e18f89 100644 --- a/src/calibre/gui2/ui.py +++ b/src/calibre/gui2/ui.py @@ -991,7 +991,6 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): self.library_view.model().current_changed(current_idx, current_idx) def __add_filesystem_book(self, paths, allow_device=True): - print 222, paths if isinstance(paths, basestring): paths = [paths] books = [path for path in map(os.path.abspath, paths) if os.access(path, From 5fd1703e30e2fd1681f2be219884879e29d34034 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 21 Feb 2010 11:36:59 -0700 Subject: [PATCH 3/5] ... --- src/calibre/manual/faq.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index bafc13f388..1d44c004fd 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -180,7 +180,7 @@ Why is my device not detected in linux? grep SYSFS_DEPRECATED /boot/config-`uname -r` -You should see something like ``CONFIG_SYSFS_DEPRECATED_V2 is not set``. +You should see something like ``CONFIG_SYSFS_DEPRECATED_V2 is not set``. If you don't you have to either recompile your kernel with the correct setting, or upgrade your linux distro to a more modern version, where this will not be set. Library Management ------------------ From 6b04e572752326ccf840a8263dcfa07236f3f137 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 21 Feb 2010 12:52:41 -0700 Subject: [PATCH 4/5] Fix #4973 (Improvement for chm conversion - remove br tag at top of page) --- src/calibre/ebooks/chm/input.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py index ecb54dffdb..3b08854532 100644 --- a/src/calibre/ebooks/chm/input.py +++ b/src/calibre/ebooks/chm/input.py @@ -4,11 +4,11 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ,' \ ' and Alex Bramley .' -import os, shutil, uuid +import os, shutil, uuid, re from tempfile import mkdtemp from mimetypes import guess_type as guess_mimetype -from BeautifulSoup import BeautifulSoup +from BeautifulSoup import BeautifulSoup, NavigableString from lxml import html from pychm.chm import CHMFile from pychm.chmlib import ( @@ -29,6 +29,17 @@ def match_string(s1, s2_already_lowered): return True return False +def check_all_prev_empty(tag): + if tag is None: + return True + if tag.__class__ == NavigableString and not check_empty(tag): + return False + return check_all_prev_empty(tag.previousSibling) + +def check_empty(s, rex = re.compile(r'\S')): + return rex.search(s) is None + + def option_parser(): parser = OptionParser(usage=_('%prog [options] mybook.chm')) parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output') @@ -155,6 +166,12 @@ class CHMReader(CHMFile): # for some very odd reason each page's content appears to be in a table # too. and this table has sub-tables for random asides... grr. + # remove br at top of page if present after nav bars removed + br = soup('br') + if br: + if check_all_prev_empty(br[0].previousSibling): + br[0].extract() + # some images seem to be broken in some chm's :/ for img in soup('img'): try: From eae6bf3608f7fc9b1eaae2fb0f6009f3313d9a4d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 21 Feb 2010 13:33:49 -0700 Subject: [PATCH 5/5] MOBI metadata: Do not try to extarct embedded metadata from MOBI files larger than 4MB --- src/calibre/ebooks/mobi/reader.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index ae175f1493..ed61dbd719 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -10,6 +10,7 @@ import re import struct import textwrap import cStringIO +import sys try: from PIL import Image as PILImage @@ -806,13 +807,20 @@ def get_metadata(stream): if mh.exth.mi is not None: mi = mh.exth.mi else: - with TemporaryDirectory('_mobi_meta_reader') as tdir: - with CurrentDir(tdir): - mr = MobiReader(stream, log) - parse_cache = {} - mr.extract_content(tdir, parse_cache) - if mr.embedded_mi is not None: - mi = mr.embedded_mi + size = sys.maxint + if hasattr(stream, 'seek') and hasattr(stream, 'tell'): + pos = stream.tell() + stream.seek(0, 2) + size = stream.tell() + stream.seek(pos) + if size < 4*1024*1024: + with TemporaryDirectory('_mobi_meta_reader') as tdir: + with CurrentDir(tdir): + mr = MobiReader(stream, log) + parse_cache = {} + mr.extract_content(tdir, parse_cache) + if mr.embedded_mi is not None: + mi = mr.embedded_mi if hasattr(mh.exth, 'cover_offset'): cover_index = mh.first_image_index + mh.exth.cover_offset data = mh.section_data(int(cover_index))