From aafc6d97649de2a72e303507953872205f8fbc5b Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 15 Apr 2009 19:57:42 -0400 Subject: [PATCH 1/7] Fix text output regex --- src/calibre/ebooks/txt/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py index 0f84c32804..ea613010ef 100644 --- a/src/calibre/ebooks/txt/writer.py +++ b/src/calibre/ebooks/txt/writer.py @@ -76,7 +76,7 @@ class TxtWriter(object): text = re.sub('(?imu)' % tag, '\n\n', text) for tag in ['hr', 'br']: - text = re.sub('(?imu)<[ ]*%s[ ]*/*?>' % tag, '\n\n', text) + text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text) # Remove any tags that do not need special processing. text = re.sub('<.*?>', '', text) From 575b021f48ea9cab351648999bc69737ea2aafa0 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 15 Apr 2009 20:11:00 -0400 Subject: [PATCH 2/7] pdftohtml preprocess rules work --- src/calibre/ebooks/conversion/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 6b58d2d18d..632a7a3291 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -73,7 +73,7 @@ class HTMLPreProcessor(object): (re.compile(r''), lambda match : '

'), # Un wrap lines - (re.compile(r'(?<=\w)\s*\s*\s*\s*(?=\w)'), lambda match: ' '), + (re.compile(r'(?<=\w)\s*\s*\s*<(i|b|u)>\s*(?=\w)'), lambda match: ' '), (re.compile(r'(?<=\w)\s*\s*(?=\w)', re.UNICODE), lambda match: ' '), # Clean up spaces (re.compile(u'(?<=\.|,|:|;|\?|!|”|"|\')[\s^ ]*(?=<)'), lambda match: ' '), From 7814dda6d8a531dd37fa7ce56c63aaa948a364a5 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 16 Apr 2009 19:01:25 -0400 Subject: [PATCH 3/7] Fix splitting of authors --- src/calibre/devices/cybookg3/driver.py | 5 ++--- src/calibre/devices/usbms/driver.py | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index c3a4fa94b0..5458fbbffb 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -7,7 +7,6 @@ Device driver for Bookeen's Cybook Gen 3 import os, shutil from itertools import cycle -from calibre.ebooks.metadata import authors_to_string from calibre.devices.errors import DeviceError, FreeSpaceError from calibre.devices.usbms.driver import USBMS import calibre.devices.cybookg3.t2b as t2b @@ -92,8 +91,8 @@ class CYBOOKG3(USBMS): break if newpath == path: - newpath = os.path.join(newpath, authors_to_string(mdata.get('authors', ''))) - newpath = os.path.join(newpath, mdata.get('title', '')) + newpath = os.path.join(newpath, mdata.get('authors', _('Unknown'))) + newpath = os.path.join(newpath, mdata.get('title', _('Unknown'))) if not os.path.exists(newpath): os.makedirs(newpath) diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py index bb7a104fa4..aa40f90c25 100644 --- a/src/calibre/devices/usbms/driver.py +++ b/src/calibre/devices/usbms/driver.py @@ -124,8 +124,8 @@ class USBMS(CLI, Device): break if newpath == path: - newpath = os.path.join(newpath, authors_to_string(mdata.get('authors', ''))) - newpath = os.path.join(newpath, mdata.get('title', '')) + newpath = os.path.join(newpath, mdata.get('authors', _('Unknown'))) + newpath = os.path.join(newpath, mdata.get('title', _('Unknown'))) if not os.path.exists(newpath): os.makedirs(newpath) From 4c6599fd45b2b3f188d6cc09bdec9b2c209ec5c3 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 16 Apr 2009 19:10:38 -0400 Subject: [PATCH 4/7] PRS505/700: Put books in author/title dir structure and use USBMS style / tag paths. --- src/calibre/devices/cybookg3/driver.py | 29 +++++++++++----------- src/calibre/devices/prs505/driver.py | 33 ++++++++++++++++++++++---- 2 files changed, 43 insertions(+), 19 deletions(-) diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py index 5458fbbffb..1cdf9863b4 100644 --- a/src/calibre/devices/cybookg3/driver.py +++ b/src/calibre/devices/cybookg3/driver.py @@ -77,22 +77,21 @@ class CYBOOKG3(USBMS): newpath = path mdata = metadata.next() - if self.SUPPORTS_SUB_DIRS: - if 'tags' in mdata.keys(): - for tag in mdata['tags']: - if tag.startswith(_('News')): - newpath = os.path.join(newpath, 'news') - newpath = os.path.join(newpath, mdata.get('title', '')) - newpath = os.path.join(newpath, mdata.get('timestamp', '')) - elif tag.startswith('/'): - newpath = path - newpath += tag - newpath = os.path.normpath(newpath) - break + if 'tags' in mdata.keys(): + for tag in mdata['tags']: + if tag.startswith(_('News')): + newpath = os.path.join(newpath, 'news') + newpath = os.path.join(newpath, mdata.get('title', '')) + newpath = os.path.join(newpath, mdata.get('timestamp', '')) + elif tag.startswith('/'): + newpath = path + newpath += tag + newpath = os.path.normpath(newpath) + break - if newpath == path: - newpath = os.path.join(newpath, mdata.get('authors', _('Unknown'))) - newpath = os.path.join(newpath, mdata.get('title', _('Unknown'))) + if newpath == path: + newpath = os.path.join(newpath, mdata.get('authors', _('Unknown'))) + newpath = os.path.join(newpath, mdata.get('title', _('Unknown'))) if not os.path.exists(newpath): os.makedirs(newpath) diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py index efc48a2dff..a704eb1ec3 100644 --- a/src/calibre/devices/prs505/driver.py +++ b/src/calibre/devices/prs505/driver.py @@ -119,19 +119,44 @@ class PRS505(CLI, Device): paths, ctimes = [], [] names = iter(names) + metadata = iter(metadata) for infile in files: close = False if not hasattr(infile, 'read'): infile, close = open(infile, 'rb'), True infile.seek(0) - name = names.next() - paths.append(os.path.join(path, name)) - if not os.path.exists(os.path.dirname(paths[-1])): - os.makedirs(os.path.dirname(paths[-1])) + + newpath = path + mdata = metadata.next() + + if 'tags' in mdata.keys(): + for tag in mdata['tags']: + if tag.startswith(_('News')): + newpath = os.path.join(newpath, 'news') + newpath = os.path.join(newpath, mdata.get('title', '')) + newpath = os.path.join(newpath, mdata.get('timestamp', '')) + elif tag.startswith('/'): + newpath = path + newpath += tag + newpath = os.path.normpath(newpath) + break + + if newpath == path: + newpath = os.path.join(newpath, mdata.get('authors', _('Unknown'))) + newpath = os.path.join(newpath, mdata.get('title', _('Unknown'))) + + if not os.path.exists(newpath): + os.makedirs(newpath) + + filepath = os.path.join(newpath, names.next()) + paths.append(filepath) + self.put_file(infile, paths[-1], replace_file=True) + if close: infile.close() ctimes.append(os.path.getctime(paths[-1])) + return zip(paths, sizes, ctimes, cycle([on_card])) @classmethod From a66fb31027465d3b79196abc9ac95811c5f1f82f Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 17 Apr 2009 08:06:35 -0400 Subject: [PATCH 5/7] Clean up command line options display. Use opf2 instead of opf. --- src/calibre/ebooks/pdf/input.py | 2 +- src/calibre/ebooks/txt/input.py | 2 +- src/calibre/ebooks/txt/output.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index 6733d3aadc..e8c3889e41 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -9,7 +9,7 @@ import os from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.pdf.pdftohtml import pdftohtml -from calibre.ebooks.metadata.opf import OPFCreator +from calibre.ebooks.metadata.opf2 import OPFCreator class PDFInput(InputFormatPlugin): diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index aafc36989e..34fafc91fc 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -9,7 +9,7 @@ import os from calibre.customize.conversion import InputFormatPlugin from calibre.ebooks.markdown import markdown -from calibre.ebooks.metadata.opf import OPFCreator +from calibre.ebooks.metadata.opf2 import OPFCreator class TXTInput(InputFormatPlugin): diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 423e668a56..dd87394507 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -18,14 +18,14 @@ class TXTOutput(OutputFormatPlugin): options = set([ OptionRecommendation(name='newline', recommended_value='system', - level=OptionRecommendation.LOW, long_switch='newline', + level=OptionRecommendation.LOW, short_switch='n', choices=TxtNewlines.NEWLINE_TYPES.keys(), help=_('Type of newline to use. Options are %s. Default is \'system\'. ' 'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. ' 'For Mac OS X use \'unix\'. \'system\' will default to the newline ' 'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys()))), OptionRecommendation(name='prepend_metadata', recommended_value='false', - level=OptionRecommendation.LOW, long_switch='prepend_metadata', + level=OptionRecommendation.LOW, choices=['true', 'false'], help=_('Write the title and author to the beginning of the file. ' 'Default is \'true\'. Use \'false\' to disable.')), From a2064499e815b37c0dbef55e2b3f251cb6a1366e Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 17 Apr 2009 21:42:03 -0400 Subject: [PATCH 6/7] Fix bug 2112: Stop metadata reader from holding pdf files open after reading. --- src/calibre/ebooks/metadata/pdf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 4476eb0847..9946d831af 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -21,6 +21,7 @@ def get_metadata(stream, extract_cover=True): """ Return metadata as a L{MetaInfo} object """ mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) + stream = StringIO.StringIO(stream.read()) if extract_cover and _imagemagick_loaded: try: @@ -70,6 +71,10 @@ def set_metadata(stream, mi): stream.seek(0) def get_cover(stream): + stream.seek(0) + if not isinstance(stream, StringIO.StringIO): + stream = StringIO.StringIO(stream.read()) + data = StringIO.StringIO() try: From 70e1336a90b4bdf7f7a388734d6a58710f1f8b62 Mon Sep 17 00:00:00 2001 From: John Schember Date: Fri, 17 Apr 2009 22:29:00 -0400 Subject: [PATCH 7/7] Use FileWrapper instead of StringIO for bug 2112 fix. --- src/calibre/__init__.py | 17 +++++++ src/calibre/ebooks/metadata/pdf.py | 82 +++++++++++++++--------------- 2 files changed, 58 insertions(+), 41 deletions(-) diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 807ce1def5..6299bb8782 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -246,6 +246,23 @@ class CurrentDir(object): os.chdir(self.cwd) +class FileWrapper(object): + ''' + Used primarily with pyPdf to ensure the stream is properly closed. + ''' + + def __init__(self, stream): + for x in ('read', 'seek', 'tell'): + setattr(self, x, getattr(stream, x)) + + def __exit__(self, *args): + for x in ('read', 'seek', 'tell'): + setattr(self, x, None) + + def __enter__(self): + return self + + def detect_ncpus(): """Detects the number of effective CPUs in the system""" try: diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 9946d831af..4dc98509e2 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -4,8 +4,9 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' '''Read meta information from PDF files''' -import sys, os, StringIO +import sys, os, cStringIO +from calibre import FileWrapper from calibre.ebooks.metadata import MetaInformation, authors_to_string from calibre.ptempfile import TemporaryDirectory from pyPdf import PdfFileReader, PdfFileWriter @@ -21,7 +22,6 @@ def get_metadata(stream, extract_cover=True): """ Return metadata as a L{MetaInfo} object """ mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) - stream = StringIO.StringIO(stream.read()) if extract_cover and _imagemagick_loaded: try: @@ -33,18 +33,19 @@ def get_metadata(stream, extract_cover=True): traceback.print_exc() try: - info = PdfFileReader(stream).getDocumentInfo() - if info.title: - mi.title = info.title - if info.author: - src = info.author.split('&') - authors = [] - for au in src: - authors += au.split(',') - mi.authors = authors - mi.author = info.author - if info.subject: - mi.category = info.subject + with FileWrapper(stream) as stream: + info = PdfFileReader(stream).getDocumentInfo() + if info.title: + mi.title = info.title + if info.author: + src = info.author.split('&') + authors = [] + for au in src: + authors += au.split(',') + mi.authors = authors + mi.author = info.author + if info.subject: + mi.category = info.subject except Exception, err: msg = u'Couldn\'t read metadata from pdf: %s with error %s'%(mi.title, unicode(err)) print >>sys.stderr, msg.encode('utf8') @@ -52,17 +53,17 @@ def get_metadata(stream, extract_cover=True): def set_metadata(stream, mi): stream.seek(0) - # Use a StringIO object for the pdf because we will want to over + # Use a cStringIO object for the pdf because we will want to over # write it later and if we are working on the stream directly it # could cause some issues. - raw = StringIO.StringIO(stream.read()) + raw = cStringIO.StringIO(stream.read()) orig_pdf = PdfFileReader(raw) title = mi.title if mi.title else orig_pdf.documentInfo.title author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author out_pdf = PdfFileWriter(title=title, author=author) for page in orig_pdf.pages: out_pdf.addPage(page) - out_str = StringIO.StringIO() + out_str = cStringIO.StringIO() out_pdf.write(out_str) stream.seek(0) stream.truncate() @@ -72,33 +73,32 @@ def set_metadata(stream, mi): def get_cover(stream): stream.seek(0) - if not isinstance(stream, StringIO.StringIO): - stream = StringIO.StringIO(stream.read()) - data = StringIO.StringIO() + data = cStringIO.StringIO() try: - pdf = PdfFileReader(stream) - output = PdfFileWriter() - - if len(pdf.pages) >= 1: - output.addPage(pdf.getPage(0)) - - with TemporaryDirectory('_pdfmeta') as tdir: - cover_path = os.path.join(tdir, 'cover.pdf') - - outputStream = file(cover_path, "wb") - output.write(outputStream) - outputStream.close() - - wand = NewMagickWand() - MagickReadImage(wand, cover_path) - MagickSetImageFormat(wand, 'JPEG') - MagickWriteImage(wand, '%s.jpg' % cover_path) - - img = Image.open('%s.jpg' % cover_path) - - img.save(data, 'JPEG') + with FileWrapper(stream) as stream: + pdf = PdfFileReader(stream) + output = PdfFileWriter() + + if len(pdf.pages) >= 1: + output.addPage(pdf.getPage(0)) + + with TemporaryDirectory('_pdfmeta') as tdir: + cover_path = os.path.join(tdir, 'cover.pdf') + + outputStream = file(cover_path, "wb") + output.write(outputStream) + outputStream.close() + + wand = NewMagickWand() + MagickReadImage(wand, cover_path) + MagickSetImageFormat(wand, 'JPEG') + MagickWriteImage(wand, '%s.jpg' % cover_path) + + img = Image.open('%s.jpg' % cover_path) + + img.save(data, 'JPEG') except: import traceback traceback.print_exc()