From aafc6d97649de2a72e303507953872205f8fbc5b Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Wed, 15 Apr 2009 19:57:42 -0400
Subject: [PATCH 1/7] Fix text output regex

---
 src/calibre/ebooks/txt/writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/txt/writer.py b/src/calibre/ebooks/txt/writer.py
index 0f84c32804..ea613010ef 100644
--- a/src/calibre/ebooks/txt/writer.py
+++ b/src/calibre/ebooks/txt/writer.py
@@ -76,7 +76,7 @@ class TxtWriter(object):
                 text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '\n\n', text)
             
             for tag in ['hr', 'br']:
-                text = re.sub('(?imu)<[ ]*%s[ ]*/*?>' % tag, '\n\n', text)
+                text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text)
             
             # Remove any tags that do not need special processing.
             text = re.sub('<.*?>', '', text)

From 575b021f48ea9cab351648999bc69737ea2aafa0 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Wed, 15 Apr 2009 20:11:00 -0400
Subject: [PATCH 2/7] pdftohtml preprocess rules work

---
 src/calibre/ebooks/conversion/preprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 6b58d2d18d..632a7a3291 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -73,7 +73,7 @@ class HTMLPreProcessor(object):
                   (re.compile(r'<br.*?>'), lambda match : '<p>'),
                   
                   # Un wrap lines
-                  (re.compile(r'(?<=\w)\s*</i>\s*<p.*?>\s*<i>\s*(?=\w)'), lambda match: ' '),
+                  (re.compile(r'(?<=\w)\s*</(i|b|u)>\s*<p.*?>\s*<(i|b|u)>\s*(?=\w)'), lambda match: ' '),
                   (re.compile(r'(?<=\w)\s*<p.*?>\s*(?=\w)', re.UNICODE), lambda match: ' '),
                   # Clean up spaces
                   (re.compile(u'(?<=\.|,|:|;|\?|!|”|"|\')[\s^ ]*(?=<)'), lambda match: ' '),

From 7814dda6d8a531dd37fa7ce56c63aaa948a364a5 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Thu, 16 Apr 2009 19:01:25 -0400
Subject: [PATCH 3/7] Fix splitting of authors

---
 src/calibre/devices/cybookg3/driver.py | 5 ++---
 src/calibre/devices/usbms/driver.py    | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py
index c3a4fa94b0..5458fbbffb 100644
--- a/src/calibre/devices/cybookg3/driver.py
+++ b/src/calibre/devices/cybookg3/driver.py
@@ -7,7 +7,6 @@ Device driver for Bookeen's Cybook Gen 3
 import os, shutil
 from itertools import cycle
 
-from calibre.ebooks.metadata import authors_to_string
 from calibre.devices.errors import DeviceError, FreeSpaceError
 from calibre.devices.usbms.driver import USBMS
 import calibre.devices.cybookg3.t2b as t2b
@@ -92,8 +91,8 @@ class CYBOOKG3(USBMS):
                             break
 
                 if newpath == path:
-                    newpath = os.path.join(newpath, authors_to_string(mdata.get('authors', '')))
-                    newpath = os.path.join(newpath, mdata.get('title', ''))
+                    newpath = os.path.join(newpath, mdata.get('authors', _('Unknown')))
+                    newpath = os.path.join(newpath, mdata.get('title', _('Unknown')))
 
             if not os.path.exists(newpath):
                 os.makedirs(newpath)
diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py
index bb7a104fa4..aa40f90c25 100644
--- a/src/calibre/devices/usbms/driver.py
+++ b/src/calibre/devices/usbms/driver.py
@@ -124,8 +124,8 @@ class USBMS(CLI, Device):
                             break
                             
                 if newpath == path:
-                    newpath = os.path.join(newpath, authors_to_string(mdata.get('authors', '')))
-                    newpath = os.path.join(newpath, mdata.get('title', ''))
+                    newpath = os.path.join(newpath, mdata.get('authors', _('Unknown')))
+                    newpath = os.path.join(newpath, mdata.get('title', _('Unknown')))
 
             if not os.path.exists(newpath):
                 os.makedirs(newpath)

From 4c6599fd45b2b3f188d6cc09bdec9b2c209ec5c3 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Thu, 16 Apr 2009 19:10:38 -0400
Subject: [PATCH 4/7] PRS505/700: Put books in author/title dir structure and
 use USBMS style / tag paths.

---
 src/calibre/devices/cybookg3/driver.py | 29 +++++++++++-----------
 src/calibre/devices/prs505/driver.py   | 33 ++++++++++++++++++++++----
 2 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/src/calibre/devices/cybookg3/driver.py b/src/calibre/devices/cybookg3/driver.py
index 5458fbbffb..1cdf9863b4 100644
--- a/src/calibre/devices/cybookg3/driver.py
+++ b/src/calibre/devices/cybookg3/driver.py
@@ -77,22 +77,21 @@ class CYBOOKG3(USBMS):
             newpath = path
             mdata = metadata.next()
 
-            if self.SUPPORTS_SUB_DIRS:
-                if 'tags' in mdata.keys():
-                    for tag in mdata['tags']:
-                        if tag.startswith(_('News')):
-                            newpath = os.path.join(newpath, 'news')
-                            newpath = os.path.join(newpath, mdata.get('title', ''))
-                            newpath = os.path.join(newpath, mdata.get('timestamp', ''))
-                        elif tag.startswith('/'):
-                            newpath = path
-                            newpath += tag
-                            newpath = os.path.normpath(newpath)
-                            break
+            if 'tags' in mdata.keys():
+                for tag in mdata['tags']:
+                    if tag.startswith(_('News')):
+                        newpath = os.path.join(newpath, 'news')
+                        newpath = os.path.join(newpath, mdata.get('title', ''))
+                        newpath = os.path.join(newpath, mdata.get('timestamp', ''))
+                    elif tag.startswith('/'):
+                        newpath = path
+                        newpath += tag
+                        newpath = os.path.normpath(newpath)
+                        break
 
-                if newpath == path:
-                    newpath = os.path.join(newpath, mdata.get('authors', _('Unknown')))
-                    newpath = os.path.join(newpath, mdata.get('title', _('Unknown')))
+            if newpath == path:
+                newpath = os.path.join(newpath, mdata.get('authors', _('Unknown')))
+                newpath = os.path.join(newpath, mdata.get('title', _('Unknown')))
 
             if not os.path.exists(newpath):
                 os.makedirs(newpath)
diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py
index efc48a2dff..a704eb1ec3 100644
--- a/src/calibre/devices/prs505/driver.py
+++ b/src/calibre/devices/prs505/driver.py
@@ -119,19 +119,44 @@ class PRS505(CLI, Device):
         paths, ctimes = [], []
 
         names = iter(names)
+        metadata = iter(metadata)
         for infile in files:
             close = False
             if not hasattr(infile, 'read'):
                 infile, close = open(infile, 'rb'), True
             infile.seek(0)
-            name = names.next()
-            paths.append(os.path.join(path, name))
-            if not os.path.exists(os.path.dirname(paths[-1])):
-                os.makedirs(os.path.dirname(paths[-1]))
+            
+            newpath = path
+            mdata = metadata.next()
+
+            if 'tags' in mdata.keys():
+                for tag in mdata['tags']:
+                    if tag.startswith(_('News')):
+                        newpath = os.path.join(newpath, 'news')
+                        newpath = os.path.join(newpath, mdata.get('title', ''))
+                        newpath = os.path.join(newpath, mdata.get('timestamp', ''))
+                    elif tag.startswith('/'):
+                        newpath = path
+                        newpath += tag
+                        newpath = os.path.normpath(newpath)
+                        break
+
+            if newpath == path:
+                newpath = os.path.join(newpath, mdata.get('authors', _('Unknown')))
+                newpath = os.path.join(newpath, mdata.get('title', _('Unknown')))
+
+            if not os.path.exists(newpath):
+                os.makedirs(newpath)
+
+            filepath = os.path.join(newpath, names.next())
+            paths.append(filepath)
+
             self.put_file(infile, paths[-1], replace_file=True)
+            
             if close:
                 infile.close()
             ctimes.append(os.path.getctime(paths[-1]))
+            
         return zip(paths, sizes, ctimes, cycle([on_card]))
 
     @classmethod

From a66fb31027465d3b79196abc9ac95811c5f1f82f Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Fri, 17 Apr 2009 08:06:35 -0400
Subject: [PATCH 5/7] Clean up command line options display. Use opf2 instead
 of opf.

---
 src/calibre/ebooks/pdf/input.py  | 2 +-
 src/calibre/ebooks/txt/input.py  | 2 +-
 src/calibre/ebooks/txt/output.py | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 6733d3aadc..e8c3889e41 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -9,7 +9,7 @@ import os
 
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.pdf.pdftohtml import pdftohtml
-from calibre.ebooks.metadata.opf import OPFCreator
+from calibre.ebooks.metadata.opf2 import OPFCreator
 
 class PDFInput(InputFormatPlugin):
     
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index aafc36989e..34fafc91fc 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -9,7 +9,7 @@ import os
 
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.markdown import markdown
-from calibre.ebooks.metadata.opf import OPFCreator
+from calibre.ebooks.metadata.opf2 import OPFCreator
 
 class TXTInput(InputFormatPlugin):
     
diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py
index 423e668a56..dd87394507 100644
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@@ -18,14 +18,14 @@ class TXTOutput(OutputFormatPlugin):
 
     options = set([
                     OptionRecommendation(name='newline', recommended_value='system',
-                        level=OptionRecommendation.LOW, long_switch='newline',
+                        level=OptionRecommendation.LOW,
                         short_switch='n', choices=TxtNewlines.NEWLINE_TYPES.keys(),
                         help=_('Type of newline to use. Options are %s. Default is \'system\'. '
                             'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
                             'For Mac OS X use \'unix\'. \'system\' will default to the newline '
                             'type used by this OS.' % sorted(TxtNewlines.NEWLINE_TYPES.keys()))),
                     OptionRecommendation(name='prepend_metadata', recommended_value='false',
-                        level=OptionRecommendation.LOW, long_switch='prepend_metadata',
+                        level=OptionRecommendation.LOW,
                         choices=['true', 'false'],
                         help=_('Write the title and author to the beginning of the file. '
                             'Default is \'true\'. Use \'false\' to disable.')),

From a2064499e815b37c0dbef55e2b3f251cb6a1366e Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Fri, 17 Apr 2009 21:42:03 -0400
Subject: [PATCH 6/7] Fix bug 2112: Stop metadata reader from holding pdf files
 open after reading.

---
 src/calibre/ebooks/metadata/pdf.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py
index 4476eb0847..9946d831af 100644
--- a/src/calibre/ebooks/metadata/pdf.py
+++ b/src/calibre/ebooks/metadata/pdf.py
@@ -21,6 +21,7 @@ def get_metadata(stream, extract_cover=True):
     """ Return metadata as a L{MetaInfo} object """
     mi = MetaInformation(_('Unknown'), [_('Unknown')])
     stream.seek(0)
+    stream = StringIO.StringIO(stream.read())
 
     if extract_cover and _imagemagick_loaded:
         try:
@@ -70,6 +71,10 @@ def set_metadata(stream, mi):
     stream.seek(0)
 
 def get_cover(stream):
+    stream.seek(0)
+    if not isinstance(stream, StringIO.StringIO):
+        stream = StringIO.StringIO(stream.read())
+    
     data = StringIO.StringIO()
 
     try:

From 70e1336a90b4bdf7f7a388734d6a58710f1f8b62 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Fri, 17 Apr 2009 22:29:00 -0400
Subject: [PATCH 7/7] Use FileWrapper instead of StringIO for bug 2112 fix.

---
 src/calibre/__init__.py            | 17 +++++++
 src/calibre/ebooks/metadata/pdf.py | 82 +++++++++++++++---------------
 2 files changed, 58 insertions(+), 41 deletions(-)

diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py
index 807ce1def5..6299bb8782 100644
--- a/src/calibre/__init__.py
+++ b/src/calibre/__init__.py
@@ -246,6 +246,23 @@ class CurrentDir(object):
         os.chdir(self.cwd)
 
 
+class FileWrapper(object):
+    '''
+    Used primarily with pyPdf to ensure the stream is properly closed.
+    '''
+
+    def __init__(self, stream):
+        for x in ('read', 'seek', 'tell'):
+            setattr(self, x, getattr(stream, x))
+
+    def __exit__(self, *args):
+        for x in ('read', 'seek', 'tell'):
+            setattr(self, x, None)
+
+    def __enter__(self):
+        return self
+
+
 def detect_ncpus():
     """Detects the number of effective CPUs in the system"""
     try:
diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py
index 9946d831af..4dc98509e2 100644
--- a/src/calibre/ebooks/metadata/pdf.py
+++ b/src/calibre/ebooks/metadata/pdf.py
@@ -4,8 +4,9 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''Read meta information from PDF files'''
 
-import sys, os, StringIO
+import sys, os, cStringIO
 
+from calibre import FileWrapper
 from calibre.ebooks.metadata import MetaInformation, authors_to_string
 from calibre.ptempfile import TemporaryDirectory
 from pyPdf import PdfFileReader, PdfFileWriter
@@ -21,7 +22,6 @@ def get_metadata(stream, extract_cover=True):
     """ Return metadata as a L{MetaInfo} object """
     mi = MetaInformation(_('Unknown'), [_('Unknown')])
     stream.seek(0)
-    stream = StringIO.StringIO(stream.read())
 
     if extract_cover and _imagemagick_loaded:
         try:
@@ -33,18 +33,19 @@ def get_metadata(stream, extract_cover=True):
             traceback.print_exc()
 
     try:
-        info = PdfFileReader(stream).getDocumentInfo()
-        if info.title:
-            mi.title = info.title
-        if info.author:
-            src = info.author.split('&')
-            authors = []
-            for au in src:
-                authors += au.split(',')
-            mi.authors = authors
-            mi.author = info.author
-        if info.subject:
-            mi.category = info.subject
+        with FileWrapper(stream) as stream:
+            info = PdfFileReader(stream).getDocumentInfo()
+            if info.title:
+                mi.title = info.title
+            if info.author:
+                src = info.author.split('&')
+                authors = []
+                for au in src:
+                    authors += au.split(',')
+                mi.authors = authors
+                mi.author = info.author
+            if info.subject:
+                mi.category = info.subject
     except Exception, err:
         msg = u'Couldn\'t read metadata from pdf: %s with error %s'%(mi.title, unicode(err))
         print >>sys.stderr, msg.encode('utf8')
@@ -52,17 +53,17 @@ def get_metadata(stream, extract_cover=True):
 
 def set_metadata(stream, mi):
     stream.seek(0)
-    # Use a StringIO object for the pdf because we will want to over
+    # Use a cStringIO object for the pdf because we will want to over
     # write it later and if we are working on the stream directly it
     # could cause some issues.
-    raw = StringIO.StringIO(stream.read())
+    raw = cStringIO.StringIO(stream.read())
     orig_pdf = PdfFileReader(raw)
     title = mi.title if mi.title else orig_pdf.documentInfo.title
     author = authors_to_string(mi.authors) if mi.authors else orig_pdf.documentInfo.author
     out_pdf = PdfFileWriter(title=title, author=author)
     for page in orig_pdf.pages:
         out_pdf.addPage(page)
-    out_str = StringIO.StringIO()
+    out_str = cStringIO.StringIO()
     out_pdf.write(out_str)
     stream.seek(0)
     stream.truncate()
@@ -72,33 +73,32 @@ def set_metadata(stream, mi):
 
 def get_cover(stream):
     stream.seek(0)
-    if not isinstance(stream, StringIO.StringIO):
-        stream = StringIO.StringIO(stream.read())
     
-    data = StringIO.StringIO()
+    data = cStringIO.StringIO()
 
     try:
-        pdf = PdfFileReader(stream)
-        output = PdfFileWriter()
-
-        if len(pdf.pages) >= 1:
-            output.addPage(pdf.getPage(0))
-
-        with TemporaryDirectory('_pdfmeta') as tdir:
-            cover_path = os.path.join(tdir, 'cover.pdf')
-
-            outputStream = file(cover_path, "wb")
-            output.write(outputStream)
-            outputStream.close()
-
-            wand = NewMagickWand()
-            MagickReadImage(wand, cover_path)
-            MagickSetImageFormat(wand, 'JPEG')
-            MagickWriteImage(wand, '%s.jpg' % cover_path)
-
-            img = Image.open('%s.jpg' % cover_path)
-
-            img.save(data, 'JPEG')
+        with FileWrapper(stream) as stream:
+            pdf = PdfFileReader(stream)
+            output = PdfFileWriter()
+    
+            if len(pdf.pages) >= 1:
+                output.addPage(pdf.getPage(0))
+    
+            with TemporaryDirectory('_pdfmeta') as tdir:
+                cover_path = os.path.join(tdir, 'cover.pdf')
+    
+                outputStream = file(cover_path, "wb")
+                output.write(outputStream)
+                outputStream.close()
+    
+                wand = NewMagickWand()
+                MagickReadImage(wand, cover_path)
+                MagickSetImageFormat(wand, 'JPEG')
+                MagickWriteImage(wand, '%s.jpg' % cover_path)
+    
+                img = Image.open('%s.jpg' % cover_path)
+    
+                img.save(data, 'JPEG')
     except:
         import traceback
         traceback.print_exc()