Fix #816616 (Too many open files)

2025-07-09 03:04:10 -04:00 · 2011-09-05 10:06:30 -06:00 · 2011-09-05 10:06:30 -06:00 · a7254ffc1d
commit a7254ffc1d
parent 9e0d5aef2f 123991aea5
2 changed files with 76 additions and 22 deletions
--- a/src/calibre/ebooks/pdf/writer.py
+++ b/src/calibre/ebooks/pdf/writer.py
@ -198,7 +198,8 @@ class PDFWriter(QObject): # {{{
        try:
            outPDF = PdfFileWriter(title=self.metadata.title, author=self.metadata.author)
            for item in self.combine_queue:
-                inputPDF = PdfFileReader(open(item, 'rb'))
+                with open(item, 'rb') as item_stream:
+                    inputPDF = PdfFileReader(item_stream)
                    for page in inputPDF.pages:
                        outPDF.addPage(page)
            outPDF.write(self.out_stream)
--- a/src/calibre/ebooks/txt/markdownml.py
+++ b/src/calibre/ebooks/txt/markdownml.py
@ -22,6 +22,7 @@ class MarkdownMLizer(OEB2HTML):
    def extract_content(self, oeb_book, opts):
        self.log.info('Converting XHTML to Markdown formatted TXT...')
        self.opts = opts
+        self.in_code = False
        self.in_pre = False
        self.list = []
        self.blockquotes = 0
@ -55,10 +56,27 @@ class MarkdownMLizer(OEB2HTML):
    def tidy_up(self, text):
        # Remove blank space form beginning of paragraph.
        text = re.sub('(?msu)^[ ]{1,3}', '', text)
+        # pre has 4 spaces. We trimmed 3 so anything with a space left is a pre.
+        text = re.sub('(?msu)^[ ]', '    ', text)
+        
+        # Remove tabs that aren't at the beinning of a line
+        new_text = []
+        for l in text.splitlines():
+            start = re.match('\t+', l)
+            if start:
+                start = start.group()
+            else:
+                start = ''
+            l = re.sub('\t', '', l)
+            new_text.append(start + l)
+        text = '\n'.join(new_text)
+        
        # Remove spaces from blank lines.
        text = re.sub('(?msu)^[ ]+$', '', text)
+        
        # Reduce blank lines
        text = re.sub('(?msu)\n{7,}', '\n' * 6, text)
+        
        # Remove blank lines at beginning and end of document.
        text = re.sub('^\s*', '', text)
        text = re.sub('\s*$', '\n\n', text)
@ -81,6 +99,12 @@ class MarkdownMLizer(OEB2HTML):
        txt = re.sub(r'([\\`*_{}\[\]()#+!])', r'\\\1', txt)
        return txt
    
+    def prepare_string_for_pre(self, txt):
+        new_text = []
+        for l in txt.splitlines():
+            new_text.append('    ' + l)
+        return '\n'.join(new_text)
+
    def dump_text(self, elem, stylizer):
        '''
        @elem: The element in the etree that we are working on.
@ -97,7 +121,7 @@ class MarkdownMLizer(OEB2HTML):
            return ['']

        # Setup our variables.
-        text = ['']
+        text = []
        style = stylizer.style(elem)
        tags = []
        tag = barename(elem.tag)
@ -143,46 +167,74 @@ class MarkdownMLizer(OEB2HTML):
            self.blockquotes += 1
            tags.append('>')
            text.append('> ' * self.blockquotes)
-        elif tag in ('code', 'pre'):
+        elif tag == 'code':
+            if not self.in_pre and not self.in_code:
+                text.append('`')
+                tags.append('`')
+                self.in_code = True
+        elif tag == 'pre':
+            if not self.in_pre:
+                text.append('\n')
+                tags.append('pre')
                self.in_pre = True
-            text.append('    ')
        elif tag == 'hr':
            text.append('\n* * *')
            tags.append('\n')
        elif tag == 'a':
            # Only write links with absolute (external) urls.
-            if attribs.has_key('href') and '://' in attribs['href']:
+            if self.opts.keep_links and attribs.has_key('href') and '://' in attribs['href']:
                title = ''
                if attribs.has_key('title'):
-                    title = ' "' + attribs['title'] + '" '
+                    title = ' "' + attribs['title'] + '"'
+                    remove_space = self.remove_space_after_newline
+                    title = self.remove_newlines(title)
+                    self.remove_space_after_newline = remove_space
                text.append('[')
                tags.append('](' + attribs['href'] + title + ')')
        elif tag == 'img':
            if self.opts.keep_image_references:
                txt = '!'
                if attribs.has_key('alt'):
-                    txt += '[' + attribs['alt'] + ']'
+                    remove_space = self.remove_space_after_newline
+                    txt += '[' + self.remove_newlines(attribs['alt']) + ']'
+                    self.remove_space_after_newline = remove_space
                txt += '(' + attribs['src'] + ')'
                text.append(txt)
        elif tag in ('ol', 'ul'):
+            tags.append(tag)
+            # Add the list to our lists of lists so we can track
+            # nested lists.
            self.list.append({'name': tag, 'num': 0})
        elif tag == 'li':
+            # Get the last list from our list of lists
            if self.list:
                li = self.list[-1]
            else:
                li = {'name': 'ul', 'num': 0}
+            # Add a new line to start the item
            text.append('\n')
+            # Add indent if we have nested lists.
+            list_count = len(self.list)
+            # We only care about indenting nested lists.
+            if (list_count - 1) > 0:
+                text.append('\t' * (list_count - 1))
+            # Add blockquote if we have a blockquote in a list item.
            text.append(bq)
+            # Write the proper sign for ordered and unorded lists.
            if li['name'] == 'ul':
                text.append('+ ')
            elif li['name'] == 'ol':
-                text.append(unicode(len(self.list)) + '. ')
-            tags.append('')
+                li['num'] += 1
+                text.append(unicode(li['num']) + '. ')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            txt = elem.text
-            if not self.in_pre:
+            if self.in_pre:
+                txt = self.prepare_string_for_pre(txt)
+            elif self.in_code:
+                txt = self.remove_newlines(txt)
+            else:
                txt = self.prepare_string_for_markdown(self.remove_newlines(txt))
            text.append(txt)

@ -193,26 +245,23 @@ class MarkdownMLizer(OEB2HTML):
        # Close all open tags.
        tags.reverse()
        for t in tags:
-            if t in ('pre', 'ul', 'ol', 'li', '>', 'block'):
+            if t in ('pre', 'ul', 'ol', '>'):
                if t == 'pre':
                    self.in_pre = False
+                    text.append('\n')
                elif t == '>':
                    self.blockquotes -= 1
-                elif t == 'block':
-                    if self.style_bold:
-                        text.append('**')
-                    if self.style_italic:
-                        text.append('*')
                elif t in ('ul', 'ol'):
                    if self.list:
                        self.list.pop()
-                    if not self.list:
                    text.append('\n')
            else:
                if t == '**':
                    self.style_bold = False
                elif t == '*':
                    self.style_italic = False
+                elif t == '`':
+                    self.in_code = False
                text.append('%s' % t)

        # Soft scene breaks.
@ -224,7 +273,11 @@ class MarkdownMLizer(OEB2HTML):
        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            tail = elem.tail
-            if not self.in_pre:
+            if self.in_pre:
+                tail = self.prepare_string_for_pre(tail)
+            elif self.in_code:
+                tail = self.remove_newlines(tail)
+            else:
                tail = self.prepare_string_for_markdown(self.remove_newlines(tail))
            text.append(tail)