From adcf2a0cb66287623572df6a3109a4e6f7cb39a4 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 4 Sep 2011 12:56:01 -0400 Subject: [PATCH 1/7] Markdown Output: Fix issues with pre tags. --- src/calibre/ebooks/txt/markdownml.py | 53 ++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py index 087877e78f..8a111670c5 100644 --- a/src/calibre/ebooks/txt/markdownml.py +++ b/src/calibre/ebooks/txt/markdownml.py @@ -55,10 +55,15 @@ class MarkdownMLizer(OEB2HTML): def tidy_up(self, text): # Remove blank space form beginning of paragraph. text = re.sub('(?msu)^[ ]{1,3}', '', text) + # pre has 4 spaces. We trimmed 3 so anything with a space left is a pre. + text = re.sub('(?msu)^[ ]', ' ', text) + # Remove spaces from blank lines. text = re.sub('(?msu)^[ ]+$', '', text) + # Reduce blank lines text = re.sub('(?msu)\n{7,}', '\n' * 6, text) + # Remove blank lines at beginning and end of document. text = re.sub('^\s*', '', text) text = re.sub('\s*$', '\n\n', text) @@ -80,6 +85,12 @@ class MarkdownMLizer(OEB2HTML): def prepare_string_for_markdown(self, txt): txt = re.sub(r'([\\`*_{}\[\]()#+!])', r'\\\1', txt) return txt + + def prepare_string_for_pre(self, txt): + new_text = [] + for l in txt.splitlines(): + new_text.append(' ' + l) + return '\n'.join(new_text) def dump_text(self, elem, stylizer): ''' @@ -97,7 +108,7 @@ class MarkdownMLizer(OEB2HTML): return [''] # Setup our variables. - text = [''] + text = [] style = stylizer.style(elem) tags = [] tag = barename(elem.tag) @@ -143,29 +154,41 @@ class MarkdownMLizer(OEB2HTML): self.blockquotes += 1 tags.append('>') text.append('> ' * self.blockquotes) - elif tag in ('code', 'pre'): - self.in_pre = True - text.append(' ') + elif tag == 'code': + if not self.in_pre: + text.append('`') + tags.append('`') + elif tag == 'pre': + if not self.in_pre: + text.append('\n') + tags.append('pre') + self.in_pre = True elif tag == 'hr': text.append('\n* * *') tags.append('\n') elif tag == 'a': # Only write links with absolute (external) urls. - if attribs.has_key('href') and '://' in attribs['href']: + if self.opts.keep_links and attribs.has_key('href') and '://' in attribs['href']: title = '' if attribs.has_key('title'): - title = ' "' + attribs['title'] + '" ' + title = ' "' + attribs['title'] + '"' + remove_space = self.remove_space_after_newline + title = self.remove_newlines(title) + self.remove_space_after_newline = remove_space text.append('[') tags.append('](' + attribs['href'] + title + ')') elif tag == 'img': if self.opts.keep_image_references: txt = '!' if attribs.has_key('alt'): - txt += '[' + attribs['alt'] + ']' + remove_space = self.remove_space_after_newline + txt += '[' + self.remove_newlines(attribs['alt']) + ']' + self.remove_space_after_newline = remove_space txt += '(' + attribs['src'] + ')' text.append(txt) elif tag in ('ol', 'ul'): self.list.append({'name': tag, 'num': 0}) + tags.append(tag) elif tag == 'li': if self.list: li = self.list[-1] @@ -182,7 +205,9 @@ class MarkdownMLizer(OEB2HTML): # Process tags that contain text. if hasattr(elem, 'text') and elem.text: txt = elem.text - if not self.in_pre: + if self.in_pre: + txt = self.prepare_string_for_pre(txt) + else: txt = self.prepare_string_for_markdown(self.remove_newlines(txt)) text.append(txt) @@ -193,16 +218,12 @@ class MarkdownMLizer(OEB2HTML): # Close all open tags. tags.reverse() for t in tags: - if t in ('pre', 'ul', 'ol', 'li', '>', 'block'): + if t in ('pre', 'ul', 'ol', 'li', '>'): if t == 'pre': self.in_pre = False + text.append('\n') elif t == '>': self.blockquotes -= 1 - elif t == 'block': - if self.style_bold: - text.append('**') - if self.style_italic: - text.append('*') elif t in ('ul', 'ol'): if self.list: self.list.pop() @@ -224,7 +245,9 @@ class MarkdownMLizer(OEB2HTML): # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: tail = elem.tail - if not self.in_pre: + if self.in_pre: + tail = self.prepare_string_for_pre(tail) + else: tail = self.prepare_string_for_markdown(self.remove_newlines(tail)) text.append(tail) From 8a689cf3b6172157351213e86da0d06ed06b450e Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 4 Sep 2011 13:03:52 -0400 Subject: [PATCH 2/7] Markdown Output: Don't escape special characters in code blocks. --- src/calibre/ebooks/txt/markdownml.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py index 8a111670c5..7765736024 100644 --- a/src/calibre/ebooks/txt/markdownml.py +++ b/src/calibre/ebooks/txt/markdownml.py @@ -22,6 +22,7 @@ class MarkdownMLizer(OEB2HTML): def extract_content(self, oeb_book, opts): self.log.info('Converting XHTML to Markdown formatted TXT...') self.opts = opts + self.in_code = False self.in_pre = False self.list = [] self.blockquotes = 0 @@ -158,6 +159,7 @@ class MarkdownMLizer(OEB2HTML): if not self.in_pre: text.append('`') tags.append('`') + self.in_code = True elif tag == 'pre': if not self.in_pre: text.append('\n') @@ -207,6 +209,8 @@ class MarkdownMLizer(OEB2HTML): txt = elem.text if self.in_pre: txt = self.prepare_string_for_pre(txt) + elif self.in_code: + txt = self.remove_newlines(txt) else: txt = self.prepare_string_for_markdown(self.remove_newlines(txt)) text.append(txt) @@ -234,6 +238,8 @@ class MarkdownMLizer(OEB2HTML): self.style_bold = False elif t == '*': self.style_italic = False + elif t == '`': + self.in_code = False text.append('%s' % t) # Soft scene breaks. @@ -247,6 +253,8 @@ class MarkdownMLizer(OEB2HTML): tail = elem.tail if self.in_pre: tail = self.prepare_string_for_pre(tail) + elif self.in_code: + tail = self.remove_newlines(tail) else: tail = self.prepare_string_for_markdown(self.remove_newlines(tail)) text.append(tail) From 22e1a293cc19fe127156f28657727f1b5d4a83ce Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 4 Sep 2011 13:16:16 -0400 Subject: [PATCH 3/7] Markdown Output: More code block handling fixes. --- src/calibre/ebooks/txt/markdownml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py index 7765736024..1f974fda60 100644 --- a/src/calibre/ebooks/txt/markdownml.py +++ b/src/calibre/ebooks/txt/markdownml.py @@ -156,7 +156,7 @@ class MarkdownMLizer(OEB2HTML): tags.append('>') text.append('> ' * self.blockquotes) elif tag == 'code': - if not self.in_pre: + if not self.in_pre and not self.in_code: text.append('`') tags.append('`') self.in_code = True From ea3837a3e89d6fe8dd617d0acd30535a90c99810 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 4 Sep 2011 16:31:41 -0400 Subject: [PATCH 4/7] Markdown Output: List fixes. --- src/calibre/ebooks/txt/markdownml.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py index 1f974fda60..c87836bf7d 100644 --- a/src/calibre/ebooks/txt/markdownml.py +++ b/src/calibre/ebooks/txt/markdownml.py @@ -77,7 +77,7 @@ class MarkdownMLizer(OEB2HTML): text = text.replace('\r', ' ') # Condense redundant spaces created by replacing newlines with spaces. text = re.sub(r'[ ]{2,}', ' ', text) - text = re.sub(r'\t+', '', text) + #text = re.sub(r'\t+', '', text) if self.remove_space_after_newline == True: text = re.sub(r'^ +', '', text) self.remove_space_after_newline = False @@ -189,20 +189,30 @@ class MarkdownMLizer(OEB2HTML): txt += '(' + attribs['src'] + ')' text.append(txt) elif tag in ('ol', 'ul'): - self.list.append({'name': tag, 'num': 0}) tags.append(tag) + # Add the list to our lists of lists so we can track + # nested lists. + self.list.append({'name': tag, 'num': 0}) elif tag == 'li': + # Get the last list from our list of lists if self.list: li = self.list[-1] else: li = {'name': 'ul', 'num': 0} + # Add a new line to start the item text.append('\n') + # Add indent if we have nested lists. + list_count = len(self.list) + # We only care about indenting nested lists. + if (list_count - 1) > 0: + text.append('\t' * (list_count - 1)) + # Add blockquote if we have a blockquote in a list item. text.append(bq) + # Write the proper sign for ordered and unorded lists. if li['name'] == 'ul': text.append('+ ') elif li['name'] == 'ol': text.append(unicode(len(self.list)) + '. ') - tags.append('') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: @@ -222,7 +232,7 @@ class MarkdownMLizer(OEB2HTML): # Close all open tags. tags.reverse() for t in tags: - if t in ('pre', 'ul', 'ol', 'li', '>'): + if t in ('pre', 'ul', 'ol', '>'): if t == 'pre': self.in_pre = False text.append('\n') From 4a6a013bef48a450344e37cdd5733d1c9f2f9c0f Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 4 Sep 2011 17:07:45 -0400 Subject: [PATCH 5/7] Markdown Output: Nested list fixes. --- src/calibre/ebooks/txt/markdownml.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py index c87836bf7d..30e2d1d7be 100644 --- a/src/calibre/ebooks/txt/markdownml.py +++ b/src/calibre/ebooks/txt/markdownml.py @@ -59,6 +59,19 @@ class MarkdownMLizer(OEB2HTML): # pre has 4 spaces. We trimmed 3 so anything with a space left is a pre. text = re.sub('(?msu)^[ ]', ' ', text) + # Remove tabs that aren't at the beinning of a line + new_text = [] + for l in text.splitlines(): + start = re.match('\t+', l) + if start: + start = start.group() + else: + start = '' + l = re.sub('\t', '', l) + new_text.append(start + l) + text = '\n'.join(new_text) + print(text) + # Remove spaces from blank lines. text = re.sub('(?msu)^[ ]+$', '', text) @@ -77,7 +90,7 @@ class MarkdownMLizer(OEB2HTML): text = text.replace('\r', ' ') # Condense redundant spaces created by replacing newlines with spaces. text = re.sub(r'[ ]{2,}', ' ', text) - #text = re.sub(r'\t+', '', text) + text = re.sub(r'\t+', '', text) if self.remove_space_after_newline == True: text = re.sub(r'^ +', '', text) self.remove_space_after_newline = False @@ -212,7 +225,8 @@ class MarkdownMLizer(OEB2HTML): if li['name'] == 'ul': text.append('+ ') elif li['name'] == 'ol': - text.append(unicode(len(self.list)) + '. ') + li['num'] += 1 + text.append(unicode(li['num']) + '. ') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: @@ -241,8 +255,7 @@ class MarkdownMLizer(OEB2HTML): elif t in ('ul', 'ol'): if self.list: self.list.pop() - if not self.list: - text.append('\n') + text.append('\n') else: if t == '**': self.style_bold = False From cf3b7f85cec0371f1d8c1995ff9e780c7c81d63e Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 4 Sep 2011 18:18:26 -0400 Subject: [PATCH 6/7] Markdown Output: Remove left over print statement from debugging. --- src/calibre/ebooks/txt/markdownml.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py index 30e2d1d7be..878633add3 100644 --- a/src/calibre/ebooks/txt/markdownml.py +++ b/src/calibre/ebooks/txt/markdownml.py @@ -70,7 +70,6 @@ class MarkdownMLizer(OEB2HTML): l = re.sub('\t', '', l) new_text.append(start + l) text = '\n'.join(new_text) - print(text) # Remove spaces from blank lines. text = re.sub('(?msu)^[ ]+$', '', text) From 123991aea51113193ce6af1261831bebeeaa019b Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 5 Sep 2011 07:36:02 -0400 Subject: [PATCH 7/7] Fix for issue #816616: PDF Output Too many open files. --- src/calibre/ebooks/pdf/writer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index dc7f2edba9..ebe6533419 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -198,9 +198,10 @@ class PDFWriter(QObject): # {{{ try: outPDF = PdfFileWriter(title=self.metadata.title, author=self.metadata.author) for item in self.combine_queue: - inputPDF = PdfFileReader(open(item, 'rb')) - for page in inputPDF.pages: - outPDF.addPage(page) + with open(item, 'rb') as item_stream: + inputPDF = PdfFileReader(item_stream) + for page in inputPDF.pages: + outPDF.addPage(page) outPDF.write(self.out_stream) finally: self._delete_tmpdir()