diff --git a/src/calibre/ebooks/pdf/writer.py b/src/calibre/ebooks/pdf/writer.py index dc7f2edba9..ebe6533419 100644 --- a/src/calibre/ebooks/pdf/writer.py +++ b/src/calibre/ebooks/pdf/writer.py @@ -198,9 +198,10 @@ class PDFWriter(QObject): # {{{ try: outPDF = PdfFileWriter(title=self.metadata.title, author=self.metadata.author) for item in self.combine_queue: - inputPDF = PdfFileReader(open(item, 'rb')) - for page in inputPDF.pages: - outPDF.addPage(page) + with open(item, 'rb') as item_stream: + inputPDF = PdfFileReader(item_stream) + for page in inputPDF.pages: + outPDF.addPage(page) outPDF.write(self.out_stream) finally: self._delete_tmpdir() diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py index 087877e78f..878633add3 100644 --- a/src/calibre/ebooks/txt/markdownml.py +++ b/src/calibre/ebooks/txt/markdownml.py @@ -22,6 +22,7 @@ class MarkdownMLizer(OEB2HTML): def extract_content(self, oeb_book, opts): self.log.info('Converting XHTML to Markdown formatted TXT...') self.opts = opts + self.in_code = False self.in_pre = False self.list = [] self.blockquotes = 0 @@ -55,10 +56,27 @@ class MarkdownMLizer(OEB2HTML): def tidy_up(self, text): # Remove blank space form beginning of paragraph. text = re.sub('(?msu)^[ ]{1,3}', '', text) + # pre has 4 spaces. We trimmed 3 so anything with a space left is a pre. + text = re.sub('(?msu)^[ ]', ' ', text) + + # Remove tabs that aren't at the beinning of a line + new_text = [] + for l in text.splitlines(): + start = re.match('\t+', l) + if start: + start = start.group() + else: + start = '' + l = re.sub('\t', '', l) + new_text.append(start + l) + text = '\n'.join(new_text) + # Remove spaces from blank lines. text = re.sub('(?msu)^[ ]+$', '', text) + # Reduce blank lines text = re.sub('(?msu)\n{7,}', '\n' * 6, text) + # Remove blank lines at beginning and end of document. text = re.sub('^\s*', '', text) text = re.sub('\s*$', '\n\n', text) @@ -80,6 +98,12 @@ class MarkdownMLizer(OEB2HTML): def prepare_string_for_markdown(self, txt): txt = re.sub(r'([\\`*_{}\[\]()#+!])', r'\\\1', txt) return txt + + def prepare_string_for_pre(self, txt): + new_text = [] + for l in txt.splitlines(): + new_text.append(' ' + l) + return '\n'.join(new_text) def dump_text(self, elem, stylizer): ''' @@ -97,7 +121,7 @@ class MarkdownMLizer(OEB2HTML): return [''] # Setup our variables. - text = [''] + text = [] style = stylizer.style(elem) tags = [] tag = barename(elem.tag) @@ -143,46 +167,74 @@ class MarkdownMLizer(OEB2HTML): self.blockquotes += 1 tags.append('>') text.append('> ' * self.blockquotes) - elif tag in ('code', 'pre'): - self.in_pre = True - text.append(' ') + elif tag == 'code': + if not self.in_pre and not self.in_code: + text.append('`') + tags.append('`') + self.in_code = True + elif tag == 'pre': + if not self.in_pre: + text.append('\n') + tags.append('pre') + self.in_pre = True elif tag == 'hr': text.append('\n* * *') tags.append('\n') elif tag == 'a': # Only write links with absolute (external) urls. - if attribs.has_key('href') and '://' in attribs['href']: + if self.opts.keep_links and attribs.has_key('href') and '://' in attribs['href']: title = '' if attribs.has_key('title'): - title = ' "' + attribs['title'] + '" ' + title = ' "' + attribs['title'] + '"' + remove_space = self.remove_space_after_newline + title = self.remove_newlines(title) + self.remove_space_after_newline = remove_space text.append('[') tags.append('](' + attribs['href'] + title + ')') elif tag == 'img': if self.opts.keep_image_references: txt = '!' if attribs.has_key('alt'): - txt += '[' + attribs['alt'] + ']' + remove_space = self.remove_space_after_newline + txt += '[' + self.remove_newlines(attribs['alt']) + ']' + self.remove_space_after_newline = remove_space txt += '(' + attribs['src'] + ')' text.append(txt) elif tag in ('ol', 'ul'): + tags.append(tag) + # Add the list to our lists of lists so we can track + # nested lists. self.list.append({'name': tag, 'num': 0}) elif tag == 'li': + # Get the last list from our list of lists if self.list: li = self.list[-1] else: li = {'name': 'ul', 'num': 0} + # Add a new line to start the item text.append('\n') + # Add indent if we have nested lists. + list_count = len(self.list) + # We only care about indenting nested lists. + if (list_count - 1) > 0: + text.append('\t' * (list_count - 1)) + # Add blockquote if we have a blockquote in a list item. text.append(bq) + # Write the proper sign for ordered and unorded lists. if li['name'] == 'ul': text.append('+ ') elif li['name'] == 'ol': - text.append(unicode(len(self.list)) + '. ') - tags.append('') + li['num'] += 1 + text.append(unicode(li['num']) + '. ') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: txt = elem.text - if not self.in_pre: + if self.in_pre: + txt = self.prepare_string_for_pre(txt) + elif self.in_code: + txt = self.remove_newlines(txt) + else: txt = self.prepare_string_for_markdown(self.remove_newlines(txt)) text.append(txt) @@ -193,26 +245,23 @@ class MarkdownMLizer(OEB2HTML): # Close all open tags. tags.reverse() for t in tags: - if t in ('pre', 'ul', 'ol', 'li', '>', 'block'): + if t in ('pre', 'ul', 'ol', '>'): if t == 'pre': self.in_pre = False + text.append('\n') elif t == '>': self.blockquotes -= 1 - elif t == 'block': - if self.style_bold: - text.append('**') - if self.style_italic: - text.append('*') elif t in ('ul', 'ol'): if self.list: self.list.pop() - if not self.list: - text.append('\n') + text.append('\n') else: if t == '**': self.style_bold = False elif t == '*': self.style_italic = False + elif t == '`': + self.in_code = False text.append('%s' % t) # Soft scene breaks. @@ -224,7 +273,11 @@ class MarkdownMLizer(OEB2HTML): # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: tail = elem.tail - if not self.in_pre: + if self.in_pre: + tail = self.prepare_string_for_pre(tail) + elif self.in_code: + tail = self.remove_newlines(tail) + else: tail = self.prepare_string_for_markdown(self.remove_newlines(tail)) text.append(tail)