From 1f0932ad4047395bd5ae11b8ee350b26367f1eea Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 17 May 2009 21:35:59 -0400 Subject: [PATCH 1/6] Much better html to pml parser, now accounts for style information and produces output that looks more like the input. --- src/calibre/ebooks/pdb/ereader/writer.py | 13 +- src/calibre/ebooks/pml/output.py | 18 +-- src/calibre/ebooks/pml/pmlconverter.py | 82 ----------- src/calibre/ebooks/pml/pmlml.py | 178 +++++++++++++++++++++++ 4 files changed, 190 insertions(+), 101 deletions(-) create mode 100644 src/calibre/ebooks/pml/pmlml.py diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index f49aa4e125..c99c75a929 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -16,7 +16,7 @@ from calibre.ebooks.pdb.formatwriter import FormatWriter from calibre.ebooks.oeb.base import OEB_IMAGES from calibre.ebooks.pdb.header import PdbHeaderBuilder from calibre.ebooks.pdb.ereader import image_name -from calibre.ebooks.pml.pmlconverter import html_to_pml +from calibre.ebooks.pml.pmlml import PMLMLizer IDENTITY = 'PNRdPPrs' @@ -31,7 +31,7 @@ class Writer(FormatWriter): self.log = log def write_content(self, oeb_book, out_stream, metadata=None): - text = self._text(oeb_book.spine) + text = self._text(oeb_book) images = self._images(oeb_book.manifest) metadata = [self._metadata(metadata)] @@ -41,16 +41,15 @@ class Writer(FormatWriter): lengths = [len(i) for i in sections] - pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, '') + pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].parition()[0]) pdbHeaderBuilder.build_header(lengths, out_stream) for item in sections: out_stream.write(item) - def _text(self, pages): - pml = '' - for page in pages: - pml += html_to_pml(unicode(page)).encode('cp1252') + def _text(self, oeb_book): + pmlmlizer = PMLMLizer(ignore_tables=self.opts.linearize_tables) + pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252') pml_pages = [] for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1): diff --git a/src/calibre/ebooks/pml/output.py b/src/calibre/ebooks/pml/output.py index c5fbc990af..9d07718654 100644 --- a/src/calibre/ebooks/pml/output.py +++ b/src/calibre/ebooks/pml/output.py @@ -12,7 +12,7 @@ from calibre.customize.conversion import OutputFormatPlugin from calibre.ptempfile import TemporaryDirectory from calibre.utils.zipfile import ZipFile from calibre.ebooks.oeb.base import OEB_IMAGES -from calibre.ebooks.pml.pmlconverter import html_to_pml +from calibre.ebooks.pml.pmlml import PMLMLizer class PMLOutput(OutputFormatPlugin): @@ -22,22 +22,16 @@ class PMLOutput(OutputFormatPlugin): def convert(self, oeb_book, output_path, input_plugin, opts, log): with TemporaryDirectory('_pmlz_output') as tdir: - self.process_spine(oeb_book.spine, tdir) + pmlmlizer = PMLMLizer(ignore_tables=opts.linearize_tables) + content = pmlmlizer.extract_content(oeb_book, opts) + with open(os.path.join(tdir, 'index.pml'), 'wb') as out: + out.write(content.encode('utf-8')) + self.write_images(oeb_book.manifest, tdir) pmlz = ZipFile(output_path, 'w') pmlz.add_dir(tdir) - def process_spine(self, spine, out_dir): - for item in spine: - html = html_to_pml(unicode(item)).encode('utf-8') - - name = os.path.splitext(os.path.basename(item.href))[0] + '.pml' - path = os.path.join(out_dir, name) - - with open(path, 'wb') as out: - out.write(html) - def write_images(self, manifest, out_dir): for item in manifest: if item.media_type in OEB_IMAGES: diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index dded21c38c..0cd7da8e72 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en' import re -from calibre import entity_to_unicode from calibre.ebooks.pdb.ereader import image_name from calibre.ebooks.htmlsymbols import HTML_SYMBOLS @@ -67,75 +66,6 @@ PML_HTML_RULES = [ (re.compile(r'\\\\'), lambda match: '\\'), ] -HTML_PML_RULES = [ - - (re.compile(r'\\'), lambda match: '\\\\'), - (re.compile('(?<=[^\n])[ ]*'), lambda match: '\n

'), - (re.compile('

(?=^\n|^\r\n)'), lambda match: '\n'), - - - # Clean up HTML - (re.compile('@page.*?}'), lambda match: ''), - (re.compile('.*?', re.DOTALL), lambda match: ''), - (re.compile('.*?', re.DOTALL), lambda match: ''), - - # Reflow paragraphs - (re.compile('(?P.*?)

', re.DOTALL), lambda match: match.group('text').replace('\r\n', ' ').replace('\n', ' ')), - - # HTML to PML - (re.compile('.+?).*?">(?P.+?)'), lambda match: '\\Sd="%s"%s\\Sd' % (match.group('target'), match.group('text'))), - (re.compile('.+?).*?">(?P.+?)'), lambda match: '\\Fn="%s"%s\\Fn' % (match.group('target'), match.group('text'))), - (re.compile('.+?).*?">'), lambda match: '\\\\Q="%s"' % match.group('target')), - (re.compile('#.+?).*?">(?P)', re.DOTALL), lambda match: '\\q="%s"%s\\q' % (match.group('target'), match.group('text'))), - (re.compile('.+?)".*?>(.*?)*'), lambda match: '\\m="%s"' % image_name(match.group('name')).strip('\x00')), - (re.compile('&(?P#\d+);'), lambda match: entity_to_unicode(match)), - (re.compile('&(?P.+);'), lambda match: entity_to_unicode(match)), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\k%s\\k' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sb%s\\Sb' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\Sp%s\\Sp' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\B%s\\B' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\l%s\\l' % match.group('text')), - (re.compile('\d+)%".*?>'), lambda match: '\\w="%s%%"' % match.group('val')), - (re.compile('\d+)%*;.*?>(?P.+?)', re.MULTILINE), lambda match: '\\T="%s%%"%s$' % (match.group('val'), match.group('text'))), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\t%s\\t' % match.group('text')), - (re.compile('', re.DOTALL), lambda match: '\\v%s\\v' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\o%s\\o' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\u%s\\u' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\\\i%s\\i' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\r%s\\r' % match.group('text')), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\c%s\\c' % match.group('text')), - (re.compile('[0-4]).*?>(?P.+?)', re.DOTALL), lambda match: '\\X%s%s\\X%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1)), - (re.compile('(?P.+?)', re.DOTALL), lambda match: '\\x%s\\x' % match.group('text')), - (re.compile('
'), lambda match: '\n'), - (re.compile('
'), lambda match: '\n'), - - # Remove remaining HTML tags - (re.compile('<.*?>'), lambda match: ''), - - # Remove redundant page break markers - (re.compile(r'(\\p){2,}'), lambda match: r'\p'), - - # Remove whitespace on empty lines - (re.compile('^[\t\r ]$', re.MULTILINE), lambda match: ''), - # Remove excess whitespace in lines - (re.compile('(?<=.)[ ]{2,}(?=.)'), lambda match: ' '), - - # Remove excess newlines at the beginning and end - (re.compile('^(\r\n){1,}'), lambda match: ''), - (re.compile('^\n{1,}'), lambda match: ''), - (re.compile('(\r\n){3,}$'), lambda match: ''), - (re.compile('\n{3,}$'), lambda match: ''), -] - def pml_to_html(pml): html = pml for rule in PML_HTML_RULES: @@ -151,15 +81,3 @@ def footnote_sidebar_to_html(id, pml): html = '
%s
' % (id, id, pml_to_html(pml)) return html -def html_to_pml(html): - pml = '' - - for dom_tree in BeautifulSoup(html).findAll('body'): - body = unicode(dom_tree.prettify()) - - for rule in HTML_PML_RULES: - body = rule[0].sub(rule[1], body) - - pml += body - - return pml diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py new file mode 100644 index 0000000000..a6febdc53f --- /dev/null +++ b/src/calibre/ebooks/pml/pmlml.py @@ -0,0 +1,178 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into PML markup +''' + +import os, re + +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace +from calibre.ebooks.oeb.stylizer import Stylizer +from calibre.ebooks.pdb.ereader import image_name + +TAG_MAP = { + 'b' : 'B', + 'strong' : 'B', + 'i' : 'I', + 'small' : 'k', + 'sub' : 'Sb', + 'sup' : 'Sp', + 'big' : 'l', + 'del' : 'o', + 'h1' : 'x', + 'h2' : 'x0', + 'h3' : 'x1', + 'h4' : 'x2', + 'h5' : 'x3', + 'h6' : 'x4', + '!--' : 'v', +} + +STYLES = [ + ('font-weight', {'bold' : 'B', 'bolder' : 'B'}), + ('font-style', {'italic' : 'I'}), + ('text-decoration', {'underline' : 'u'}), + ('text-align', {'right' : 'r', 'center' : 'c'}), +] + +class PMLMLizer(object): + def __init__(self, ignore_tables=False): + self.ignore_tables = ignore_tables + + def extract_content(self, oeb_book, opts): + oeb_book.logger.info('Converting XHTML to PML markup...') + self.oeb_book = oeb_book + self.opts = opts + return self.pmlmlize_spine() + + def pmlmlize_spine(self): + output = u'' + for item in self.oeb_book.spine: + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) + output = self.clean_text(output) + + output = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), output) + output = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), output) + output = re.sub('[ ]{2,}', ' ', output) + + return output + + def clean_text(self, text): + return text + + def dump_text(self, elem, stylizer, tag_stack=[]): + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + return u'' + + text = u'' + style = stylizer.style(elem) + + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return u'' + + tag = barename(elem.tag) + tag_count = 0 + + # Are we in a paragraph block? + if tag == 'p' or style['display'] in ('block'): + if 'block' not in tag_stack: + tag_count += 1 + tag_stack.append('block') + + # Process tags that need special processing and that do not have inner + # text. Usually these require an argument + if tag == 'img': + text += '\\m="%s"' % image_name(os.path.basename(elem.get('src'))).strip('\x00') + if tag == 'hr': + text += '\\w' + width = elem.get('width') + if width: + text += '="%s%"' % width + else: + text += '="50%"' + + # Process style information that needs holds a single tag + if style['page-break-before'] == 'always': + text += '\\p' + if style['page-break-after'] == 'always': + text += '\\p' + + # Proccess tags that contain text. + if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + pml_tag = TAG_MAP.get(tag, None) + if pml_tag and pml_tag not in tag_stack: + tag_count += 1 + text += '\\%s' % pml_tag + tag_stack.append(pml_tag) + + # Special processing of tags that require an argument. + # Anchors links + if tag == 'a' and 'q' not in tag_stack: + href = elem.get('href') + if href and href.startswith('#'): + tag_count += 1 + text += '\\q="%s"' % href + tag_stack.append('q') + # Anchor ids + id_name = elem.get('id') + if id_name: + text += '\\Q="%s"' % id_name + + # Processes style information + for s in STYLES: + style_tag = s[1].get(style[s[0]], None) + if style_tag and style_tag not in tag_stack: + tag_count += 1 + text += '\\%s' % style_tag + tag_stack.append(style_tag) + # margin + + text += self.elem_text(elem, tag_stack) + + for item in elem: + text += self.dump_text(item, stylizer, tag_stack) + + close_tag_list = [] + for i in range(0, tag_count): + close_tag_list.insert(0, tag_stack.pop()) + text += self.close_tags(close_tag_list) + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li'): + text += os.linesep + os.linesep + + + if 'block' not in tag_stack: + text += os.linesep + os.linesep + + if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': + text += self.elem_tail(elem, tag_stack) + + return text + + def elem_text(self, elem, tag_stack): + return self.block_text(elem.text, 'block' in tag_stack) + + def elem_tail(self, elem, tag_stack): + return self.block_text(elem.tail, 'block' in tag_stack) + + def block_text(self, text, in_block): + if in_block: + text = text.replace('\n\r', ' ') + text = text.replace('\n', ' ') + text = text.replace('\r', ' ') + return text + + def close_tags(self, tags): + text = u'' + for i in range(0, len(tags)): + tag = tags.pop() + if tag != 'block': + text += '\\%s' % tag + return text + From 71eb5ab8fa695cb6ec7c57185703b983909d695f Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 17 May 2009 21:38:35 -0400 Subject: [PATCH 2/6] Fix chapter pml tag. --- src/calibre/ebooks/pml/pmlml.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index a6febdc53f..7c10784867 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -24,11 +24,11 @@ TAG_MAP = { 'big' : 'l', 'del' : 'o', 'h1' : 'x', - 'h2' : 'x0', + 'h2' : 'X0', 'h3' : 'x1', - 'h4' : 'x2', - 'h5' : 'x3', - 'h6' : 'x4', + 'h4' : 'X2', + 'h5' : 'X3', + 'h6' : 'X4', '!--' : 'v', } From 19b04056d4149d3467a3b623a09802dcd77682e3 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 18 May 2009 20:17:40 -0400 Subject: [PATCH 3/6] disable page breaks --- src/calibre/ebooks/pml/pmlml.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index 7c10784867..a5e3b36377 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -55,14 +55,19 @@ class PMLMLizer(object): stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) output += self.dump_text(item.data.find(XHTML('body')), stylizer) output = self.clean_text(output) - - output = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), output) - output = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), output) - output = re.sub('[ ]{2,}', ' ', output) return output def clean_text(self, text): + text = re.sub('(?m)^[ ]+', '', text) + text = re.sub('(?m)[ ]+$', '', text) + + text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text) + text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text) + text = re.sub('[ ]{2,}', ' ', text) + + text = re.sub(r'\\p\s*\\p', '', text) + return text def dump_text(self, elem, stylizer, tag_stack=[]): @@ -99,10 +104,9 @@ class PMLMLizer(object): text += '="50%"' # Process style information that needs holds a single tag - if style['page-break-before'] == 'always': - text += '\\p' - if style['page-break-after'] == 'always': - text += '\\p' + # Commented out because every page in an OEB book starts with this style + #if style['page-break-before'] == 'always': + # text += '\\p' # Proccess tags that contain text. if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': @@ -145,10 +149,12 @@ class PMLMLizer(object): text += self.close_tags(close_tag_list) if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li'): text += os.linesep + os.linesep - if 'block' not in tag_stack: text += os.linesep + os.linesep + + #if style['page-break-after'] == 'always': + # text += '\\p' if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': text += self.elem_tail(elem, tag_stack) From 2a155e22bef2baaed15e3a4089b7477fc770c08f Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 19 May 2009 07:44:01 -0400 Subject: [PATCH 4/6] PML: remove unused anchors, clean up anchors and links. --- src/calibre/ebooks/pml/pmlml.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index a5e3b36377..d32d391004 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -53,21 +53,35 @@ class PMLMLizer(object): output = u'' for item in self.oeb_book.spine: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + output += self.add_page_anchor(item.href) output += self.dump_text(item.data.find(XHTML('body')), stylizer) output = self.clean_text(output) return output + def add_page_anchor(self, href): + href = os.path.splitext(os.path.basename(href))[0] + return '\\Q="%s"' % href + def clean_text(self, text): + # Remove excess spaces at beginning and end of lines text = re.sub('(?m)^[ ]+', '', text) text = re.sub('(?m)[ ]+$', '', text) + # Remove excessive newlines text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text) text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text) text = re.sub('[ ]{2,}', ' ', text) + # Remove excessive \p tags text = re.sub(r'\\p\s*\\p', '', text) + # Remove anchors that do not have links + anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text)) + links = set(re.findall(r'(?<=\\q=").+?(?=")', text)) + for unused in anchors.difference(links): + text = text.replace('\\Q="%s"' % unused, '') + return text def dump_text(self, elem, stylizer, tag_stack=[]): @@ -120,14 +134,17 @@ class PMLMLizer(object): # Anchors links if tag == 'a' and 'q' not in tag_stack: href = elem.get('href') - if href and href.startswith('#'): + if href and '://' not in href: + if '#' in href: + href = href.partition('#')[2][1:] + href = os.path.splitext(os.path.basename(href))[0] tag_count += 1 text += '\\q="%s"' % href tag_stack.append('q') # Anchor ids id_name = elem.get('id') if id_name: - text += '\\Q="%s"' % id_name + text += '\\Q="%s"' % os.path.splitext(id_name)[0] # Processes style information for s in STYLES: @@ -147,7 +164,7 @@ class PMLMLizer(object): for i in range(0, tag_count): close_tag_list.insert(0, tag_stack.pop()) text += self.close_tags(close_tag_list) - if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li'): + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li', 'tr'): text += os.linesep + os.linesep if 'block' not in tag_stack: From 91b7cbc5808cb65aa69f51a59ccb0b5cbb604291 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 19 May 2009 18:57:07 -0400 Subject: [PATCH 5/6] PML: turn html entities into characters, internal links produced properly. --- src/calibre/ebooks/pdb/ereader/writer.py | 4 ++-- src/calibre/ebooks/pml/pmlml.py | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index c99c75a929..875aae764a 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -41,7 +41,7 @@ class Writer(FormatWriter): lengths = [len(i) for i in sections] - pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].parition()[0]) + pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].partition('\x00')[0]) pdbHeaderBuilder.build_header(lengths, out_stream) for item in sections: @@ -49,7 +49,7 @@ class Writer(FormatWriter): def _text(self, oeb_book): pmlmlizer = PMLMLizer(ignore_tables=self.opts.linearize_tables) - pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252') + pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') pml_pages = [] for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1): diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index d32d391004..cdf3bf69e8 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -13,6 +13,7 @@ import os, re from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.pdb.ereader import image_name +from calibre import entity_to_unicode TAG_MAP = { 'b' : 'B', @@ -78,9 +79,12 @@ class PMLMLizer(object): # Remove anchors that do not have links anchors = set(re.findall(r'(?<=\\Q=").+?(?=")', text)) - links = set(re.findall(r'(?<=\\q=").+?(?=")', text)) + links = set(re.findall(r'(?<=\\q="#).+?(?=")', text)) for unused in anchors.difference(links): text = text.replace('\\Q="%s"' % unused, '') + + for entity in set(re.findall('&.+?;', text)): + text = text.replace(entity, entity_to_unicode(entity[1:-1])) return text @@ -136,10 +140,10 @@ class PMLMLizer(object): href = elem.get('href') if href and '://' not in href: if '#' in href: - href = href.partition('#')[2][1:] + href = href.partition('#')[2] href = os.path.splitext(os.path.basename(href))[0] tag_count += 1 - text += '\\q="%s"' % href + text += '\\q="#%s"' % href tag_stack.append('q') # Anchor ids id_name = elem.get('id') From 8398d4a7d706e6958d8bf112e60c12eecbd0fa4c Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 19 May 2009 20:23:40 -0400 Subject: [PATCH 6/6] Auto convert when syncing news. --- src/calibre/gui2/device.py | 35 ++++++++++++++++++++++++++------ src/calibre/gui2/main.py | 41 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index b176c25062..caed0358cc 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -640,12 +640,33 @@ class DeviceGUI(object): ', '.join(sent_mails), 3000) - def sync_news(self): + def sync_news(self, send_ids=None, do_auto=True): if self.device_connected: - ids = list(dynamic.get('news_to_be_synced', set([]))) + ids = list(dynamic.get('news_to_be_synced', set([]))) if send_ids is None else send_ids ids = [id for id in ids if self.library_view.model().db.has_id(id)] - files, auto = self.library_view.model().get_preferred_formats_from_ids( - ids, self.device_manager.device_class.settings().format_map) + files, _auto_ids = self.library_view.model().get_preferred_formats_from_ids( + ids, self.device_manager.device_class.settings().format_map, + exclude_auto=do_auto) + auto = [] + if _auto_ids: + for id in _auto_ids: + formats = [f.lower() for f in self.library_view.model().db.formats(id, index_is_id=True).split(',')] + formats = formats if formats != None else [] + if list(set(formats).intersection(available_input_formats())) != [] and list(set(self.device_manager.device_class.settings().format_map).intersection(available_output_formats())) != []: + auto.append(id) + if auto != []: + format = None + for fmt in self.device_manager.device_class.settings().format_map: + if fmt in list(set(self.device_manager.device_class.settings().format_map).intersection(set(available_output_formats()))): + format = fmt + break + if format is not None: + autos = [self.library_view.model().db.title(id, index_is_id=True) for id in auto] + autos = '\n'.join('%s'%i for i in autos) + info_dialog(self, _('No suitable formats'), + _('Auto converting the following books before uploading to ' + 'the device:'), det_msg=autos, show=True) + self.auto_convert_news(auto, format) files = [f for f in files if f is not None] if not files: dynamic.set('news_to_be_synced', set([])) @@ -667,8 +688,10 @@ class DeviceGUI(object): if config['upload_news_to_device'] and files: remove = ids if \ config['delete_news_from_library_on_upload'] else [] - on_card = self.location_view.model().free[0] < \ - self.location_view.model().free[1] + space = { self.location_view.model().free[0] : 'main', + self.location_view.model().free[1] : 'carda', + self.location_view.model().free[2] : 'cardb' } + on_card = space.get(sorted(space.keys(), reverse=True)[0], 'main') self.upload_books(files, names, metadata, on_card=on_card, memory=[[f.name for f in files], remove]) diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index f50bffbb76..7f36a9560c 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -1080,6 +1080,24 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): current = self.library_view.currentIndex() self.library_view.model().current_changed(current, previous) + def auto_convert_news(self, book_ids, format): + previous = self.library_view.currentIndex() + rows = [x.row() for x in \ + self.library_view.selectionModel().selectedRows()] + jobs, changed, bad = convert_single_ebook(self, self.library_view.model().db, book_ids, True, format) + if jobs == []: return + for func, args, desc, fmt, id, temp_files in jobs: + if id not in bad: + job = self.job_manager.run_job(Dispatcher(self.book_auto_converted_news), + func, args=args, description=desc) + self.conversion_jobs[job] = (temp_files, fmt, id) + + if changed: + self.library_view.model().refresh_rows(rows) + current = self.library_view.currentIndex() + self.library_view.model().current_changed(current, previous) + + def get_books_for_conversion(self): rows = [r.row() for r in \ self.library_view.selectionModel().selectedRows()] @@ -1175,6 +1193,29 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): self.send_by_mail(to, fmts, delete_from_library, specific_format=fmt, send_ids=[book_id], do_auto_convert=False) + def book_auto_converted_news(self, job): + temp_files, fmt, book_id = self.conversion_jobs.pop(job) + try: + if job.failed: + return self.job_exception(job) + data = open(temp_files[0].name, 'rb') + self.library_view.model().db.add_format(book_id, fmt, data, index_is_id=True) + data.close() + self.status_bar.showMessage(job.description + (' completed'), 2000) + finally: + for f in temp_files: + try: + if os.path.exists(f.name): + os.remove(f.name) + except: + pass + self.tags_view.recount() + if self.current_view() is self.library_view: + current = self.library_view.currentIndex() + self.library_view.model().current_changed(current, QModelIndex()) + + self.sync_news(send_ids=[book_id], do_auto_convert=False) + def book_converted(self, job): temp_files, fmt, book_id = self.conversion_jobs.pop(job) try: