diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 5d84c78e60..c0ea8ea197 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -38,6 +38,14 @@ TAG_SPACE = [ 'br', ] +TAG_IMAGES = [ + 'img', +] + +TAG_LINKS = [ + 'a', +] + STYLES = [ ('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}), ('font-style', {'italic' : 'emphasis'}), @@ -48,6 +56,7 @@ class FB2MLizer(object): def __init__(self, log): self.log = log self.image_hrefs = {} + self.link_hrefs = {} def extract_content(self, oeb_book, opts): self.log.info('Converting XHTML to FB2 markup...') @@ -57,6 +66,7 @@ class FB2MLizer(object): def fb2mlize_spine(self): self.image_hrefs = {} + self.link_hrefs = {} output = self.fb2_header() if 'titlepage' in self.oeb_book.guide: self.log.debug('Generating cover page...') @@ -68,6 +78,7 @@ class FB2MLizer(object): for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + output += self.add_page_anchor(item) output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) output += self.fb2_body_footer() output += self.fb2mlize_images() @@ -82,7 +93,7 @@ class FB2MLizer(object): if len(author_parts) == 1: author_last = author_parts[0] - elif len(author_parts == 2): + elif len(author_parts) == 2: author_first = author_parts[0] author_last = author_parts[1] else: @@ -108,6 +119,17 @@ class FB2MLizer(object): def fb2_footer(self): return u'' + def add_page_anchor(self, page): + return self.get_anchor(page, '') + + def get_anchor(self, page, aid): + aid = prepare_string_for_xml(aid) + aid = '%s#%s' % (page.href, aid) + if aid not in self.link_hrefs.keys(): + self.link_hrefs[aid] = 'calibre_link-%s' % len(self.link_hrefs.keys()) + aid = self.link_hrefs[aid] + return '' % aid + def fb2mlize_images(self): images = u'' for item in self.oeb_book.manifest: @@ -149,11 +171,33 @@ class FB2MLizer(object): tag = barename(elem.tag) tag_count = 0 - if tag == 'img': - if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys(): - self.image_hrefs[page.abshref(elem.attrib['src'])] = '%s.jpg' % len(self.image_hrefs.keys()) - fb2_text += '' % self.image_hrefs[page.abshref(elem.attrib['src'])] - + if tag in TAG_IMAGES: + if elem.attrib.get('src', None): + if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys(): + self.image_hrefs[page.abshref(elem.attrib['src'])] = '%s.jpg' % len(self.image_hrefs.keys()) + fb2_text += '' % self.image_hrefs[page.abshref(elem.attrib['src'])] + + if tag in TAG_LINKS: + href = elem.get('href') + if href: + href = prepare_string_for_xml(page.abshref(href)) + if '://' in href: + fb2_text += '' % href + else: + if '#' not in href: + href += '#' + if href not in self.link_hrefs.keys(): + self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) + href = self.link_hrefs[href] + fb2_text += '' % href + tag_count += 1 + tag_stack.append('a') + + # Anchor ids + id_name = elem.get('id') + if id_name: + fb2_text += self.get_anchor(page, id_name) + fb2_tag = TAG_MAP.get(tag, None) if fb2_tag and fb2_tag not in tag_stack: tag_count += 1 diff --git a/src/calibre/ebooks/pdb/ereader/__init__.py b/src/calibre/ebooks/pdb/ereader/__init__.py index 185a44d1a9..3f08b068cb 100644 --- a/src/calibre/ebooks/pdb/ereader/__init__.py +++ b/src/calibre/ebooks/pdb/ereader/__init__.py @@ -9,7 +9,7 @@ import os class EreaderError(Exception): pass -def image_name(name): +def image_name(name, taken_names=[]): name = os.path.basename(name) if len(name) > 32: @@ -17,7 +17,11 @@ def image_name(name): names = name[:10] namee = name[10+cut:] name = names + namee - + + while name in taken_names: + for i in xrange(9999999999999999999999999999999): + name = '%s%s' % (name[:-len('%s' % i)], i) + name = name.ljust(32, '\x00')[:32] return name diff --git a/src/calibre/ebooks/pdb/ereader/writer.py b/src/calibre/ebooks/pdb/ereader/writer.py index 9bf83c33b0..2f4e3bf16f 100644 --- a/src/calibre/ebooks/pdb/ereader/writer.py +++ b/src/calibre/ebooks/pdb/ereader/writer.py @@ -22,7 +22,6 @@ import cStringIO from calibre.ebooks.pdb.formatwriter import FormatWriter from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES from calibre.ebooks.pdb.header import PdbHeaderBuilder -from calibre.ebooks.pdb.ereader import image_name from calibre.ebooks.pml.pmlml import PMLMLizer IDENTITY = 'PNRdPPrs' @@ -38,8 +37,8 @@ class Writer(FormatWriter): self.log = log def write_content(self, oeb_book, out_stream, metadata=None): - text = self._text(oeb_book) - images = self._images(oeb_book.manifest) + text, image_hrefs = self._text(oeb_book) + images = self._images(oeb_book.manifest, image_hrefs) metadata = [self._metadata(metadata)] hr = [self._header_record(len(text), len(images))] @@ -66,13 +65,13 @@ class Writer(FormatWriter): for i in range(0, (len(pml) / MAX_RECORD_SIZE) + 1): pml_pages.append(zlib.compress(pml[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE])) - return pml_pages + return pml_pages, pmlmlizer.image_hrefs - def _images(self, manifest): + def _images(self, manifest, image_hrefs): images = [] for item in manifest: - if item.media_type in OEB_RASTER_IMAGES: + if item.media_type in OEB_RASTER_IMAGES and item.href in image_hrefs.keys(): try: im = Image.open(cStringIO.StringIO(item.data)).convert('P') im.thumbnail((300,300), Image.ANTIALIAS) @@ -82,7 +81,7 @@ class Writer(FormatWriter): data = data.getvalue() header = 'PNG ' - header += image_name(item.href) + header += image_hrefs[item.href].ljust(32, '\x00')[:32] header = header.ljust(62, '\x00') if len(data) + len(header) < 65505: diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index fd54fcf681..59b27f1763 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -53,6 +53,10 @@ LINK_TAGS = [ 'a', ] +IMAGE_TAGS = [ + 'img', +] + SEPARATE_TAGS = [ 'h1', 'h2', @@ -69,6 +73,8 @@ SEPARATE_TAGS = [ class PMLMLizer(object): def __init__(self, log): self.log = log + self.image_hrefs = {} + self.link_hrefs = {} def extract_content(self, oeb_book, opts): self.log.info('Converting XHTML to PML markup...') @@ -77,6 +83,8 @@ class PMLMLizer(object): return self.pmlmlize_spine() def pmlmlize_spine(self): + self.image_hrefs = {} + self.link_hrefs = {} output = u'' if 'titlepage' in self.oeb_book.guide: self.log.debug('Generating title page...') @@ -84,19 +92,25 @@ class PMLMLizer(object): item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) - output += self.dump_text(item.data.find(XHTML('body')), stylizer) + output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) for item in self.oeb_book.spine: self.log.debug('Converting %s to PML markup...' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) - output += self.add_page_anchor(item.href) - output += self.dump_text(item.data.find(XHTML('body')), stylizer) + output += self.add_page_anchor(item) + output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) output = self.clean_text(output) return output - def add_page_anchor(self, href): - href = os.path.splitext(os.path.basename(href))[0] - return u'\\Q="%s"' % href + def add_page_anchor(self, page): + return self.get_anchor(page, '') + + def get_anchor(self, page, aid): + aid = '%s#%s' % (page.href, aid) + if aid not in self.link_hrefs.keys(): + self.link_hrefs[aid] = 'calibre_link-%s' % len(self.link_hrefs.keys()) + aid = self.link_hrefs[aid] + return u'\\Q="%s"' % aid def clean_text(self, text): # Remove excess spaces at beginning and end of lines @@ -123,7 +137,7 @@ class PMLMLizer(object): return text - def dump_text(self, elem, stylizer, tag_stack=[]): + def dump_text(self, elem, stylizer, page, tag_stack=[]): if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: return u'' @@ -146,8 +160,11 @@ class PMLMLizer(object): # Process tags that need special processing and that do not have inner # text. Usually these require an argument - if tag == 'img': - text += '\\m="%s"' % image_name(os.path.basename(elem.get('src'))).strip('\x00') + if tag in IMAGE_TAGS: + if elem.attrib.get('src', None): + if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys(): + self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00') + text += '\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])] if tag == 'hr': text += '\\w' width = elem.get('width') @@ -171,17 +188,22 @@ class PMLMLizer(object): # Anchors links if tag in LINK_TAGS and 'q' not in tag_stack: href = elem.get('href') - if href and '://' not in href: - if '#' in href: - href = href.partition('#')[2] - href = os.path.splitext(os.path.basename(href))[0] + if href: + href = page.abshref(href) + if '://' not in href: + if '#' not in href: + href += '#' + if href not in self.link_hrefs.keys(): + self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) + href = self.link_hrefs[href] + text += '\\q="#%s"' % href tag_count += 1 - text += '\\q="#%s"' % href tag_stack.append('q') + # Anchor ids id_name = elem.get('id') if id_name: - text += '\\Q="%s"' % os.path.splitext(id_name)[0] + text += self.get_anchor(page, id_name) # Processes style information for s in STYLES: @@ -197,7 +219,7 @@ class PMLMLizer(object): text += self.elem_text(elem, tag_stack) for item in elem: - text += self.dump_text(item, stylizer, tag_stack) + text += self.dump_text(item, stylizer, page, tag_stack) close_tag_list = [] for i in range(0, tag_count): diff --git a/src/calibre/ebooks/rb/rbml.py b/src/calibre/ebooks/rb/rbml.py index 82f8a17281..2c4bb6916c 100644 --- a/src/calibre/ebooks/rb/rbml.py +++ b/src/calibre/ebooks/rb/rbml.py @@ -45,6 +45,10 @@ LINK_TAGS = [ 'a', ] +IMAGE_TAGS = [ + 'img', +] + STYLES = [ ('font-weight', {'bold' : 'b', 'bolder' : 'b'}), ('font-style', {'italic' : 'i'}), @@ -56,6 +60,7 @@ class RBMLizer(object): def __init__(self, log, name_map={}): self.log = log self.name_map = name_map + self.link_hrefs = {} def extract_content(self, oeb_book, opts): self.log.info('Converting XHTML to RB markup...') @@ -65,6 +70,7 @@ class RBMLizer(object): def mlize_spine(self): + self.link_hrefs = {} output = u'' if 'titlepage' in self.oeb_book.guide: self.log.debug('Generating cover page...') @@ -72,19 +78,25 @@ class RBMLizer(object): item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) - output += self.dump_text(item.data.find(XHTML('body')), stylizer) + output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) for item in self.oeb_book.spine: self.log.debug('Converting %s to RocketBook HTML...' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) - output += self.add_page_anchor(item.href) - output += self.dump_text(item.data.find(XHTML('body')), stylizer) + output += self.add_page_anchor(item) + output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) output += u'' output = self.clean_text(output) return output - def add_page_anchor(self, href): - href = os.path.splitext(os.path.basename(href))[0] - return u'' % href + def add_page_anchor(self, page): + return self.get_anchor(page, '') + + def get_anchor(self, page, aid): + aid = '%s#%s' % (page.href, aid) + if aid not in self.link_hrefs.keys(): + self.link_hrefs[aid] = 'calibre_link-%s' % len(self.link_hrefs.keys()) + aid = self.link_hrefs[aid] + return u'' % aid def clean_text(self, text): # Remove anchors that do not have links @@ -95,7 +107,7 @@ class RBMLizer(object): return text - def dump_text(self, elem, stylizer, tag_stack=[]): + def dump_text(self, elem, stylizer, page, tag_stack=[]): if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: return u'' @@ -112,10 +124,11 @@ class RBMLizer(object): # Process tags that need special processing and that do not have inner # text. Usually these require an argument - if tag == 'img': - src = os.path.basename(elem.get('src')) - name = self.name_map.get(src, src) - text += '' % name + if tag in IMAGE_TAGS: + if elem.attrib.get('src', None): + if page.abshref(elem.attrib['src']) not in self.name_map.keys(): + self.name_map[page.abshref(elem.attrib['src'])] = unique_name('%s' % len(self.image_hrefs.keys()), self.image_hrefs.keys(), self.name_map.keys()) + text += '' % self.name_map[page.abshref(elem.attrib['src'])] rb_tag = tag.upper() if tag in TAGS else None if rb_tag: @@ -123,21 +136,25 @@ class RBMLizer(object): text += '<%s>' % rb_tag tag_stack.append(rb_tag) + # Anchors links if tag in LINK_TAGS: href = elem.get('href') if href: + href = page.abshref(href) if '://' not in href: - if '#' in href: - href = href.partition('#')[2] - href = os.path.splitext(os.path.basename(href))[0] + if '#' not in href: + href += '#' + if href not in self.link_hrefs.keys(): + self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) + href = self.link_hrefs[href] + text += '' % href tag_count += 1 - text += '' % href tag_stack.append('A') # Anchor ids id_name = elem.get('id') if id_name: - text += '' % os.path.splitext(id_name)[0] + text += self.get_anchor(page, id_name) # Processes style information for s in STYLES: @@ -153,7 +170,7 @@ class RBMLizer(object): text += prepare_string_for_xml(elem.text) for item in elem: - text += self.dump_text(item, stylizer, tag_stack) + text += self.dump_text(item, stylizer, page, tag_stack) close_tag_list = [] for i in range(0, tag_count): diff --git a/src/calibre/ebooks/rb/writer.py b/src/calibre/ebooks/rb/writer.py index 81ffcf0bb3..515c95a6fe 100644 --- a/src/calibre/ebooks/rb/writer.py +++ b/src/calibre/ebooks/rb/writer.py @@ -125,10 +125,10 @@ class RBWriter(object): im.save(data, 'PNG') data = data.getvalue() - name = '%s.png' % os.path.splitext(os.path.basename(item.href))[0] + name = '%s.png' % len(used_names) name = unique_name(name, used_names) used_names.append(name) - self.name_map[os.path.basename(item.href)] = name + self.name_map[item.href] = name images.append((name, data)) except Exception as e: