diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 3346f205b8..b455c55553 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -36,6 +36,7 @@ mimetypes.add_type('application/ereader', '.pdb') mimetypes.add_type('application/mobi', '.mobi') mimetypes.add_type('application/mobi', '.prc') mimetypes.add_type('application/mobi', '.azw') +mimetypes.add_type('image/wmf', '.wmf') guess_type = mimetypes.guess_type import cssutils cssutils.log.setLevel(logging.WARN) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 22bb5263d5..51e6862e7b 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -2,7 +2,7 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import os +import os, glob, re from lxml import etree @@ -61,6 +61,39 @@ class RTFInput(InputFormatPlugin): os.remove('out.xml') return ans + def extract_images(self, picts): + self.log('Extracting images...') + count = 0 + raw = open(picts, 'rb').read() + starts = [] + for match in re.finditer(r'\{\\pict([^}]+)\}', raw): + starts.append(match.start(1)) + + imap = {} + + for start in starts: + pos, bc = start, 1 + while bc > 0: + if raw[pos] == '}': bc -= 1 + elif raw[pos] == '{': bc += 1 + pos += 1 + pict = raw[start:pos+1] + enc = re.sub(r'[^a-zA-Z0-9]', '', pict) + if len(enc) % 2 == 1: + enc = enc[:-1] + data = enc.decode('hex') + ext = '.jpg' + if 'EMF' in data[:200]: + ext = '.wmf' + elif 'PNG' in data[:200]: + ext = '.png' + count += 1 + name = (('%4d'%count).replace(' ', '0'))+ext + open(name, 'wb').write(data) + imap[count] = name + #open(name+'.hex', 'wb').write(enc) + return imap + def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.rtf.xsl import xhtml @@ -74,9 +107,22 @@ class RTFInput(InputFormatPlugin): except RtfInvalidCodeException: raise ValueError(_('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.')) + d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) + if d: + imap = {} + try: + imap = self.extract_images(d[0]) + except: + self.log.exception('Failed to extract images...') self.log('Parsing XML...') parser = etree.XMLParser(recover=True, no_network=True) doc = etree.fromstring(xml, parser=parser) + for pict in doc.xpath('//rtf:pict[@num]', + namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}): + num = int(pict.get('num')) + name = imap.get(num, None) + if name is not None: + pict.set('num', name) self.log('Converting XML to HTML...') styledoc = etree.fromstring(xhtml) diff --git a/src/calibre/ebooks/rtf/xsl.py b/src/calibre/ebooks/rtf/xsl.py index be76bd2d42..be58683958 100644 --- a/src/calibre/ebooks/rtf/xsl.py +++ b/src/calibre/ebooks/rtf/xsl.py @@ -18,11 +18,11 @@ xhtml = '''\ - @@ -36,7 +36,7 @@ xhtml = '''\ parent::rtf:paragraph-definition[@name='heading 7']| parent::rtf:paragraph-definition[@name='heading 8']| parent::rtf:paragraph-definition[@name='heading 9'] - + "> @@ -64,7 +64,7 @@ xhtml = '''\ parent::rtf:paragraph-definition[@name='heading 7']| parent::rtf:paragraph-definition[@name='heading 8']| parent::rtf:paragraph-definition[@name='heading 9'] - + "> @@ -108,17 +108,17 @@ xhtml = '''\ - + - + - + - + - + @@ -150,7 +150,7 @@ xhtml = '''\ http://rtf2xml.sourceforge.net/ - + @@ -333,7 +333,7 @@ xhtml = '''\ - + @@ -387,7 +387,7 @@ xhtml = '''\ - + @@ -401,9 +401,9 @@ xhtml = '''\ - + - + @@ -479,13 +479,13 @@ xhtml = '''\ - + page-break-after:always - + @@ -505,11 +505,11 @@ xhtml = '''\ - + - + @@ -522,9 +522,13 @@ xhtml = '''\ - - - + + + + + + + no match for element: " @@ -533,6 +537,6 @@ xhtml = '''\ - + -''' \ No newline at end of file +''' diff --git a/src/calibre/ebooks/rtf2xml/pict.py b/src/calibre/ebooks/rtf2xml/pict.py index 6c88dd54e4..767efda273 100755 --- a/src/calibre/ebooks/rtf2xml/pict.py +++ b/src/calibre/ebooks/rtf2xml/pict.py @@ -55,7 +55,7 @@ class Pict: return "}\n" def __text_func(self, line): #tx '' and issueDate > '' : + return byline + ' | ' + issueDate + else : + return byline + issueDate + + def extractDescription(href) : + soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) + description = soup.find(True,attrs={'name':'description'}) + if description is not None and description.has_key('content'): + description = description['content'] + if description.startswith('Newsweek magazine online plus') : + description = soup.find(True, attrs={'class':'story'}) + firstPara = soup.find('p') + description = self.tag_to_string(firstPara) + else : + description = soup.find(True, attrs={'class':'story'}) + firstPara = soup.find('p') + description = self.tag_to_string(firstPara) + return description + + for section in oeb.toc : + for article in section : + if article.author is None : + article.author = extractByline(article.href) + if article.description is None : + article.description = extractDescription(article.href) + return