Sync to trunk.

2025-08-30 23:00:21 -04:00 · 2009-07-30 19:18:33 -04:00 · 2009-07-30 19:18:33 -04:00 · f10852a43c
commit f10852a43c
parent 6b5d4d3548 fab11a14cf
6 changed files with 117 additions and 28 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -36,6 +36,7 @@ mimetypes.add_type('application/ereader',                 '.pdb')
 mimetypes.add_type('application/mobi',                    '.mobi')
 mimetypes.add_type('application/mobi',                    '.prc')
 mimetypes.add_type('application/mobi',                    '.azw')
+mimetypes.add_type('image/wmf',                           '.wmf')
 guess_type = mimetypes.guess_type
 import cssutils
 cssutils.log.setLevel(logging.WARN)
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -2,7 +2,7 @@ from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

-import os
+import os, glob, re

 from lxml import etree

@ -61,6 +61,39 @@ class RTFInput(InputFormatPlugin):
        os.remove('out.xml')
        return ans

+    def extract_images(self, picts):
+        self.log('Extracting images...')
+        count = 0
+        raw = open(picts, 'rb').read()
+        starts = []
+        for match in re.finditer(r'\{\\pict([^}]+)\}', raw):
+            starts.append(match.start(1))
+
+        imap = {}
+
+        for start in starts:
+            pos, bc = start, 1
+            while bc > 0:
+                if raw[pos] == '}': bc -= 1
+                elif raw[pos] == '{': bc += 1
+                pos += 1
+            pict = raw[start:pos+1]
+            enc = re.sub(r'[^a-zA-Z0-9]', '', pict)
+            if len(enc) % 2 == 1:
+                enc = enc[:-1]
+            data = enc.decode('hex')
+            ext = '.jpg'
+            if 'EMF' in data[:200]:
+                ext = '.wmf'
+            elif 'PNG' in data[:200]:
+                ext = '.png'
+            count += 1
+            name = (('%4d'%count).replace(' ', '0'))+ext
+            open(name, 'wb').write(data)
+            imap[count] = name
+            #open(name+'.hex', 'wb').write(enc)
+        return imap
+
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.rtf.xsl import xhtml
@ -74,9 +107,22 @@ class RTFInput(InputFormatPlugin):
        except RtfInvalidCodeException:
            raise ValueError(_('This RTF file has a feature calibre does not '
            'support. Convert it to HTML first and then try it.'))
+        d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
+        if d:
+            imap = {}
+            try:
+                imap = self.extract_images(d[0])
+            except:
+                self.log.exception('Failed to extract images...')
        self.log('Parsing XML...')
        parser = etree.XMLParser(recover=True, no_network=True)
        doc = etree.fromstring(xml, parser=parser)
+        for pict in doc.xpath('//rtf:pict[@num]',
+                namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
+            num = int(pict.get('num'))
+            name = imap.get(num, None)
+            if name is not None:
+                pict.set('num', name)
        self.log('Converting XML to HTML...')
        styledoc = etree.fromstring(xhtml)

--- a/src/calibre/ebooks/rtf/xsl.py
+++ b/src/calibre/ebooks/rtf/xsl.py
@ -523,7 +523,11 @@ xhtml = '''\
        <xsl:apply-templates/>
    </xsl:template>

-    <xsl:template match="rtf:pict" />
+    <xsl:template match="rtf:pict">
+        <xsl:element name="img">
+            <xsl:attribute name="src"><xsl:value-of select="@num" /></xsl:attribute>
+        </xsl:element>
+    </xsl:template>

    <xsl:template match="*">
        <xsl:message>
--- a/src/calibre/ebooks/rtf2xml/pict.py
+++ b/src/calibre/ebooks/rtf2xml/pict.py
@ -55,7 +55,7 @@ class Pict:
        return "}\n"
    def __text_func(self, line):
        #tx<nu<__________<true text
-        return line[18:]
+        return line[17:]
    def __make_dir(self):
        """ Make a dirctory to put the image data in"""
        base_name = os.path.basename(getattr(self.__orig_file, 'name',
--- a/src/calibre/gui2/wizard/send_email.py
+++ b/src/calibre/gui2/wizard/send_email.py
@ -112,7 +112,8 @@ class SendEmail(QWidget, Ui_Form):
        self.relay_tls.setChecked(True)

        info_dialog(self, _('Finish gmail setup'),
-            _('Dont forget to enter your gmail username and password')).exec_()
+            _('Dont forget to enter your gmail username and password. '
+                'You can sign up for a free gmail account at http://gmail.com')).exec_()
        self.relay_username.setFocus(Qt.OtherFocusReason)
        self.relay_username.setCursorPosition(0)

--- a/src/calibre/web/feeds/recipes/recipe_newsweek.py
+++ b/src/calibre/web/feeds/recipes/recipe_newsweek.py
@ -4,6 +4,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 import re
 from calibre import strftime
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.web.feeds.news import BasicNewsRecipe

 class Newsweek(BasicNewsRecipe):
@ -128,3 +129,39 @@ class Newsweek(BasicNewsRecipe):
        return cover_url


+    def postprocess_book(self, oeb, opts, log) :
+
+        def extractByline(href) :
+            soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))            
+            byline = soup.find(True,attrs={'class':'authorInfo'})
+            byline = self.tag_to_string(byline) if byline is not None else ''
+            issueDate = soup.find(True,attrs={'class':'issueDate'})
+            issueDate = self.tag_to_string(issueDate) if issueDate is not None else ''
+            issueDate = re.sub(',','', issueDate)
+            if byline > '' and issueDate > '' :
+                return byline + ' | ' + issueDate
+            else :
+                return byline + issueDate
+            
+        def extractDescription(href) :
+            soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
+            description = soup.find(True,attrs={'name':'description'})
+            if description is not None and description.has_key('content'):
+                description = description['content']
+                if description.startswith('Newsweek magazine online plus') :
+                    description = soup.find(True, attrs={'class':'story'})
+                    firstPara = soup.find('p')
+                    description = self.tag_to_string(firstPara)
+            else :
+                description = soup.find(True, attrs={'class':'story'})
+                firstPara = soup.find('p')
+                description = self.tag_to_string(firstPara)
+            return description    
+        
+        for section in oeb.toc :
+            for article in section :
+                if article.author is None :
+                    article.author = extractByline(article.href)
+                if article.description is None :
+                    article.description = extractDescription(article.href)
+        return