From 1a523d923ff8b3e40557efec42cfd722b72f11d8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 29 Jul 2009 18:09:36 -0600 Subject: [PATCH 1/5] IGN:... --- src/calibre/gui2/wizard/send_email.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/wizard/send_email.py b/src/calibre/gui2/wizard/send_email.py index 5650279c15..504c426359 100644 --- a/src/calibre/gui2/wizard/send_email.py +++ b/src/calibre/gui2/wizard/send_email.py @@ -112,7 +112,8 @@ class SendEmail(QWidget, Ui_Form): self.relay_tls.setChecked(True) info_dialog(self, _('Finish gmail setup'), - _('Dont forget to enter your gmail username and password')).exec_() + _('Dont forget to enter your gmail username and password. ' + 'You can sign up for a free gmail account at http://gmail.com')).exec_() self.relay_username.setFocus(Qt.OtherFocusReason) self.relay_username.setCursorPosition(0) From d0a1ce48258e6c16237c109eeb50f2efa28c2bce Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 30 Jul 2009 12:52:55 -0600 Subject: [PATCH 2/5] Improved Newsweek recipe (thanks to GRiker) --- .../web/feeds/recipes/recipe_newsweek.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/calibre/web/feeds/recipes/recipe_newsweek.py b/src/calibre/web/feeds/recipes/recipe_newsweek.py index c7f043ef74..9a6ef77cee 100644 --- a/src/calibre/web/feeds/recipes/recipe_newsweek.py +++ b/src/calibre/web/feeds/recipes/recipe_newsweek.py @@ -4,6 +4,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' import re from calibre import strftime +from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe class Newsweek(BasicNewsRecipe): @@ -128,3 +129,39 @@ class Newsweek(BasicNewsRecipe): return cover_url + def postprocess_book(self, oeb, opts, log) : + + def extractByline(href) : + soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) + byline = soup.find(True,attrs={'class':'authorInfo'}) + byline = self.tag_to_string(byline) if byline is not None else '' + issueDate = soup.find(True,attrs={'class':'issueDate'}) + issueDate = self.tag_to_string(issueDate) if issueDate is not None else '' + issueDate = re.sub(',','', issueDate) + if byline > '' and issueDate > '' : + return byline + ' | ' + issueDate + else : + return byline + issueDate + + def extractDescription(href) : + soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) + description = soup.find(True,attrs={'name':'description'}) + if description is not None and description.has_key('content'): + description = description['content'] + if description.startswith('Newsweek magazine online plus') : + description = soup.find(True, attrs={'class':'story'}) + firstPara = soup.find('p') + description = self.tag_to_string(firstPara) + else : + description = soup.find(True, attrs={'class':'story'}) + firstPara = soup.find('p') + description = self.tag_to_string(firstPara) + return description + + for section in oeb.toc : + for article in section : + if article.author is None : + article.author = extractByline(article.href) + if article.description is None : + article.description = extractDescription(article.href) + return From ecfa8d838510110b12011b43f4b122c113d70bf8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 30 Jul 2009 15:04:02 -0600 Subject: [PATCH 3/5] RTF Input: Support extraction of images (JPEG/PNG only) --- src/calibre/ebooks/rtf/input.py | 32 +++++++++++++++++- src/calibre/ebooks/rtf/xsl.py | 54 ++++++++++++++++-------------- src/calibre/ebooks/rtf2xml/pict.py | 2 +- 3 files changed, 61 insertions(+), 27 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 22bb5263d5..06b4d5f2ee 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -2,7 +2,7 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import os +import os, glob, re from lxml import etree @@ -61,6 +61,30 @@ class RTFInput(InputFormatPlugin): os.remove('out.xml') return ans + def extract_images(self, picts): + self.log('Extracting images...') + count = 0 + raw = open(picts, 'rb').read() + starts = [] + for match in re.finditer(r'\{\\pict([^}]+)\}', raw): + starts.append(match.start(1)) + + for start in starts: + pos, bc = start, 1 + while bc > 0: + if raw[pos] == '}': bc -= 1 + elif raw[pos] == '{': bc += 1 + pos += 1 + pict = raw[start:pos+1] + enc = re.sub(r'[^a-zA-Z0-9]', '', pict) + if len(enc) % 2 == 1: + enc = enc[:-1] + data = enc.decode('hex') + count += 1 + name = (('%4d'%count).replace(' ', '0'))+'.jpg' + open(name, 'wb').write(data) + #open(name+'.hex', 'wb').write(enc) + def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.rtf.xsl import xhtml @@ -74,6 +98,12 @@ class RTFInput(InputFormatPlugin): except RtfInvalidCodeException: raise ValueError(_('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.')) + d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) + if d: + try: + self.extract_images(d[0]) + except: + self.log.exception('Failed to extract images...') self.log('Parsing XML...') parser = etree.XMLParser(recover=True, no_network=True) doc = etree.fromstring(xml, parser=parser) diff --git a/src/calibre/ebooks/rtf/xsl.py b/src/calibre/ebooks/rtf/xsl.py index be76bd2d42..8004367d77 100644 --- a/src/calibre/ebooks/rtf/xsl.py +++ b/src/calibre/ebooks/rtf/xsl.py @@ -18,11 +18,11 @@ xhtml = '''\ - @@ -36,7 +36,7 @@ xhtml = '''\ parent::rtf:paragraph-definition[@name='heading 7']| parent::rtf:paragraph-definition[@name='heading 8']| parent::rtf:paragraph-definition[@name='heading 9'] - + "> @@ -64,7 +64,7 @@ xhtml = '''\ parent::rtf:paragraph-definition[@name='heading 7']| parent::rtf:paragraph-definition[@name='heading 8']| parent::rtf:paragraph-definition[@name='heading 9'] - + "> @@ -108,17 +108,17 @@ xhtml = '''\ - + - + - + - + - + @@ -150,7 +150,7 @@ xhtml = '''\ http://rtf2xml.sourceforge.net/ - + @@ -333,7 +333,7 @@ xhtml = '''\ - + @@ -387,7 +387,7 @@ xhtml = '''\ - + @@ -401,9 +401,9 @@ xhtml = '''\ - + - + @@ -479,13 +479,13 @@ xhtml = '''\ - + page-break-after:always - + @@ -505,11 +505,11 @@ xhtml = '''\ - + - + @@ -522,9 +522,13 @@ xhtml = '''\ - - - + + + + .jpg + + + no match for element: " @@ -533,6 +537,6 @@ xhtml = '''\ - + -''' \ No newline at end of file +''' diff --git a/src/calibre/ebooks/rtf2xml/pict.py b/src/calibre/ebooks/rtf2xml/pict.py index 6c88dd54e4..767efda273 100755 --- a/src/calibre/ebooks/rtf2xml/pict.py +++ b/src/calibre/ebooks/rtf2xml/pict.py @@ -55,7 +55,7 @@ class Pict: return "}\n" def __text_func(self, line): #tx Date: Thu, 30 Jul 2009 16:11:14 -0600 Subject: [PATCH 4/5] IGN: RTF Input: Give images their proper extensions --- src/calibre/ebooks/rtf/input.py | 20 ++++++++++++++++++-- src/calibre/ebooks/rtf/xsl.py | 2 +- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 06b4d5f2ee..51e6862e7b 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -69,6 +69,8 @@ class RTFInput(InputFormatPlugin): for match in re.finditer(r'\{\\pict([^}]+)\}', raw): starts.append(match.start(1)) + imap = {} + for start in starts: pos, bc = start, 1 while bc > 0: @@ -80,10 +82,17 @@ class RTFInput(InputFormatPlugin): if len(enc) % 2 == 1: enc = enc[:-1] data = enc.decode('hex') + ext = '.jpg' + if 'EMF' in data[:200]: + ext = '.wmf' + elif 'PNG' in data[:200]: + ext = '.png' count += 1 - name = (('%4d'%count).replace(' ', '0'))+'.jpg' + name = (('%4d'%count).replace(' ', '0'))+ext open(name, 'wb').write(data) + imap[count] = name #open(name+'.hex', 'wb').write(enc) + return imap def convert(self, stream, options, file_ext, log, accelerators): @@ -100,13 +109,20 @@ class RTFInput(InputFormatPlugin): 'support. Convert it to HTML first and then try it.')) d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) if d: + imap = {} try: - self.extract_images(d[0]) + imap = self.extract_images(d[0]) except: self.log.exception('Failed to extract images...') self.log('Parsing XML...') parser = etree.XMLParser(recover=True, no_network=True) doc = etree.fromstring(xml, parser=parser) + for pict in doc.xpath('//rtf:pict[@num]', + namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}): + num = int(pict.get('num')) + name = imap.get(num, None) + if name is not None: + pict.set('num', name) self.log('Converting XML to HTML...') styledoc = etree.fromstring(xhtml) diff --git a/src/calibre/ebooks/rtf/xsl.py b/src/calibre/ebooks/rtf/xsl.py index 8004367d77..be58683958 100644 --- a/src/calibre/ebooks/rtf/xsl.py +++ b/src/calibre/ebooks/rtf/xsl.py @@ -525,7 +525,7 @@ xhtml = '''\ - .jpg + From fab11a14cfb4f018ee6773531b2a8a22764467c4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 30 Jul 2009 16:13:27 -0600 Subject: [PATCH 5/5] IGN:... --- src/calibre/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 3346f205b8..b455c55553 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -36,6 +36,7 @@ mimetypes.add_type('application/ereader', '.pdb') mimetypes.add_type('application/mobi', '.mobi') mimetypes.add_type('application/mobi', '.prc') mimetypes.add_type('application/mobi', '.azw') +mimetypes.add_type('image/wmf', '.wmf') guess_type = mimetypes.guess_type import cssutils cssutils.log.setLevel(logging.WARN)