diff --git a/resources/recipes/drivelry.recipe b/resources/recipes/drivelry.recipe
new file mode 100644
index 0000000000..9e001ba530
--- /dev/null
+++ b/resources/recipes/drivelry.recipe
@@ -0,0 +1,41 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+class drivelrycom(BasicNewsRecipe):
+ title = u'drivelry.com'
+ language = 'en'
+ description = 'A blog by Mike Abrahams'
+ __author__ = 'Krittika Goyal'
+ oldest_article = 60 #days
+ max_articles_per_feed = 25
+ #encoding = 'latin1'
+
+ remove_stylesheets = True
+ #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
+ remove_tags_after = dict(name='div', attrs={'id':'bookmark'})
+ remove_tags = [
+ dict(name='iframe'),
+ dict(name='div', attrs={'class':['sidebar']}),
+ dict(name='div', attrs={'id':['bookmark']}),
+ #dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}),
+ #dict(name='ul', attrs={'class':'articleTools'}),
+ ]
+
+ feeds = [
+('drivelry.com',
+ 'http://feeds.feedburner.com/drivelry'),
+
+]
+
+ def preprocess_html(self, soup):
+ story = soup.find(name='div', attrs={'id':'main'})
+ #td = heading.findParent(name='td')
+ #td.extract()
+ soup = BeautifulSoup('''
+
t
+To donate to this blog: click here
+
+''')
+ body = soup.find(name='body')
+ body.insert(0, story)
+ return soup
diff --git a/resources/recipes/greader_uber.recipe b/resources/recipes/greader_uber.recipe
new file mode 100644
index 0000000000..ee48e7069d
--- /dev/null
+++ b/resources/recipes/greader_uber.recipe
@@ -0,0 +1,38 @@
+import urllib, re, mechanize
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre import __appname__
+
+class GoogleReaderUber(BasicNewsRecipe):
+ title = 'Google Reader Uber'
+ description = 'This recipe downloads all unread feedsfrom your Google Reader account.'
+ needs_subscription = True
+ __author__ = 'rollercoaster, davec'
+ base_url = 'http://www.google.com/reader/atom/'
+ oldest_article = 365
+ max_articles_per_feed = 250
+ get_options = '?n=%d&xt=user/-/state/com.google/read' % max_articles_per_feed
+ use_embedded_content = True
+
+ def get_browser(self):
+ br = BasicNewsRecipe.get_browser()
+
+ if self.username is not None and self.password is not None:
+ request = urllib.urlencode([('Email', self.username), ('Passwd', self.password),
+ ('service', 'reader'), ('source', __appname__)])
+ response = br.open('https://www.google.com/accounts/ClientLogin', request)
+ sid = re.search('SID=(\S*)', response.read()).group(1)
+
+ cookies = mechanize.CookieJar()
+ br = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
+ cookies.set_cookie(mechanize.Cookie(None, 'SID', sid, None, False, '.google.com', True, True, '/', True, False, None, True, '', '', None))
+ return br
+
+
+ def get_feeds(self):
+ feeds = []
+ soup = self.index_to_soup('http://www.google.com/reader/api/0/tag/list')
+ for id in soup.findAll(True, attrs={'name':['id']}):
+ url = id.contents[0].replace('broadcast','reading-list')
+ feeds.append((re.search('/([^/]*)$', url).group(1),
+ self.base_url + urllib.quote(url.encode('utf-8')) + self.get_options))
+ return feeds
diff --git a/resources/recipes/ledevoir.recipe b/resources/recipes/ledevoir.recipe
index 4612beea2e..c9dbd8c5d7 100644
--- a/resources/recipes/ledevoir.recipe
+++ b/resources/recipes/ledevoir.recipe
@@ -25,7 +25,6 @@ class ledevoir(BasicNewsRecipe):
encoding = 'utf-8'
timefmt = '[%a, %d %b, %Y]'
- oldest_article = 1
max_articles_per_feed = 50
use_embedded_content = False
recursion = 10
diff --git a/resources/recipes/national_post.recipe b/resources/recipes/national_post.recipe
index d9743d5980..4fe188934c 100644
--- a/resources/recipes/national_post.recipe
+++ b/resources/recipes/national_post.recipe
@@ -70,11 +70,28 @@ class NYTimes(BasicNewsRecipe):
feeds.append((current_section, current_articles))
return feeds
+
def preprocess_html(self, soup):
story = soup.find(name='div', attrs={'class':'triline'})
- #td = heading.findParent(name='td')
- #td.extract()
+ page2_link = soup.find('p','pagenav')
+ if page2_link:
+ atag = page2_link.find('a',href=True)
+ if atag:
+ page2_url = atag['href']
+ if page2_url.startswith('story'):
+ page2_url = 'http://www.nationalpost.com/todays-paper/'+page2_url
+ elif page2_url.startswith( '/todays-paper/story.html'):
+ page2_url = 'http://www.nationalpost.com/'+page2_url
+ page2_soup = self.index_to_soup(page2_url)
+ if page2_soup:
+ page2_content = page2_soup.find('div','story-content')
+ if page2_content:
+ full_story = BeautifulSoup('')
+ full_story.insert(0,story)
+ full_story.insert(1,page2_content)
+ story = full_story
soup = BeautifulSoup('t')
body = soup.find(name='body')
body.insert(0, story)
return soup
+
diff --git a/src/calibre/devices/eb600/driver.py b/src/calibre/devices/eb600/driver.py
index d84f3c3e77..d3990e95ac 100644
--- a/src/calibre/devices/eb600/driver.py
+++ b/src/calibre/devices/eb600/driver.py
@@ -14,6 +14,7 @@ Windows PNP strings:
2W00000&1', 3, u'G:\\')
'''
+import re
from calibre.devices.usbms.driver import USBMS
@@ -108,6 +109,7 @@ class POCKETBOOK360(EB600):
OSX_MAIN_MEM = 'Philips Mass Storge Media'
OSX_CARD_A_MEM = 'Philips Mass Storge Media'
+ OSX_MAIN_MEM_VOL_PAT = re.compile(r'/Pocket')
@classmethod
def can_handle(cls, dev, debug=False):
diff --git a/src/calibre/ebooks/metadata/rtf.py b/src/calibre/ebooks/metadata/rtf.py
index 7f418de8d7..d116ec30fb 100644
--- a/src/calibre/ebooks/metadata/rtf.py
+++ b/src/calibre/ebooks/metadata/rtf.py
@@ -25,12 +25,14 @@ def get_document_info(stream):
while not found:
prefix = block[-6:]
block = prefix + stream.read(block_size)
+ actual_block_size = len(block) - len(prefix)
if len(block) == len(prefix):
break
idx = block.find(r'{\info')
if idx >= 0:
found = True
- stream.seek(stream.tell() - block_size + idx - len(prefix))
+ pos = stream.tell() - actual_block_size + idx - len(prefix)
+ stream.seek(pos)
else:
if block.find(r'\sect') > -1:
break
diff --git a/src/calibre/ebooks/oeb/transforms/structure.py b/src/calibre/ebooks/oeb/transforms/structure.py
index 2f52fde371..15e9675aa8 100644
--- a/src/calibre/ebooks/oeb/transforms/structure.py
+++ b/src/calibre/ebooks/oeb/transforms/structure.py
@@ -90,7 +90,10 @@ class DetectStructure(object):
mark = etree.Element(XHTML('div'), style=page_break_after)
else: # chapter_mark == 'both':
mark = etree.Element(XHTML('hr'), style=page_break_before)
- elem.addprevious(mark)
+ try:
+ elem.addprevious(mark)
+ except TypeError:
+ self.log.exception('Failed to mark chapter')
def create_level_based_toc(self):
if self.opts.level1_toc is None:
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 55f42ae4d5..ff20793f39 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -169,6 +169,21 @@ class RTFInput(InputFormatPlugin):
with open('styles.css', 'ab') as f:
f.write(css)
+ def preprocess(self, fname):
+ self.log('\tPreprocessing to convert unicode characters')
+ try:
+ data = open(fname, 'rb').read()
+ from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
+ tokenizer = RtfTokenizer(data)
+ tokens = RtfTokenParser(tokenizer.tokens)
+ data = tokens.toRTF()
+ fname = 'preprocessed.rtf'
+ with open(fname, 'wb') as f:
+ f.write(data)
+ except:
+ self.log.exception(
+ 'Failed to preprocess RTF to convert unicode sequences, ignoring...')
+ return fname
def convert(self, stream, options, file_ext, log,
accelerators):
@@ -177,8 +192,9 @@ class RTFInput(InputFormatPlugin):
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
self.log = log
self.log('Converting RTF to XML...')
+ fname = self.preprocess(stream.name)
try:
- xml = self.generate_xml(stream.name)
+ xml = self.generate_xml(fname)
except RtfInvalidCodeException:
raise ValueError(_('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try it.'))
diff --git a/src/calibre/ebooks/rtf/preprocess.py b/src/calibre/ebooks/rtf/preprocess.py
new file mode 100644
index 0000000000..07e6d41fac
--- /dev/null
+++ b/src/calibre/ebooks/rtf/preprocess.py
@@ -0,0 +1,344 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__ = 'GPL v3'
+__copyright__ = '2010, Gerendi Sandor Attila'
+__docformat__ = 'restructuredtext en'
+
+"""
+RTF tokenizer and token parser. v.1.0 (1/17/2010)
+Author: Gerendi Sandor Attila
+
+At this point this will tokenize a RTF file then rebuild it from the tokens.
+In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compilant.
+"""
+
+class tokenDelimitatorStart():
+ def __init__(self):
+ pass
+ def toRTF(self):
+ return b'{'
+ def __repr__(self):
+ return '{'
+
+class tokenDelimitatorEnd():
+ def __init__(self):
+ pass
+ def toRTF(self):
+ return b'}'
+ def __repr__(self):
+ return '}'
+
+class tokenControlWord():
+ def __init__(self, name, separator = ''):
+ self.name = name
+ self.separator = separator
+ def toRTF(self):
+ return self.name + self.separator
+ def __repr__(self):
+ return self.name + self.separator
+
+class tokenControlWordWithNumericArgument():
+ def __init__(self, name, argument, separator = ''):
+ self.name = name
+ self.argument = argument
+ self.separator = separator
+ def toRTF(self):
+ return self.name + repr(self.argument) + self.separator
+ def __repr__(self):
+ return self.name + repr(self.argument) + self.separator
+
+class tokenControlSymbol():
+ def __init__(self, name):
+ self.name = name
+ def toRTF(self):
+ return self.name
+ def __repr__(self):
+ return self.name
+
+class tokenData():
+ def __init__(self, data):
+ self.data = data
+ def toRTF(self):
+ return self.data
+ def __repr__(self):
+ return self.data
+
+class tokenBinN():
+ def __init__(self, data, separator = ''):
+ self.data = data
+ self.separator = separator
+ def toRTF(self):
+ return "\\bin" + repr(len(self.data)) + self.separator + self.data
+ def __repr__(self):
+ return "\\bin" + repr(len(self.data)) + self.separator + self.data
+
+class token8bitChar():
+ def __init__(self, data):
+ self.data = data
+ def toRTF(self):
+ return "\\'" + self.data
+ def __repr__(self):
+ return "\\'" + self.data
+
+class tokenUnicode():
+ def __init__(self, data, separator = '', current_ucn = 1, eqList = []):
+ self.data = data
+ self.separator = separator
+ self.current_ucn = current_ucn
+ self.eqList = eqList
+ def toRTF(self):
+ result = '\\u' + repr(self.data) + ' '
+ ucn = self.current_ucn
+ if len(self.eqList) < ucn:
+ ucn = len(self.eqList)
+ result = tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result
+ i = 0
+ for eq in self.eqList:
+ if i >= ucn:
+ break
+ result = result + eq.toRTF()
+ return result
+ def __repr__(self):
+ return '\\u' + repr(self.data)
+
+
+def isAsciiLetter(value):
+ return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z'))
+
+def isDigit(value):
+ return (value >= '0') and (value <= '9')
+
+def isChar(value, char):
+ return value == char
+
+def isString(buffer, string):
+ return buffer == string
+
+
+class RtfTokenParser():
+ def __init__(self, tokens):
+ self.tokens = tokens
+ self.process()
+ self.processUnicode()
+
+ def process(self):
+ i = 0
+ newTokens = []
+ while i < len(self.tokens):
+ if isinstance(self.tokens[i], tokenControlSymbol):
+ if isString(self.tokens[i].name, "\\'"):
+ i = i + 1
+ if not isinstance(self.tokens[i], tokenData):
+ raise BaseException('Error: token8bitChar without data.')
+ if len(self.tokens[i].data) < 2:
+ raise BaseException('Error: token8bitChar without data.')
+ newTokens.append(token8bitChar(self.tokens[i].data[0:2]))
+ if len(self.tokens[i].data) > 2:
+ newTokens.append(tokenData(self.tokens[i].data[2:]))
+ i = i + 1
+ continue
+
+ newTokens.append(self.tokens[i])
+ i = i + 1
+
+ self.tokens = list(newTokens)
+
+ def processUnicode(self):
+ i = 0
+ newTokens = []
+ ucNbStack = [1]
+ while i < len(self.tokens):
+ if isinstance(self.tokens[i], tokenDelimitatorStart):
+ ucNbStack.append(ucNbStack[len(ucNbStack) - 1])
+ newTokens.append(self.tokens[i])
+ i = i + 1
+ continue
+ if isinstance(self.tokens[i], tokenDelimitatorEnd):
+ ucNbStack.pop()
+ newTokens.append(self.tokens[i])
+ i = i + 1
+ continue
+ if isinstance(self.tokens[i], tokenControlWordWithNumericArgument):
+ if isString(self.tokens[i].name, '\\uc'):
+ ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument
+ newTokens.append(self.tokens[i])
+ i = i + 1
+ continue
+ if isString(self.tokens[i].name, '\\u'):
+ x = i
+ j = 0
+ i = i + 1
+ replace = []
+ partialData = None
+ ucn = ucNbStack[len(ucNbStack) - 1]
+ while (i < len(self.tokens)) and (j < ucn):
+ if isinstance(self.tokens[i], tokenDelimitatorStart):
+ break
+ if isinstance(self.tokens[i], tokenDelimitatorEnd):
+ break
+ if isinstance(self.tokens[i], tokenData):
+ if len(self.tokens[i].data) >= ucn - j:
+ replace.append(tokenData(self.tokens[i].data[0 : ucn - j]))
+ if len(self.tokens[i].data) > ucn - j:
+ partialData = tokenData(self.tokens[i].data[ucn - j:])
+ i = i + 1
+ break
+ else:
+ replace.append(self.tokens[i])
+ j = j + len(self.tokens[i].data)
+ i = i + 1
+ continue
+ if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN):
+ replace.append(self.tokens[i])
+ i = i + 1
+ j = j + 1
+ continue
+ raise BaseException('Error: incorect utf replacement.')
+
+ #calibre rtf2xml does not support utfreplace
+ replace = []
+
+ newTokens.append(tokenUnicode(self.tokens[x].argument, self.tokens[x].separator, ucNbStack[len(ucNbStack) - 1], replace))
+ if partialData != None:
+ newTokens.append(partialData)
+ continue
+
+ newTokens.append(self.tokens[i])
+ i = i + 1
+
+ self.tokens = list(newTokens)
+
+
+ def toRTF(self):
+ result = []
+ for token in self.tokens:
+ result.append(token.toRTF())
+ return "".join(result)
+
+
+class RtfTokenizer():
+ def __init__(self, rtfData):
+ self.rtfData = []
+ self.tokens = []
+ self.rtfData = rtfData
+ self.tokenize()
+
+ def tokenize(self):
+ i = 0
+ lastDataStart = -1
+ while i < len(self.rtfData):
+
+ if isChar(self.rtfData[i], '{'):
+ if lastDataStart > -1:
+ self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+ lastDataStart = -1
+ self.tokens.append(tokenDelimitatorStart())
+ i = i + 1
+ continue
+
+ if isChar(self.rtfData[i], '}'):
+ if lastDataStart > -1:
+ self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+ lastDataStart = -1
+ self.tokens.append(tokenDelimitatorEnd())
+ i = i + 1
+ continue
+
+ if isChar(self.rtfData[i], '\\'):
+ if i + 1 >= len(self.rtfData):
+ raise BaseException('Error: Control character found at the end of the document.')
+
+ if lastDataStart > -1:
+ self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+ lastDataStart = -1
+
+ tokenStart = i
+ i = i + 1
+
+ #Control Words
+ if isAsciiLetter(self.rtfData[i]):
+ #consume
+ consumed = False
+ while i < len(self.rtfData):
+ if not isAsciiLetter(self.rtfData[i]):
+ tokenEnd = i
+ consumed = True
+ break
+ i = i + 1
+
+ if not consumed:
+ raise BaseException('Error (at:%d): Control Word without end.'%(tokenStart))
+
+ #we have numeric argument before delimiter
+ if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]):
+ #consume the numeric argument
+ consumed = False
+ l = 0
+ while i < len(self.rtfData):
+ if not isDigit(self.rtfData[i]):
+ consumed = True
+ break
+ l = l + 1
+ i = i + 1
+ if l > 10 :
+ raise BaseException('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart])
+
+ if not consumed:
+ raise BaseException('Error (at:%d): Control Word without numeric argument end.'%[tokenStart])
+
+ separator = ''
+ if isChar(self.rtfData[i], ' '):
+ separator = ' '
+
+ controlWord = self.rtfData[tokenStart: tokenEnd]
+ if tokenEnd < i:
+ value = int(self.rtfData[tokenEnd: i])
+ if isString(controlWord, "\\bin"):
+ i = i + value
+ self.tokens.append(tokenBinN(self.rtfData[tokenStart:i], separator))
+ else:
+ self.tokens.append(tokenControlWordWithNumericArgument(controlWord, value, separator))
+ else:
+ self.tokens.append(tokenControlWord(controlWord, separator))
+ #space delimiter, we should discard it
+ if self.rtfData[i] == ' ':
+ i = i + 1
+
+ #Control Symbol
+ else:
+ self.tokens.append(tokenControlSymbol(self.rtfData[tokenStart : i + 1]))
+ i = i + 1
+ continue
+
+ if lastDataStart < 0:
+ lastDataStart = i
+ i = i + 1
+
+ def toRTF(self):
+ result = []
+ for token in self.tokens:
+ result.append(token.toRTF())
+ return "".join(result)
+
+
+if __name__ == "__main__":
+ import sys
+ if len(sys.argv) < 2:
+ print ("Usage %prog rtfFileToConvert")
+ sys.exit()
+ f = open(sys.argv[1], 'rb')
+ data = f.read()
+ f.close()
+
+ tokenizer = RtfTokenizer(data)
+ parsedTokens = RtfTokenParser(tokenizer.tokens)
+
+ data = parsedTokens.toRTF()
+
+ f = open(sys.argv[1], 'w')
+ f.write(data)
+ f.close()
+
+