From d2a6dc430569552c08ef3c1e98b1e0a6a0c64222 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jan 2010 09:42:35 -0700 Subject: [PATCH 1/7] Fix main mem and card being swapped in pocketbook detection on OS X --- src/calibre/devices/eb600/driver.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/devices/eb600/driver.py b/src/calibre/devices/eb600/driver.py index d84f3c3e77..d3990e95ac 100644 --- a/src/calibre/devices/eb600/driver.py +++ b/src/calibre/devices/eb600/driver.py @@ -14,6 +14,7 @@ Windows PNP strings: 2W00000&1', 3, u'G:\\') ''' +import re from calibre.devices.usbms.driver import USBMS @@ -108,6 +109,7 @@ class POCKETBOOK360(EB600): OSX_MAIN_MEM = 'Philips Mass Storge Media' OSX_CARD_A_MEM = 'Philips Mass Storge Media' + OSX_MAIN_MEM_VOL_PAT = re.compile(r'/Pocket') @classmethod def can_handle(cls, dev, debug=False): From 10565bd06144f6e8ec55ea69b0890f287f1cb81d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jan 2010 09:57:58 -0700 Subject: [PATCH 2/7] Conversion pipeline: Don't error out if the user sets an invalid chapter detection XPath --- src/calibre/ebooks/oeb/transforms/structure.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/oeb/transforms/structure.py b/src/calibre/ebooks/oeb/transforms/structure.py index 2f52fde371..15e9675aa8 100644 --- a/src/calibre/ebooks/oeb/transforms/structure.py +++ b/src/calibre/ebooks/oeb/transforms/structure.py @@ -90,7 +90,10 @@ class DetectStructure(object): mark = etree.Element(XHTML('div'), style=page_break_after) else: # chapter_mark == 'both': mark = etree.Element(XHTML('hr'), style=page_break_before) - elem.addprevious(mark) + try: + elem.addprevious(mark) + except TypeError: + self.log.exception('Failed to mark chapter') def create_level_based_toc(self): if self.opts.level1_toc is None: From 5c243cda3b2832b11f608323a7465ad996e34471 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jan 2010 11:09:06 -0700 Subject: [PATCH 3/7] New recipe for Google Reader that downloads unread articles instead of just starred ones, by rollercoaster --- resources/recipes/greader_uber.recipe | 38 +++++++++++++++++++++++++++ resources/recipes/ledevoir.recipe | 1 - 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 resources/recipes/greader_uber.recipe diff --git a/resources/recipes/greader_uber.recipe b/resources/recipes/greader_uber.recipe new file mode 100644 index 0000000000..ee48e7069d --- /dev/null +++ b/resources/recipes/greader_uber.recipe @@ -0,0 +1,38 @@ +import urllib, re, mechanize +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre import __appname__ + +class GoogleReaderUber(BasicNewsRecipe): + title = 'Google Reader Uber' + description = 'This recipe downloads all unread feedsfrom your Google Reader account.' + needs_subscription = True + __author__ = 'rollercoaster, davec' + base_url = 'http://www.google.com/reader/atom/' + oldest_article = 365 + max_articles_per_feed = 250 + get_options = '?n=%d&xt=user/-/state/com.google/read' % max_articles_per_feed + use_embedded_content = True + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + + if self.username is not None and self.password is not None: + request = urllib.urlencode([('Email', self.username), ('Passwd', self.password), + ('service', 'reader'), ('source', __appname__)]) + response = br.open('https://www.google.com/accounts/ClientLogin', request) + sid = re.search('SID=(\S*)', response.read()).group(1) + + cookies = mechanize.CookieJar() + br = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) + cookies.set_cookie(mechanize.Cookie(None, 'SID', sid, None, False, '.google.com', True, True, '/', True, False, None, True, '', '', None)) + return br + + + def get_feeds(self): + feeds = [] + soup = self.index_to_soup('http://www.google.com/reader/api/0/tag/list') + for id in soup.findAll(True, attrs={'name':['id']}): + url = id.contents[0].replace('broadcast','reading-list') + feeds.append((re.search('/([^/]*)$', url).group(1), + self.base_url + urllib.quote(url.encode('utf-8')) + self.get_options)) + return feeds diff --git a/resources/recipes/ledevoir.recipe b/resources/recipes/ledevoir.recipe index 4612beea2e..c9dbd8c5d7 100644 --- a/resources/recipes/ledevoir.recipe +++ b/resources/recipes/ledevoir.recipe @@ -25,7 +25,6 @@ class ledevoir(BasicNewsRecipe): encoding = 'utf-8' timefmt = '[%a, %d %b, %Y]' - oldest_article = 1 max_articles_per_feed = 50 use_embedded_content = False recursion = 10 From c6692c859ef8b6b8dc06421f22dd122fc200f52e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jan 2010 19:40:31 -0700 Subject: [PATCH 4/7] Fix multipage articles in The National Post --- resources/recipes/national_post.recipe | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/resources/recipes/national_post.recipe b/resources/recipes/national_post.recipe index d9743d5980..4fe188934c 100644 --- a/resources/recipes/national_post.recipe +++ b/resources/recipes/national_post.recipe @@ -70,11 +70,28 @@ class NYTimes(BasicNewsRecipe): feeds.append((current_section, current_articles)) return feeds + def preprocess_html(self, soup): story = soup.find(name='div', attrs={'class':'triline'}) - #td = heading.findParent(name='td') - #td.extract() + page2_link = soup.find('p','pagenav') + if page2_link: + atag = page2_link.find('a',href=True) + if atag: + page2_url = atag['href'] + if page2_url.startswith('story'): + page2_url = 'http://www.nationalpost.com/todays-paper/'+page2_url + elif page2_url.startswith( '/todays-paper/story.html'): + page2_url = 'http://www.nationalpost.com/'+page2_url + page2_soup = self.index_to_soup(page2_url) + if page2_soup: + page2_content = page2_soup.find('div','story-content') + if page2_content: + full_story = BeautifulSoup('
') + full_story.insert(0,story) + full_story.insert(1,page2_content) + story = full_story soup = BeautifulSoup('t') body = soup.find(name='body') body.insert(0, story) return soup + From 03714a978fc9453bc5df8d18cf12f3ee45c6a1a7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jan 2010 19:56:45 -0700 Subject: [PATCH 5/7] RTF Input: Support for unicode characters. Fixes #4501 (Unicode escaped RTF to XML problem) --- src/calibre/ebooks/rtf/input.py | 18 +- src/calibre/ebooks/rtf/preprocess.py | 344 +++++++++++++++++++++++++++ 2 files changed, 361 insertions(+), 1 deletion(-) create mode 100644 src/calibre/ebooks/rtf/preprocess.py diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 55f42ae4d5..ff20793f39 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -169,6 +169,21 @@ class RTFInput(InputFormatPlugin): with open('styles.css', 'ab') as f: f.write(css) + def preprocess(self, fname): + self.log('\tPreprocessing to convert unicode characters') + try: + data = open(fname, 'rb').read() + from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser + tokenizer = RtfTokenizer(data) + tokens = RtfTokenParser(tokenizer.tokens) + data = tokens.toRTF() + fname = 'preprocessed.rtf' + with open(fname, 'wb') as f: + f.write(data) + except: + self.log.exception( + 'Failed to preprocess RTF to convert unicode sequences, ignoring...') + return fname def convert(self, stream, options, file_ext, log, accelerators): @@ -177,8 +192,9 @@ class RTFInput(InputFormatPlugin): from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException self.log = log self.log('Converting RTF to XML...') + fname = self.preprocess(stream.name) try: - xml = self.generate_xml(stream.name) + xml = self.generate_xml(fname) except RtfInvalidCodeException: raise ValueError(_('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.')) diff --git a/src/calibre/ebooks/rtf/preprocess.py b/src/calibre/ebooks/rtf/preprocess.py new file mode 100644 index 0000000000..07e6d41fac --- /dev/null +++ b/src/calibre/ebooks/rtf/preprocess.py @@ -0,0 +1,344 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2010, Gerendi Sandor Attila' +__docformat__ = 'restructuredtext en' + +""" +RTF tokenizer and token parser. v.1.0 (1/17/2010) +Author: Gerendi Sandor Attila + +At this point this will tokenize a RTF file then rebuild it from the tokens. +In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compilant. +""" + +class tokenDelimitatorStart(): + def __init__(self): + pass + def toRTF(self): + return b'{' + def __repr__(self): + return '{' + +class tokenDelimitatorEnd(): + def __init__(self): + pass + def toRTF(self): + return b'}' + def __repr__(self): + return '}' + +class tokenControlWord(): + def __init__(self, name, separator = ''): + self.name = name + self.separator = separator + def toRTF(self): + return self.name + self.separator + def __repr__(self): + return self.name + self.separator + +class tokenControlWordWithNumericArgument(): + def __init__(self, name, argument, separator = ''): + self.name = name + self.argument = argument + self.separator = separator + def toRTF(self): + return self.name + repr(self.argument) + self.separator + def __repr__(self): + return self.name + repr(self.argument) + self.separator + +class tokenControlSymbol(): + def __init__(self, name): + self.name = name + def toRTF(self): + return self.name + def __repr__(self): + return self.name + +class tokenData(): + def __init__(self, data): + self.data = data + def toRTF(self): + return self.data + def __repr__(self): + return self.data + +class tokenBinN(): + def __init__(self, data, separator = ''): + self.data = data + self.separator = separator + def toRTF(self): + return "\\bin" + repr(len(self.data)) + self.separator + self.data + def __repr__(self): + return "\\bin" + repr(len(self.data)) + self.separator + self.data + +class token8bitChar(): + def __init__(self, data): + self.data = data + def toRTF(self): + return "\\'" + self.data + def __repr__(self): + return "\\'" + self.data + +class tokenUnicode(): + def __init__(self, data, separator = '', current_ucn = 1, eqList = []): + self.data = data + self.separator = separator + self.current_ucn = current_ucn + self.eqList = eqList + def toRTF(self): + result = '\\u' + repr(self.data) + ' ' + ucn = self.current_ucn + if len(self.eqList) < ucn: + ucn = len(self.eqList) + result = tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result + i = 0 + for eq in self.eqList: + if i >= ucn: + break + result = result + eq.toRTF() + return result + def __repr__(self): + return '\\u' + repr(self.data) + + +def isAsciiLetter(value): + return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z')) + +def isDigit(value): + return (value >= '0') and (value <= '9') + +def isChar(value, char): + return value == char + +def isString(buffer, string): + return buffer == string + + +class RtfTokenParser(): + def __init__(self, tokens): + self.tokens = tokens + self.process() + self.processUnicode() + + def process(self): + i = 0 + newTokens = [] + while i < len(self.tokens): + if isinstance(self.tokens[i], tokenControlSymbol): + if isString(self.tokens[i].name, "\\'"): + i = i + 1 + if not isinstance(self.tokens[i], tokenData): + raise BaseException('Error: token8bitChar without data.') + if len(self.tokens[i].data) < 2: + raise BaseException('Error: token8bitChar without data.') + newTokens.append(token8bitChar(self.tokens[i].data[0:2])) + if len(self.tokens[i].data) > 2: + newTokens.append(tokenData(self.tokens[i].data[2:])) + i = i + 1 + continue + + newTokens.append(self.tokens[i]) + i = i + 1 + + self.tokens = list(newTokens) + + def processUnicode(self): + i = 0 + newTokens = [] + ucNbStack = [1] + while i < len(self.tokens): + if isinstance(self.tokens[i], tokenDelimitatorStart): + ucNbStack.append(ucNbStack[len(ucNbStack) - 1]) + newTokens.append(self.tokens[i]) + i = i + 1 + continue + if isinstance(self.tokens[i], tokenDelimitatorEnd): + ucNbStack.pop() + newTokens.append(self.tokens[i]) + i = i + 1 + continue + if isinstance(self.tokens[i], tokenControlWordWithNumericArgument): + if isString(self.tokens[i].name, '\\uc'): + ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument + newTokens.append(self.tokens[i]) + i = i + 1 + continue + if isString(self.tokens[i].name, '\\u'): + x = i + j = 0 + i = i + 1 + replace = [] + partialData = None + ucn = ucNbStack[len(ucNbStack) - 1] + while (i < len(self.tokens)) and (j < ucn): + if isinstance(self.tokens[i], tokenDelimitatorStart): + break + if isinstance(self.tokens[i], tokenDelimitatorEnd): + break + if isinstance(self.tokens[i], tokenData): + if len(self.tokens[i].data) >= ucn - j: + replace.append(tokenData(self.tokens[i].data[0 : ucn - j])) + if len(self.tokens[i].data) > ucn - j: + partialData = tokenData(self.tokens[i].data[ucn - j:]) + i = i + 1 + break + else: + replace.append(self.tokens[i]) + j = j + len(self.tokens[i].data) + i = i + 1 + continue + if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN): + replace.append(self.tokens[i]) + i = i + 1 + j = j + 1 + continue + raise BaseException('Error: incorect utf replacement.') + + #calibre rtf2xml does not support utfreplace + replace = [] + + newTokens.append(tokenUnicode(self.tokens[x].argument, self.tokens[x].separator, ucNbStack[len(ucNbStack) - 1], replace)) + if partialData != None: + newTokens.append(partialData) + continue + + newTokens.append(self.tokens[i]) + i = i + 1 + + self.tokens = list(newTokens) + + + def toRTF(self): + result = [] + for token in self.tokens: + result.append(token.toRTF()) + return "".join(result) + + +class RtfTokenizer(): + def __init__(self, rtfData): + self.rtfData = [] + self.tokens = [] + self.rtfData = rtfData + self.tokenize() + + def tokenize(self): + i = 0 + lastDataStart = -1 + while i < len(self.rtfData): + + if isChar(self.rtfData[i], '{'): + if lastDataStart > -1: + self.tokens.append(tokenData(self.rtfData[lastDataStart : i])) + lastDataStart = -1 + self.tokens.append(tokenDelimitatorStart()) + i = i + 1 + continue + + if isChar(self.rtfData[i], '}'): + if lastDataStart > -1: + self.tokens.append(tokenData(self.rtfData[lastDataStart : i])) + lastDataStart = -1 + self.tokens.append(tokenDelimitatorEnd()) + i = i + 1 + continue + + if isChar(self.rtfData[i], '\\'): + if i + 1 >= len(self.rtfData): + raise BaseException('Error: Control character found at the end of the document.') + + if lastDataStart > -1: + self.tokens.append(tokenData(self.rtfData[lastDataStart : i])) + lastDataStart = -1 + + tokenStart = i + i = i + 1 + + #Control Words + if isAsciiLetter(self.rtfData[i]): + #consume + consumed = False + while i < len(self.rtfData): + if not isAsciiLetter(self.rtfData[i]): + tokenEnd = i + consumed = True + break + i = i + 1 + + if not consumed: + raise BaseException('Error (at:%d): Control Word without end.'%(tokenStart)) + + #we have numeric argument before delimiter + if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]): + #consume the numeric argument + consumed = False + l = 0 + while i < len(self.rtfData): + if not isDigit(self.rtfData[i]): + consumed = True + break + l = l + 1 + i = i + 1 + if l > 10 : + raise BaseException('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart]) + + if not consumed: + raise BaseException('Error (at:%d): Control Word without numeric argument end.'%[tokenStart]) + + separator = '' + if isChar(self.rtfData[i], ' '): + separator = ' ' + + controlWord = self.rtfData[tokenStart: tokenEnd] + if tokenEnd < i: + value = int(self.rtfData[tokenEnd: i]) + if isString(controlWord, "\\bin"): + i = i + value + self.tokens.append(tokenBinN(self.rtfData[tokenStart:i], separator)) + else: + self.tokens.append(tokenControlWordWithNumericArgument(controlWord, value, separator)) + else: + self.tokens.append(tokenControlWord(controlWord, separator)) + #space delimiter, we should discard it + if self.rtfData[i] == ' ': + i = i + 1 + + #Control Symbol + else: + self.tokens.append(tokenControlSymbol(self.rtfData[tokenStart : i + 1])) + i = i + 1 + continue + + if lastDataStart < 0: + lastDataStart = i + i = i + 1 + + def toRTF(self): + result = [] + for token in self.tokens: + result.append(token.toRTF()) + return "".join(result) + + +if __name__ == "__main__": + import sys + if len(sys.argv) < 2: + print ("Usage %prog rtfFileToConvert") + sys.exit() + f = open(sys.argv[1], 'rb') + data = f.read() + f.close() + + tokenizer = RtfTokenizer(data) + parsedTokens = RtfTokenParser(tokenizer.tokens) + + data = parsedTokens.toRTF() + + f = open(sys.argv[1], 'w') + f.write(data) + f.close() + + From 96e32a2deff9676570e15ff33a1a6f168cdef16d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jan 2010 20:04:19 -0700 Subject: [PATCH 6/7] New recipe for drivelry.com by Krittika Goyal --- resources/recipes/drivelry.recipe | 41 +++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 resources/recipes/drivelry.recipe diff --git a/resources/recipes/drivelry.recipe b/resources/recipes/drivelry.recipe new file mode 100644 index 0000000000..9e001ba530 --- /dev/null +++ b/resources/recipes/drivelry.recipe @@ -0,0 +1,41 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class drivelrycom(BasicNewsRecipe): + title = u'drivelry.com' + language = 'en' + description = 'A blog by Mike Abrahams' + __author__ = 'Krittika Goyal' + oldest_article = 60 #days + max_articles_per_feed = 25 + #encoding = 'latin1' + + remove_stylesheets = True + #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) + remove_tags_after = dict(name='div', attrs={'id':'bookmark'}) + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':['sidebar']}), + dict(name='div', attrs={'id':['bookmark']}), + #dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}), + #dict(name='ul', attrs={'class':'articleTools'}), + ] + + feeds = [ +('drivelry.com', + 'http://feeds.feedburner.com/drivelry'), + +] + + def preprocess_html(self, soup): + story = soup.find(name='div', attrs={'id':'main'}) + #td = heading.findParent(name='td') + #td.extract() + soup = BeautifulSoup(''' +t +

To donate to this blog: click here

+ +''') + body = soup.find(name='body') + body.insert(0, story) + return soup From 547e984accf3e44b8b205ca5759481b9e249c475 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Jan 2010 20:12:54 -0700 Subject: [PATCH 7/7] RTF metadata: Fix reading metadata from very small files --- src/calibre/ebooks/metadata/rtf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/rtf.py b/src/calibre/ebooks/metadata/rtf.py index 7f418de8d7..d116ec30fb 100644 --- a/src/calibre/ebooks/metadata/rtf.py +++ b/src/calibre/ebooks/metadata/rtf.py @@ -25,12 +25,14 @@ def get_document_info(stream): while not found: prefix = block[-6:] block = prefix + stream.read(block_size) + actual_block_size = len(block) - len(prefix) if len(block) == len(prefix): break idx = block.find(r'{\info') if idx >= 0: found = True - stream.seek(stream.tell() - block_size + idx - len(prefix)) + pos = stream.tell() - actual_block_size + idx - len(prefix) + stream.seek(pos) else: if block.find(r'\sect') > -1: break