Sync to trunk.

2025-07-09 03:04:10 -04:00 · 2010-01-18 06:47:16 -05:00 · 2010-01-18 06:47:16 -05:00 · 4b3f998e9c
commit 4b3f998e9c
parent 6e52a2ada0 547e984acc
9 changed files with 468 additions and 6 deletions
--- a/resources/recipes/drivelry.recipe
+++ b/resources/recipes/drivelry.recipe
@ -0,0 +1,41 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+class drivelrycom(BasicNewsRecipe):
+    title          = u'drivelry.com'
+    language       = 'en'
+    description    = 'A blog by Mike Abrahams'
+    __author__     = 'Krittika Goyal'
+    oldest_article = 60 #days
+    max_articles_per_feed = 25
+    #encoding = 'latin1'
+
+    remove_stylesheets = True
+    #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
+    remove_tags_after  = dict(name='div', attrs={'id':'bookmark'})
+    remove_tags = [
+       dict(name='iframe'),
+       dict(name='div', attrs={'class':['sidebar']}),
+       dict(name='div', attrs={'id':['bookmark']}),
+       #dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}),
+       #dict(name='ul', attrs={'class':'articleTools'}),
+    ]
+
+    feeds          = [
+('drivelry.com',
+ 'http://feeds.feedburner.com/drivelry'),
+
+]
+
+    def preprocess_html(self, soup):
+        story = soup.find(name='div', attrs={'id':'main'})
+        #td = heading.findParent(name='td')
+        #td.extract()
+        soup = BeautifulSoup('''
+<html><head><title>t</title></head><body>
+<p>To donate to this blog: <a href="http://www.drivelry.com/thank-you/">click here</a></p>
+</body></html>
+''')
+        body = soup.find(name='body')
+        body.insert(0, story)
+        return soup
--- a/resources/recipes/greader_uber.recipe
+++ b/resources/recipes/greader_uber.recipe
@ -0,0 +1,38 @@
+import urllib, re, mechanize
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre import __appname__
+
+class GoogleReaderUber(BasicNewsRecipe):
+    title   = 'Google Reader Uber'
+    description = 'This recipe downloads all unread feedsfrom your Google Reader account.'
+    needs_subscription = True
+    __author__ = 'rollercoaster, davec'
+    base_url = 'http://www.google.com/reader/atom/'
+    oldest_article = 365
+    max_articles_per_feed = 250
+    get_options = '?n=%d&xt=user/-/state/com.google/read' % max_articles_per_feed
+    use_embedded_content = True
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser()
+
+        if self.username is not None and self.password is not None:
+            request = urllib.urlencode([('Email', self.username), ('Passwd', self.password),
+                                        ('service', 'reader'), ('source', __appname__)])
+            response = br.open('https://www.google.com/accounts/ClientLogin', request)
+            sid = re.search('SID=(\S*)', response.read()).group(1)
+
+            cookies = mechanize.CookieJar()
+            br = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
+            cookies.set_cookie(mechanize.Cookie(None, 'SID', sid, None, False, '.google.com', True, True, '/', True, False, None, True, '', '', None))
+        return br
+
+
+    def get_feeds(self):
+        feeds = []
+        soup = self.index_to_soup('http://www.google.com/reader/api/0/tag/list')
+        for id in soup.findAll(True, attrs={'name':['id']}):
+            url = id.contents[0].replace('broadcast','reading-list')
+            feeds.append((re.search('/([^/]*)$', url).group(1),
+                          self.base_url + urllib.quote(url.encode('utf-8')) + self.get_options))
+        return feeds
--- a/resources/recipes/ledevoir.recipe
+++ b/resources/recipes/ledevoir.recipe
@ -25,7 +25,6 @@ class ledevoir(BasicNewsRecipe):
    encoding       = 'utf-8'
    timefmt        = '[%a, %d %b, %Y]'

-    oldest_article = 1
    max_articles_per_feed = 50
    use_embedded_content  = False
    recursion             = 10
--- a/resources/recipes/national_post.recipe
+++ b/resources/recipes/national_post.recipe
@ -70,11 +70,28 @@ class NYTimes(BasicNewsRecipe):
                feeds.append((current_section, current_articles))

            return feeds
+
    def preprocess_html(self, soup):
        story = soup.find(name='div', attrs={'class':'triline'})
-        #td = heading.findParent(name='td')
-        #td.extract()
+        page2_link = soup.find('p','pagenav')
+        if page2_link:
+            atag = page2_link.find('a',href=True)
+            if atag:
+                page2_url = atag['href']
+                if page2_url.startswith('story'):
+                         page2_url = 'http://www.nationalpost.com/todays-paper/'+page2_url
+                elif page2_url.startswith( '/todays-paper/story.html'):
+                    page2_url = 'http://www.nationalpost.com/'+page2_url
+                page2_soup = self.index_to_soup(page2_url)
+                if page2_soup:
+                    page2_content = page2_soup.find('div','story-content')
+                    if page2_content:
+                        full_story = BeautifulSoup('<div></div>')
+                        full_story.insert(0,story)
+                        full_story.insert(1,page2_content)
+                        story = full_story
        soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
        body = soup.find(name='body')
        body.insert(0, story)
        return soup
+
--- a/src/calibre/devices/eb600/driver.py
+++ b/src/calibre/devices/eb600/driver.py
@ -14,6 +14,7 @@ Windows PNP strings:
 2W00000&1', 3, u'G:\\')

 '''
+import re

 from calibre.devices.usbms.driver import USBMS

@ -108,6 +109,7 @@ class POCKETBOOK360(EB600):

    OSX_MAIN_MEM   = 'Philips Mass Storge Media'
    OSX_CARD_A_MEM = 'Philips Mass Storge Media'
+    OSX_MAIN_MEM_VOL_PAT = re.compile(r'/Pocket')

    @classmethod
    def can_handle(cls, dev, debug=False):
--- a/src/calibre/ebooks/metadata/rtf.py
+++ b/src/calibre/ebooks/metadata/rtf.py
@ -25,12 +25,14 @@ def get_document_info(stream):
    while not found:
        prefix = block[-6:]
        block = prefix + stream.read(block_size)
+        actual_block_size = len(block) - len(prefix)
        if len(block) == len(prefix):
            break
        idx = block.find(r'{\info')
        if idx >= 0:
            found = True
-            stream.seek(stream.tell() - block_size + idx - len(prefix))
+            pos = stream.tell() - actual_block_size + idx - len(prefix)
+            stream.seek(pos)
        else:
            if block.find(r'\sect') > -1:
                break
--- a/src/calibre/ebooks/oeb/transforms/structure.py
+++ b/src/calibre/ebooks/oeb/transforms/structure.py
@ -90,7 +90,10 @@ class DetectStructure(object):
                    mark = etree.Element(XHTML('div'), style=page_break_after)
                else: # chapter_mark == 'both':
                    mark = etree.Element(XHTML('hr'), style=page_break_before)
-                elem.addprevious(mark)
+                try:
+                    elem.addprevious(mark)
+                except TypeError:
+                    self.log.exception('Failed to mark chapter')

    def create_level_based_toc(self):
        if self.opts.level1_toc is None:
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -169,6 +169,21 @@ class RTFInput(InputFormatPlugin):
        with open('styles.css', 'ab') as f:
            f.write(css)

+    def preprocess(self, fname):
+        self.log('\tPreprocessing to convert unicode characters')
+        try:
+            data = open(fname, 'rb').read()
+            from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
+            tokenizer = RtfTokenizer(data)
+            tokens = RtfTokenParser(tokenizer.tokens)
+            data = tokens.toRTF()
+            fname = 'preprocessed.rtf'
+            with open(fname, 'wb') as f:
+                f.write(data)
+        except:
+            self.log.exception(
+            'Failed to preprocess RTF to convert unicode sequences, ignoring...')
+        return fname

    def convert(self, stream, options, file_ext, log,
                accelerators):
@ -177,8 +192,9 @@ class RTFInput(InputFormatPlugin):
        from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
        self.log = log
        self.log('Converting RTF to XML...')
+        fname = self.preprocess(stream.name)
        try:
-            xml = self.generate_xml(stream.name)
+            xml = self.generate_xml(fname)
        except RtfInvalidCodeException:
            raise ValueError(_('This RTF file has a feature calibre does not '
            'support. Convert it to HTML first and then try it.'))
--- a/src/calibre/ebooks/rtf/preprocess.py
+++ b/src/calibre/ebooks/rtf/preprocess.py
@ -0,0 +1,344 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Gerendi Sandor Attila'
+__docformat__ = 'restructuredtext en'
+
+"""
+RTF tokenizer and token parser. v.1.0 (1/17/2010)
+Author: Gerendi Sandor Attila
+
+At this point this will tokenize a RTF file then rebuild it from the tokens.
+In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compilant.
+"""
+
+class tokenDelimitatorStart():
+    def __init__(self):
+        pass
+    def toRTF(self):
+        return b'{'
+    def __repr__(self):
+        return '{'
+
+class tokenDelimitatorEnd():
+    def __init__(self):
+        pass
+    def toRTF(self):
+        return b'}'
+    def __repr__(self):
+        return '}'
+
+class tokenControlWord():
+    def __init__(self, name, separator = ''):
+        self.name = name
+        self.separator = separator
+    def toRTF(self):
+        return self.name + self.separator
+    def __repr__(self):
+        return self.name + self.separator
+
+class tokenControlWordWithNumericArgument():
+    def __init__(self, name, argument, separator = ''):
+        self.name = name
+        self.argument = argument
+        self.separator = separator
+    def toRTF(self):
+        return self.name + repr(self.argument) + self.separator
+    def __repr__(self):
+        return self.name + repr(self.argument) + self.separator
+
+class tokenControlSymbol():
+    def __init__(self, name):
+        self.name = name
+    def toRTF(self):
+        return self.name
+    def __repr__(self):
+        return self.name
+
+class tokenData():
+    def __init__(self, data):
+        self.data = data
+    def toRTF(self):
+        return self.data
+    def __repr__(self):
+        return self.data
+
+class tokenBinN():
+    def __init__(self, data, separator = ''):
+        self.data = data
+        self.separator = separator
+    def toRTF(self):
+        return "\\bin" + repr(len(self.data)) + self.separator + self.data
+    def __repr__(self):
+        return "\\bin" + repr(len(self.data)) + self.separator + self.data
+
+class token8bitChar():
+    def __init__(self, data):
+        self.data = data
+    def toRTF(self):
+        return "\\'" + self.data
+    def __repr__(self):
+        return "\\'" + self.data
+
+class tokenUnicode():
+    def __init__(self, data, separator = '', current_ucn = 1, eqList = []):
+        self.data = data
+        self.separator = separator
+        self.current_ucn = current_ucn
+        self.eqList = eqList
+    def toRTF(self):
+        result = '\\u' + repr(self.data) + ' '
+        ucn = self.current_ucn
+        if len(self.eqList) < ucn:
+            ucn = len(self.eqList)
+            result =  tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result
+        i = 0
+        for eq in self.eqList:
+            if i >= ucn:
+                break
+            result = result + eq.toRTF()
+        return result
+    def __repr__(self):
+        return '\\u' + repr(self.data)
+
+
+def isAsciiLetter(value):
+    return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z'))
+
+def isDigit(value):
+    return (value >= '0') and (value <= '9')
+
+def isChar(value, char):
+    return value == char
+
+def isString(buffer, string):
+    return buffer == string
+
+
+class RtfTokenParser():
+    def __init__(self, tokens):
+        self.tokens = tokens
+        self.process()
+        self.processUnicode()
+
+    def process(self):
+        i = 0
+        newTokens = []
+        while i < len(self.tokens):
+            if isinstance(self.tokens[i], tokenControlSymbol):
+                if isString(self.tokens[i].name, "\\'"):
+                    i = i + 1
+                    if not isinstance(self.tokens[i], tokenData):
+                        raise BaseException('Error: token8bitChar without data.')
+                    if len(self.tokens[i].data) < 2:
+                        raise BaseException('Error: token8bitChar without data.')
+                    newTokens.append(token8bitChar(self.tokens[i].data[0:2]))
+                    if len(self.tokens[i].data) > 2:
+                        newTokens.append(tokenData(self.tokens[i].data[2:]))
+                    i = i + 1
+                    continue
+
+            newTokens.append(self.tokens[i])
+            i = i + 1
+
+        self.tokens = list(newTokens)
+
+    def processUnicode(self):
+        i = 0
+        newTokens = []
+        ucNbStack = [1]
+        while i < len(self.tokens):
+            if isinstance(self.tokens[i], tokenDelimitatorStart):
+                ucNbStack.append(ucNbStack[len(ucNbStack) - 1])
+                newTokens.append(self.tokens[i])
+                i = i + 1
+                continue
+            if isinstance(self.tokens[i], tokenDelimitatorEnd):
+                ucNbStack.pop()
+                newTokens.append(self.tokens[i])
+                i = i + 1
+                continue
+            if isinstance(self.tokens[i], tokenControlWordWithNumericArgument):
+                if isString(self.tokens[i].name, '\\uc'):
+                    ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument
+                    newTokens.append(self.tokens[i])
+                    i = i + 1
+                    continue
+                if isString(self.tokens[i].name, '\\u'):
+                    x = i
+                    j = 0
+                    i = i + 1
+                    replace = []
+                    partialData = None
+                    ucn = ucNbStack[len(ucNbStack) - 1]
+                    while (i < len(self.tokens)) and (j < ucn):
+                        if isinstance(self.tokens[i], tokenDelimitatorStart):
+                            break
+                        if isinstance(self.tokens[i], tokenDelimitatorEnd):
+                            break
+                        if isinstance(self.tokens[i], tokenData):
+                            if len(self.tokens[i].data) >= ucn - j:
+                                replace.append(tokenData(self.tokens[i].data[0 : ucn - j]))
+                                if len(self.tokens[i].data) > ucn - j:
+                                    partialData = tokenData(self.tokens[i].data[ucn - j:])
+                                i = i + 1
+                                break
+                            else:
+                                replace.append(self.tokens[i])
+                                j = j + len(self.tokens[i].data)
+                                i = i + 1
+                                continue
+                        if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN):
+                            replace.append(self.tokens[i])
+                            i = i + 1
+                            j = j + 1
+                            continue
+                        raise BaseException('Error: incorect utf replacement.')
+
+                    #calibre rtf2xml does not support utfreplace
+                    replace = []
+
+                    newTokens.append(tokenUnicode(self.tokens[x].argument, self.tokens[x].separator, ucNbStack[len(ucNbStack) - 1], replace))
+                    if partialData != None:
+                        newTokens.append(partialData)
+                    continue
+
+            newTokens.append(self.tokens[i])
+            i = i + 1
+
+        self.tokens = list(newTokens)
+
+
+    def toRTF(self):
+        result = []
+        for token in self.tokens:
+            result.append(token.toRTF())
+        return "".join(result)
+
+
+class RtfTokenizer():
+    def __init__(self, rtfData):
+        self.rtfData = []
+        self.tokens = []
+        self.rtfData = rtfData
+        self.tokenize()
+
+    def tokenize(self):
+        i = 0
+        lastDataStart = -1
+        while i < len(self.rtfData):
+
+            if isChar(self.rtfData[i], '{'):
+                if lastDataStart > -1:
+                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+                    lastDataStart = -1
+                self.tokens.append(tokenDelimitatorStart())
+                i = i + 1
+                continue
+
+            if isChar(self.rtfData[i], '}'):
+                if lastDataStart > -1:
+                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+                    lastDataStart = -1
+                self.tokens.append(tokenDelimitatorEnd())
+                i = i + 1
+                continue
+
+            if isChar(self.rtfData[i], '\\'):
+                if i + 1 >= len(self.rtfData):
+                    raise BaseException('Error: Control character found at the end of the document.')
+
+                if lastDataStart > -1:
+                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+                    lastDataStart = -1
+
+                tokenStart = i
+                i = i + 1
+
+                #Control Words
+                if isAsciiLetter(self.rtfData[i]):
+                    #consume <ASCII Letter Sequence>
+                    consumed = False
+                    while i < len(self.rtfData):
+                        if not isAsciiLetter(self.rtfData[i]):
+                            tokenEnd = i
+                            consumed = True
+                            break
+                        i = i + 1
+
+                    if not consumed:
+                        raise BaseException('Error (at:%d): Control Word without end.'%(tokenStart))
+
+                    #we have numeric argument before delimiter
+                    if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]):
+                        #consume the numeric argument
+                        consumed = False
+                        l = 0
+                        while i < len(self.rtfData):
+                            if not isDigit(self.rtfData[i]):
+                                consumed = True
+                                break
+                            l = l + 1
+                            i = i + 1
+                            if l > 10 :
+                                raise BaseException('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart])
+
+                        if not consumed:
+                            raise BaseException('Error (at:%d): Control Word without numeric argument end.'%[tokenStart])
+
+                    separator = ''
+                    if isChar(self.rtfData[i], ' '):
+                        separator = ' '
+
+                    controlWord = self.rtfData[tokenStart: tokenEnd]
+                    if tokenEnd < i:
+                        value = int(self.rtfData[tokenEnd: i])
+                        if isString(controlWord, "\\bin"):
+                            i = i + value
+                            self.tokens.append(tokenBinN(self.rtfData[tokenStart:i], separator))
+                        else:
+                            self.tokens.append(tokenControlWordWithNumericArgument(controlWord, value, separator))
+                    else:
+                        self.tokens.append(tokenControlWord(controlWord, separator))
+                    #space delimiter, we should discard it
+                    if self.rtfData[i] == ' ':
+                        i = i + 1
+
+                #Control Symbol
+                else:
+                    self.tokens.append(tokenControlSymbol(self.rtfData[tokenStart : i + 1]))
+                    i = i + 1
+                continue
+
+            if lastDataStart < 0:
+                lastDataStart = i
+            i = i + 1
+
+    def toRTF(self):
+        result = []
+        for token in self.tokens:
+            result.append(token.toRTF())
+        return "".join(result)
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        print ("Usage %prog rtfFileToConvert")
+        sys.exit()
+    f = open(sys.argv[1], 'rb')
+    data = f.read()
+    f.close()
+
+    tokenizer = RtfTokenizer(data)
+    parsedTokens = RtfTokenParser(tokenizer.tokens)
+
+    data = parsedTokens.toRTF()
+
+    f = open(sys.argv[1], 'w')
+    f.write(data)
+    f.close()
+
+