Sync to trunk.

This commit is contained in:
John Schember 2010-01-18 06:47:16 -05:00
commit 4b3f998e9c
9 changed files with 468 additions and 6 deletions

View File

@ -0,0 +1,41 @@
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class drivelrycom(BasicNewsRecipe):
title = u'drivelry.com'
language = 'en'
description = 'A blog by Mike Abrahams'
__author__ = 'Krittika Goyal'
oldest_article = 60 #days
max_articles_per_feed = 25
#encoding = 'latin1'
remove_stylesheets = True
#remove_tags_before = dict(name='h1', attrs={'class':'heading'})
remove_tags_after = dict(name='div', attrs={'id':'bookmark'})
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':['sidebar']}),
dict(name='div', attrs={'id':['bookmark']}),
#dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}),
#dict(name='ul', attrs={'class':'articleTools'}),
]
feeds = [
('drivelry.com',
'http://feeds.feedburner.com/drivelry'),
]
def preprocess_html(self, soup):
story = soup.find(name='div', attrs={'id':'main'})
#td = heading.findParent(name='td')
#td.extract()
soup = BeautifulSoup('''
<html><head><title>t</title></head><body>
<p>To donate to this blog: <a href="http://www.drivelry.com/thank-you/">click here</a></p>
</body></html>
''')
body = soup.find(name='body')
body.insert(0, story)
return soup

View File

@ -0,0 +1,38 @@
import urllib, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre import __appname__
class GoogleReaderUber(BasicNewsRecipe):
title = 'Google Reader Uber'
description = 'This recipe downloads all unread feedsfrom your Google Reader account.'
needs_subscription = True
__author__ = 'rollercoaster, davec'
base_url = 'http://www.google.com/reader/atom/'
oldest_article = 365
max_articles_per_feed = 250
get_options = '?n=%d&xt=user/-/state/com.google/read' % max_articles_per_feed
use_embedded_content = True
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
request = urllib.urlencode([('Email', self.username), ('Passwd', self.password),
('service', 'reader'), ('source', __appname__)])
response = br.open('https://www.google.com/accounts/ClientLogin', request)
sid = re.search('SID=(\S*)', response.read()).group(1)
cookies = mechanize.CookieJar()
br = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
cookies.set_cookie(mechanize.Cookie(None, 'SID', sid, None, False, '.google.com', True, True, '/', True, False, None, True, '', '', None))
return br
def get_feeds(self):
feeds = []
soup = self.index_to_soup('http://www.google.com/reader/api/0/tag/list')
for id in soup.findAll(True, attrs={'name':['id']}):
url = id.contents[0].replace('broadcast','reading-list')
feeds.append((re.search('/([^/]*)$', url).group(1),
self.base_url + urllib.quote(url.encode('utf-8')) + self.get_options))
return feeds

View File

@ -25,7 +25,6 @@ class ledevoir(BasicNewsRecipe):
encoding = 'utf-8'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 1
max_articles_per_feed = 50
use_embedded_content = False
recursion = 10

View File

@ -70,11 +70,28 @@ class NYTimes(BasicNewsRecipe):
feeds.append((current_section, current_articles))
return feeds
def preprocess_html(self, soup):
story = soup.find(name='div', attrs={'class':'triline'})
#td = heading.findParent(name='td')
#td.extract()
page2_link = soup.find('p','pagenav')
if page2_link:
atag = page2_link.find('a',href=True)
if atag:
page2_url = atag['href']
if page2_url.startswith('story'):
page2_url = 'http://www.nationalpost.com/todays-paper/'+page2_url
elif page2_url.startswith( '/todays-paper/story.html'):
page2_url = 'http://www.nationalpost.com/'+page2_url
page2_soup = self.index_to_soup(page2_url)
if page2_soup:
page2_content = page2_soup.find('div','story-content')
if page2_content:
full_story = BeautifulSoup('<div></div>')
full_story.insert(0,story)
full_story.insert(1,page2_content)
story = full_story
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
body = soup.find(name='body')
body.insert(0, story)
return soup

View File

@ -14,6 +14,7 @@ Windows PNP strings:
2W00000&1', 3, u'G:\\')
'''
import re
from calibre.devices.usbms.driver import USBMS
@ -108,6 +109,7 @@ class POCKETBOOK360(EB600):
OSX_MAIN_MEM = 'Philips Mass Storge Media'
OSX_CARD_A_MEM = 'Philips Mass Storge Media'
OSX_MAIN_MEM_VOL_PAT = re.compile(r'/Pocket')
@classmethod
def can_handle(cls, dev, debug=False):

View File

@ -25,12 +25,14 @@ def get_document_info(stream):
while not found:
prefix = block[-6:]
block = prefix + stream.read(block_size)
actual_block_size = len(block) - len(prefix)
if len(block) == len(prefix):
break
idx = block.find(r'{\info')
if idx >= 0:
found = True
stream.seek(stream.tell() - block_size + idx - len(prefix))
pos = stream.tell() - actual_block_size + idx - len(prefix)
stream.seek(pos)
else:
if block.find(r'\sect') > -1:
break

View File

@ -90,7 +90,10 @@ class DetectStructure(object):
mark = etree.Element(XHTML('div'), style=page_break_after)
else: # chapter_mark == 'both':
mark = etree.Element(XHTML('hr'), style=page_break_before)
elem.addprevious(mark)
try:
elem.addprevious(mark)
except TypeError:
self.log.exception('Failed to mark chapter')
def create_level_based_toc(self):
if self.opts.level1_toc is None:

View File

@ -169,6 +169,21 @@ class RTFInput(InputFormatPlugin):
with open('styles.css', 'ab') as f:
f.write(css)
def preprocess(self, fname):
self.log('\tPreprocessing to convert unicode characters')
try:
data = open(fname, 'rb').read()
from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
tokenizer = RtfTokenizer(data)
tokens = RtfTokenParser(tokenizer.tokens)
data = tokens.toRTF()
fname = 'preprocessed.rtf'
with open(fname, 'wb') as f:
f.write(data)
except:
self.log.exception(
'Failed to preprocess RTF to convert unicode sequences, ignoring...')
return fname
def convert(self, stream, options, file_ext, log,
accelerators):
@ -177,8 +192,9 @@ class RTFInput(InputFormatPlugin):
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
self.log = log
self.log('Converting RTF to XML...')
fname = self.preprocess(stream.name)
try:
xml = self.generate_xml(stream.name)
xml = self.generate_xml(fname)
except RtfInvalidCodeException:
raise ValueError(_('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try it.'))

View File

@ -0,0 +1,344 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2010, Gerendi Sandor Attila'
__docformat__ = 'restructuredtext en'
"""
RTF tokenizer and token parser. v.1.0 (1/17/2010)
Author: Gerendi Sandor Attila
At this point this will tokenize a RTF file then rebuild it from the tokens.
In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compilant.
"""
class tokenDelimitatorStart():
def __init__(self):
pass
def toRTF(self):
return b'{'
def __repr__(self):
return '{'
class tokenDelimitatorEnd():
def __init__(self):
pass
def toRTF(self):
return b'}'
def __repr__(self):
return '}'
class tokenControlWord():
def __init__(self, name, separator = ''):
self.name = name
self.separator = separator
def toRTF(self):
return self.name + self.separator
def __repr__(self):
return self.name + self.separator
class tokenControlWordWithNumericArgument():
def __init__(self, name, argument, separator = ''):
self.name = name
self.argument = argument
self.separator = separator
def toRTF(self):
return self.name + repr(self.argument) + self.separator
def __repr__(self):
return self.name + repr(self.argument) + self.separator
class tokenControlSymbol():
def __init__(self, name):
self.name = name
def toRTF(self):
return self.name
def __repr__(self):
return self.name
class tokenData():
def __init__(self, data):
self.data = data
def toRTF(self):
return self.data
def __repr__(self):
return self.data
class tokenBinN():
def __init__(self, data, separator = ''):
self.data = data
self.separator = separator
def toRTF(self):
return "\\bin" + repr(len(self.data)) + self.separator + self.data
def __repr__(self):
return "\\bin" + repr(len(self.data)) + self.separator + self.data
class token8bitChar():
def __init__(self, data):
self.data = data
def toRTF(self):
return "\\'" + self.data
def __repr__(self):
return "\\'" + self.data
class tokenUnicode():
def __init__(self, data, separator = '', current_ucn = 1, eqList = []):
self.data = data
self.separator = separator
self.current_ucn = current_ucn
self.eqList = eqList
def toRTF(self):
result = '\\u' + repr(self.data) + ' '
ucn = self.current_ucn
if len(self.eqList) < ucn:
ucn = len(self.eqList)
result = tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result
i = 0
for eq in self.eqList:
if i >= ucn:
break
result = result + eq.toRTF()
return result
def __repr__(self):
return '\\u' + repr(self.data)
def isAsciiLetter(value):
return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z'))
def isDigit(value):
return (value >= '0') and (value <= '9')
def isChar(value, char):
return value == char
def isString(buffer, string):
return buffer == string
class RtfTokenParser():
def __init__(self, tokens):
self.tokens = tokens
self.process()
self.processUnicode()
def process(self):
i = 0
newTokens = []
while i < len(self.tokens):
if isinstance(self.tokens[i], tokenControlSymbol):
if isString(self.tokens[i].name, "\\'"):
i = i + 1
if not isinstance(self.tokens[i], tokenData):
raise BaseException('Error: token8bitChar without data.')
if len(self.tokens[i].data) < 2:
raise BaseException('Error: token8bitChar without data.')
newTokens.append(token8bitChar(self.tokens[i].data[0:2]))
if len(self.tokens[i].data) > 2:
newTokens.append(tokenData(self.tokens[i].data[2:]))
i = i + 1
continue
newTokens.append(self.tokens[i])
i = i + 1
self.tokens = list(newTokens)
def processUnicode(self):
i = 0
newTokens = []
ucNbStack = [1]
while i < len(self.tokens):
if isinstance(self.tokens[i], tokenDelimitatorStart):
ucNbStack.append(ucNbStack[len(ucNbStack) - 1])
newTokens.append(self.tokens[i])
i = i + 1
continue
if isinstance(self.tokens[i], tokenDelimitatorEnd):
ucNbStack.pop()
newTokens.append(self.tokens[i])
i = i + 1
continue
if isinstance(self.tokens[i], tokenControlWordWithNumericArgument):
if isString(self.tokens[i].name, '\\uc'):
ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument
newTokens.append(self.tokens[i])
i = i + 1
continue
if isString(self.tokens[i].name, '\\u'):
x = i
j = 0
i = i + 1
replace = []
partialData = None
ucn = ucNbStack[len(ucNbStack) - 1]
while (i < len(self.tokens)) and (j < ucn):
if isinstance(self.tokens[i], tokenDelimitatorStart):
break
if isinstance(self.tokens[i], tokenDelimitatorEnd):
break
if isinstance(self.tokens[i], tokenData):
if len(self.tokens[i].data) >= ucn - j:
replace.append(tokenData(self.tokens[i].data[0 : ucn - j]))
if len(self.tokens[i].data) > ucn - j:
partialData = tokenData(self.tokens[i].data[ucn - j:])
i = i + 1
break
else:
replace.append(self.tokens[i])
j = j + len(self.tokens[i].data)
i = i + 1
continue
if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN):
replace.append(self.tokens[i])
i = i + 1
j = j + 1
continue
raise BaseException('Error: incorect utf replacement.')
#calibre rtf2xml does not support utfreplace
replace = []
newTokens.append(tokenUnicode(self.tokens[x].argument, self.tokens[x].separator, ucNbStack[len(ucNbStack) - 1], replace))
if partialData != None:
newTokens.append(partialData)
continue
newTokens.append(self.tokens[i])
i = i + 1
self.tokens = list(newTokens)
def toRTF(self):
result = []
for token in self.tokens:
result.append(token.toRTF())
return "".join(result)
class RtfTokenizer():
def __init__(self, rtfData):
self.rtfData = []
self.tokens = []
self.rtfData = rtfData
self.tokenize()
def tokenize(self):
i = 0
lastDataStart = -1
while i < len(self.rtfData):
if isChar(self.rtfData[i], '{'):
if lastDataStart > -1:
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
lastDataStart = -1
self.tokens.append(tokenDelimitatorStart())
i = i + 1
continue
if isChar(self.rtfData[i], '}'):
if lastDataStart > -1:
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
lastDataStart = -1
self.tokens.append(tokenDelimitatorEnd())
i = i + 1
continue
if isChar(self.rtfData[i], '\\'):
if i + 1 >= len(self.rtfData):
raise BaseException('Error: Control character found at the end of the document.')
if lastDataStart > -1:
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
lastDataStart = -1
tokenStart = i
i = i + 1
#Control Words
if isAsciiLetter(self.rtfData[i]):
#consume <ASCII Letter Sequence>
consumed = False
while i < len(self.rtfData):
if not isAsciiLetter(self.rtfData[i]):
tokenEnd = i
consumed = True
break
i = i + 1
if not consumed:
raise BaseException('Error (at:%d): Control Word without end.'%(tokenStart))
#we have numeric argument before delimiter
if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]):
#consume the numeric argument
consumed = False
l = 0
while i < len(self.rtfData):
if not isDigit(self.rtfData[i]):
consumed = True
break
l = l + 1
i = i + 1
if l > 10 :
raise BaseException('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart])
if not consumed:
raise BaseException('Error (at:%d): Control Word without numeric argument end.'%[tokenStart])
separator = ''
if isChar(self.rtfData[i], ' '):
separator = ' '
controlWord = self.rtfData[tokenStart: tokenEnd]
if tokenEnd < i:
value = int(self.rtfData[tokenEnd: i])
if isString(controlWord, "\\bin"):
i = i + value
self.tokens.append(tokenBinN(self.rtfData[tokenStart:i], separator))
else:
self.tokens.append(tokenControlWordWithNumericArgument(controlWord, value, separator))
else:
self.tokens.append(tokenControlWord(controlWord, separator))
#space delimiter, we should discard it
if self.rtfData[i] == ' ':
i = i + 1
#Control Symbol
else:
self.tokens.append(tokenControlSymbol(self.rtfData[tokenStart : i + 1]))
i = i + 1
continue
if lastDataStart < 0:
lastDataStart = i
i = i + 1
def toRTF(self):
result = []
for token in self.tokens:
result.append(token.toRTF())
return "".join(result)
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print ("Usage %prog rtfFileToConvert")
sys.exit()
f = open(sys.argv[1], 'rb')
data = f.read()
f.close()
tokenizer = RtfTokenizer(data)
parsedTokens = RtfTokenParser(tokenizer.tokens)
data = parsedTokens.toRTF()
f = open(sys.argv[1], 'w')
f.write(data)
f.close()