mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Sync to trunk.
This commit is contained in:
commit
4b3f998e9c
41
resources/recipes/drivelry.recipe
Normal file
41
resources/recipes/drivelry.recipe
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
class drivelrycom(BasicNewsRecipe):
|
||||||
|
title = u'drivelry.com'
|
||||||
|
language = 'en'
|
||||||
|
description = 'A blog by Mike Abrahams'
|
||||||
|
__author__ = 'Krittika Goyal'
|
||||||
|
oldest_article = 60 #days
|
||||||
|
max_articles_per_feed = 25
|
||||||
|
#encoding = 'latin1'
|
||||||
|
|
||||||
|
remove_stylesheets = True
|
||||||
|
#remove_tags_before = dict(name='h1', attrs={'class':'heading'})
|
||||||
|
remove_tags_after = dict(name='div', attrs={'id':'bookmark'})
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='iframe'),
|
||||||
|
dict(name='div', attrs={'class':['sidebar']}),
|
||||||
|
dict(name='div', attrs={'id':['bookmark']}),
|
||||||
|
#dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}),
|
||||||
|
#dict(name='ul', attrs={'class':'articleTools'}),
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('drivelry.com',
|
||||||
|
'http://feeds.feedburner.com/drivelry'),
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
story = soup.find(name='div', attrs={'id':'main'})
|
||||||
|
#td = heading.findParent(name='td')
|
||||||
|
#td.extract()
|
||||||
|
soup = BeautifulSoup('''
|
||||||
|
<html><head><title>t</title></head><body>
|
||||||
|
<p>To donate to this blog: <a href="http://www.drivelry.com/thank-you/">click here</a></p>
|
||||||
|
</body></html>
|
||||||
|
''')
|
||||||
|
body = soup.find(name='body')
|
||||||
|
body.insert(0, story)
|
||||||
|
return soup
|
38
resources/recipes/greader_uber.recipe
Normal file
38
resources/recipes/greader_uber.recipe
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
import urllib, re, mechanize
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from calibre import __appname__
|
||||||
|
|
||||||
|
class GoogleReaderUber(BasicNewsRecipe):
|
||||||
|
title = 'Google Reader Uber'
|
||||||
|
description = 'This recipe downloads all unread feedsfrom your Google Reader account.'
|
||||||
|
needs_subscription = True
|
||||||
|
__author__ = 'rollercoaster, davec'
|
||||||
|
base_url = 'http://www.google.com/reader/atom/'
|
||||||
|
oldest_article = 365
|
||||||
|
max_articles_per_feed = 250
|
||||||
|
get_options = '?n=%d&xt=user/-/state/com.google/read' % max_articles_per_feed
|
||||||
|
use_embedded_content = True
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
request = urllib.urlencode([('Email', self.username), ('Passwd', self.password),
|
||||||
|
('service', 'reader'), ('source', __appname__)])
|
||||||
|
response = br.open('https://www.google.com/accounts/ClientLogin', request)
|
||||||
|
sid = re.search('SID=(\S*)', response.read()).group(1)
|
||||||
|
|
||||||
|
cookies = mechanize.CookieJar()
|
||||||
|
br = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
|
||||||
|
cookies.set_cookie(mechanize.Cookie(None, 'SID', sid, None, False, '.google.com', True, True, '/', True, False, None, True, '', '', None))
|
||||||
|
return br
|
||||||
|
|
||||||
|
|
||||||
|
def get_feeds(self):
|
||||||
|
feeds = []
|
||||||
|
soup = self.index_to_soup('http://www.google.com/reader/api/0/tag/list')
|
||||||
|
for id in soup.findAll(True, attrs={'name':['id']}):
|
||||||
|
url = id.contents[0].replace('broadcast','reading-list')
|
||||||
|
feeds.append((re.search('/([^/]*)$', url).group(1),
|
||||||
|
self.base_url + urllib.quote(url.encode('utf-8')) + self.get_options))
|
||||||
|
return feeds
|
@ -25,7 +25,6 @@ class ledevoir(BasicNewsRecipe):
|
|||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
timefmt = '[%a, %d %b, %Y]'
|
||||||
|
|
||||||
oldest_article = 1
|
|
||||||
max_articles_per_feed = 50
|
max_articles_per_feed = 50
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
recursion = 10
|
recursion = 10
|
||||||
|
@ -70,11 +70,28 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
feeds.append((current_section, current_articles))
|
feeds.append((current_section, current_articles))
|
||||||
|
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
story = soup.find(name='div', attrs={'class':'triline'})
|
story = soup.find(name='div', attrs={'class':'triline'})
|
||||||
#td = heading.findParent(name='td')
|
page2_link = soup.find('p','pagenav')
|
||||||
#td.extract()
|
if page2_link:
|
||||||
|
atag = page2_link.find('a',href=True)
|
||||||
|
if atag:
|
||||||
|
page2_url = atag['href']
|
||||||
|
if page2_url.startswith('story'):
|
||||||
|
page2_url = 'http://www.nationalpost.com/todays-paper/'+page2_url
|
||||||
|
elif page2_url.startswith( '/todays-paper/story.html'):
|
||||||
|
page2_url = 'http://www.nationalpost.com/'+page2_url
|
||||||
|
page2_soup = self.index_to_soup(page2_url)
|
||||||
|
if page2_soup:
|
||||||
|
page2_content = page2_soup.find('div','story-content')
|
||||||
|
if page2_content:
|
||||||
|
full_story = BeautifulSoup('<div></div>')
|
||||||
|
full_story.insert(0,story)
|
||||||
|
full_story.insert(1,page2_content)
|
||||||
|
story = full_story
|
||||||
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
|
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
|
||||||
body = soup.find(name='body')
|
body = soup.find(name='body')
|
||||||
body.insert(0, story)
|
body.insert(0, story)
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
@ -14,6 +14,7 @@ Windows PNP strings:
|
|||||||
2W00000&1', 3, u'G:\\')
|
2W00000&1', 3, u'G:\\')
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
import re
|
||||||
|
|
||||||
from calibre.devices.usbms.driver import USBMS
|
from calibre.devices.usbms.driver import USBMS
|
||||||
|
|
||||||
@ -108,6 +109,7 @@ class POCKETBOOK360(EB600):
|
|||||||
|
|
||||||
OSX_MAIN_MEM = 'Philips Mass Storge Media'
|
OSX_MAIN_MEM = 'Philips Mass Storge Media'
|
||||||
OSX_CARD_A_MEM = 'Philips Mass Storge Media'
|
OSX_CARD_A_MEM = 'Philips Mass Storge Media'
|
||||||
|
OSX_MAIN_MEM_VOL_PAT = re.compile(r'/Pocket')
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def can_handle(cls, dev, debug=False):
|
def can_handle(cls, dev, debug=False):
|
||||||
|
@ -25,12 +25,14 @@ def get_document_info(stream):
|
|||||||
while not found:
|
while not found:
|
||||||
prefix = block[-6:]
|
prefix = block[-6:]
|
||||||
block = prefix + stream.read(block_size)
|
block = prefix + stream.read(block_size)
|
||||||
|
actual_block_size = len(block) - len(prefix)
|
||||||
if len(block) == len(prefix):
|
if len(block) == len(prefix):
|
||||||
break
|
break
|
||||||
idx = block.find(r'{\info')
|
idx = block.find(r'{\info')
|
||||||
if idx >= 0:
|
if idx >= 0:
|
||||||
found = True
|
found = True
|
||||||
stream.seek(stream.tell() - block_size + idx - len(prefix))
|
pos = stream.tell() - actual_block_size + idx - len(prefix)
|
||||||
|
stream.seek(pos)
|
||||||
else:
|
else:
|
||||||
if block.find(r'\sect') > -1:
|
if block.find(r'\sect') > -1:
|
||||||
break
|
break
|
||||||
|
@ -90,7 +90,10 @@ class DetectStructure(object):
|
|||||||
mark = etree.Element(XHTML('div'), style=page_break_after)
|
mark = etree.Element(XHTML('div'), style=page_break_after)
|
||||||
else: # chapter_mark == 'both':
|
else: # chapter_mark == 'both':
|
||||||
mark = etree.Element(XHTML('hr'), style=page_break_before)
|
mark = etree.Element(XHTML('hr'), style=page_break_before)
|
||||||
|
try:
|
||||||
elem.addprevious(mark)
|
elem.addprevious(mark)
|
||||||
|
except TypeError:
|
||||||
|
self.log.exception('Failed to mark chapter')
|
||||||
|
|
||||||
def create_level_based_toc(self):
|
def create_level_based_toc(self):
|
||||||
if self.opts.level1_toc is None:
|
if self.opts.level1_toc is None:
|
||||||
|
@ -169,6 +169,21 @@ class RTFInput(InputFormatPlugin):
|
|||||||
with open('styles.css', 'ab') as f:
|
with open('styles.css', 'ab') as f:
|
||||||
f.write(css)
|
f.write(css)
|
||||||
|
|
||||||
|
def preprocess(self, fname):
|
||||||
|
self.log('\tPreprocessing to convert unicode characters')
|
||||||
|
try:
|
||||||
|
data = open(fname, 'rb').read()
|
||||||
|
from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
|
||||||
|
tokenizer = RtfTokenizer(data)
|
||||||
|
tokens = RtfTokenParser(tokenizer.tokens)
|
||||||
|
data = tokens.toRTF()
|
||||||
|
fname = 'preprocessed.rtf'
|
||||||
|
with open(fname, 'wb') as f:
|
||||||
|
f.write(data)
|
||||||
|
except:
|
||||||
|
self.log.exception(
|
||||||
|
'Failed to preprocess RTF to convert unicode sequences, ignoring...')
|
||||||
|
return fname
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
@ -177,8 +192,9 @@ class RTFInput(InputFormatPlugin):
|
|||||||
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
|
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
|
||||||
self.log = log
|
self.log = log
|
||||||
self.log('Converting RTF to XML...')
|
self.log('Converting RTF to XML...')
|
||||||
|
fname = self.preprocess(stream.name)
|
||||||
try:
|
try:
|
||||||
xml = self.generate_xml(stream.name)
|
xml = self.generate_xml(fname)
|
||||||
except RtfInvalidCodeException:
|
except RtfInvalidCodeException:
|
||||||
raise ValueError(_('This RTF file has a feature calibre does not '
|
raise ValueError(_('This RTF file has a feature calibre does not '
|
||||||
'support. Convert it to HTML first and then try it.'))
|
'support. Convert it to HTML first and then try it.'))
|
||||||
|
344
src/calibre/ebooks/rtf/preprocess.py
Normal file
344
src/calibre/ebooks/rtf/preprocess.py
Normal file
@ -0,0 +1,344 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import with_statement
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2010, Gerendi Sandor Attila'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
"""
|
||||||
|
RTF tokenizer and token parser. v.1.0 (1/17/2010)
|
||||||
|
Author: Gerendi Sandor Attila
|
||||||
|
|
||||||
|
At this point this will tokenize a RTF file then rebuild it from the tokens.
|
||||||
|
In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compilant.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class tokenDelimitatorStart():
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
def toRTF(self):
|
||||||
|
return b'{'
|
||||||
|
def __repr__(self):
|
||||||
|
return '{'
|
||||||
|
|
||||||
|
class tokenDelimitatorEnd():
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
def toRTF(self):
|
||||||
|
return b'}'
|
||||||
|
def __repr__(self):
|
||||||
|
return '}'
|
||||||
|
|
||||||
|
class tokenControlWord():
|
||||||
|
def __init__(self, name, separator = ''):
|
||||||
|
self.name = name
|
||||||
|
self.separator = separator
|
||||||
|
def toRTF(self):
|
||||||
|
return self.name + self.separator
|
||||||
|
def __repr__(self):
|
||||||
|
return self.name + self.separator
|
||||||
|
|
||||||
|
class tokenControlWordWithNumericArgument():
|
||||||
|
def __init__(self, name, argument, separator = ''):
|
||||||
|
self.name = name
|
||||||
|
self.argument = argument
|
||||||
|
self.separator = separator
|
||||||
|
def toRTF(self):
|
||||||
|
return self.name + repr(self.argument) + self.separator
|
||||||
|
def __repr__(self):
|
||||||
|
return self.name + repr(self.argument) + self.separator
|
||||||
|
|
||||||
|
class tokenControlSymbol():
|
||||||
|
def __init__(self, name):
|
||||||
|
self.name = name
|
||||||
|
def toRTF(self):
|
||||||
|
return self.name
|
||||||
|
def __repr__(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
class tokenData():
|
||||||
|
def __init__(self, data):
|
||||||
|
self.data = data
|
||||||
|
def toRTF(self):
|
||||||
|
return self.data
|
||||||
|
def __repr__(self):
|
||||||
|
return self.data
|
||||||
|
|
||||||
|
class tokenBinN():
|
||||||
|
def __init__(self, data, separator = ''):
|
||||||
|
self.data = data
|
||||||
|
self.separator = separator
|
||||||
|
def toRTF(self):
|
||||||
|
return "\\bin" + repr(len(self.data)) + self.separator + self.data
|
||||||
|
def __repr__(self):
|
||||||
|
return "\\bin" + repr(len(self.data)) + self.separator + self.data
|
||||||
|
|
||||||
|
class token8bitChar():
|
||||||
|
def __init__(self, data):
|
||||||
|
self.data = data
|
||||||
|
def toRTF(self):
|
||||||
|
return "\\'" + self.data
|
||||||
|
def __repr__(self):
|
||||||
|
return "\\'" + self.data
|
||||||
|
|
||||||
|
class tokenUnicode():
|
||||||
|
def __init__(self, data, separator = '', current_ucn = 1, eqList = []):
|
||||||
|
self.data = data
|
||||||
|
self.separator = separator
|
||||||
|
self.current_ucn = current_ucn
|
||||||
|
self.eqList = eqList
|
||||||
|
def toRTF(self):
|
||||||
|
result = '\\u' + repr(self.data) + ' '
|
||||||
|
ucn = self.current_ucn
|
||||||
|
if len(self.eqList) < ucn:
|
||||||
|
ucn = len(self.eqList)
|
||||||
|
result = tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result
|
||||||
|
i = 0
|
||||||
|
for eq in self.eqList:
|
||||||
|
if i >= ucn:
|
||||||
|
break
|
||||||
|
result = result + eq.toRTF()
|
||||||
|
return result
|
||||||
|
def __repr__(self):
|
||||||
|
return '\\u' + repr(self.data)
|
||||||
|
|
||||||
|
|
||||||
|
def isAsciiLetter(value):
|
||||||
|
return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z'))
|
||||||
|
|
||||||
|
def isDigit(value):
|
||||||
|
return (value >= '0') and (value <= '9')
|
||||||
|
|
||||||
|
def isChar(value, char):
|
||||||
|
return value == char
|
||||||
|
|
||||||
|
def isString(buffer, string):
|
||||||
|
return buffer == string
|
||||||
|
|
||||||
|
|
||||||
|
class RtfTokenParser():
|
||||||
|
def __init__(self, tokens):
|
||||||
|
self.tokens = tokens
|
||||||
|
self.process()
|
||||||
|
self.processUnicode()
|
||||||
|
|
||||||
|
def process(self):
|
||||||
|
i = 0
|
||||||
|
newTokens = []
|
||||||
|
while i < len(self.tokens):
|
||||||
|
if isinstance(self.tokens[i], tokenControlSymbol):
|
||||||
|
if isString(self.tokens[i].name, "\\'"):
|
||||||
|
i = i + 1
|
||||||
|
if not isinstance(self.tokens[i], tokenData):
|
||||||
|
raise BaseException('Error: token8bitChar without data.')
|
||||||
|
if len(self.tokens[i].data) < 2:
|
||||||
|
raise BaseException('Error: token8bitChar without data.')
|
||||||
|
newTokens.append(token8bitChar(self.tokens[i].data[0:2]))
|
||||||
|
if len(self.tokens[i].data) > 2:
|
||||||
|
newTokens.append(tokenData(self.tokens[i].data[2:]))
|
||||||
|
i = i + 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
newTokens.append(self.tokens[i])
|
||||||
|
i = i + 1
|
||||||
|
|
||||||
|
self.tokens = list(newTokens)
|
||||||
|
|
||||||
|
def processUnicode(self):
|
||||||
|
i = 0
|
||||||
|
newTokens = []
|
||||||
|
ucNbStack = [1]
|
||||||
|
while i < len(self.tokens):
|
||||||
|
if isinstance(self.tokens[i], tokenDelimitatorStart):
|
||||||
|
ucNbStack.append(ucNbStack[len(ucNbStack) - 1])
|
||||||
|
newTokens.append(self.tokens[i])
|
||||||
|
i = i + 1
|
||||||
|
continue
|
||||||
|
if isinstance(self.tokens[i], tokenDelimitatorEnd):
|
||||||
|
ucNbStack.pop()
|
||||||
|
newTokens.append(self.tokens[i])
|
||||||
|
i = i + 1
|
||||||
|
continue
|
||||||
|
if isinstance(self.tokens[i], tokenControlWordWithNumericArgument):
|
||||||
|
if isString(self.tokens[i].name, '\\uc'):
|
||||||
|
ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument
|
||||||
|
newTokens.append(self.tokens[i])
|
||||||
|
i = i + 1
|
||||||
|
continue
|
||||||
|
if isString(self.tokens[i].name, '\\u'):
|
||||||
|
x = i
|
||||||
|
j = 0
|
||||||
|
i = i + 1
|
||||||
|
replace = []
|
||||||
|
partialData = None
|
||||||
|
ucn = ucNbStack[len(ucNbStack) - 1]
|
||||||
|
while (i < len(self.tokens)) and (j < ucn):
|
||||||
|
if isinstance(self.tokens[i], tokenDelimitatorStart):
|
||||||
|
break
|
||||||
|
if isinstance(self.tokens[i], tokenDelimitatorEnd):
|
||||||
|
break
|
||||||
|
if isinstance(self.tokens[i], tokenData):
|
||||||
|
if len(self.tokens[i].data) >= ucn - j:
|
||||||
|
replace.append(tokenData(self.tokens[i].data[0 : ucn - j]))
|
||||||
|
if len(self.tokens[i].data) > ucn - j:
|
||||||
|
partialData = tokenData(self.tokens[i].data[ucn - j:])
|
||||||
|
i = i + 1
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
replace.append(self.tokens[i])
|
||||||
|
j = j + len(self.tokens[i].data)
|
||||||
|
i = i + 1
|
||||||
|
continue
|
||||||
|
if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN):
|
||||||
|
replace.append(self.tokens[i])
|
||||||
|
i = i + 1
|
||||||
|
j = j + 1
|
||||||
|
continue
|
||||||
|
raise BaseException('Error: incorect utf replacement.')
|
||||||
|
|
||||||
|
#calibre rtf2xml does not support utfreplace
|
||||||
|
replace = []
|
||||||
|
|
||||||
|
newTokens.append(tokenUnicode(self.tokens[x].argument, self.tokens[x].separator, ucNbStack[len(ucNbStack) - 1], replace))
|
||||||
|
if partialData != None:
|
||||||
|
newTokens.append(partialData)
|
||||||
|
continue
|
||||||
|
|
||||||
|
newTokens.append(self.tokens[i])
|
||||||
|
i = i + 1
|
||||||
|
|
||||||
|
self.tokens = list(newTokens)
|
||||||
|
|
||||||
|
|
||||||
|
def toRTF(self):
|
||||||
|
result = []
|
||||||
|
for token in self.tokens:
|
||||||
|
result.append(token.toRTF())
|
||||||
|
return "".join(result)
|
||||||
|
|
||||||
|
|
||||||
|
class RtfTokenizer():
|
||||||
|
def __init__(self, rtfData):
|
||||||
|
self.rtfData = []
|
||||||
|
self.tokens = []
|
||||||
|
self.rtfData = rtfData
|
||||||
|
self.tokenize()
|
||||||
|
|
||||||
|
def tokenize(self):
|
||||||
|
i = 0
|
||||||
|
lastDataStart = -1
|
||||||
|
while i < len(self.rtfData):
|
||||||
|
|
||||||
|
if isChar(self.rtfData[i], '{'):
|
||||||
|
if lastDataStart > -1:
|
||||||
|
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
|
||||||
|
lastDataStart = -1
|
||||||
|
self.tokens.append(tokenDelimitatorStart())
|
||||||
|
i = i + 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if isChar(self.rtfData[i], '}'):
|
||||||
|
if lastDataStart > -1:
|
||||||
|
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
|
||||||
|
lastDataStart = -1
|
||||||
|
self.tokens.append(tokenDelimitatorEnd())
|
||||||
|
i = i + 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if isChar(self.rtfData[i], '\\'):
|
||||||
|
if i + 1 >= len(self.rtfData):
|
||||||
|
raise BaseException('Error: Control character found at the end of the document.')
|
||||||
|
|
||||||
|
if lastDataStart > -1:
|
||||||
|
self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
|
||||||
|
lastDataStart = -1
|
||||||
|
|
||||||
|
tokenStart = i
|
||||||
|
i = i + 1
|
||||||
|
|
||||||
|
#Control Words
|
||||||
|
if isAsciiLetter(self.rtfData[i]):
|
||||||
|
#consume <ASCII Letter Sequence>
|
||||||
|
consumed = False
|
||||||
|
while i < len(self.rtfData):
|
||||||
|
if not isAsciiLetter(self.rtfData[i]):
|
||||||
|
tokenEnd = i
|
||||||
|
consumed = True
|
||||||
|
break
|
||||||
|
i = i + 1
|
||||||
|
|
||||||
|
if not consumed:
|
||||||
|
raise BaseException('Error (at:%d): Control Word without end.'%(tokenStart))
|
||||||
|
|
||||||
|
#we have numeric argument before delimiter
|
||||||
|
if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]):
|
||||||
|
#consume the numeric argument
|
||||||
|
consumed = False
|
||||||
|
l = 0
|
||||||
|
while i < len(self.rtfData):
|
||||||
|
if not isDigit(self.rtfData[i]):
|
||||||
|
consumed = True
|
||||||
|
break
|
||||||
|
l = l + 1
|
||||||
|
i = i + 1
|
||||||
|
if l > 10 :
|
||||||
|
raise BaseException('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart])
|
||||||
|
|
||||||
|
if not consumed:
|
||||||
|
raise BaseException('Error (at:%d): Control Word without numeric argument end.'%[tokenStart])
|
||||||
|
|
||||||
|
separator = ''
|
||||||
|
if isChar(self.rtfData[i], ' '):
|
||||||
|
separator = ' '
|
||||||
|
|
||||||
|
controlWord = self.rtfData[tokenStart: tokenEnd]
|
||||||
|
if tokenEnd < i:
|
||||||
|
value = int(self.rtfData[tokenEnd: i])
|
||||||
|
if isString(controlWord, "\\bin"):
|
||||||
|
i = i + value
|
||||||
|
self.tokens.append(tokenBinN(self.rtfData[tokenStart:i], separator))
|
||||||
|
else:
|
||||||
|
self.tokens.append(tokenControlWordWithNumericArgument(controlWord, value, separator))
|
||||||
|
else:
|
||||||
|
self.tokens.append(tokenControlWord(controlWord, separator))
|
||||||
|
#space delimiter, we should discard it
|
||||||
|
if self.rtfData[i] == ' ':
|
||||||
|
i = i + 1
|
||||||
|
|
||||||
|
#Control Symbol
|
||||||
|
else:
|
||||||
|
self.tokens.append(tokenControlSymbol(self.rtfData[tokenStart : i + 1]))
|
||||||
|
i = i + 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if lastDataStart < 0:
|
||||||
|
lastDataStart = i
|
||||||
|
i = i + 1
|
||||||
|
|
||||||
|
def toRTF(self):
|
||||||
|
result = []
|
||||||
|
for token in self.tokens:
|
||||||
|
result.append(token.toRTF())
|
||||||
|
return "".join(result)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print ("Usage %prog rtfFileToConvert")
|
||||||
|
sys.exit()
|
||||||
|
f = open(sys.argv[1], 'rb')
|
||||||
|
data = f.read()
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
tokenizer = RtfTokenizer(data)
|
||||||
|
parsedTokens = RtfTokenParser(tokenizer.tokens)
|
||||||
|
|
||||||
|
data = parsedTokens.toRTF()
|
||||||
|
|
||||||
|
f = open(sys.argv[1], 'w')
|
||||||
|
f.write(data)
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user