A rewritten and much improved lrs2lrf

This commit is contained in:
Kovid Goyal 2008-01-28 02:10:18 +00:00
parent c03508cf37
commit da826622b0
6 changed files with 331 additions and 1045 deletions

File diff suppressed because it is too large Load Diff

View File

@ -601,7 +601,7 @@ class Text(LRFStream):
s = u'<%s '%(self.name,)
for name, val in self.attrs.items():
s += '%s="%s" '%(name, val)
return s.rstrip() + (u' />' if self.self_closing else u'>') + (u'\n' if self.name in ('P', 'CR') else u'')
return s.rstrip() + (u' />' if self.self_closing else u'>')
class Span(TextTag):
pass
@ -760,8 +760,7 @@ class Text(LRFStream):
s += c
elif c is None:
p = open_containers.pop()
nl = u'\n' if p.name == 'P' else u''
s += nl + u'</%s>'%(p.name,) + nl
s += u'</%s>'%(p.name,)
else:
s += unicode(c)
if not c.self_closing:

View File

@ -81,7 +81,7 @@ def writeWord(f, word):
f.write(struct.pack("<H", int(word)))
def writeSignedWord(f, sword):
f.write(struct.pack("<h", int(sword)))
f.write(struct.pack("<h", int(float(sword))))
def writeWords(f, *words):
f.write(struct.pack("<%dH" % len(words), *words))

View File

@ -396,8 +396,8 @@ class Book(Delegator):
booksetting=BookSetting()
Override the default BookSetting.
setdefault=SetDefault()
Override the defalut SetDefault.
setdefault=StyleDefault()
Override the default SetDefault.
There are several other settings -- see the BookInfo class for more.
"""
@ -434,9 +434,12 @@ class Book(Delegator):
self.defaultTextStyle = textStyle
self.defaultBlockStyle = blockStyle
LrsObject.nextObjId += 1
styledefault = StyleDefault()
if settings.has_key('setdefault'):
styledefault = settings.pop('setdefault')
Delegator.__init__(self, [BookInformation(), Main(),
Template(), Style(), Solos(), Objects()])
Template(), Style(styledefault), Solos(), Objects()])
self.sourceencoding = None
@ -606,10 +609,10 @@ class Book(Delegator):
span.attrs['baselineskip'] = rescale(span.attrs['baselineskip'])
def renderLrs(self, lrsFile):
def renderLrs(self, lrsFile, encoding="UTF-8"):
if isinstance(lrsFile, basestring):
lrsFile = codecs.open(lrsFile, "wb", encoding="utf-16")
self.render(lrsFile)
lrsFile = codecs.open(lrsFile, "wb", encoding=encoding)
self.render(lrsFile, outputEncodingName=encoding)
lrsFile.close()
@ -634,7 +637,7 @@ class Book(Delegator):
return root
def render(self, f):
def render(self, f, outputEncodingName='UTF-8'):
""" Write the book as an LRS to file f. """
self.appendReferencedObjects(self)
@ -649,7 +652,8 @@ class Book(Delegator):
writer = ElementWriter(root, header=True,
sourceEncoding=self.sourceencoding,
spaceBeforeClose=False)
spaceBeforeClose=False,
outputEncodingName=outputEncodingName)
writer.write(f)
@ -1010,12 +1014,33 @@ class Template(object):
# does nothing
pass
class StyleDefault(LrsAttributes):
"""
Supply some defaults for all TextBlocks.
The legal values are a subset of what is allowed on a
TextBlock -- ruby, emphasis, and waitprop settings.
"""
defaults = dict(rubyalign="start", rubyadjust="none",
rubyoverhang="none", empdotsposition="before",
empdotsfontname="Dutch801 Rm BT Roman",
empdotscode="0x002e", emplineposition="after",
emplinetype = "solid", setwaitprop="noreplay")
alsoAllow = ["refempdotsfont", "rubyAlignAndAdjust"]
def __init__(self, **settings):
LrsAttributes.__init__(self, self.defaults,
alsoAllow=self.alsoAllow, **settings)
def toElement(self, se):
return Element("SetDefault", self.attrs)
class Style(LrsContainer, Delegator):
def __init__(self):
def __init__(self, styledefault=StyleDefault()):
LrsContainer.__init__(self, [PageStyle, TextStyle, BlockStyle])
Delegator.__init__(self, [BookStyle()])
Delegator.__init__(self, [BookStyle(styledefault=styledefault)])
self.bookStyle = self.delegates[0]
self.appendPageStyle = self.appendTextStyle = \
self.appendBlockStyle = self.append
@ -1071,10 +1096,10 @@ class Style(LrsContainer, Delegator):
class BookStyle(LrsObject, LrsContainer):
def __init__(self):
def __init__(self, styledefault=StyleDefault()):
LrsObject.__init__(self, assignId=True)
LrsContainer.__init__(self, [Font])
self.styledefault = StyleDefault()
self.styledefault = styledefault
self.booksetting = BookSetting()
self.appendFont = self.append
@ -1119,27 +1144,6 @@ class BookStyle(LrsObject, LrsContainer):
class StyleDefault(LrsAttributes):
"""
Supply some defaults for all TextBlocks.
The legal values are a subset of what is allowed on a
TextBlock -- ruby, emphasis, and waitprop settings.
"""
defaults = dict(rubyalign="start", rubyadjust="none",
rubyoverhang="none", empdotsposition="before",
empdotsfontname="Dutch801 Rm BT Roman",
empdotscode="0x002e", emplineposition="after",
emplinetype = "solid", setwaitprop="noreplay")
alsoAllow = ["refempdotsfont", "rubyAlignAndAdjust"]
def __init__(self, **settings):
LrsAttributes.__init__(self, self.defaults,
alsoAllow=self.alsoAllow, **settings)
def toElement(self, se):
return Element("SetDefault", self.attrs)
@ -1226,7 +1230,7 @@ class TextStyle(LrsStyle):
"""
baseDefaults = dict(
columnsep="0", charspace="0",
textlinewidth="10", align="head", linecolor="0x00000000",
textlinewidth="2", align="head", linecolor="0x00000000",
column="1", fontsize="100", fontwidth="-10", fontescapement="0",
fontorientation="0", fontweight="400",
fontfacename="Dutch801 Rm BT Roman",
@ -2251,7 +2255,9 @@ class HeaderOrFooter(LrsObject, LrsContainer, LrsAttributes):
LrsContainer.__init__(self, [PutObj])
LrsAttributes.__init__(self, self.defaults, **settings)
def put_object(self, obj, x1, y1):
self.append(PutObj(obj, x1, y1))
def PutObj(self, *args, **kwargs):
p = PutObj(*args, **kwargs)
self.append(p)
@ -2468,7 +2474,7 @@ class ImageBlock(LrsObject, LrsContainer, LrsAttributes):
""" Create an image on a page. """
# TODO: allow other block attributes
defaults = dict(blockwidth="600", blockheight="800")
defaults = BlockStyle.baseDefaults.copy()
def __init__(self, refstream, x0="0", y0="0", x1="600", y1="800",
xsize="600", ysize="800",

View File

@ -19,7 +19,7 @@ import tempfile, time, calendar, re, operator
from htmlentitydefs import name2codepoint
from libprs500 import __appname__, iswindows, browser
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, CData, Tag
class DefaultProfile(object):
@ -55,6 +55,7 @@ class DefaultProfile(object):
# See the built-in profiles for examples of these settings.
feeds = []
CDATA_PAT = re.compile(r'<\!\[CDATA\[(.*?)\]\]>', re.DOTALL)
def get_feeds(self):
'''
@ -68,7 +69,7 @@ class DefaultProfile(object):
@classmethod
def print_version(cls, url):
'''
Takea a URL pointing to an article and returns the URL pointing to the
Take a URL pointing to an article and returns the URL pointing to the
print version of the article.
'''
return url
@ -157,6 +158,28 @@ class DefaultProfile(object):
return index
@classmethod
def tag_to_string(cls, tag, use_alt=True):
'''
Convenience method to take a BeautifulSoup Tag and extract the text from it
recursively, including any CDATA sections and alt tag attributes.
@param use_alt: If True try to use the alt attribute for tags that don't have any textual content
@return: A unicode (possibly empty) object
'''
if not tag:
return ''
strings = []
for item in tag.contents:
if isinstance(item, (NavigableString, CData)):
strings.append(item.string)
elif isinstance(item, Tag):
res = cls.tag_to_string(item)
if res:
strings.append(res)
elif use_alt and item.has_key('alt'):
strings.append(item['alt'])
return u''.join(strings)
def parse_feeds(self, require_url=True):
'''
Create list of articles from a list of feeds.
@ -195,7 +218,7 @@ class DefaultProfile(object):
if not pubdate or not pubdate.string:
self.logger.debug('Skipping article as it does not have publication date')
continue
pubdate = pubdate.string
pubdate = self.tag_to_string(pubdate)
pubdate = pubdate.replace('+0000', 'GMT')
for element in self.url_search_order:
url = item.find(element)
@ -205,7 +228,7 @@ class DefaultProfile(object):
if require_url and (not url or not url.string):
self.logger.debug('Skipping article as it does not have a link url')
continue
url = url.string if (url and url.string) else ''
url = self.tag_to_string(url)
content = item.find('content:encoded')
if not content:
@ -221,7 +244,7 @@ class DefaultProfile(object):
self.logger.debug('Skipping %s as could not find URL for print version. Error:\n%s'%(url, err))
continue
d = {
'title' : item.find('title').string,
'title' : self.tag_to_string(item.find('title')),
'url' : purl,
'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
'date' : pubdate if self.use_pubdate else time.ctime(),
@ -263,7 +286,7 @@ class DefaultProfile(object):
@classmethod
def process_html_description(cls, tag, strip_links=True):
src = '\n'.join(tag.contents)
match = re.match(r'<\!\[CDATA\[(.*)\]\]>', src.lstrip())
match = cls.CDATA_PAT.match(src.lstrip())
if match:
src = match.group(1)
else:

View File

@ -17,9 +17,7 @@ class WallStreetJournal(DefaultProfile):
needs_subscription = True
max_articles_per_feed = 10
timefmt = ' [%a, %b %d, %Y]'
html_description = True
no_stylesheets = False
html2lrf_options = [('--ignore-tables')]
html2lrf_options = ['--ignore-tables', '--base-font-size=5']
## Don't grab articles more than 7 days old
oldest_article = 7