mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
A rewritten and much improved lrs2lrf
This commit is contained in:
parent
c03508cf37
commit
da826622b0
File diff suppressed because it is too large
Load Diff
@ -601,7 +601,7 @@ class Text(LRFStream):
|
|||||||
s = u'<%s '%(self.name,)
|
s = u'<%s '%(self.name,)
|
||||||
for name, val in self.attrs.items():
|
for name, val in self.attrs.items():
|
||||||
s += '%s="%s" '%(name, val)
|
s += '%s="%s" '%(name, val)
|
||||||
return s.rstrip() + (u' />' if self.self_closing else u'>') + (u'\n' if self.name in ('P', 'CR') else u'')
|
return s.rstrip() + (u' />' if self.self_closing else u'>')
|
||||||
|
|
||||||
class Span(TextTag):
|
class Span(TextTag):
|
||||||
pass
|
pass
|
||||||
@ -760,8 +760,7 @@ class Text(LRFStream):
|
|||||||
s += c
|
s += c
|
||||||
elif c is None:
|
elif c is None:
|
||||||
p = open_containers.pop()
|
p = open_containers.pop()
|
||||||
nl = u'\n' if p.name == 'P' else u''
|
s += u'</%s>'%(p.name,)
|
||||||
s += nl + u'</%s>'%(p.name,) + nl
|
|
||||||
else:
|
else:
|
||||||
s += unicode(c)
|
s += unicode(c)
|
||||||
if not c.self_closing:
|
if not c.self_closing:
|
||||||
|
@ -81,7 +81,7 @@ def writeWord(f, word):
|
|||||||
f.write(struct.pack("<H", int(word)))
|
f.write(struct.pack("<H", int(word)))
|
||||||
|
|
||||||
def writeSignedWord(f, sword):
|
def writeSignedWord(f, sword):
|
||||||
f.write(struct.pack("<h", int(sword)))
|
f.write(struct.pack("<h", int(float(sword))))
|
||||||
|
|
||||||
def writeWords(f, *words):
|
def writeWords(f, *words):
|
||||||
f.write(struct.pack("<%dH" % len(words), *words))
|
f.write(struct.pack("<%dH" % len(words), *words))
|
||||||
|
@ -396,8 +396,8 @@ class Book(Delegator):
|
|||||||
booksetting=BookSetting()
|
booksetting=BookSetting()
|
||||||
Override the default BookSetting.
|
Override the default BookSetting.
|
||||||
|
|
||||||
setdefault=SetDefault()
|
setdefault=StyleDefault()
|
||||||
Override the defalut SetDefault.
|
Override the default SetDefault.
|
||||||
|
|
||||||
There are several other settings -- see the BookInfo class for more.
|
There are several other settings -- see the BookInfo class for more.
|
||||||
"""
|
"""
|
||||||
@ -434,9 +434,12 @@ class Book(Delegator):
|
|||||||
self.defaultTextStyle = textStyle
|
self.defaultTextStyle = textStyle
|
||||||
self.defaultBlockStyle = blockStyle
|
self.defaultBlockStyle = blockStyle
|
||||||
LrsObject.nextObjId += 1
|
LrsObject.nextObjId += 1
|
||||||
|
|
||||||
|
styledefault = StyleDefault()
|
||||||
|
if settings.has_key('setdefault'):
|
||||||
|
styledefault = settings.pop('setdefault')
|
||||||
Delegator.__init__(self, [BookInformation(), Main(),
|
Delegator.__init__(self, [BookInformation(), Main(),
|
||||||
Template(), Style(), Solos(), Objects()])
|
Template(), Style(styledefault), Solos(), Objects()])
|
||||||
|
|
||||||
self.sourceencoding = None
|
self.sourceencoding = None
|
||||||
|
|
||||||
@ -606,10 +609,10 @@ class Book(Delegator):
|
|||||||
span.attrs['baselineskip'] = rescale(span.attrs['baselineskip'])
|
span.attrs['baselineskip'] = rescale(span.attrs['baselineskip'])
|
||||||
|
|
||||||
|
|
||||||
def renderLrs(self, lrsFile):
|
def renderLrs(self, lrsFile, encoding="UTF-8"):
|
||||||
if isinstance(lrsFile, basestring):
|
if isinstance(lrsFile, basestring):
|
||||||
lrsFile = codecs.open(lrsFile, "wb", encoding="utf-16")
|
lrsFile = codecs.open(lrsFile, "wb", encoding=encoding)
|
||||||
self.render(lrsFile)
|
self.render(lrsFile, outputEncodingName=encoding)
|
||||||
lrsFile.close()
|
lrsFile.close()
|
||||||
|
|
||||||
|
|
||||||
@ -634,7 +637,7 @@ class Book(Delegator):
|
|||||||
return root
|
return root
|
||||||
|
|
||||||
|
|
||||||
def render(self, f):
|
def render(self, f, outputEncodingName='UTF-8'):
|
||||||
""" Write the book as an LRS to file f. """
|
""" Write the book as an LRS to file f. """
|
||||||
|
|
||||||
self.appendReferencedObjects(self)
|
self.appendReferencedObjects(self)
|
||||||
@ -649,7 +652,8 @@ class Book(Delegator):
|
|||||||
|
|
||||||
writer = ElementWriter(root, header=True,
|
writer = ElementWriter(root, header=True,
|
||||||
sourceEncoding=self.sourceencoding,
|
sourceEncoding=self.sourceencoding,
|
||||||
spaceBeforeClose=False)
|
spaceBeforeClose=False,
|
||||||
|
outputEncodingName=outputEncodingName)
|
||||||
writer.write(f)
|
writer.write(f)
|
||||||
|
|
||||||
|
|
||||||
@ -1010,12 +1014,33 @@ class Template(object):
|
|||||||
# does nothing
|
# does nothing
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class StyleDefault(LrsAttributes):
|
||||||
|
"""
|
||||||
|
Supply some defaults for all TextBlocks.
|
||||||
|
The legal values are a subset of what is allowed on a
|
||||||
|
TextBlock -- ruby, emphasis, and waitprop settings.
|
||||||
|
"""
|
||||||
|
defaults = dict(rubyalign="start", rubyadjust="none",
|
||||||
|
rubyoverhang="none", empdotsposition="before",
|
||||||
|
empdotsfontname="Dutch801 Rm BT Roman",
|
||||||
|
empdotscode="0x002e", emplineposition="after",
|
||||||
|
emplinetype = "solid", setwaitprop="noreplay")
|
||||||
|
|
||||||
|
alsoAllow = ["refempdotsfont", "rubyAlignAndAdjust"]
|
||||||
|
|
||||||
|
def __init__(self, **settings):
|
||||||
|
LrsAttributes.__init__(self, self.defaults,
|
||||||
|
alsoAllow=self.alsoAllow, **settings)
|
||||||
|
|
||||||
|
|
||||||
|
def toElement(self, se):
|
||||||
|
return Element("SetDefault", self.attrs)
|
||||||
|
|
||||||
|
|
||||||
class Style(LrsContainer, Delegator):
|
class Style(LrsContainer, Delegator):
|
||||||
def __init__(self):
|
def __init__(self, styledefault=StyleDefault()):
|
||||||
LrsContainer.__init__(self, [PageStyle, TextStyle, BlockStyle])
|
LrsContainer.__init__(self, [PageStyle, TextStyle, BlockStyle])
|
||||||
Delegator.__init__(self, [BookStyle()])
|
Delegator.__init__(self, [BookStyle(styledefault=styledefault)])
|
||||||
self.bookStyle = self.delegates[0]
|
self.bookStyle = self.delegates[0]
|
||||||
self.appendPageStyle = self.appendTextStyle = \
|
self.appendPageStyle = self.appendTextStyle = \
|
||||||
self.appendBlockStyle = self.append
|
self.appendBlockStyle = self.append
|
||||||
@ -1071,10 +1096,10 @@ class Style(LrsContainer, Delegator):
|
|||||||
|
|
||||||
|
|
||||||
class BookStyle(LrsObject, LrsContainer):
|
class BookStyle(LrsObject, LrsContainer):
|
||||||
def __init__(self):
|
def __init__(self, styledefault=StyleDefault()):
|
||||||
LrsObject.__init__(self, assignId=True)
|
LrsObject.__init__(self, assignId=True)
|
||||||
LrsContainer.__init__(self, [Font])
|
LrsContainer.__init__(self, [Font])
|
||||||
self.styledefault = StyleDefault()
|
self.styledefault = styledefault
|
||||||
self.booksetting = BookSetting()
|
self.booksetting = BookSetting()
|
||||||
self.appendFont = self.append
|
self.appendFont = self.append
|
||||||
|
|
||||||
@ -1119,27 +1144,6 @@ class BookStyle(LrsObject, LrsContainer):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
class StyleDefault(LrsAttributes):
|
|
||||||
"""
|
|
||||||
Supply some defaults for all TextBlocks.
|
|
||||||
The legal values are a subset of what is allowed on a
|
|
||||||
TextBlock -- ruby, emphasis, and waitprop settings.
|
|
||||||
"""
|
|
||||||
defaults = dict(rubyalign="start", rubyadjust="none",
|
|
||||||
rubyoverhang="none", empdotsposition="before",
|
|
||||||
empdotsfontname="Dutch801 Rm BT Roman",
|
|
||||||
empdotscode="0x002e", emplineposition="after",
|
|
||||||
emplinetype = "solid", setwaitprop="noreplay")
|
|
||||||
|
|
||||||
alsoAllow = ["refempdotsfont", "rubyAlignAndAdjust"]
|
|
||||||
|
|
||||||
def __init__(self, **settings):
|
|
||||||
LrsAttributes.__init__(self, self.defaults,
|
|
||||||
alsoAllow=self.alsoAllow, **settings)
|
|
||||||
|
|
||||||
|
|
||||||
def toElement(self, se):
|
|
||||||
return Element("SetDefault", self.attrs)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -1226,7 +1230,7 @@ class TextStyle(LrsStyle):
|
|||||||
"""
|
"""
|
||||||
baseDefaults = dict(
|
baseDefaults = dict(
|
||||||
columnsep="0", charspace="0",
|
columnsep="0", charspace="0",
|
||||||
textlinewidth="10", align="head", linecolor="0x00000000",
|
textlinewidth="2", align="head", linecolor="0x00000000",
|
||||||
column="1", fontsize="100", fontwidth="-10", fontescapement="0",
|
column="1", fontsize="100", fontwidth="-10", fontescapement="0",
|
||||||
fontorientation="0", fontweight="400",
|
fontorientation="0", fontweight="400",
|
||||||
fontfacename="Dutch801 Rm BT Roman",
|
fontfacename="Dutch801 Rm BT Roman",
|
||||||
@ -2251,7 +2255,9 @@ class HeaderOrFooter(LrsObject, LrsContainer, LrsAttributes):
|
|||||||
LrsContainer.__init__(self, [PutObj])
|
LrsContainer.__init__(self, [PutObj])
|
||||||
LrsAttributes.__init__(self, self.defaults, **settings)
|
LrsAttributes.__init__(self, self.defaults, **settings)
|
||||||
|
|
||||||
|
def put_object(self, obj, x1, y1):
|
||||||
|
self.append(PutObj(obj, x1, y1))
|
||||||
|
|
||||||
def PutObj(self, *args, **kwargs):
|
def PutObj(self, *args, **kwargs):
|
||||||
p = PutObj(*args, **kwargs)
|
p = PutObj(*args, **kwargs)
|
||||||
self.append(p)
|
self.append(p)
|
||||||
@ -2468,7 +2474,7 @@ class ImageBlock(LrsObject, LrsContainer, LrsAttributes):
|
|||||||
""" Create an image on a page. """
|
""" Create an image on a page. """
|
||||||
# TODO: allow other block attributes
|
# TODO: allow other block attributes
|
||||||
|
|
||||||
defaults = dict(blockwidth="600", blockheight="800")
|
defaults = BlockStyle.baseDefaults.copy()
|
||||||
|
|
||||||
def __init__(self, refstream, x0="0", y0="0", x1="600", y1="800",
|
def __init__(self, refstream, x0="0", y0="0", x1="600", y1="800",
|
||||||
xsize="600", ysize="800",
|
xsize="600", ysize="800",
|
||||||
|
@ -19,7 +19,7 @@ import tempfile, time, calendar, re, operator
|
|||||||
from htmlentitydefs import name2codepoint
|
from htmlentitydefs import name2codepoint
|
||||||
|
|
||||||
from libprs500 import __appname__, iswindows, browser
|
from libprs500 import __appname__, iswindows, browser
|
||||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, CData, Tag
|
||||||
|
|
||||||
|
|
||||||
class DefaultProfile(object):
|
class DefaultProfile(object):
|
||||||
@ -55,6 +55,7 @@ class DefaultProfile(object):
|
|||||||
# See the built-in profiles for examples of these settings.
|
# See the built-in profiles for examples of these settings.
|
||||||
|
|
||||||
feeds = []
|
feeds = []
|
||||||
|
CDATA_PAT = re.compile(r'<\!\[CDATA\[(.*?)\]\]>', re.DOTALL)
|
||||||
|
|
||||||
def get_feeds(self):
|
def get_feeds(self):
|
||||||
'''
|
'''
|
||||||
@ -68,7 +69,7 @@ class DefaultProfile(object):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def print_version(cls, url):
|
def print_version(cls, url):
|
||||||
'''
|
'''
|
||||||
Takea a URL pointing to an article and returns the URL pointing to the
|
Take a URL pointing to an article and returns the URL pointing to the
|
||||||
print version of the article.
|
print version of the article.
|
||||||
'''
|
'''
|
||||||
return url
|
return url
|
||||||
@ -157,6 +158,28 @@ class DefaultProfile(object):
|
|||||||
return index
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def tag_to_string(cls, tag, use_alt=True):
|
||||||
|
'''
|
||||||
|
Convenience method to take a BeautifulSoup Tag and extract the text from it
|
||||||
|
recursively, including any CDATA sections and alt tag attributes.
|
||||||
|
@param use_alt: If True try to use the alt attribute for tags that don't have any textual content
|
||||||
|
@return: A unicode (possibly empty) object
|
||||||
|
'''
|
||||||
|
if not tag:
|
||||||
|
return ''
|
||||||
|
strings = []
|
||||||
|
for item in tag.contents:
|
||||||
|
if isinstance(item, (NavigableString, CData)):
|
||||||
|
strings.append(item.string)
|
||||||
|
elif isinstance(item, Tag):
|
||||||
|
res = cls.tag_to_string(item)
|
||||||
|
if res:
|
||||||
|
strings.append(res)
|
||||||
|
elif use_alt and item.has_key('alt'):
|
||||||
|
strings.append(item['alt'])
|
||||||
|
return u''.join(strings)
|
||||||
|
|
||||||
def parse_feeds(self, require_url=True):
|
def parse_feeds(self, require_url=True):
|
||||||
'''
|
'''
|
||||||
Create list of articles from a list of feeds.
|
Create list of articles from a list of feeds.
|
||||||
@ -195,7 +218,7 @@ class DefaultProfile(object):
|
|||||||
if not pubdate or not pubdate.string:
|
if not pubdate or not pubdate.string:
|
||||||
self.logger.debug('Skipping article as it does not have publication date')
|
self.logger.debug('Skipping article as it does not have publication date')
|
||||||
continue
|
continue
|
||||||
pubdate = pubdate.string
|
pubdate = self.tag_to_string(pubdate)
|
||||||
pubdate = pubdate.replace('+0000', 'GMT')
|
pubdate = pubdate.replace('+0000', 'GMT')
|
||||||
for element in self.url_search_order:
|
for element in self.url_search_order:
|
||||||
url = item.find(element)
|
url = item.find(element)
|
||||||
@ -205,7 +228,7 @@ class DefaultProfile(object):
|
|||||||
if require_url and (not url or not url.string):
|
if require_url and (not url or not url.string):
|
||||||
self.logger.debug('Skipping article as it does not have a link url')
|
self.logger.debug('Skipping article as it does not have a link url')
|
||||||
continue
|
continue
|
||||||
url = url.string if (url and url.string) else ''
|
url = self.tag_to_string(url)
|
||||||
|
|
||||||
content = item.find('content:encoded')
|
content = item.find('content:encoded')
|
||||||
if not content:
|
if not content:
|
||||||
@ -221,7 +244,7 @@ class DefaultProfile(object):
|
|||||||
self.logger.debug('Skipping %s as could not find URL for print version. Error:\n%s'%(url, err))
|
self.logger.debug('Skipping %s as could not find URL for print version. Error:\n%s'%(url, err))
|
||||||
continue
|
continue
|
||||||
d = {
|
d = {
|
||||||
'title' : item.find('title').string,
|
'title' : self.tag_to_string(item.find('title')),
|
||||||
'url' : purl,
|
'url' : purl,
|
||||||
'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
|
'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
|
||||||
'date' : pubdate if self.use_pubdate else time.ctime(),
|
'date' : pubdate if self.use_pubdate else time.ctime(),
|
||||||
@ -263,7 +286,7 @@ class DefaultProfile(object):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def process_html_description(cls, tag, strip_links=True):
|
def process_html_description(cls, tag, strip_links=True):
|
||||||
src = '\n'.join(tag.contents)
|
src = '\n'.join(tag.contents)
|
||||||
match = re.match(r'<\!\[CDATA\[(.*)\]\]>', src.lstrip())
|
match = cls.CDATA_PAT.match(src.lstrip())
|
||||||
if match:
|
if match:
|
||||||
src = match.group(1)
|
src = match.group(1)
|
||||||
else:
|
else:
|
||||||
|
@ -17,9 +17,7 @@ class WallStreetJournal(DefaultProfile):
|
|||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
max_articles_per_feed = 10
|
max_articles_per_feed = 10
|
||||||
timefmt = ' [%a, %b %d, %Y]'
|
timefmt = ' [%a, %b %d, %Y]'
|
||||||
html_description = True
|
html2lrf_options = ['--ignore-tables', '--base-font-size=5']
|
||||||
no_stylesheets = False
|
|
||||||
html2lrf_options = [('--ignore-tables')]
|
|
||||||
|
|
||||||
## Don't grab articles more than 7 days old
|
## Don't grab articles more than 7 days old
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
|
Loading…
x
Reference in New Issue
Block a user