mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
A rewritten and much improved lrs2lrf
This commit is contained in:
parent
c03508cf37
commit
da826622b0
File diff suppressed because it is too large
Load Diff
@ -601,7 +601,7 @@ class Text(LRFStream):
|
||||
s = u'<%s '%(self.name,)
|
||||
for name, val in self.attrs.items():
|
||||
s += '%s="%s" '%(name, val)
|
||||
return s.rstrip() + (u' />' if self.self_closing else u'>') + (u'\n' if self.name in ('P', 'CR') else u'')
|
||||
return s.rstrip() + (u' />' if self.self_closing else u'>')
|
||||
|
||||
class Span(TextTag):
|
||||
pass
|
||||
@ -760,8 +760,7 @@ class Text(LRFStream):
|
||||
s += c
|
||||
elif c is None:
|
||||
p = open_containers.pop()
|
||||
nl = u'\n' if p.name == 'P' else u''
|
||||
s += nl + u'</%s>'%(p.name,) + nl
|
||||
s += u'</%s>'%(p.name,)
|
||||
else:
|
||||
s += unicode(c)
|
||||
if not c.self_closing:
|
||||
|
@ -81,7 +81,7 @@ def writeWord(f, word):
|
||||
f.write(struct.pack("<H", int(word)))
|
||||
|
||||
def writeSignedWord(f, sword):
|
||||
f.write(struct.pack("<h", int(sword)))
|
||||
f.write(struct.pack("<h", int(float(sword))))
|
||||
|
||||
def writeWords(f, *words):
|
||||
f.write(struct.pack("<%dH" % len(words), *words))
|
||||
|
@ -396,8 +396,8 @@ class Book(Delegator):
|
||||
booksetting=BookSetting()
|
||||
Override the default BookSetting.
|
||||
|
||||
setdefault=SetDefault()
|
||||
Override the defalut SetDefault.
|
||||
setdefault=StyleDefault()
|
||||
Override the default SetDefault.
|
||||
|
||||
There are several other settings -- see the BookInfo class for more.
|
||||
"""
|
||||
@ -434,9 +434,12 @@ class Book(Delegator):
|
||||
self.defaultTextStyle = textStyle
|
||||
self.defaultBlockStyle = blockStyle
|
||||
LrsObject.nextObjId += 1
|
||||
|
||||
|
||||
styledefault = StyleDefault()
|
||||
if settings.has_key('setdefault'):
|
||||
styledefault = settings.pop('setdefault')
|
||||
Delegator.__init__(self, [BookInformation(), Main(),
|
||||
Template(), Style(), Solos(), Objects()])
|
||||
Template(), Style(styledefault), Solos(), Objects()])
|
||||
|
||||
self.sourceencoding = None
|
||||
|
||||
@ -606,10 +609,10 @@ class Book(Delegator):
|
||||
span.attrs['baselineskip'] = rescale(span.attrs['baselineskip'])
|
||||
|
||||
|
||||
def renderLrs(self, lrsFile):
|
||||
def renderLrs(self, lrsFile, encoding="UTF-8"):
|
||||
if isinstance(lrsFile, basestring):
|
||||
lrsFile = codecs.open(lrsFile, "wb", encoding="utf-16")
|
||||
self.render(lrsFile)
|
||||
lrsFile = codecs.open(lrsFile, "wb", encoding=encoding)
|
||||
self.render(lrsFile, outputEncodingName=encoding)
|
||||
lrsFile.close()
|
||||
|
||||
|
||||
@ -634,7 +637,7 @@ class Book(Delegator):
|
||||
return root
|
||||
|
||||
|
||||
def render(self, f):
|
||||
def render(self, f, outputEncodingName='UTF-8'):
|
||||
""" Write the book as an LRS to file f. """
|
||||
|
||||
self.appendReferencedObjects(self)
|
||||
@ -649,7 +652,8 @@ class Book(Delegator):
|
||||
|
||||
writer = ElementWriter(root, header=True,
|
||||
sourceEncoding=self.sourceencoding,
|
||||
spaceBeforeClose=False)
|
||||
spaceBeforeClose=False,
|
||||
outputEncodingName=outputEncodingName)
|
||||
writer.write(f)
|
||||
|
||||
|
||||
@ -1010,12 +1014,33 @@ class Template(object):
|
||||
# does nothing
|
||||
pass
|
||||
|
||||
class StyleDefault(LrsAttributes):
|
||||
"""
|
||||
Supply some defaults for all TextBlocks.
|
||||
The legal values are a subset of what is allowed on a
|
||||
TextBlock -- ruby, emphasis, and waitprop settings.
|
||||
"""
|
||||
defaults = dict(rubyalign="start", rubyadjust="none",
|
||||
rubyoverhang="none", empdotsposition="before",
|
||||
empdotsfontname="Dutch801 Rm BT Roman",
|
||||
empdotscode="0x002e", emplineposition="after",
|
||||
emplinetype = "solid", setwaitprop="noreplay")
|
||||
|
||||
alsoAllow = ["refempdotsfont", "rubyAlignAndAdjust"]
|
||||
|
||||
def __init__(self, **settings):
|
||||
LrsAttributes.__init__(self, self.defaults,
|
||||
alsoAllow=self.alsoAllow, **settings)
|
||||
|
||||
|
||||
def toElement(self, se):
|
||||
return Element("SetDefault", self.attrs)
|
||||
|
||||
|
||||
class Style(LrsContainer, Delegator):
|
||||
def __init__(self):
|
||||
def __init__(self, styledefault=StyleDefault()):
|
||||
LrsContainer.__init__(self, [PageStyle, TextStyle, BlockStyle])
|
||||
Delegator.__init__(self, [BookStyle()])
|
||||
Delegator.__init__(self, [BookStyle(styledefault=styledefault)])
|
||||
self.bookStyle = self.delegates[0]
|
||||
self.appendPageStyle = self.appendTextStyle = \
|
||||
self.appendBlockStyle = self.append
|
||||
@ -1071,10 +1096,10 @@ class Style(LrsContainer, Delegator):
|
||||
|
||||
|
||||
class BookStyle(LrsObject, LrsContainer):
|
||||
def __init__(self):
|
||||
def __init__(self, styledefault=StyleDefault()):
|
||||
LrsObject.__init__(self, assignId=True)
|
||||
LrsContainer.__init__(self, [Font])
|
||||
self.styledefault = StyleDefault()
|
||||
self.styledefault = styledefault
|
||||
self.booksetting = BookSetting()
|
||||
self.appendFont = self.append
|
||||
|
||||
@ -1119,27 +1144,6 @@ class BookStyle(LrsObject, LrsContainer):
|
||||
|
||||
|
||||
|
||||
class StyleDefault(LrsAttributes):
|
||||
"""
|
||||
Supply some defaults for all TextBlocks.
|
||||
The legal values are a subset of what is allowed on a
|
||||
TextBlock -- ruby, emphasis, and waitprop settings.
|
||||
"""
|
||||
defaults = dict(rubyalign="start", rubyadjust="none",
|
||||
rubyoverhang="none", empdotsposition="before",
|
||||
empdotsfontname="Dutch801 Rm BT Roman",
|
||||
empdotscode="0x002e", emplineposition="after",
|
||||
emplinetype = "solid", setwaitprop="noreplay")
|
||||
|
||||
alsoAllow = ["refempdotsfont", "rubyAlignAndAdjust"]
|
||||
|
||||
def __init__(self, **settings):
|
||||
LrsAttributes.__init__(self, self.defaults,
|
||||
alsoAllow=self.alsoAllow, **settings)
|
||||
|
||||
|
||||
def toElement(self, se):
|
||||
return Element("SetDefault", self.attrs)
|
||||
|
||||
|
||||
|
||||
@ -1226,7 +1230,7 @@ class TextStyle(LrsStyle):
|
||||
"""
|
||||
baseDefaults = dict(
|
||||
columnsep="0", charspace="0",
|
||||
textlinewidth="10", align="head", linecolor="0x00000000",
|
||||
textlinewidth="2", align="head", linecolor="0x00000000",
|
||||
column="1", fontsize="100", fontwidth="-10", fontescapement="0",
|
||||
fontorientation="0", fontweight="400",
|
||||
fontfacename="Dutch801 Rm BT Roman",
|
||||
@ -2251,7 +2255,9 @@ class HeaderOrFooter(LrsObject, LrsContainer, LrsAttributes):
|
||||
LrsContainer.__init__(self, [PutObj])
|
||||
LrsAttributes.__init__(self, self.defaults, **settings)
|
||||
|
||||
|
||||
def put_object(self, obj, x1, y1):
|
||||
self.append(PutObj(obj, x1, y1))
|
||||
|
||||
def PutObj(self, *args, **kwargs):
|
||||
p = PutObj(*args, **kwargs)
|
||||
self.append(p)
|
||||
@ -2468,7 +2474,7 @@ class ImageBlock(LrsObject, LrsContainer, LrsAttributes):
|
||||
""" Create an image on a page. """
|
||||
# TODO: allow other block attributes
|
||||
|
||||
defaults = dict(blockwidth="600", blockheight="800")
|
||||
defaults = BlockStyle.baseDefaults.copy()
|
||||
|
||||
def __init__(self, refstream, x0="0", y0="0", x1="600", y1="800",
|
||||
xsize="600", ysize="800",
|
||||
|
@ -19,7 +19,7 @@ import tempfile, time, calendar, re, operator
|
||||
from htmlentitydefs import name2codepoint
|
||||
|
||||
from libprs500 import __appname__, iswindows, browser
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, CData, Tag
|
||||
|
||||
|
||||
class DefaultProfile(object):
|
||||
@ -55,6 +55,7 @@ class DefaultProfile(object):
|
||||
# See the built-in profiles for examples of these settings.
|
||||
|
||||
feeds = []
|
||||
CDATA_PAT = re.compile(r'<\!\[CDATA\[(.*?)\]\]>', re.DOTALL)
|
||||
|
||||
def get_feeds(self):
|
||||
'''
|
||||
@ -68,7 +69,7 @@ class DefaultProfile(object):
|
||||
@classmethod
|
||||
def print_version(cls, url):
|
||||
'''
|
||||
Takea a URL pointing to an article and returns the URL pointing to the
|
||||
Take a URL pointing to an article and returns the URL pointing to the
|
||||
print version of the article.
|
||||
'''
|
||||
return url
|
||||
@ -157,6 +158,28 @@ class DefaultProfile(object):
|
||||
return index
|
||||
|
||||
|
||||
@classmethod
|
||||
def tag_to_string(cls, tag, use_alt=True):
|
||||
'''
|
||||
Convenience method to take a BeautifulSoup Tag and extract the text from it
|
||||
recursively, including any CDATA sections and alt tag attributes.
|
||||
@param use_alt: If True try to use the alt attribute for tags that don't have any textual content
|
||||
@return: A unicode (possibly empty) object
|
||||
'''
|
||||
if not tag:
|
||||
return ''
|
||||
strings = []
|
||||
for item in tag.contents:
|
||||
if isinstance(item, (NavigableString, CData)):
|
||||
strings.append(item.string)
|
||||
elif isinstance(item, Tag):
|
||||
res = cls.tag_to_string(item)
|
||||
if res:
|
||||
strings.append(res)
|
||||
elif use_alt and item.has_key('alt'):
|
||||
strings.append(item['alt'])
|
||||
return u''.join(strings)
|
||||
|
||||
def parse_feeds(self, require_url=True):
|
||||
'''
|
||||
Create list of articles from a list of feeds.
|
||||
@ -195,7 +218,7 @@ class DefaultProfile(object):
|
||||
if not pubdate or not pubdate.string:
|
||||
self.logger.debug('Skipping article as it does not have publication date')
|
||||
continue
|
||||
pubdate = pubdate.string
|
||||
pubdate = self.tag_to_string(pubdate)
|
||||
pubdate = pubdate.replace('+0000', 'GMT')
|
||||
for element in self.url_search_order:
|
||||
url = item.find(element)
|
||||
@ -205,7 +228,7 @@ class DefaultProfile(object):
|
||||
if require_url and (not url or not url.string):
|
||||
self.logger.debug('Skipping article as it does not have a link url')
|
||||
continue
|
||||
url = url.string if (url and url.string) else ''
|
||||
url = self.tag_to_string(url)
|
||||
|
||||
content = item.find('content:encoded')
|
||||
if not content:
|
||||
@ -221,7 +244,7 @@ class DefaultProfile(object):
|
||||
self.logger.debug('Skipping %s as could not find URL for print version. Error:\n%s'%(url, err))
|
||||
continue
|
||||
d = {
|
||||
'title' : item.find('title').string,
|
||||
'title' : self.tag_to_string(item.find('title')),
|
||||
'url' : purl,
|
||||
'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
|
||||
'date' : pubdate if self.use_pubdate else time.ctime(),
|
||||
@ -263,7 +286,7 @@ class DefaultProfile(object):
|
||||
@classmethod
|
||||
def process_html_description(cls, tag, strip_links=True):
|
||||
src = '\n'.join(tag.contents)
|
||||
match = re.match(r'<\!\[CDATA\[(.*)\]\]>', src.lstrip())
|
||||
match = cls.CDATA_PAT.match(src.lstrip())
|
||||
if match:
|
||||
src = match.group(1)
|
||||
else:
|
||||
|
@ -17,9 +17,7 @@ class WallStreetJournal(DefaultProfile):
|
||||
needs_subscription = True
|
||||
max_articles_per_feed = 10
|
||||
timefmt = ' [%a, %b %d, %Y]'
|
||||
html_description = True
|
||||
no_stylesheets = False
|
||||
html2lrf_options = [('--ignore-tables')]
|
||||
html2lrf_options = ['--ignore-tables', '--base-font-size=5']
|
||||
|
||||
## Don't grab articles more than 7 days old
|
||||
oldest_article = 7
|
||||
|
Loading…
x
Reference in New Issue
Block a user