A rewritten and much improved lrs2lrf

2025-07-09 03:04:10 -04:00 · 2008-01-28 02:10:18 +00:00 · 2008-01-28 02:10:18 +00:00 · da826622b0
commit da826622b0
parent c03508cf37
6 changed files with 331 additions and 1045 deletions
--- a/src/libprs500/ebooks/lrf/lrs/convert_from.py
+++ b/src/libprs500/ebooks/lrf/lrs/convert_from.py
--- a/src/libprs500/ebooks/lrf/objects.py
+++ b/src/libprs500/ebooks/lrf/objects.py
@ -601,7 +601,7 @@ class Text(LRFStream):
            s = u'<%s '%(self.name,)
            for name, val in self.attrs.items():
                s += '%s="%s" '%(name, val)
-            return s.rstrip() + (u' />' if self.self_closing else u'>') + (u'\n' if self.name in ('P', 'CR') else u'')
+            return s.rstrip() + (u' />' if self.self_closing else u'>')
        
    class Span(TextTag):
        pass
@ -760,8 +760,7 @@ class Text(LRFStream):
                s += c
            elif c is None:
                p = open_containers.pop()
-                nl = u'\n' if p.name == 'P' else u''
-                s += nl + u'</%s>'%(p.name,) + nl 
+                s += u'</%s>'%(p.name,) 
            else:
                s += unicode(c)
                if not c.self_closing: 
--- a/src/libprs500/ebooks/lrf/pylrs/pylrf.py
+++ b/src/libprs500/ebooks/lrf/pylrs/pylrf.py
@ -81,7 +81,7 @@ def writeWord(f, word):
    f.write(struct.pack("<H", int(word)))

 def writeSignedWord(f, sword):
-    f.write(struct.pack("<h", int(sword)))
+    f.write(struct.pack("<h", int(float(sword))))

 def writeWords(f, *words):
    f.write(struct.pack("<%dH" % len(words), *words))
--- a/src/libprs500/ebooks/lrf/pylrs/pylrs.py
+++ b/src/libprs500/ebooks/lrf/pylrs/pylrs.py
@ -396,8 +396,8 @@ class Book(Delegator):
        booksetting=BookSetting()
        Override the default BookSetting.

-        setdefault=SetDefault()
-        Override the defalut SetDefault.
+        setdefault=StyleDefault()
+        Override the default SetDefault.
        
        There are several other settings -- see the BookInfo class for more.       
    """
@ -434,9 +434,12 @@ class Book(Delegator):
        self.defaultTextStyle = textStyle
        self.defaultBlockStyle = blockStyle
        LrsObject.nextObjId += 1
-
+        
+        styledefault = StyleDefault()
+        if settings.has_key('setdefault'):
+            styledefault = settings.pop('setdefault')
        Delegator.__init__(self, [BookInformation(), Main(),
-            Template(), Style(), Solos(), Objects()])        
+            Template(), Style(styledefault), Solos(), Objects()])        

        self.sourceencoding = None
        
@ -606,10 +609,10 @@ class Book(Delegator):
                    span.attrs['baselineskip'] = rescale(span.attrs['baselineskip'])
                
    
-    def renderLrs(self, lrsFile):
+    def renderLrs(self, lrsFile, encoding="UTF-8"):
        if isinstance(lrsFile, basestring): 
-            lrsFile = codecs.open(lrsFile, "wb", encoding="utf-16")
-        self.render(lrsFile)
+            lrsFile = codecs.open(lrsFile, "wb", encoding=encoding)
+        self.render(lrsFile, outputEncodingName=encoding)
        lrsFile.close()


@ -634,7 +637,7 @@ class Book(Delegator):
        return root


-    def render(self, f):
+    def render(self, f, outputEncodingName='UTF-8'):
        """ Write the book as an LRS to file f. """

        self.appendReferencedObjects(self)
@ -649,7 +652,8 @@ class Book(Delegator):

        writer = ElementWriter(root, header=True,
                               sourceEncoding=self.sourceencoding,
-                               spaceBeforeClose=False)
+                               spaceBeforeClose=False,
+                               outputEncodingName=outputEncodingName)
        writer.write(f)
        

@ -1010,12 +1014,33 @@ class Template(object):
        # does nothing
        pass

+class StyleDefault(LrsAttributes):
+    """
+        Supply some defaults for all TextBlocks.
+        The legal values are a subset of what is allowed on a
+        TextBlock -- ruby, emphasis, and waitprop settings.
+    """
+    defaults = dict(rubyalign="start", rubyadjust="none", 
+                rubyoverhang="none", empdotsposition="before",
+                empdotsfontname="Dutch801 Rm BT Roman",
+                empdotscode="0x002e", emplineposition="after",
+                emplinetype = "solid", setwaitprop="noreplay")
+
+    alsoAllow = ["refempdotsfont", "rubyAlignAndAdjust"]
+
+    def __init__(self, **settings):       
+        LrsAttributes.__init__(self, self.defaults,
+                alsoAllow=self.alsoAllow, **settings)
+        
+        
+    def toElement(self, se):
+        return Element("SetDefault", self.attrs)


 class Style(LrsContainer, Delegator):
-    def __init__(self):
+    def __init__(self, styledefault=StyleDefault()):
        LrsContainer.__init__(self, [PageStyle, TextStyle, BlockStyle])
-        Delegator.__init__(self, [BookStyle()])
+        Delegator.__init__(self, [BookStyle(styledefault=styledefault)])
        self.bookStyle = self.delegates[0]
        self.appendPageStyle = self.appendTextStyle = \
                self.appendBlockStyle = self.append
@ -1071,10 +1096,10 @@ class Style(LrsContainer, Delegator):


 class BookStyle(LrsObject, LrsContainer):
-    def __init__(self):
+    def __init__(self, styledefault=StyleDefault()):
        LrsObject.__init__(self, assignId=True)
        LrsContainer.__init__(self, [Font])
-        self.styledefault = StyleDefault()
+        self.styledefault = styledefault
        self.booksetting = BookSetting()
        self.appendFont = self.append
        
@ -1119,27 +1144,6 @@ class BookStyle(LrsObject, LrsContainer):
    
    
 
-class StyleDefault(LrsAttributes):
-    """
-        Supply some defaults for all TextBlocks.
-        The legal values are a subset of what is allowed on a
-        TextBlock -- ruby, emphasis, and waitprop settings.
-    """
-    defaults = dict(rubyalign="start", rubyadjust="none", 
-                rubyoverhang="none", empdotsposition="before",
-                empdotsfontname="Dutch801 Rm BT Roman",
-                empdotscode="0x002e", emplineposition="after",
-                emplinetype = "solid", setwaitprop="noreplay")
-
-    alsoAllow = ["refempdotsfont", "rubyAlignAndAdjust"]
-
-    def __init__(self, **settings):       
-        LrsAttributes.__init__(self, self.defaults,
-                alsoAllow=self.alsoAllow, **settings)
-        
-        
-    def toElement(self, se):
-        return Element("SetDefault", self.attrs)
    
    

@ -1226,7 +1230,7 @@ class TextStyle(LrsStyle):
    """
    baseDefaults = dict(
            columnsep="0", charspace="0",
-            textlinewidth="10", align="head", linecolor="0x00000000",
+            textlinewidth="2", align="head", linecolor="0x00000000",
            column="1", fontsize="100", fontwidth="-10", fontescapement="0",
            fontorientation="0", fontweight="400",
            fontfacename="Dutch801 Rm BT Roman",
@ -2251,7 +2255,9 @@ class HeaderOrFooter(LrsObject, LrsContainer, LrsAttributes):
        LrsContainer.__init__(self, [PutObj])
        LrsAttributes.__init__(self, self.defaults, **settings)

-
+    def put_object(self, obj, x1, y1):
+        self.append(PutObj(obj, x1, y1))
+        
    def PutObj(self, *args, **kwargs):
        p = PutObj(*args, **kwargs)
        self.append(p)
@ -2468,7 +2474,7 @@ class ImageBlock(LrsObject, LrsContainer, LrsAttributes):
    """ Create an image on a page. """
    # TODO: allow other block attributes

-    defaults = dict(blockwidth="600", blockheight="800") 
+    defaults = BlockStyle.baseDefaults.copy() 

    def __init__(self, refstream, x0="0", y0="0", x1="600", y1="800", 
                       xsize="600", ysize="800",  
--- a/src/libprs500/ebooks/lrf/web/profiles/init.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/init.py
@ -19,7 +19,7 @@ import tempfile, time, calendar, re, operator
 from htmlentitydefs import name2codepoint

 from libprs500 import __appname__, iswindows, browser
-from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
+from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, CData, Tag


 class DefaultProfile(object):
@ -55,6 +55,7 @@ class DefaultProfile(object):
    # See the built-in profiles for examples of these settings.
    
    feeds = []
+    CDATA_PAT = re.compile(r'<\!\[CDATA\[(.*?)\]\]>', re.DOTALL)

    def get_feeds(self):
        '''
@ -68,7 +69,7 @@ class DefaultProfile(object):
    @classmethod
    def print_version(cls, url):
        '''
-        Takea a URL pointing to an article and returns the URL pointing to the
+        Take a URL pointing to an article and returns the URL pointing to the
        print version of the article.
        '''
        return url
@ -157,6 +158,28 @@ class DefaultProfile(object):
        return index

    
+    @classmethod
+    def tag_to_string(cls, tag, use_alt=True):
+        '''
+        Convenience method to take a BeautifulSoup Tag and extract the text from it
+        recursively, including any CDATA sections and alt tag attributes.
+        @param use_alt: If True try to use the alt attribute for tags that don't have any textual content
+        @return: A unicode (possibly empty) object
+        '''
+        if not tag:
+            return ''
+        strings = []
+        for item in tag.contents:
+            if isinstance(item, (NavigableString, CData)):
+                strings.append(item.string)
+            elif isinstance(item, Tag):
+                res = cls.tag_to_string(item)
+                if res:
+                    strings.append(res)
+                elif use_alt and item.has_key('alt'):
+                    strings.append(item['alt'])
+        return u''.join(strings) 
+    
    def parse_feeds(self, require_url=True):
        '''
        Create list of articles from a list of feeds.
@ -195,7 +218,7 @@ class DefaultProfile(object):
                        if not pubdate or not pubdate.string:
                            self.logger.debug('Skipping article as it does not have publication date')
                            continue
-                        pubdate = pubdate.string
+                        pubdate = self.tag_to_string(pubdate)
                        pubdate = pubdate.replace('+0000', 'GMT')
                    for element in self.url_search_order:
                        url = item.find(element)
@ -205,7 +228,7 @@ class DefaultProfile(object):
                    if require_url and (not url or not url.string):
                        self.logger.debug('Skipping article as it does not have a link url')
                        continue
-                    url = url.string if (url and url.string) else ''
+                    url = self.tag_to_string(url)
                    
                    content = item.find('content:encoded')
                    if not content:
@ -221,7 +244,7 @@ class DefaultProfile(object):
                        self.logger.debug('Skipping %s as could not find URL for print version. Error:\n%s'%(url, err))
                        continue
                    d = { 
-                        'title'    : item.find('title').string,                 
+                        'title'    : self.tag_to_string(item.find('title')),                 
                        'url'      : purl,
                        'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
                        'date'     : pubdate if self.use_pubdate else time.ctime(),
@ -263,7 +286,7 @@ class DefaultProfile(object):
    @classmethod
    def process_html_description(cls, tag, strip_links=True):
        src = '\n'.join(tag.contents)
-        match = re.match(r'<\!\[CDATA\[(.*)\]\]>', src.lstrip())
+        match = cls.CDATA_PAT.match(src.lstrip())
        if match:
            src = match.group(1)
        else:
--- a/src/libprs500/ebooks/lrf/web/profiles/wsj.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/wsj.py
@ -17,9 +17,7 @@ class WallStreetJournal(DefaultProfile):
        needs_subscription = True 
        max_articles_per_feed = 10
        timefmt  = ' [%a, %b %d, %Y]' 
-        html_description = True 
-        no_stylesheets = False
-        html2lrf_options = [('--ignore-tables')]
+        html2lrf_options = ['--ignore-tables', '--base-font-size=5']

        ## Don't grab articles more than 7 days old 
        oldest_article = 7