Fix #163 and auto-detect Baen files.

2025-07-09 03:04:10 -04:00 · 2007-08-19 21:42:21 +00:00 · 2007-08-19 21:42:21 +00:00 · e39dc4223f
commit e39dc4223f
parent 04fbb91fae
2 changed files with 75 additions and 46 deletions
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -53,10 +53,9 @@ class Span(_Span):
    
    
    @staticmethod
-    def unit_convert(val, dpi, ref=80, pts=False):
+    def unit_convert(val, dpi, pts=False):
        """
-        Tries to convert html units stored in C{val} to pixels. 
-        @param ref: reference size in pixels for % units.
+        Tries to convert html units stored in C{val} to pixels. Assumes 100% = 10pt 
        @param pts: If True return 10*pts instead of pixels.
        @return: The number of pixels (an int) if successful. Otherwise, returns None.
        Assumes: One em is 10pts
@ -70,7 +69,8 @@ class Span(_Span):
        if m is not None:
            unit = float(m.group(1))
            if m.group(2) == '%':
-                result = int(unit/100.0*ref)
+                normal = Span.unit_convert('10pt', dpi)
+                result = int((unit/100.0)*normal)
            elif m.group(2) == 'px':
                result =  int(unit)
            elif m.group(2) == 'in':
@ -85,14 +85,13 @@ class Span(_Span):
                result =  int(unit * 0.04 * (dpi/72.))
            elif m.group(2)== 'cm':
                result =  int(unit * 0.4 * (dpi/72.))
-        if result is None:
-            result = 0
        if pts:
-            result = int((float(result)/dpi)*720)
+            if result is not None:
+                result = int((float(result)/dpi)*720)
        return result
    
    @staticmethod
-    def translate_attrs(d, dpi, fonts, logger, font_delta=0, memory=None):
+    def translate_font_attrs(d, dpi, fonts, logger, font_delta=0, memory=None):
        """
        Receives a dictionary of html attributes and styles and returns
        approximate Xylog equivalents in a new dictionary
@ -141,16 +140,13 @@ class Span(_Span):
            
        
        def font_size(val):
-            # Assumes a 10 pt font (14 pixels) has fontsize 100
-            ans = None
-            normal = 14
-            unit = Span.unit_convert(val, dpi, normal)            
-            if unit:
-                if unit < 0:
-                    unit = normal + unit
-                    if unit < 0:
-                        unit = normal
-                ans = int(unit * (72./dpi) * 10)
+            normal = 100 #10*pts
+            ans = Span.unit_convert(val, dpi, pts=True)
+            if ans:
+                if ans < 0:
+                    ans += normal
+                    if ans < 0:
+                        ans = normal
            else:
                if "xx-small" in val:
                    ans = 40
@ -211,7 +207,7 @@ class Span(_Span):
                variant = font_variant(val)
                if variant:
                    t['fontvariant'] = variant
-            else:
+            elif memory is not None:
                report = True
                if memory != None:
                    if key in memory:
@ -221,8 +217,10 @@ class Span(_Span):
                if report:
                    logger.info('Unhandled/malformed CSS key: %s: %s', key, d[key])
        t['fontfacename'] = (family, font_key(family, style, weight))
-        if t.has_key('fontsize') and int(t['fontsize']) > 120:
-            t['wordspace'] = 50
+        if t.has_key('fontsize'):
+            if int(t['fontsize']) > 120:
+                t['wordspace'] = 50
+            t['baselineskip'] = int(t['fontsize']) + 20
        return t
    
    def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta, parent_style,
@ -231,7 +229,7 @@ class Span(_Span):
        for pat, repl in Span.rules:
            src = pat.sub(repl, src)
        src = src.replace(u'\xa0', ' ') # nbsp is replaced with \xa0 by BeatifulSoup
-        attrs = Span.translate_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory)
+        attrs = Span.translate_font_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory)
        if 'fontsize' in attrs.keys():
            normal_font_size = int(attrs['fontsize'])
        variant = attrs.pop('fontvariant', None)
@ -259,13 +257,14 @@ class Span(_Span):
                attrs['fontweight'] = 700
            if key in ['italic', 'bi']:
                src = Italic(src)
-        if 'fontsize' in attrs.keys():
-            attrs['baselineskip'] = int(attrs['fontsize']) + 20
        if attrs['fontfacename'] == fonts['serif']['normal'][1]:
            attrs.pop('fontfacename')
+        unneeded = []
        for key in attrs:
            if parent_style.has_key(key) and str(parent_style[key]) == str(attrs[key]):
-                attrs.pop(key)
+                unneeded.append(key)
+        for key in unneeded:
+            attrs.pop(key)
        self.text_src = src
        self.span_needed = bool(attrs)
        _Span.__init__(self, text=src, **attrs)
@ -395,6 +394,10 @@ class HTMLConverter(object):
        self.book = book                #: The Book object representing a BBeB book
        self.start_on_file(path, is_root=True)
        
+    def is_baen(self, soup):
+        return bool(soup.find('meta', attrs={'name':'Publisher', 
+                        'content':re.compile('Baen', re.IGNORECASE)}))
+    
    def start_on_file(self, path, is_root=True, link_level=0):
        path = os.path.abspath(path)
        os.chdir(os.path.dirname(path))
@ -413,6 +416,10 @@ class HTMLConverter(object):
        soup = BeautifulSoup(raw, 
                         convertEntities=BeautifulSoup.HTML_ENTITIES,
                         markupMassage=nmassage)
+        if not self.baen and self.is_baen(soup):
+            self.baen = True
+            self.logger.info('Baen file detected. Re-parsing...')
+            return self.start_on_file(path, is_root=is_root, link_level=link_level)
        self.logger.info('\tConverting to BBeB...')
        sys.stdout.flush()        
        self.current_page = None
@ -998,7 +1005,6 @@ class HTMLConverter(object):
                indent = 0
            if hasattr(self, 'minimum_indent') and indent > 0 and indent < self.minimum_indent:
                indent = self.minimum_indent
-            
        else:
            indent = self.book.defaultTextStyle.attrs['parindent']
            
@ -1017,14 +1023,32 @@ class HTMLConverter(object):
        top = Span.unit_convert(top, self.profile.dpi) if top is not None else 0
        bottom = Span.unit_convert(bottom, self.profile.dpi) if bottom is not None else 0
        left = Span.unit_convert(left, self.profile.dpi) if left is not None else 0
-        
-        if indent != int(self.current_block.textStyle.attrs['parindent']) or \
+        fonts = Span.translate_font_attrs(tag_css, self.profile.dpi, self.fonts, 
+                                          self.logger, self.font_delta, None)
+        fonts_changed = False
+        fonts.pop('fontvariant', None)
+        family, key = fonts['fontfacename']
+        if self.fonts[family].has_key(key):
+            fonts['fontfacename'] = self.fonts[family][key][1]
+        else:
+            fonts['fontfacename'] = self.fonts[family]['normal'][1]
+        for key in fonts.keys():
+            if str(self.current_block.textStyle.attrs[key]) != str(fonts[key]):
+                fonts_changed = True
+                break 
+        if fonts_changed or \
+           indent != int(self.current_block.textStyle.attrs['parindent']) or \
           top    != int(self.current_block.blockStyle.attrs['topskip'])   or \
           bottom != int(self.current_block.blockStyle.attrs['footskip'])  or \
           left   != int(self.current_block.blockStyle.attrs['sidemargin']):
+           
            self.current_block.append_to(self.current_page)
            ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
            ts.attrs['parindent'] = indent
+            for key in ('fontfacename', 'fontsize', 'fontwidth', 'wordspace', 'baselineskip'):
+                ts.attrs[key] = self.book.defaultTextStyle.attrs[key]
+            for key in fonts:
+                ts.attrs[key] = fonts[key]
            bs = self.book.create_block_style(**self.current_block.blockStyle.attrs)
            ba = bs.attrs
            ba['topskip'], ba['footskip'], ba['sidemargin'] = top, bottom, left            
@ -1177,7 +1201,7 @@ class HTMLConverter(object):
        elif tagname == 'pre':
            self.end_current_para()
            self.end_current_block()
-            self.current_block.textStyle = self.current_block.textStyle.copy()
+            self.current_block = self.book.create_text_block()
            self.current_block.textStyle.attrs['parindent'] = '0'
            if tag.contents:
                c = tag.contents[0]
@ -1247,14 +1271,14 @@ class HTMLConverter(object):
            self.current_block.append_to(self.current_page)
            pb = self.current_block
            self.current_para = Paragraph()
-            ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
+            ts = self.book.create_text_style()
            ts.attrs['parindent'] = 0
            try:
                index = self.text_styles.index(ts)
                ts = self.text_styles[index]
            except ValueError:
                self.text_styles.append(ts)
-            bs = self.book.create_block_style(**self.current_block.blockStyle.attrs)
+            bs = self.book.create_block_style()
            bs.attrs['sidemargin'], bs.attrs['topskip'], bs.attrs['footskip'] = \
            60, 20, 20
            try:
@ -1297,19 +1321,25 @@ class HTMLConverter(object):
                    self.logger.debug('Detected chapter %s', src)
                    self.end_page()
                    self.page_break_found = True
-            self.end_current_para()
-            if not tag.contents or not src.strip(): # Handle empty <p></p> elements
+            if not tag.contents:
                self.current_block.append(CR())
-                self.previous_text = '\n'
-                self.process_children(tag, tag_css)
+                self.current_block.must_append = True
                return
+            if not self.in_table:
+                self.process_block(tag, tag_css, tkey)
+            if self.current_para.contents:
+                self.current_block.append(self.current_para)            
+            if self.current_block.contents:
+                self.current_block.append(CR())
            self.previous_text = '\n'
-            self.process_block(tag, tag_css, tkey)
+            self.current_para = Paragraph()
+            
            self.process_children(tag, tag_css)
-            self.end_current_para()
+            if self.current_para.contents:
+                self.current_block.append(self.current_para)
+            self.current_para = Paragraph()
            if tagname.startswith('h') or self.blank_after_para:
                self.current_block.append(CR())                            
-                self.previous_text = '\n'            
        elif tagname in ['b', 'strong', 'i', 'em', 'span', 'tt', 'big', 'code', 'cite']:
            self.process_children(tag, tag_css)
        elif tagname == 'font':
@ -1350,6 +1380,7 @@ class HTMLConverter(object):
                    
    def process_table(self, tag, tag_css):
        self.end_current_block()
+        self.current_block = self.book.create_text_block()
        rowpad = 10
        table = Table(self, tag, tag_css, rowpad=rowpad, colpad=10)
        canvases = []
--- a/src/libprs500/ebooks/lrf/html/demo/demo.html
+++ b/src/libprs500/ebooks/lrf/html/demo/demo.html
@ -24,7 +24,7 @@
 </ul>

 <h2><a name='lists'>Lists</a></h2>
- <p></p>
+ 
 <h3>Nested lists</h3>
 <ol>
   <li>Item 1</li>
@ -38,8 +38,6 @@
   </ul>
   <li>Item 2</li>
 </ol> 
- </p>
- <br/>
 <p></p>
 <h3>Definition Lists</h3>
 <dl>