From e39dc4223f880d6b721e75d35aaa36ac5ad96971 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 19 Aug 2007 21:42:21 +0000 Subject: [PATCH] Fix #163 and auto-detect Baen files. --- src/libprs500/ebooks/lrf/html/convert_from.py | 115 +++++++++++------- src/libprs500/ebooks/lrf/html/demo/demo.html | 6 +- 2 files changed, 75 insertions(+), 46 deletions(-) diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index 8cb3eb9df9..bc8a4d6af7 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -53,10 +53,9 @@ class Span(_Span): @staticmethod - def unit_convert(val, dpi, ref=80, pts=False): + def unit_convert(val, dpi, pts=False): """ - Tries to convert html units stored in C{val} to pixels. - @param ref: reference size in pixels for % units. + Tries to convert html units stored in C{val} to pixels. Assumes 100% = 10pt @param pts: If True return 10*pts instead of pixels. @return: The number of pixels (an int) if successful. Otherwise, returns None. Assumes: One em is 10pts @@ -70,7 +69,8 @@ class Span(_Span): if m is not None: unit = float(m.group(1)) if m.group(2) == '%': - result = int(unit/100.0*ref) + normal = Span.unit_convert('10pt', dpi) + result = int((unit/100.0)*normal) elif m.group(2) == 'px': result = int(unit) elif m.group(2) == 'in': @@ -85,14 +85,13 @@ class Span(_Span): result = int(unit * 0.04 * (dpi/72.)) elif m.group(2)== 'cm': result = int(unit * 0.4 * (dpi/72.)) - if result is None: - result = 0 if pts: - result = int((float(result)/dpi)*720) + if result is not None: + result = int((float(result)/dpi)*720) return result @staticmethod - def translate_attrs(d, dpi, fonts, logger, font_delta=0, memory=None): + def translate_font_attrs(d, dpi, fonts, logger, font_delta=0, memory=None): """ Receives a dictionary of html attributes and styles and returns approximate Xylog equivalents in a new dictionary @@ -141,16 +140,13 @@ class Span(_Span): def font_size(val): - # Assumes a 10 pt font (14 pixels) has fontsize 100 - ans = None - normal = 14 - unit = Span.unit_convert(val, dpi, normal) - if unit: - if unit < 0: - unit = normal + unit - if unit < 0: - unit = normal - ans = int(unit * (72./dpi) * 10) + normal = 100 #10*pts + ans = Span.unit_convert(val, dpi, pts=True) + if ans: + if ans < 0: + ans += normal + if ans < 0: + ans = normal else: if "xx-small" in val: ans = 40 @@ -211,7 +207,7 @@ class Span(_Span): variant = font_variant(val) if variant: t['fontvariant'] = variant - else: + elif memory is not None: report = True if memory != None: if key in memory: @@ -221,8 +217,10 @@ class Span(_Span): if report: logger.info('Unhandled/malformed CSS key: %s: %s', key, d[key]) t['fontfacename'] = (family, font_key(family, style, weight)) - if t.has_key('fontsize') and int(t['fontsize']) > 120: - t['wordspace'] = 50 + if t.has_key('fontsize'): + if int(t['fontsize']) > 120: + t['wordspace'] = 50 + t['baselineskip'] = int(t['fontsize']) + 20 return t def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta, parent_style, @@ -231,7 +229,7 @@ class Span(_Span): for pat, repl in Span.rules: src = pat.sub(repl, src) src = src.replace(u'\xa0', ' ') # nbsp is replaced with \xa0 by BeatifulSoup - attrs = Span.translate_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory) + attrs = Span.translate_font_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory) if 'fontsize' in attrs.keys(): normal_font_size = int(attrs['fontsize']) variant = attrs.pop('fontvariant', None) @@ -259,13 +257,14 @@ class Span(_Span): attrs['fontweight'] = 700 if key in ['italic', 'bi']: src = Italic(src) - if 'fontsize' in attrs.keys(): - attrs['baselineskip'] = int(attrs['fontsize']) + 20 if attrs['fontfacename'] == fonts['serif']['normal'][1]: attrs.pop('fontfacename') + unneeded = [] for key in attrs: if parent_style.has_key(key) and str(parent_style[key]) == str(attrs[key]): - attrs.pop(key) + unneeded.append(key) + for key in unneeded: + attrs.pop(key) self.text_src = src self.span_needed = bool(attrs) _Span.__init__(self, text=src, **attrs) @@ -395,6 +394,10 @@ class HTMLConverter(object): self.book = book #: The Book object representing a BBeB book self.start_on_file(path, is_root=True) + def is_baen(self, soup): + return bool(soup.find('meta', attrs={'name':'Publisher', + 'content':re.compile('Baen', re.IGNORECASE)})) + def start_on_file(self, path, is_root=True, link_level=0): path = os.path.abspath(path) os.chdir(os.path.dirname(path)) @@ -413,6 +416,10 @@ class HTMLConverter(object): soup = BeautifulSoup(raw, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=nmassage) + if not self.baen and self.is_baen(soup): + self.baen = True + self.logger.info('Baen file detected. Re-parsing...') + return self.start_on_file(path, is_root=is_root, link_level=link_level) self.logger.info('\tConverting to BBeB...') sys.stdout.flush() self.current_page = None @@ -990,7 +997,7 @@ class HTMLConverter(object): self.logger.debug('Forcing page break at %s', tagname) return end_page - def process_block(self, tag, tag_css, tkey): + def process_block(self, tag, tag_css, tkey): ''' Ensure padding and text-indent properties are respected ''' if tag_css.has_key('text-indent'): indent = Span.unit_convert(str(tag_css['text-indent']), self.profile.dpi, pts=True) @@ -998,7 +1005,6 @@ class HTMLConverter(object): indent = 0 if hasattr(self, 'minimum_indent') and indent > 0 and indent < self.minimum_indent: indent = self.minimum_indent - else: indent = self.book.defaultTextStyle.attrs['parindent'] @@ -1017,14 +1023,32 @@ class HTMLConverter(object): top = Span.unit_convert(top, self.profile.dpi) if top is not None else 0 bottom = Span.unit_convert(bottom, self.profile.dpi) if bottom is not None else 0 left = Span.unit_convert(left, self.profile.dpi) if left is not None else 0 - - if indent != int(self.current_block.textStyle.attrs['parindent']) or \ + fonts = Span.translate_font_attrs(tag_css, self.profile.dpi, self.fonts, + self.logger, self.font_delta, None) + fonts_changed = False + fonts.pop('fontvariant', None) + family, key = fonts['fontfacename'] + if self.fonts[family].has_key(key): + fonts['fontfacename'] = self.fonts[family][key][1] + else: + fonts['fontfacename'] = self.fonts[family]['normal'][1] + for key in fonts.keys(): + if str(self.current_block.textStyle.attrs[key]) != str(fonts[key]): + fonts_changed = True + break + if fonts_changed or \ + indent != int(self.current_block.textStyle.attrs['parindent']) or \ top != int(self.current_block.blockStyle.attrs['topskip']) or \ bottom != int(self.current_block.blockStyle.attrs['footskip']) or \ left != int(self.current_block.blockStyle.attrs['sidemargin']): + self.current_block.append_to(self.current_page) ts = self.book.create_text_style(**self.current_block.textStyle.attrs) ts.attrs['parindent'] = indent + for key in ('fontfacename', 'fontsize', 'fontwidth', 'wordspace', 'baselineskip'): + ts.attrs[key] = self.book.defaultTextStyle.attrs[key] + for key in fonts: + ts.attrs[key] = fonts[key] bs = self.book.create_block_style(**self.current_block.blockStyle.attrs) ba = bs.attrs ba['topskip'], ba['footskip'], ba['sidemargin'] = top, bottom, left @@ -1177,7 +1201,7 @@ class HTMLConverter(object): elif tagname == 'pre': self.end_current_para() self.end_current_block() - self.current_block.textStyle = self.current_block.textStyle.copy() + self.current_block = self.book.create_text_block() self.current_block.textStyle.attrs['parindent'] = '0' if tag.contents: c = tag.contents[0] @@ -1247,14 +1271,14 @@ class HTMLConverter(object): self.current_block.append_to(self.current_page) pb = self.current_block self.current_para = Paragraph() - ts = self.book.create_text_style(**self.current_block.textStyle.attrs) + ts = self.book.create_text_style() ts.attrs['parindent'] = 0 try: index = self.text_styles.index(ts) ts = self.text_styles[index] except ValueError: self.text_styles.append(ts) - bs = self.book.create_block_style(**self.current_block.blockStyle.attrs) + bs = self.book.create_block_style() bs.attrs['sidemargin'], bs.attrs['topskip'], bs.attrs['footskip'] = \ 60, 20, 20 try: @@ -1297,19 +1321,25 @@ class HTMLConverter(object): self.logger.debug('Detected chapter %s', src) self.end_page() self.page_break_found = True - self.end_current_para() - if not tag.contents or not src.strip(): # Handle empty

elements + if not tag.contents: self.current_block.append(CR()) - self.previous_text = '\n' - self.process_children(tag, tag_css) + self.current_block.must_append = True return - self.previous_text = '\n' - self.process_block(tag, tag_css, tkey) - self.process_children(tag, tag_css) - self.end_current_para() - if tagname.startswith('h') or self.blank_after_para: + if not self.in_table: + self.process_block(tag, tag_css, tkey) + if self.current_para.contents: + self.current_block.append(self.current_para) + if self.current_block.contents: self.current_block.append(CR()) - self.previous_text = '\n' + self.previous_text = '\n' + self.current_para = Paragraph() + + self.process_children(tag, tag_css) + if self.current_para.contents: + self.current_block.append(self.current_para) + self.current_para = Paragraph() + if tagname.startswith('h') or self.blank_after_para: + self.current_block.append(CR()) elif tagname in ['b', 'strong', 'i', 'em', 'span', 'tt', 'big', 'code', 'cite']: self.process_children(tag, tag_css) elif tagname == 'font': @@ -1350,6 +1380,7 @@ class HTMLConverter(object): def process_table(self, tag, tag_css): self.end_current_block() + self.current_block = self.book.create_text_block() rowpad = 10 table = Table(self, tag, tag_css, rowpad=rowpad, colpad=10) canvases = [] diff --git a/src/libprs500/ebooks/lrf/html/demo/demo.html b/src/libprs500/ebooks/lrf/html/demo/demo.html index 62fce386ae..5b25c3d2cf 100644 --- a/src/libprs500/ebooks/lrf/html/demo/demo.html +++ b/src/libprs500/ebooks/lrf/html/demo/demo.html @@ -24,7 +24,7 @@

Lists

-

+

Nested lists

  1. Item 1
  2. @@ -37,9 +37,7 @@
  • Item 2
  • - -

    -
    +

    Definition Lists