diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index bc8a4d6af7..20527b059b 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -35,8 +35,8 @@ from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, \ from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \ TextBlock, ImageBlock, JumpButton, CharButton, \ Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \ - LrsError, Sup, Sub -from libprs500.ebooks.lrf.pylrs.pylrs import Span as _Span + LrsError, Sup, Sub, properties_different +from libprs500.ebooks.lrf.pylrs.pylrs import Span from libprs500.ebooks.lrf import Book from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks import ConversionError @@ -45,234 +45,16 @@ from libprs500 import filename_to_utf8, setup_cli_handlers, __appname__ from libprs500.ptempfile import PersistentTemporaryFile from libprs500.ebooks.metadata.opf import OPFReader -class Span(_Span): - replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] - patterns = [ re.compile('&'+i+';') for i in replaced_entities ] - targets = [ unichr(name2codepoint[i]) for i in replaced_entities ] - rules = zip(patterns, targets) - - - @staticmethod - def unit_convert(val, dpi, pts=False): - """ - Tries to convert html units stored in C{val} to pixels. Assumes 100% = 10pt - @param pts: If True return 10*pts instead of pixels. - @return: The number of pixels (an int) if successful. Otherwise, returns None. - Assumes: One em is 10pts - """ - result = None - try: - result = int(val) - except ValueError: - pass - m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val) - if m is not None: - unit = float(m.group(1)) - if m.group(2) == '%': - normal = Span.unit_convert('10pt', dpi) - result = int((unit/100.0)*normal) - elif m.group(2) == 'px': - result = int(unit) - elif m.group(2) == 'in': - result = int(unit * dpi) - elif m.group(2) == 'pt': - result = int(unit * dpi/72.) - elif m.group(2)== 'em': - result = int(unit * (dpi/72.) * 10) - elif m.group(2)== 'pc': - result = int(unit * (dpi/72.) * 12) - elif m.group(2)== 'mm': - result = int(unit * 0.04 * (dpi/72.)) - elif m.group(2)== 'cm': - result = int(unit * 0.4 * (dpi/72.)) - if pts: - if result is not None: - result = int((float(result)/dpi)*720) - return result - - @staticmethod - def translate_font_attrs(d, dpi, fonts, logger, font_delta=0, memory=None): - """ - Receives a dictionary of html attributes and styles and returns - approximate Xylog equivalents in a new dictionary - """ - def font_weight(val): - ans = 0 - m = re.search("([0-9]+)", val) - if m: - ans = int(m.group(1)) - elif val.find("bold") >= 0 or val.find("strong") >= 0: - ans = 700 - return 'bold' if ans >= 700 else 'normal' - - def font_style(val): - ans = 'normal' - if 'italic' in val or 'oblique' in val: - ans = 'italic' - return ans - - def font_family(val): - ans = 'serif' - if max(val.find("courier"), val.find("mono"), val.find("fixed"), val.find("typewriter"))>=0: - ans = 'mono' - elif max(val.find("arial"), val.find("helvetica"), val.find("verdana"), - val.find("trebuchet"), val.find("sans")) >= 0: - ans = 'sans' - return ans - - def font_variant(val): - ans = None - if 'small-caps' in val.lower(): - ans = 'small-caps' - return ans - - def font_key(family, style, weight): - key = 'normal' - if style == 'italic' and weight == 'normal': - key = 'italic' - elif style == 'normal' and weight == 'bold': - key = 'bold' - elif style == 'italic' and weight == 'bold': - key = 'bi' - return key - - - - - def font_size(val): - normal = 100 #10*pts - ans = Span.unit_convert(val, dpi, pts=True) - if ans: - if ans < 0: - ans += normal - if ans < 0: - ans = normal - else: - if "xx-small" in val: - ans = 40 - elif "x-small" in val: - ans = 60 - elif "small" in val: - ans = 80 - elif "xx-large" in val: - ans = 180 - elif "x-large" in val: - ans = 140 - elif "large" in val: - ans = 120 - if ans is not None: - ans += int(font_delta * 20) - ans = str(ans) - return ans - - t = dict() - family, weight, style, variant = 'serif', 'normal', 'normal', None - for key in d.keys(): - val = d[key].lower() - if key == 'font': - vals = val.split() - for val in vals: - family = font_family(val) - if family != 'serif': - break - for val in vals: - weight = font_weight(val) - if weight != 'normal': - break - for val in vals: - style = font_style(val) - if style != 'normal': - break - for val in vals: - sz = font_size(val) - if sz: - t['fontsize'] = sz - break - for val in vals: - variant = font_variant(val) - if variant: - t['fontvariant'] = variant - break - elif key in ['font-family', 'font-name']: - family = font_family(val) - elif key == "font-size": - ans = font_size(val) - if ans: - t['fontsize'] = ans - elif key == 'font-weight': - weight = font_weight(val) - elif key == 'font-style': - style = font_style(val) - elif key == 'font-variant': - variant = font_variant(val) - if variant: - t['fontvariant'] = variant - elif memory is not None: - report = True - if memory != None: - if key in memory: - report = False - else: - memory.append(key) - if report: - logger.info('Unhandled/malformed CSS key: %s: %s', key, d[key]) - t['fontfacename'] = (family, font_key(family, style, weight)) - if t.has_key('fontsize'): - if int(t['fontsize']) > 120: - t['wordspace'] = 50 - t['baselineskip'] = int(t['fontsize']) + 20 - return t - - def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta, parent_style, - normal_font_size=100): - src = ns.string if hasattr(ns, 'string') else ns - for pat, repl in Span.rules: - src = pat.sub(repl, src) - src = src.replace(u'\xa0', ' ') # nbsp is replaced with \xa0 by BeatifulSoup - attrs = Span.translate_font_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory) - if 'fontsize' in attrs.keys(): - normal_font_size = int(attrs['fontsize']) - variant = attrs.pop('fontvariant', None) - if variant == 'small-caps': - dump = _Span(fontsize=normal_font_size-30) - temp = [] - for c in src: - if c.isupper(): - if temp: - dump.append(''.join(temp)) - temp = [] - dump.append(_Span(c, fontsize=normal_font_size)) - else: - temp.append(c.upper()) - src = dump - if temp: - src.append(''.join(temp)) - - family, key = attrs['fontfacename'] - if fonts[family].has_key(key): - attrs['fontfacename'] = fonts[family][key][1] - else: - attrs['fontfacename'] = fonts[family]['normal'][1] - if key in ['bold', 'bi']: - attrs['fontweight'] = 700 - if key in ['italic', 'bi']: - src = Italic(src) - if attrs['fontfacename'] == fonts['serif']['normal'][1]: - attrs.pop('fontfacename') - unneeded = [] - for key in attrs: - if parent_style.has_key(key) and str(parent_style[key]) == str(attrs[key]): - unneeded.append(key) - for key in unneeded: - attrs.pop(key) - self.text_src = src - self.span_needed = bool(attrs) - _Span.__init__(self, text=src, **attrs) class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) + replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] + patterns = [ re.compile('&'+i+';') for i in replaced_entities ] + targets = [ unichr(name2codepoint[i]) for i in replaced_entities ] + ENTITY_RULES = zip(patterns, targets) + MARKUP_MASSAGE = [ # Close tags @@ -559,7 +341,7 @@ class HTMLConverter(object): def create_link(self, children, tag): para = None for i in range(len(children)-1, -1, -1): - if isinstance(children[i], _Span): + if isinstance(children[i], Span): para = children[i] break if para is None: @@ -778,17 +560,51 @@ class HTMLConverter(object): if self.process_alignment(css) and collapse_whitespace: # Dont want leading blanks in a new paragraph src = src.lstrip() + def append_text(src): - span = Span(src, self.sanctify_css(css), self.memory, self.profile.dpi, - self.fonts, self.logger, self.font_delta, - self.current_block.textStyle.attrs) - if span.span_needed or force_span_use: - self.current_para.append(span) - else: - if hasattr(span.text_src, 'parent'): - span.text_src.parent.contents = [] - span.text_src.parent = None - self.current_para.append(span.text_src) + fp, key, variant = self.font_properties(css) + for pat, repl in self.__class__.ENTITY_RULES: + src = pat.sub(repl, src) + src = src.replace(u'\xa0', ' ')# nbsp is replaced with \xa0 by BeatifulSoup + normal_font_size = int(fp['fontsize']) + if variant == 'small-caps': + dump = Span(fontsize=normal_font_size-30) + temp = [] + for c in src: + if c.isupper(): + if temp: + dump.append(''.join(temp)) + temp = [] + dump.append(Span(c, fontsize=normal_font_size)) + else: + temp.append(c.upper()) + src = dump + if temp: + src.append(''.join(temp)) + + + if key in ['italic', 'bi']: + already_italic = False + for fonts in self.fonts.values(): + it = fonts['italic'][1] if fonts.has_key('italic') else '' + bi = fonts['bi'][1] if fonts.has_key('bi') else '' + if fp['fontfacename'] in (it, bi): + already_italic = True + break + if not already_italic: + src = Italic(src) + + unneeded = [] + for prop in fp: + if fp[prop] == self.current_block.textStyle.attrs[prop]: + unneeded.append(prop) + for prop in unneeded: + fp.pop(prop) + + elem = Span(text=src, **fp) if (fp or force_span_use) else src + self.current_para.append(elem) + + if collapse_whitespace: src = re.sub(r'\s{1,}', ' ', src) if len(self.previous_text) != len(self.previous_text.rstrip()): @@ -808,19 +624,6 @@ class HTMLConverter(object): self.current_para.append(CR()) self.previous_text = '\n' - def sanctify_css(self, css): - """ Return a copy of C{css} that is safe for use in a SPAM Xylog tag """ - css = copy.copy(css) - for key in css.keys(): - test = key.lower() - if test.startswith('margin') or test.startswith('text') or \ - 'padding' in test or 'border' in test or 'page-break' in test \ - or test.startswith('mso') or test.startswith('background')\ - or test.startswith('line') or test in ['color', 'display', \ - 'letter-spacing', 'position', 'white-space']: - css.pop(key) - return css - def end_current_para(self): ''' End current paragraph with a paragraph break after it. If the current @@ -997,17 +800,11 @@ class HTMLConverter(object): self.logger.debug('Forcing page break at %s', tagname) return end_page - def process_block(self, tag, tag_css, tkey): - ''' Ensure padding and text-indent properties are respected ''' - if tag_css.has_key('text-indent'): - indent = Span.unit_convert(str(tag_css['text-indent']), self.profile.dpi, pts=True) - if not indent: - indent = 0 - if hasattr(self, 'minimum_indent') and indent > 0 and indent < self.minimum_indent: - indent = self.minimum_indent - else: - indent = self.book.defaultTextStyle.attrs['parindent'] - + def block_properties(self, tag_css): + ans = {} + for key in ('topskip', 'footskip', 'sidemargin'): + ans[key] = self.book.defaultBlockStyle.attrs[key] + src = [None for i in range(4)] if tag_css.has_key('padding'): msrc = tag_css['padding'].split() @@ -1018,40 +815,211 @@ class HTMLConverter(object): if tag_css.has_key('padding-'+c): src[i] = tag_css['padding-'+c] i += 1 - top, right, bottom, left = src - top = Span.unit_convert(top, self.profile.dpi) if top is not None else 0 - bottom = Span.unit_convert(bottom, self.profile.dpi) if bottom is not None else 0 - left = Span.unit_convert(left, self.profile.dpi) if left is not None else 0 - fonts = Span.translate_font_attrs(tag_css, self.profile.dpi, self.fonts, - self.logger, self.font_delta, None) - fonts_changed = False - fonts.pop('fontvariant', None) - family, key = fonts['fontfacename'] + t = {} + t['topskip'], t['footskip'], t['sidemargin'] = src[0], src[2], src[3] + for key in ('topskip', 'footskip', 'sidemargin'): + if t[key] is not None: + ans[key] = self.unit_convert(t[key]) + + return ans + + def font_properties(self, css): + ''' + Convert the font propertiess in css to the Xylog equivalents. If the CSS + does not contain a particular font property, the default from self.book.defaultTextSytle + is used. + @return: dict, key, variant. The dict contains the Xlog equivalents. key indicates + the font type (i.e. bold, bi, normal) and variant is None or 'small-caps' + ''' + t = {} + for key in ('fontwidth', 'fontsize', 'wordspace', 'fontfacename', 'fontweight', 'baselineskip'): + t[key] = self.book.defaultTextStyle.attrs[key] + + def font_weight(val): + ans = 0 + m = re.search("([0-9]+)", val) + if m: + ans = int(m.group(1)) + elif val.find("bold") >= 0 or val.find("strong") >= 0: + ans = 700 + return 'bold' if ans >= 700 else 'normal' + + def font_style(val): + ans = 'normal' + if 'italic' in val or 'oblique' in val: + ans = 'italic' + return ans + + def font_family(val): + ans = 'serif' + if max(val.find("courier"), val.find("mono"), val.find("fixed"), val.find("typewriter"))>=0: + ans = 'mono' + elif max(val.find("arial"), val.find("helvetica"), val.find("verdana"), + val.find("trebuchet"), val.find("sans")) >= 0: + ans = 'sans' + return ans + + def font_variant(val): + ans = None + if 'small-caps' in val.lower(): + ans = 'small-caps' + return ans + + def font_key(family, style, weight): + key = 'normal' + if style == 'italic' and weight == 'normal': + key = 'italic' + elif style == 'normal' and weight == 'bold': + key = 'bold' + elif style == 'italic' and weight == 'bold': + key = 'bi' + return key + + def font_size(val): + normal = 100 #10*pts + ans = self.unit_convert(val, pts=True) + if ans: + if ans < 0: + ans += normal + if ans < 0: + ans = normal + else: + if "xx-small" in val: + ans = 40 + elif "x-small" in val: + ans = 60 + elif "small" in val: + ans = 80 + elif "xx-large" in val: + ans = 180 + elif "x-large" in val: + ans = 140 + elif "large" in val: + ans = 120 + if ans is not None: + ans += int(self.font_delta * 20) + ans = str(ans) + return ans + + family, weight, style, variant = 'serif', 'normal', 'normal', None + for key in css.keys(): + val = css[key].lower() + if key == 'font': + vals = val.split() + for val in vals: + family = font_family(val) + if family != 'serif': + break + for val in vals: + weight = font_weight(val) + if weight != 'normal': + break + for val in vals: + style = font_style(val) + if style != 'normal': + break + for val in vals: + sz = font_size(val) + if sz: + t['fontsize'] = sz + break + for val in vals: + variant = font_variant(val) + if variant: + t['fontvariant'] = variant + break + elif key in ['font-family', 'font-name']: + family = font_family(val) + elif key == "font-size": + ans = font_size(val) + if ans: + t['fontsize'] = ans + elif key == 'font-weight': + weight = font_weight(val) + elif key == 'font-style': + style = font_style(val) + elif key == 'font-variant': + variant = font_variant(val) + + key = font_key(family, style, weight) if self.fonts[family].has_key(key): - fonts['fontfacename'] = self.fonts[family][key][1] + t['fontfacename'] = self.fonts[family][key][1] else: - fonts['fontfacename'] = self.fonts[family]['normal'][1] - for key in fonts.keys(): - if str(self.current_block.textStyle.attrs[key]) != str(fonts[key]): - fonts_changed = True - break - if fonts_changed or \ - indent != int(self.current_block.textStyle.attrs['parindent']) or \ - top != int(self.current_block.blockStyle.attrs['topskip']) or \ - bottom != int(self.current_block.blockStyle.attrs['footskip']) or \ - left != int(self.current_block.blockStyle.attrs['sidemargin']): - + t['fontfacename'] = self.fonts[family]['normal'][1] + if key in ['bold', 'bi']: + t['fontweight'] = 700 + + fs = int(t['fontsize']) + if fs > 120: + t['wordspace'] = int(fs/4.) + t['baselineskip'] = fs + 20 + return t, key, variant + + def unit_convert(self, val, pts=False): + ''' + Tries to convert html units in C{val} to pixels. + Assumes: 1em = 100% = 10pts + @param pts: If True return 10*pts instead of pixels. + @return: The number of pixels (an int) if successful. Otherwise, returns None. + ''' + dpi = self.profile.dpi + result = None + try: + result = int(val) + except ValueError: + pass + m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val) + if m is not None: + unit = float(m.group(1)) + if m.group(2) == '%': + normal = self.unit_convert('10pt') + result = int((unit/100.0)*normal) + elif m.group(2) == 'px': + result = int(unit) + elif m.group(2) == 'in': + result = int(unit * dpi) + elif m.group(2) == 'pt': + result = int(unit * dpi/72.) + elif m.group(2)== 'em': + result = int(unit * (dpi/72.) * 10) + elif m.group(2)== 'pc': + result = int(unit * (dpi/72.) * 12) + elif m.group(2)== 'mm': + result = int(unit * 0.04 * (dpi/72.)) + elif m.group(2)== 'cm': + result = int(unit * 0.4 * (dpi/72.)) + if pts: + if result is not None: + result = int((float(result)/dpi)*720) + return result + + def text_properties(self, tag_css): + indent = self.book.defaultTextStyle.attrs['parindent'] + if tag_css.has_key('text-indent'): + indent = self.unit_convert(str(tag_css['text-indent']), pts=True) + if not indent: + indent = 0 + if hasattr(self, 'minimum_indent') and indent > 0 and indent < self.minimum_indent: + indent = self.minimum_indent + + fp = self.font_properties(tag_css)[0] + fp['parindent'] = indent + return fp + + + def process_block(self, tag, tag_css, tkey): + ''' Ensure padding and text-indent properties are respected ''' + text_properties = self.text_properties(tag_css) + block_properties = self.block_properties(tag_css) + + if properties_different(self.current_block.blockStyle.attrs, block_properties) or \ + properties_different(self.current_block.textStyle.attrs, text_properties): + ts = self.current_block.textStyle.copy() + ts.attrs.update(text_properties) + bs = self.current_block.blockStyle.copy() + bs.attrs.update(block_properties) self.current_block.append_to(self.current_page) - ts = self.book.create_text_style(**self.current_block.textStyle.attrs) - ts.attrs['parindent'] = indent - for key in ('fontfacename', 'fontsize', 'fontwidth', 'wordspace', 'baselineskip'): - ts.attrs[key] = self.book.defaultTextStyle.attrs[key] - for key in fonts: - ts.attrs[key] = fonts[key] - bs = self.book.create_block_style(**self.current_block.blockStyle.attrs) - ba = bs.attrs - ba['topskip'], ba['footskip'], ba['sidemargin'] = top, bottom, left try: index = self.text_styles.index(ts) ts = self.text_styles[index] @@ -1260,7 +1228,7 @@ class HTMLConverter(object): break parent = parent.parent prepend = str(self.list_counter)+'. ' if in_ol else u'\u2022' + ' ' - self.current_para.append(_Span(prepend)) + self.current_para.append(Span(prepend)) self.process_children(tag, tag_css) if in_ol: self.list_counter += 1 @@ -1325,8 +1293,7 @@ class HTMLConverter(object): self.current_block.append(CR()) self.current_block.must_append = True return - if not self.in_table: - self.process_block(tag, tag_css, tkey) + self.process_block(tag, tag_css, tkey) if self.current_para.contents: self.current_block.append(self.current_para) if self.current_block.contents: diff --git a/src/libprs500/ebooks/lrf/pylrs/pylrs.py b/src/libprs500/ebooks/lrf/pylrs/pylrs.py index 8915e91f6f..a3e05acd0e 100644 --- a/src/libprs500/ebooks/lrf/pylrs/pylrs.py +++ b/src/libprs500/ebooks/lrf/pylrs/pylrs.py @@ -1191,6 +1191,11 @@ class BlockStyle(LrsStyle): def __init__(self, **overrides): LrsStyle.__init__(self, "BlockStyle", self.defaults, **overrides) + + def copy(self): + tb = BlockStyle() + tb.attrs = self.attrs.copy() + return tb @@ -2350,3 +2355,8 @@ class Font(LrsContainer): +def properties_different(attrs1, attrs2): + for key in attrs1.keys(): + if attrs2.has_key(key) and str(attrs1[key]) != str(attrs2[key]): + return True + return False \ No newline at end of file diff --git a/src/libprs500/ebooks/lrf/web/profiles.py b/src/libprs500/ebooks/lrf/web/profiles.py index 951b9173fa..9d411dbaf7 100644 --- a/src/libprs500/ebooks/lrf/web/profiles.py +++ b/src/libprs500/ebooks/lrf/web/profiles.py @@ -71,7 +71,7 @@ profiles = { (r'', lambda match : ''''''), ] ],