## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net ## This work is based on htmlbbeb created by esperanc. ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License along ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """ Code to convert HTML ebooks into LRF ebooks. I am indebted to esperanc for the initial CSS->Xylog Style conversion code and to Falstaff for pylrs. """ import os, re, sys, shutil, traceback, copy, glob from htmlentitydefs import name2codepoint from urllib import unquote from urlparse import urlparse from tempfile import mkdtemp from operator import itemgetter from math import ceil, floor try: from PIL import Image as PILImage except ImportError: import Image as PILImage from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, \ Comment, Tag, NavigableString, Declaration, ProcessingInstruction from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \ TextBlock, ImageBlock, JumpButton, CharButton, Bold, Space, \ Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \ LrsError from libprs500.ebooks.lrf.pylrs.pylrs import Span as _Span from libprs500.ebooks.lrf import Book, PRS500_PROFILE from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks import ConversionError from libprs500.ebooks.lrf.html.table import Table from libprs500 import extract, filename_to_utf8 from libprs500.ptempfile import PersistentTemporaryFile class Span(_Span): replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo', 'nbsp' ] patterns = [ re.compile('&'+i+';') for i in replaced_entities ] targets = [ unichr(name2codepoint[i]) for i in replaced_entities ] rules = zip(patterns, targets) @staticmethod def unit_convert(val, dpi, ref=80): """ Tries to convert html units stored in C{val} to pixels. @param ref: reference size in pixels for % units. @return: The number of pixels (an int) if successful. Otherwise, returns None. Assumes: One em is 10pts """ result = None m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val) if m is not None: unit = float(m.group(1)) if m.group(2) == '%': result = int(unit/100.0*ref) elif m.group(2) == 'px': result = int(unit) elif m.group(2) == 'in': result = int(unit * dpi) elif m.group(2) == 'pt': result = int(unit * dpi/72.) elif m.group(2)== 'em': result = int(unit * (dpi/72.) * 10) elif m.group(2)== 'pc': result = int(unit * (dpi/72.) * 12) elif m.group(2)== 'mm': result = int(unit * 0.04 * (dpi/72.)) elif m.group(2)== 'cm': result = int(unit * 0.4 * (dpi/72.)) return result @staticmethod def translate_attrs(d, dpi, fonts, font_delta=0, memory=None): """ Receives a dictionary of html attributes and styles and returns approximate Xylog equivalents in a new dictionary """ def font_weight(val): ans = 0 m = re.search("([0-9]+)", val) if m: ans = int(m.group(1)) elif val.find("bold") >= 0 or val.find("strong") >= 0: ans = 700 return 'bold' if ans >= 700 else 'normal' def font_style(val): ans = 'normal' if 'italic' in val or 'oblique' in val: ans = 'italic' return ans def font_family(val): ans = 'serif' if max(val.find("courier"), val.find("mono"), val.find("fixed"), val.find("typewriter"))>=0: ans = 'mono' elif max(val.find("arial"), val.find("helvetica"), val.find("verdana"), val.find("trebuchet"), val.find("sans")) >= 0: ans = 'sans' return ans def font_key(family, style, weight): key = 'normal' if style == 'italic' and weight == 'normal': key = 'italic' elif style == 'normal' and weight == 'bold': key = 'bold' elif style == 'italic' and weight == 'bold': key = 'bi' return key def font_size(val): ans = None unit = Span.unit_convert(val, dpi, 14) if unit: # Assume a 10 pt font (14 pixels) has fontsize 100 ans = int(unit * (72./dpi) * 10) else: if "xx-small" in val: ans = 40 elif "x-small" in val: ans = 60 elif "small" in val: ans = 80 elif "xx-large" in val: ans = 180 elif "x-large" in val: ans = 140 elif "large" in val: ans = 120 if ans is not None: ans += int(font_delta * 20) ans = str(ans) return ans t = dict() family, weight, style = 'serif', 'normal', 'normal' for key in d.keys(): val = d[key].lower() if key == 'font': vals = val.split() for val in vals: family = font_family(val) if family != 'serif': break for val in vals: weight = font_weight(val) if weight != 'normal': break for val in vals: style = font_style(val) if style != 'normal': break for val in vals: sz = font_size(val) if sz: t['fontsize'] = sz break elif key in ['font-family', 'font-name']: family = font_family(val) elif key == "font-size": ans = font_size(val) if ans: t['fontsize'] = ans elif key == 'font-weight': weight = font_weight(val) elif key == 'font-style': style = font_style(val) else: report = True if memory != None: if key in memory: report = False else: memory.append(key) if report: print >>sys.stderr, 'Unhandled/malformed CSS key:', key, d[key] t['fontfacename'] = (family, font_key(family, style, weight)) if t.has_key('fontsize') and int(t['fontsize']) > 120: t['wordspace'] = 50 return t def __init__(self, ns, css, memory, dpi, fonts, font_delta=0): src = ns.string if hasattr(ns, 'string') else ns src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces for pat, repl in Span.rules: src = pat.sub(repl, src) if not src: raise ConversionError('No point in adding an empty string to a Span') attrs = Span.translate_attrs(css, dpi, fonts, font_delta=font_delta, memory=memory) family, key = attrs['fontfacename'] if fonts[family].has_key(key): attrs['fontfacename'] = fonts[family][key][1] else: attrs['fontfacename'] = fonts[family]['normal'][1] if key in ['bold', 'bi']: attrs['fontweight'] = 700 if key in ['italic', 'bi']: src = Italic(src) if 'fontsize' in attrs.keys(): attrs['baselineskip'] = int(attrs['fontsize']) + 20 if attrs['fontfacename'] == fonts['serif']['normal'][1]: attrs.pop('fontfacename') _Span.__init__(self, text=src, **attrs) class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) # Fix elements MARKUP_MASSAGE = [(re.compile(' '), lambda match : ' '), # Convert   into a normal space as the default conversion converts it into \xa0 which is not a space in LRF (re.compile("(<\s*[aA]\s+.*\/)\s*>"), #Close tags lambda match: match.group(1)+">"), # Strip comments from