## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net ## This work is based on htmlbbeb created by esperanc. ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License along ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """ Code to convert HTML ebooks into LRF ebooks. I am indebted to esperanc for the initial CSS->Xylog Style conversion routines and to Falstaff for pylrs. """ import os, re, sys, shutil, traceback, copy, glob from htmlentitydefs import name2codepoint from urllib import unquote from urlparse import urlparse from tempfile import mkdtemp from operator import itemgetter from math import ceil, floor try: from PIL import Image as PILImage except ImportError: import Image as PILImage from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, \ Comment, Tag, NavigableString, Declaration, ProcessingInstruction from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \ TextBlock, ImageBlock, JumpButton, CharButton, Bold, Space, \ Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas from libprs500.ebooks.lrf.pylrs.pylrs import Span as _Span from libprs500.ebooks.lrf import ConversionError, option_parser, Book, PRS500_PROFILE from libprs500.ebooks.lrf.html.table import Table from libprs500 import extract, filename_to_utf8 from libprs500.ptempfile import PersistentTemporaryFile class Span(_Span): replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo', 'nbsp' ] patterns = [ re.compile('&'+i+';') for i in replaced_entities ] targets = [ unichr(name2codepoint[i]) for i in replaced_entities ] rules = zip(patterns, targets) @staticmethod def unit_convert(val, ref=80): """ Tries to convert html units stored in C{val} to pixels. C{ref} contains the reference value for relative units. Returns the number of pixels (an int) if successful. Otherwise, returns None. Assumes: 1 pixel is 1/4 mm. One em is 10pts """ result = None m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val) if m is not None: unit = float(m.group(1)) if m.group(2) == '%': result = int(unit/100.0*ref) elif m.group(2) == 'px': result = int(unit) elif m.group(2) == 'in': result = int(unit * 25.4 * 4) elif m.group(2) == 'pt': result = int(unit * 25.4 * 4 / 72) elif m.group(2)== 'em': result = int(unit * 25.4 * 4 / 72 * 10) elif m.group(2)== 'pc': result = int(unit * 25.4 * 4 / 72 * 12) elif m.group(2)== 'mm': result = int(unit * 4) elif m.group(2)== 'cm': result = int(unit * 10 * 4) return result @staticmethod def translate_attrs(d, font_delta=0, memory=None): """ Receives a dictionary of html attributes and styles and returns approximate Xylog equivalents in a new dictionary """ def font_weight(val): ans = None m = re.search("([0-9]+)", val) if m: ans = str(int(m.group(1))) elif val.find("bold") >= 0 or val.find("strong") >= 0: ans = "1000" return ans def font_family(val): ans = None if max(val.find("courier"), val.find("mono"), val.find("fixed"), val.find("typewriter"))>=0: ans = "Courier10 BT Roman" elif max(val.find("arial"), val.find("helvetica"), val.find("verdana"), val.find("trebuchet"), val.find("sans")) >= 0: ans = "Swis721 BT Roman" return ans def font_size(val): ans = None unit = Span.unit_convert(val, 14) if unit: # Assume a 10 pt font (14 pixels) has fontsize 100 ans = int (unit / 14.0 * 100) else: if "xx-small" in val: ans = 40 elif "x-small" in val >= 0: ans = 60 elif "small" in val: ans = 80 elif "xx-large" in val: ans = 180 elif "x-large" in val >= 0: ans = 140 elif "large" in val >= 0: ans = 120 if ans is not None: ans += font_delta * 20 ans = str(ans) return ans t = dict() for key in d.keys(): val = d[key].lower() if key == 'font': val = val.split() val.reverse() for sval in val: ans = font_family(sval) if ans: t['fontfacename'] = ans else: ans = font_size(sval) if ans: t['fontsize'] = ans else: ans = font_weight(sval) if ans: t['fontweight'] = ans elif key in ['font-family', 'font-name']: ans = font_family(val) if ans: t['fontfacename'] = ans elif key == "font-size": ans = font_size(val) if ans: t['fontsize'] = ans elif key == 'font-weight': ans = font_weight(val) if ans: t['fontweight'] = ans if int(ans) > 140: t['wordspace'] = '50' else: report = True if memory != None: if key in memory: report = False else: memory.append(key) if report: print >>sys.stderr, 'Unhandled/malformed CSS key:', key, d[key] return t def __init__(self, ns, css, memory, font_delta=0): src = ns.string if hasattr(ns, 'string') else ns src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces for pat, repl in Span.rules: src = pat.sub(repl, src) if not src: raise ConversionError('No point in adding an empty string to a Span') if 'font-style' in css.keys(): fs = css.pop('font-style') if fs.lower() == 'italic': src = Italic(src) attrs = Span.translate_attrs(css, font_delta=font_delta, memory=memory) if 'fontsize' in attrs.keys(): attrs['baselineskip'] = int(attrs['fontsize']) + 20 _Span.__init__(self, text=src, **attrs) class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) # Fix elements MARKUP_MASSAGE = [(re.compile("(<\s*[aA]\s+.*\/)\s*>"), #Close tags lambda match: match.group(1)+">"), # Strip comments from