## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
## This work is based on htmlbbeb created by esperanc.
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Code to convert HTML ebooks into LRF ebooks.
I am indebted to esperanc for the initial CSS->Xylog Style conversion code
and to Falstaff for pylrs.
"""
import os, re, sys, shutil, traceback, copy, glob
from htmlentitydefs import name2codepoint
from urllib import unquote
from urlparse import urlparse
from tempfile import mkdtemp
from operator import itemgetter
from math import ceil, floor
try:
from PIL import Image as PILImage
except ImportError:
import Image as PILImage
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, \
Comment, Tag, NavigableString, Declaration, ProcessingInstruction
from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
TextBlock, ImageBlock, JumpButton, CharButton, Bold, Space, \
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
LrsError
from libprs500.ebooks.lrf.pylrs.pylrs import Span as _Span
from libprs500.ebooks.lrf import Book, PRS500_PROFILE
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks import ConversionError
from libprs500.ebooks.lrf.html.table import Table
from libprs500 import extract, filename_to_utf8
from libprs500.ptempfile import PersistentTemporaryFile
class Span(_Span):
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo', 'nbsp' ]
patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
targets = [ unichr(name2codepoint[i]) for i in replaced_entities ]
rules = zip(patterns, targets)
@staticmethod
def unit_convert(val, dpi, ref=80):
"""
Tries to convert html units stored in C{val} to pixels.
@param ref: reference size in pixels for % units.
@return: The number of pixels (an int) if successful. Otherwise, returns None.
Assumes: One em is 10pts
"""
result = None
m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val)
if m is not None:
unit = float(m.group(1))
if m.group(2) == '%':
result = int(unit/100.0*ref)
elif m.group(2) == 'px':
result = int(unit)
elif m.group(2) == 'in':
result = int(unit * dpi)
elif m.group(2) == 'pt':
result = int(unit * dpi/72.)
elif m.group(2)== 'em':
result = int(unit * (dpi/72.) * 10)
elif m.group(2)== 'pc':
result = int(unit * (dpi/72.) * 12)
elif m.group(2)== 'mm':
result = int(unit * 0.04 * (dpi/72.))
elif m.group(2)== 'cm':
result = int(unit * 0.4 * (dpi/72.))
return result
@staticmethod
def translate_attrs(d, dpi, fonts, font_delta=0, memory=None):
"""
Receives a dictionary of html attributes and styles and returns
approximate Xylog equivalents in a new dictionary
"""
def font_weight(val):
ans = 0
m = re.search("([0-9]+)", val)
if m:
ans = int(m.group(1))
elif val.find("bold") >= 0 or val.find("strong") >= 0:
ans = 700
return 'bold' if ans >= 700 else 'normal'
def font_style(val):
ans = 'normal'
if 'italic' in val or 'oblique' in val:
ans = 'italic'
return ans
def font_family(val):
ans = 'serif'
if max(val.find("courier"), val.find("mono"), val.find("fixed"), val.find("typewriter"))>=0:
ans = 'mono'
elif max(val.find("arial"), val.find("helvetica"), val.find("verdana"),
val.find("trebuchet"), val.find("sans")) >= 0:
ans = 'sans'
return ans
def font_key(family, style, weight):
key = 'normal'
if style == 'italic' and weight == 'normal':
key = 'italic'
elif style == 'normal' and weight == 'bold':
key = 'bold'
elif style == 'italic' and weight == 'bold':
key = 'bi'
return key
def font_size(val):
ans = None
unit = Span.unit_convert(val, dpi, 14)
if unit:
# Assume a 10 pt font (14 pixels) has fontsize 100
ans = int(unit * (72./dpi) * 10)
else:
if "xx-small" in val:
ans = 40
elif "x-small" in val:
ans = 60
elif "small" in val:
ans = 80
elif "xx-large" in val:
ans = 180
elif "x-large" in val:
ans = 140
elif "large" in val:
ans = 120
if ans is not None:
ans += int(font_delta * 20)
ans = str(ans)
return ans
t = dict()
family, weight, style = 'serif', 'normal', 'normal'
for key in d.keys():
val = d[key].lower()
if key == 'font':
vals = val.split()
for val in vals:
family = font_family(val)
if family != 'serif':
break
for val in vals:
weight = font_weight(val)
if weight != 'normal':
break
for val in vals:
style = font_style(val)
if style != 'normal':
break
for val in vals:
sz = font_size(val)
if sz:
t['fontsize'] = sz
break
elif key in ['font-family', 'font-name']:
family = font_family(val)
elif key == "font-size":
ans = font_size(val)
if ans:
t['fontsize'] = ans
elif key == 'font-weight':
weight = font_weight(val)
elif key == 'font-style':
style = font_style(val)
else:
report = True
if memory != None:
if key in memory:
report = False
else:
memory.append(key)
if report:
print >>sys.stderr, 'Unhandled/malformed CSS key:', key, d[key]
t['fontfacename'] = (family, font_key(family, style, weight))
if t.has_key('fontsize') and int(t['fontsize']) > 120:
t['wordspace'] = 50
return t
def __init__(self, ns, css, memory, dpi, fonts, font_delta=0):
src = ns.string if hasattr(ns, 'string') else ns
src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces
for pat, repl in Span.rules:
src = pat.sub(repl, src)
if not src:
raise ConversionError('No point in adding an empty string to a Span')
attrs = Span.translate_attrs(css, dpi, fonts, font_delta=font_delta, memory=memory)
family, key = attrs['fontfacename']
if fonts[family].has_key(key):
attrs['fontfacename'] = fonts[family][key][1]
else:
attrs['fontfacename'] = fonts[family]['normal'][1]
if key in ['bold', 'bi']:
attrs['fontweight'] = 700
if key in ['italic', 'bi']:
src = Italic(src)
if 'fontsize' in attrs.keys():
attrs['baselineskip'] = int(attrs['fontsize']) + 20
if attrs['fontfacename'] == fonts['serif']['normal'][1]:
attrs.pop('fontfacename')
_Span.__init__(self, text=src, **attrs)
class HTMLConverter(object):
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
# Fix elements
MARKUP_MASSAGE = [(re.compile(' '), lambda match : ' '), # Convert into a normal space as the default conversion converts it into \xa0 which is not a space in LRF
(re.compile("(<\s*[aA]\s+.*\/)\s*>"), #Close tags
lambda match: match.group(1)+">"),
# Strip comments from