## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
## This work is based on htmlbbeb created by esperanc.
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Code to convert HTML ebooks into LRF ebooks.
I am indebted to esperanc for the initial CSS->Xylog Style conversion routines
and to Falstaff for pylrs.
"""
import os, re, sys, shutil, traceback, copy, glob
from htmlentitydefs import name2codepoint
from urllib import unquote
from urlparse import urlparse
from tempfile import mkdtemp
from operator import itemgetter
from math import ceil, floor
try:
from PIL import Image as PILImage
except ImportError:
import Image as PILImage
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, \
Comment, Tag, NavigableString, Declaration, ProcessingInstruction
from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
TextBlock, ImageBlock, JumpButton, CharButton, Bold, Space, \
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas
from libprs500.ebooks.lrf.pylrs.pylrs import Span as _Span
from libprs500.ebooks.lrf import ConversionError, option_parser, Book, PRS500_PROFILE
from libprs500.ebooks.lrf.html.table import Table
from libprs500 import extract, filename_to_utf8
from libprs500.ptempfile import PersistentTemporaryFile
class Span(_Span):
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo', 'nbsp' ]
patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
targets = [ unichr(name2codepoint[i]) for i in replaced_entities ]
rules = zip(patterns, targets)
@staticmethod
def unit_convert(val, ref=80):
"""
Tries to convert html units stored in C{val} to pixels. C{ref} contains
the reference value for relative units. Returns the number of pixels
(an int) if successful. Otherwise, returns None.
Assumes: 1 pixel is 1/4 mm. One em is 10pts
"""
result = None
m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val)
if m is not None:
unit = float(m.group(1))
if m.group(2) == '%':
result = int(unit/100.0*ref)
elif m.group(2) == 'px':
result = int(unit)
elif m.group(2) == 'in':
result = int(unit * 25.4 * 4)
elif m.group(2) == 'pt':
result = int(unit * 25.4 * 4 / 72)
elif m.group(2)== 'em':
result = int(unit * 25.4 * 4 / 72 * 10)
elif m.group(2)== 'pc':
result = int(unit * 25.4 * 4 / 72 * 12)
elif m.group(2)== 'mm':
result = int(unit * 4)
elif m.group(2)== 'cm':
result = int(unit * 10 * 4)
return result
@staticmethod
def translate_attrs(d, font_delta=0, memory=None):
"""
Receives a dictionary of html attributes and styles and returns
approximate Xylog equivalents in a new dictionary
"""
def font_weight(val):
ans = None
m = re.search("([0-9]+)", val)
if m:
ans = str(int(m.group(1)))
elif val.find("bold") >= 0 or val.find("strong") >= 0:
ans = "1000"
return ans
def font_family(val):
ans = None
if max(val.find("courier"), val.find("mono"), val.find("fixed"), val.find("typewriter"))>=0:
ans = "Courier10 BT Roman"
elif max(val.find("arial"), val.find("helvetica"), val.find("verdana"),
val.find("trebuchet"), val.find("sans")) >= 0:
ans = "Swis721 BT Roman"
return ans
def font_size(val):
ans = None
unit = Span.unit_convert(val, 14)
if unit:
# Assume a 10 pt font (14 pixels) has fontsize 100
ans = int (unit / 14.0 * 100)
else:
if "xx-small" in val:
ans = 40
elif "x-small" in val >= 0:
ans = 60
elif "small" in val:
ans = 80
elif "xx-large" in val:
ans = 180
elif "x-large" in val >= 0:
ans = 140
elif "large" in val >= 0:
ans = 120
if ans is not None:
ans += font_delta * 20
ans = str(ans)
return ans
t = dict()
for key in d.keys():
val = d[key].lower()
if key == 'font':
val = val.split()
val.reverse()
for sval in val:
ans = font_family(sval)
if ans:
t['fontfacename'] = ans
else:
ans = font_size(sval)
if ans:
t['fontsize'] = ans
else:
ans = font_weight(sval)
if ans:
t['fontweight'] = ans
elif key in ['font-family', 'font-name']:
ans = font_family(val)
if ans:
t['fontfacename'] = ans
elif key == "font-size":
ans = font_size(val)
if ans:
t['fontsize'] = ans
elif key == 'font-weight':
ans = font_weight(val)
if ans:
t['fontweight'] = ans
if int(ans) > 140:
t['wordspace'] = '50'
else:
report = True
if memory != None:
if key in memory:
report = False
else:
memory.append(key)
if report:
print >>sys.stderr, 'Unhandled/malformed CSS key:', key, d[key]
return t
def __init__(self, ns, css, memory, font_delta=0):
src = ns.string if hasattr(ns, 'string') else ns
src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces
for pat, repl in Span.rules:
src = pat.sub(repl, src)
if not src:
raise ConversionError('No point in adding an empty string to a Span')
if 'font-style' in css.keys():
fs = css.pop('font-style')
if fs.lower() == 'italic':
src = Italic(src)
attrs = Span.translate_attrs(css, font_delta=font_delta, memory=memory)
if 'fontsize' in attrs.keys():
attrs['baselineskip'] = int(attrs['fontsize']) + 20
_Span.__init__(self, text=src, **attrs)
class HTMLConverter(object):
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
# Fix elements
MARKUP_MASSAGE = [(re.compile("(<\s*[aA]\s+.*\/)\s*>"), #Close tags
lambda match: match.group(1)+">"),
# Strip comments from