## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
## This work is based on htmlbbeb created by esperanc.
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Code to convert HTML ebooks into LRF ebooks.
I am indebted to esperanc for the initial CSS->Xylog Style conversion code
and to Falstaff for pylrs.
"""
import os, re, sys, copy, glob, logging
from htmlentitydefs import name2codepoint
from urllib import unquote
from urlparse import urlparse
from math import ceil, floor
try:
from PIL import Image as PILImage
except ImportError:
import Image as PILImage
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, \
NavigableString, Declaration, ProcessingInstruction
from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
TextBlock, ImageBlock, JumpButton, CharButton, \
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
LrsError, Sup, Sub, properties_different
from libprs500.ebooks.lrf.pylrs.pylrs import Span
from libprs500.ebooks.lrf import Book
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks import ConversionError
from libprs500.ebooks.lrf.html.table import Table
from libprs500 import filename_to_utf8, setup_cli_handlers, __appname__
from libprs500.ptempfile import PersistentTemporaryFile
from libprs500.ebooks.metadata.opf import OPFReader
class HTMLConverter(object):
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo']
patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
targets = [ unichr(name2codepoint[i]) for i in replaced_entities ]
ENTITY_RULES = zip(patterns, targets) + [(re.compile('''), "'")]
MARKUP_MASSAGE = [
# Close tags
(re.compile("(|", re.IGNORECASE),
lambda match: match.group(1)+">"),
# Strip comments from