## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net ## This work is based on htmlbbeb created by esperanc. ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License along ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """ Code to convert HTML ebooks into LRF ebooks. I am indebted to esperanc for the initial CSS->Xylog Style conversion code and to Falstaff for pylrs. """ import os, re, sys, copy, glob, logging from htmlentitydefs import name2codepoint from urllib import unquote from urlparse import urlparse from math import ceil, floor try: from PIL import Image as PILImage except ImportError: import Image as PILImage from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, \ NavigableString, Declaration, ProcessingInstruction from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \ TextBlock, ImageBlock, JumpButton, CharButton, \ Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \ LrsError, Sup, Sub, properties_different from libprs500.ebooks.lrf.pylrs.pylrs import Span from libprs500.ebooks.lrf import Book from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks import ConversionError from libprs500.ebooks.lrf.html.table import Table from libprs500 import filename_to_utf8, setup_cli_handlers, __appname__ from libprs500.ptempfile import PersistentTemporaryFile from libprs500.ebooks.metadata.opf import OPFReader class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo'] patterns = [ re.compile('&'+i+';') for i in replaced_entities ] targets = [ unichr(name2codepoint[i]) for i in replaced_entities ] ENTITY_RULES = zip(patterns, targets) + [(re.compile('''), "'")] MARKUP_MASSAGE = [ # Close tags (re.compile("(|", re.IGNORECASE), lambda match: match.group(1)+">"), # Strip comments from