## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net ## This work is based on htmlbbeb created by esperanc. ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License along ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """ Code to convert HTML ebooks into LRF ebooks. I am indebted to esperanc for the initial CSS->Xylog Style conversion code and to Falstaff for pylrs. """ import os, re, sys, copy, glob, logging, tempfile from collections import deque from urllib import unquote from urlparse import urlparse from math import ceil, floor from functools import partial try: from PIL import Image as PILImage except ImportError: import Image as PILImage from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, \ NavigableString, Declaration, ProcessingInstruction from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \ TextBlock, ImageBlock, JumpButton, CharButton, \ Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \ LrsError, Sup, Sub, EmpLine from libprs500.ebooks.lrf.pylrs.pylrs import Span from libprs500.ebooks.lrf import Book, entity_to_unicode from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks import ConversionError from libprs500.ebooks.lrf.html.table import Table from libprs500 import filename_to_utf8, setup_cli_handlers, __appname__, fit_image from libprs500.ptempfile import PersistentTemporaryFile from libprs500.ebooks.metadata.opf import OPFReader from libprs500.devices.interface import Device from libprs500.ebooks.lrf.html.color_map import lrs_color from libprs500.ebooks.chardet import xml_to_unicode def update_css(ncss, ocss): for key in ncss.keys(): if ocss.has_key(key): ocss[key].update(ncss[key]) else: ocss[key] = ncss[key] def munge_paths(basepath, url): purl = urlparse(unquote(url),) path, fragment = purl[2], purl[5] if not path: path = basepath elif not os.path.isabs(path): path = os.path.join(os.path.dirname(basepath), path) return os.path.normpath(path), fragment def strip_style_comments(match): src = match.group() while True: lindex = src.find('/*') if lindex < 0: break rindex = src.find('*/', lindex) if rindex < 0: src = src[:lindex] break src = src[:lindex] + src[rindex+2:] return src class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) MARKUP_MASSAGE = [ # Close tags (re.compile(r']*)?/>', re.IGNORECASE), lambda match: ''), # Strip comments from )', re.IGNORECASE|re.DOTALL), strip_style_comments), ] # Fix Baen markup BAEN = [ (re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE), lambda match: match.group(1)), (re.compile(r'

\s*(\s*)\s*

', re.IGNORECASE), lambda match: match.group(1)), (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*', re.IGNORECASE), lambda match: ''), ] # Fix pdftohtml markup PDFTOHTML = [ # Remove

tags (re.compile(r'', re.IGNORECASE), lambda match: ' '), # Remove page numbers (re.compile(r'\d+
', re.IGNORECASE), lambda match: ''), # Remove
and replace

with

(re.compile(r'\s*', re.IGNORECASE), lambda match: '

'), (re.compile(r'(.*)', re.IGNORECASE), lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 else match.group(1)), # Remove hyphenation (re.compile(r'-\n\r?'), lambda match: ''), ] # Fix Book Designer markup BOOK_DESIGNER = [ # HR (re.compile('

', re.IGNORECASE), lambda match : ' '), # Create header tags (re.compile('<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), lambda match : '

%s

'%(match.group(2) if match.group(2) else 'center', match.group(3))), (re.compile('<]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), lambda match : '

%s

'%(match.group(2) if match.group(2) else 'center', match.group(3))), (re.compile('<]*?id=title[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), lambda match : '

%s

'%(match.group(1),)), (re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), lambda match : '

%s

'%(match.group(1),)), # Blank lines (re.compile('<]*?>( ){4}', re.IGNORECASE), lambda match : '

'), ] def __hasattr__(self, attr): if hasattr(self.options, attr): return True return object.__hasattr__(self, attr) def __getattr__(self, attr): if hasattr(self.options, attr): return getattr(self.options, attr) return object.__getattr__(self, attr) def __setattr__(self, attr, val): if hasattr(self.options, attr): setattr(self.options, attr, val) else: object.__setattr__(self, attr, val) CSS = { 'h1' : {"font-size" : "xx-large", "font-weight":"bold", 'text-indent':'0pt'}, 'h2' : {"font-size" : "x-large", "font-weight":"bold", 'text-indent':'0pt'}, 'h3' : {"font-size" : "large", "font-weight":"bold", 'text-indent':'0pt'}, 'h4' : {"font-size" : "large", 'text-indent':'0pt'}, 'h5' : {"font-weight" : "bold", 'text-indent':'0pt'}, 'b' : {"font-weight" : "bold"}, 'strong' : {"font-weight" : "bold"}, 'i' : {"font-style" : "italic"}, 'cite' : {'font-style' : 'italic'}, 'em' : {"font-style" : "italic"}, 'small' : {'font-size' : 'small'}, 'pre' : {'font-family' : 'monospace', 'white-space': 'pre' }, 'code' : {'font-family' : 'monospace' }, 'tt' : {'font-family' : 'monospace'}, 'center' : {'text-align' : 'center'}, 'th' : {'font-size' : 'large', 'font-weight':'bold'}, 'big' : {'font-size' : 'large', 'font-weight':'bold'}, '.libprs500_dropcaps' : {'font-size': 'xx-large'}, 'u' : {'text-decoration': 'underline'}, 'sup' : {'vertical-align': 'super', 'font-size': '60%'}, 'sub' : {'vertical-align': 'sub', 'font-size': '60%'}, } def __init__(self, book, fonts, options, logger, paths): ''' Convert HTML files at C{paths} and add to C{book}. After creating the object, you must call L{self.writeto} to output the LRF/S file. @param book: The LRF book @type book: L{libprs500.lrf.pylrs.Book} @param fonts: dict specifying the font families to use ''' # Defaults for various formatting tags object.__setattr__(self, 'options', options) self.logger = logger self.fonts = fonts #: dict specifying font families to use # Memory self.scaled_images = {} #: Temporary files with scaled version of images self.rotated_images = {} #: Temporary files with rotated version of images self.text_styles = []#: Keep track of already used textstyles self.block_styles = []#: Keep track of already used blockstyles self.images = {} #: Images referenced in the HTML document self.targets = {} #: and id elements self.links = deque() #: elements self.processed_files = [] self.extra_toc_entries = [] #: TOC entries gleaned from semantic information self.image_memory = [] self.id_counter = 0 self.unused_target_blocks = [] #: Used to remove extra TextBlocks self.link_level = 0 #: Current link level self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported self.tops = {} #: element representing the top of each HTML file in the LRF file self.previous_text = '' #: Used to figure out when to lstrip self.preserve_block_style = False #: Used so that

tags in

elements are handled properly # Styles self.blockquote_style = book.create_block_style(sidemargin=60, topskip=20, footskip=20) self.unindented_style = book.create_text_style(parindent=0) self.in_table = False # List processing self.list_level = 0 self.list_indent = 20 self.list_counter = 1 self.book = book #: The Book object representing a BBeB book self.override_css = {} self.override_pcss = {} if self._override_css is not None: if os.access(self._override_css, os.R_OK): src = open(self._override_css, 'rb').read() else: src = self._override_css match = self.PAGE_BREAK_PAT.search(src) if match and not re.match('avoid', match.group(1), re.IGNORECASE): self.page_break_found = True ncss, npcss = self.parse_css(src) if ncss: update_css(ncss, self.override_css) if npcss: update_css(npcss, self.override_pcss) paths = [os.path.abspath(path) for path in paths] while len(paths) > 0 and self.link_level <= self.link_levels: for path in paths: if path in self.processed_files: continue try: self.add_file(path) except KeyboardInterrupt: raise except: if self.link_level == 0: # Die on errors in the first level raise for link in self.links: if link['path'] == path: self.links.remove(link) break self.logger.warn('Could not process '+path) if self.verbose: self.logger.exception(' ') self.links = self.process_links() self.link_level += 1 paths = [link['path'] for link in self.links] for text, tb in self.extra_toc_entries: ascii_text = text.encode('ascii', 'ignore') self.book.addTocEntry(ascii_text, tb) if self.base_font_size > 0: self.logger.info('\tRationalizing font sizes...') self.book.rationalize_font_sizes(self.base_font_size) def is_baen(self, soup): return bool(soup.find('meta', attrs={'name':'Publisher', 'content':re.compile('Baen', re.IGNORECASE)})) def is_book_designer(self, raw): return bool(re.search('<]*id=BookTitle', raw)) def preprocess(self, raw): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(HTMLConverter.MARKUP_MASSAGE) if not self.book_designer and self.is_book_designer(raw): self.book_designer = True self.logger.info('\tBook Designer file detected.') self.logger.info('\tParsing HTML...') if self.baen: nmassage.extend(HTMLConverter.BAEN) if self.pdftohtml: nmassage.extend(HTMLConverter.PDFTOHTML) if self.book_designer: nmassage.extend(HTMLConverter.BOOK_DESIGNER) try: soup = BeautifulSoup(raw, convertEntities=BeautifulSoup.XHTML_ENTITIES, markupMassage=nmassage) except ConversionError, err: if 'Failed to coerce to unicode' in str(err): raw = unicode(raw, 'utf8', 'replace') soup = BeautifulSoup(raw, convertEntities=BeautifulSoup.XHTML_ENTITIES, markupMassage=nmassage) if not self.baen and self.is_baen(soup): self.baen = True self.logger.info('\tBaen file detected. Re-parsing...') return self.preprocess(raw) if self.book_designer: t = soup.find(id='BookTitle') if t: self.book.set_title(self.get_text(t)) a = soup.find(id='BookAuthor') if a: self.book.set_author(self.get_text(a)) if self.verbose: tdir = tempfile.gettempdir() dump = open(os.path.join(tdir, 'html2lrf-verbose.html'), 'wb') dump.write(unicode(soup).encode('utf-8')) self.logger.info('Written preprocessed HTML to '+dump.name) dump.close() #print soup return soup def add_file(self, path): self.css = HTMLConverter.CSS.copy() self.pseudo_css = self.override_pcss.copy() self.css.update(self.override_css) self.file_name = os.path.basename(path) self.logger.info('Processing %s', path if self.verbose else self.file_name) upath = path.encode('utf-8') if isinstance(path, unicode) else path if not os.path.exists(upath): upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names f = open(upath, 'rb') raw = f.read() if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files raw = raw.decode('utf-8', 'ignore') elif self.encoding is not None: raw = raw.decode(self.encoding, 'ignore') else: raw = xml_to_unicode(raw, self.verbose)[0] f.close() soup = self.preprocess(raw) self.logger.info('\tConverting to BBeB...') self.current_page = None self.current_para = None self.current_style = {} self.page_break_found = False self.target_prefix = path self.previous_text = '\n' self.tops[path] = self.parse_file(soup) self.processed_files.append(path) def parse_css(self, style): """ Parse the contents of a