## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net ## This work is based on htmlbbeb created by esperanc. ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License along ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """ Code to convert HTML ebooks into LRF ebooks. I am indebted to esperanc for the initial CSS->Xylog Style conversion code and to Falstaff for pylrs. """ import os, re, sys, copy, glob, logging, tempfile from collections import deque from urllib import unquote from urlparse import urlparse from math import ceil, floor from functools import partial try: from PIL import Image as PILImage except ImportError: import Image as PILImage from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, \ NavigableString, Declaration, ProcessingInstruction from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \ TextBlock, ImageBlock, JumpButton, CharButton, \ Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \ LrsError, Sup, Sub, EmpLine from libprs500.ebooks.lrf.pylrs.pylrs import Span from libprs500.ebooks.lrf import Book, entity_to_unicode from libprs500.ebooks.lrf import option_parser as lrf_option_parser from libprs500.ebooks import ConversionError from libprs500.ebooks.lrf.html.table import Table from libprs500 import filename_to_utf8, setup_cli_handlers, __appname__, fit_image from libprs500.ptempfile import PersistentTemporaryFile from libprs500.ebooks.metadata.opf import OPFReader from libprs500.devices.interface import Device from libprs500.ebooks.lrf.html.color_map import lrs_color from libprs500.ebooks.chardet import xml_to_unicode def update_css(ncss, ocss): for key in ncss.keys(): if ocss.has_key(key): ocss[key].update(ncss[key]) else: ocss[key] = ncss[key] def munge_paths(basepath, url): purl = urlparse(unquote(url),) path, fragment = purl[2], purl[5] if not path: path = basepath elif not os.path.isabs(path): path = os.path.join(os.path.dirname(basepath), path) return os.path.normpath(path), fragment def strip_style_comments(match): src = match.group() while True: lindex = src.find('/*') if lindex < 0: break rindex = src.find('*/', lindex) if rindex < 0: src = src[:lindex] break src = src[:lindex] + src[rindex+2:] return src class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) MARKUP_MASSAGE = [ # Close tags (re.compile(r']*)?/>', re.IGNORECASE), lambda match: ''), # Strip comments from )', re.IGNORECASE|re.DOTALL), strip_style_comments), ] # Fix Baen markup BAEN = [ (re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE), lambda match: match.group(1)), (re.compile(r'
\s*(\s*)\s*
', re.IGNORECASE), lambda match: match.group(1)), (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*', re.IGNORECASE), lambda match: ''), ] # Fix pdftohtml markup PDFTOHTML = [ # Remove
(re.compile(r' '),
(re.compile(r'(.*) tags in
', re.IGNORECASE),
lambda match : ' '),
# Create header tags
(re.compile('<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?
', re.IGNORECASE),
lambda match : '%s
'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?
', re.IGNORECASE),
lambda match : '%s
'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<]*?id=title[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL),
lambda match : '%s
'%(match.group(1),)),
(re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL),
lambda match : '%s
'%(match.group(1),)),
# Blank lines
(re.compile(' elements are handled properly
# Styles
self.blockquote_style = book.create_block_style(sidemargin=60,
topskip=20, footskip=20)
self.unindented_style = book.create_text_style(parindent=0)
self.in_table = False
# List processing
self.list_level = 0
self.list_indent = 20
self.list_counter = 1
self.book = book #: The Book object representing a BBeB book
self.override_css = {}
self.override_pcss = {}
if self._override_css is not None:
if os.access(self._override_css, os.R_OK):
src = open(self._override_css, 'rb').read()
else:
src = self._override_css
match = self.PAGE_BREAK_PAT.search(src)
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
self.page_break_found = True
ncss, npcss = self.parse_css(src)
if ncss:
update_css(ncss, self.override_css)
if npcss:
update_css(npcss, self.override_pcss)
paths = [os.path.abspath(path) for path in paths]
while len(paths) > 0 and self.link_level <= self.link_levels:
for path in paths:
if path in self.processed_files:
continue
try:
self.add_file(path)
except KeyboardInterrupt:
raise
except:
if self.link_level == 0: # Die on errors in the first level
raise
for link in self.links:
if link['path'] == path:
self.links.remove(link)
break
self.logger.warn('Could not process '+path)
if self.verbose:
self.logger.exception(' ')
self.links = self.process_links()
self.link_level += 1
paths = [link['path'] for link in self.links]
for text, tb in self.extra_toc_entries:
ascii_text = text.encode('ascii', 'ignore')
self.book.addTocEntry(ascii_text, tb)
if self.base_font_size > 0:
self.logger.info('\tRationalizing font sizes...')
self.book.rationalize_font_sizes(self.base_font_size)
def is_baen(self, soup):
return bool(soup.find('meta', attrs={'name':'Publisher',
'content':re.compile('Baen', re.IGNORECASE)}))
def is_book_designer(self, raw):
return bool(re.search('
<]*id=BookTitle', raw))
def preprocess(self, raw):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
if not self.book_designer and self.is_book_designer(raw):
self.book_designer = True
self.logger.info('\tBook Designer file detected.')
self.logger.info('\tParsing HTML...')
if self.baen:
nmassage.extend(HTMLConverter.BAEN)
if self.pdftohtml:
nmassage.extend(HTMLConverter.PDFTOHTML)
if self.book_designer:
nmassage.extend(HTMLConverter.BOOK_DESIGNER)
try:
soup = BeautifulSoup(raw,
convertEntities=BeautifulSoup.XHTML_ENTITIES,
markupMassage=nmassage)
except ConversionError, err:
if 'Failed to coerce to unicode' in str(err):
raw = unicode(raw, 'utf8', 'replace')
soup = BeautifulSoup(raw,
convertEntities=BeautifulSoup.XHTML_ENTITIES,
markupMassage=nmassage)
if not self.baen and self.is_baen(soup):
self.baen = True
self.logger.info('\tBaen file detected. Re-parsing...')
return self.preprocess(raw)
if self.book_designer:
t = soup.find(id='BookTitle')
if t:
self.book.set_title(self.get_text(t))
a = soup.find(id='BookAuthor')
if a:
self.book.set_author(self.get_text(a))
if self.verbose:
tdir = tempfile.gettempdir()
dump = open(os.path.join(tdir, 'html2lrf-verbose.html'), 'wb')
dump.write(unicode(soup).encode('utf-8'))
self.logger.info('Written preprocessed HTML to '+dump.name)
dump.close()
#print soup
return soup
def add_file(self, path):
self.css = HTMLConverter.CSS.copy()
self.pseudo_css = self.override_pcss.copy()
self.css.update(self.override_css)
self.file_name = os.path.basename(path)
self.logger.info('Processing %s', path if self.verbose else self.file_name)
upath = path.encode('utf-8') if isinstance(path, unicode) else path
if not os.path.exists(upath):
upath = upath.replace('&', '%26') #convertlit replaces & with %26 in file names
f = open(upath, 'rb')
raw = f.read()
if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files
raw = raw.decode('utf-8', 'ignore')
elif self.encoding is not None:
raw = raw.decode(self.encoding, 'ignore')
else:
raw = xml_to_unicode(raw, self.verbose)[0]
f.close()
soup = self.preprocess(raw)
self.logger.info('\tConverting to BBeB...')
self.current_page = None
self.current_para = None
self.current_style = {}
self.page_break_found = False
self.target_prefix = path
self.previous_text = '\n'
self.tops[path] = self.parse_file(soup)
self.processed_files.append(path)
def parse_css(self, style):
"""
Parse the contents of a