Added support for <img>, <link> and <style> tags to html2lrf

This commit is contained in:
Kovid Goyal 2007-04-19 03:13:02 +00:00
parent d69fad53f4
commit 22872ee668

View File

@ -20,14 +20,16 @@ Code to convert HTML ebooks into LRF ebooks.
import os, re, sys import os, re, sys
from htmlentitydefs import name2codepoint from htmlentitydefs import name2codepoint
from optparse import OptionParser from optparse import OptionParser
from urllib import urlopen
from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, Comment, Tag, NavigableString from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, Comment, Tag, NavigableString
from libprs500.lrf.pylrs.pylrs import Book, Page, Paragraph, TextBlock, CR, Italic from libprs500.lrf.pylrs.pylrs import Book, Page, Paragraph, TextBlock, \
CR, Italic, ImageStream, ImageBlock
from libprs500.lrf.pylrs.pylrs import Span as _Span from libprs500.lrf.pylrs.pylrs import Span as _Span
from libprs500.lrf import ConversionError from libprs500.lrf import ConversionError
class Span(_Span): class Span(_Span):
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo', 'nbsp' ]
patterns = [ re.compile('&'+i+';') for i in replaced_entities ] patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
targets = [ unichr(name2codepoint[i]) for i in replaced_entities ] targets = [ unichr(name2codepoint[i]) for i in replaced_entities ]
rules = zip(patterns, targets) rules = zip(patterns, targets)
@ -283,8 +285,8 @@ class HTMLConvertor(object):
for key in css.keys(): for key in css.keys():
test = key.lower() test = key.lower()
if test.startswith('margin') or 'indent' in test or \ if test.startswith('margin') or 'indent' in test or \
'padding' in test or 'border' in test or test in \ 'padding' in test or 'border' in test or 'page-break' in test \
['color', 'display', 'text-decoration', 'letter-spacing']: or test in ['color', 'display', 'text-decoration', 'letter-spacing']:
css.pop(key) css.pop(key)
return css return css
@ -321,9 +323,29 @@ class HTMLConvertor(object):
return return
if tagname in ["title", "script", "meta"]: if tagname in ["title", "script", "meta"]:
pass pass
elif tagname == 'img':
if tag.has_key('src'):
if os.access(tag['src'], os.R_OK):
self.current_block.append(self.current_para)
self.current_page.append(self.current_block)
ib = ImageBlock(ImageStream(tag['src']))
self.current_page.append(ib)
self.current_block = TextBlock()
self.current_para = Paragraph()
elif tagname in ['style', 'link']: elif tagname in ['style', 'link']:
# TODO: Append CSS to self.css if tagname == 'style':
pass for c in tag.contents:
if isinstance(c,NavigableString):
self.css.update(self.parse_css(str(c)))
elif tag.has_key('type') and tag['type'] == "text/css" \
and tag.has_key('href'):
url = tag['href']
if url.startswith('http://'):
f = urlopen(url)
else:
f = open(url)
self.css.update(f.read())
f.close()
elif tagname == 'p': elif tagname == 'p':
css = self.tag_css(tag, parent_css=parent_css) css = self.tag_css(tag, parent_css=parent_css)
indent = css.pop('text-indent', '') indent = css.pop('text-indent', '')