calibre/src/libprs500/lrf/html/convert_from.py

410 lines
16 KiB
Python

## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
## This work is based on htmlbbeb created by esperanc.
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Code to convert HTML ebooks into LRF ebooks.
"""
import os, re, sys
from htmlentitydefs import name2codepoint
from optparse import OptionParser
from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, Comment, Tag, NavigableString
from libprs500.lrf.pylrs.pylrs import Book, Page, Paragraph, TextBlock, CR, Italic
from libprs500.lrf.pylrs.pylrs import Span as _Span
from libprs500.lrf import ConversionError
class Span(_Span):
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
targets = [ unichr(name2codepoint[i]) for i in replaced_entities ]
rules = zip(patterns, targets)
@staticmethod
def unit_convert(val, ref=80):
"""
Tries to convert html units stored in C{val} to pixels. C{ref} contains
the reference value for relative units. Returns the number of pixels
(an int) if successful. Otherwise, returns None.
Assumes: 1 pixel is 1/4 mm. One em is 10pts
"""
m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val)
if m is not None:
unit = float(m.group(1))
if m.group(2) == '%':
result = int(unit/100.0*ref)
elif m.group(2) == 'px':
result = int(unit)
elif m.group(2) == 'in':
result = int(unit * 25.4 * 4)
elif m.group(2) == 'pt':
result = int(unit * 25.4 * 4 / 72)
elif m.group(2)== 'em':
result = int(unit * 25.4 * 4 / 72 * 10)
elif m.group(2)== 'pc':
result = int(unit * 25.4 * 4 / 72 * 12)
elif m.group(2)== 'mm':
result = int(unit * 4)
elif m.group(2)== 'cm':
result = int(unit * 10 * 4)
else:
try:
result = int(val)
except ValueError:
return None
return result
@staticmethod
def translate_attrs(d):
"""
Receives a dictionary of html attributes and styles and returns
approximate Xylog equivalents in a new dictionary
"""
t = dict()
for key in d.keys():
try:
val = d[key].lower()
except IndexError:
val = None
if key == "font-family":
if max(val.find("courier"), val.find("mono"), val.find("fixed"), val.find("typewriter"))>=0:
t["fontfacename"] = "Courier10 BT Roman"
elif max(val.find("arial"), val.find("helvetica"), val.find("verdana"),
val.find("trebuchet"), val.find("sans")) >= 0:
t["fontfacename"] = "Swis721 BT Roman"
else:
t["fontfacename"] = "Dutch801 Rm BT Roman"
elif key == "font-size":
unit = Span.unit_convert(val, 14)
if unit is not None:
# Assume a 10 pt font (14 pixels) has fontsize 100
t["fontsize"] = str(int (unit / 14.0 * 100))
else:
if val.find("xx-small") >= 0:
t["fontsize"] = "40"
elif val.find("x-small") >= 0:
t["fontsize"] = "60"
elif val.find("small") >= 0:
t["fontsize"] = "80"
elif val.find("xx-large") >= 0:
t["fontsize"] = "180"
elif val.find("x-large") >= 0:
t["fontsize"] = "140"
elif val.find("large") >= 0:
t["fontsize"] = "120"
else:
t["fontsize"] = "100"
elif key == "font-weight":
m = re.match ("\s*([0-9]+)", val)
if m is not None:
#report (m.group(1))
t["fontweight"] = str(int(int(m.group(1))))
else:
if val.find("bold") >= 0 or val.find("strong") >= 0:
t["fontweight"] = "1000"
else:
t["fontweight"] = "400"
elif key.startswith("margin"):
if key == "margin":
u = []
for x in val.split(" "):
u.append(Span.unit_convert (x,200)*2)
if len(u)==1:
u = [u[0], u[0], u[0], u[0]]
elif len(u)==2:
u = [u[0], u[1], u[0], u[1]]
elif len(u)==3:
u = [u[0], u[1], u[2], u[1]]
elif key == "margin-top":
u = [Span.unit_convert(val, 200)*2, None, None, None]
elif key == "margin-right":
u = [None, Span.unit_convert(val, 200)*2, None, None]
elif key == "margin-bottom":
u = [None, None, Span.unit_convert(val, 200)*2, None]
else:
u = [None, None, None, Span.unit_convert(val, 200)*2]
if u[2] is not None:
t["parskip"] = str(u[2])
t["footskip"] = str(u[2])
if u[0] is not None:
t["topskip"] = str(u[0])
if u[1] is not None:
t["sidemargin"] = str(u[1])
elif key == "text-align" or key == "align":
if val in ["right", "foot"]:
t["align"] = "foot"
elif val == "center":
t["align"] = "center"
else:
t["align"] = "head"
else:
t[key] = d[key]
return t
def __init__(self, ns, css):
src = ns.string
src = re.sub('[\n\r]+', '', src)
for pat, repl in Span.rules:
src = pat.sub(repl, src)
if not src:
raise ConversionError('No point in adding an empty string')
if 'font-style' in css.keys():
fs = css.pop('font-style')
if fs.lower() == 'italic':
src = Italic(src)
attrs = Span.translate_attrs(css)
_Span.__init__(self, text=src, **attrs)
class HTMLConvertor(object):
selector_pat = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
# Defaults for various formatting tags
css = dict(
h1 = {"font-size":"xx-large", "font-weight":"bold"},
h2 = {"font-size":"x-large", "font-weight":"bold"},
h3 = {"font-size":"large", "font-weight":"bold"},
h4 = {"font-size":"large"},
h5 = {"font-weight":"bold"},
b = {"font-weight":"bold"},
strong = {"font-weight":"bold"},
i = {"font-style":"italic"},
em = {"font-style":"italic"},
)
def __init__(self, book, soup, verbose=False):
self.book = book #: The Book object representing a BBeB book
self.soup = soup #: Parsed HTML soup
self.verbose = verbose
self.current_page = None
self.current_para = None
self.current_style = {}
self.parse_file(self.soup.html)
def parse_css(self, style):
"""
Parse the contents of a <style> tag or .css file.
@param style: C{str(style)} should be the CSS to parse.
@return: A dictionary with one entry per selector where the key is the
selector name and the value is a dictionary of properties
"""
sdict = dict()
for sel in re.findall(HTMLConvertor.selector_pat, style):
for key in sel[0].split(','):
key = key.strip().lower()
val = self.parse_style_properties(sel[1])
if key in sdict:
sdict[key].update(val)
else:
sdict[key] = val
return sdict
def parse_style_properties(self, props):
"""
Parses a style attribute. The code within a CSS selector block or in
the style attribute of an HTML element.
@return: A dictionary with one entry for each property where the key
is the property name and the value is the property value.
"""
prop = dict()
for s in props.split(';'):
l = s.split(':',1)
if len(l)==2:
key = str(l[0].strip()).lower()
val = l[1].strip()
prop [key] = val
return prop
def tag_css(self, tag, parent_css={}):
"""
Return a dictionary of style properties applicable to Tag tag.
"""
def merge_parent_css(prop, pcss):
temp = {}
for key in pcss.keys():
if key.lower().startswith('font'):
temp[key] = pcss[key]
prop.update(temp)
prop = dict()
if tag.has_key("align"):
prop["text-align"] = tag["align"]
if self.css.has_key(tag.name):
prop.update(self.css[tag.name])
if tag.has_key("class"):
cls = tag["class"].lower()
for classname in ["."+cls, tag.name+"."+cls]:
if self.css.has_key(classname):
prop.update(self.css[classname])
if parent_css:
merge_parent_css(prop, parent_css)
if tag.has_key("style"):
prop.update(self.parse_style_properties(tag["style"]))
return prop
def parse_file(self, html):
if self.current_page:
self.book.append(self.current_page)
self.current_page = Page()
self.current_block = TextBlock()
self.current_para = Paragraph()
self.parse_tag(html, {})
if self.current_para:
self.current_block.append(self.current_para)
if self.current_block:
self.current_page.append(self.current_block)
if self.current_page:
self.book.append(self.current_page)
def end_page(self):
self.current_block.append(self.current_para)
self.current_para = Paragraph()
self.current_page.append(self.current_block)
self.current_block = TextBlock()
self.book.append(self.current_page)
self.current_page = Page()
def parse_tag(self, tag, parent_css):
def sanctify_css(css):
""" Make css safe for use in a SPAM Xylog tag """
for key in css.keys():
test = key.lower()
if test.startswith('margin') or 'indent' in test or \
'padding' in test or 'border' in test or test in \
['color', 'display', 'text-decoration', 'letter-spacing']:
css.pop(key)
return css
def add_text(tag, css):
try:
self.current_para.append(Span(tag, sanctify_css(css)))
except ConversionError, err:
if self.verbose:
print >>sys.stderr, err
def process_text_tag(tag, pcss):
if 'page-break-before' in pcss.keys():
if pcss['page-break-before'].lower() != 'avoid':
self.end_page()
pcss.pop('page-break-before')
end_page = False
if 'page-break-after' in pcss.keys():
end_page = True
pcss.pop('page-break-after')
for c in tag.contents:
if isinstance(tag, NavigableString):
add_text(tag, pcss)
else:
self.parse_tag(c, pcss)
if end_page:
self.end_page()
try:
tagname = tag.name.lower()
except AttributeError:
add_text(tag, parent_css)
return
if tagname in ["title", "script", "meta"]:
pass
elif tagname in ['style', 'link']:
# TODO: Append CSS to self.css
pass
elif tagname == 'p':
css = self.tag_css(tag, parent_css=parent_css)
indent = css.pop('text-indent', '')
if indent:
# TODO: If indent is different from current textblock's parindent
# start a new TextBlock
pass
self.current_para.CR() # Put a paragraph end
self.current_block.append(self.current_para)
self.current_para = Paragraph()
process_text_tag(tag, css)
elif tagname in ['b', 'strong', 'i', 'em', 'span']:
css = self.tag_css(tag, parent_css=parent_css)
process_text_tag(tag, css)
elif tagname == 'font':
pass
elif tagname == 'link':
pass
elif tagname == 'style':
pass
elif tagname == 'br':
self.current_para.append(CR())
elif tagname == 'hr':
self.current_page.append(self.current_para)
self.current_block.append(self.current_page)
self.current_para = Paragraph()
self.current_page = Page()
else:
css = self.tag_css(tag, parent_css=parent_css)
for c in tag.contents:
if isinstance(c, Comment):
continue
elif isinstance(c, Tag):
self.parse_tag(c, css)
elif isinstance(c, NavigableString):
add_text(c, css)
def writeto(self, path):
if path.lower().endswith('lrs'):
self.book.renderLrs(path)
else:
self.book.renderLrf(path)
def process_file(path, options):
cwd = os.getcwd()
try:
path = os.path.abspath(path)
os.chdir(os.path.dirname(path))
soup = BeautifulSoup(open(path, 'r').read(), \
convertEntities=BeautifulSoup.HTML_ENTITIES)
book = Book(title=options.title, author=options.author, \
sourceencoding='utf8')
conv = HTMLConvertor(book, soup)
name = os.path.splitext(os.path.basename(path))[0]+'.lrf'
os.chdir(cwd)
conv.writeto(name)
finally:
os.chdir(cwd)
def main():
""" CLI for html -> lrf conversions """
parser = OptionParser(usage=\
"""usage: %prog [options] mybook.txt
%prog converts mybook.txt to mybook.lrf
"""\
)
parser.add_option("-t", "--title", action="store", type="string", \
dest="title", help="Set the title")
parser.add_option("-a", "--author", action="store", type="string", \
dest="author", help="Set the author", default='Unknown')
options, args = parser.parse_args()
if len(args) != 1:
parser.print_help()
sys.exit(1)
src = args[0]
if options.title == None:
options.title = os.path.splitext(os.path.basename(src))[0]
process_file(src, options)
if __name__ == '__main__':
main()