calibre/src/libprs500/lrf/html/convert_from.py

##    Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
##    This work is based on htmlbbeb created by esperanc.
##
##    This program is free software; you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation; either version 2 of the License, or
##    (at your option) any later version.
##
##    This program is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License along
##    with this program; if not, write to the Free Software Foundation, Inc.,
##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Code to convert HTML ebooks into LRF ebooks.
"""
import os, re, sys
from htmlentitydefs import name2codepoint
from optparse import OptionParser

from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, Comment, Tag, NavigableString
from libprs500.lrf.pylrs.pylrs import Book, Page, Paragraph, TextBlock, CR, Italic
from libprs500.lrf.pylrs.pylrs import Span as _Span
from libprs500.lrf import ConversionError

class Span(_Span):
    replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
    patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
    targets  = [ unichr(name2codepoint[i]) for i in replaced_entities ]
    rules = zip(patterns, targets)

    @staticmethod
    def unit_convert(val, ref=80):
        """
        Tries to convert html units stored in C{val} to pixels. C{ref} contains
        the reference value for relative units. Returns the number of pixels
        (an int) if successful. Otherwise, returns None.
        Assumes: 1 pixel is 1/4 mm. One em is 10pts
        """
        m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val)
        if m is not None:
            unit = float(m.group(1))
            if m.group(2) == '%':
                result = int(unit/100.0*ref)
            elif m.group(2) == 'px':
                result =  int(unit)
            elif m.group(2) == 'in':
                result =  int(unit * 25.4 * 4)
            elif m.group(2) == 'pt':
                result = int(unit * 25.4 * 4 / 72)
            elif m.group(2)== 'em':
                result = int(unit * 25.4 * 4 / 72 * 10)
            elif m.group(2)== 'pc':
                result =  int(unit * 25.4 * 4 / 72 * 12)
            elif m.group(2)== 'mm':
                result =  int(unit * 4)
            elif m.group(2)== 'cm':
                result =  int(unit * 10 * 4)
        else:
            try:
                result = int(val)
            except ValueError:
                return None
        return result

    @staticmethod
    def translate_attrs(d):
        """
        Receives a dictionary of html attributes and styles and returns
        approximate Xylog equivalents in a new dictionary
        """
        t = dict()
        for key in d.keys():
            try:
                val = d[key].lower()
            except IndexError:
                val = None
            if key == "font-family":
                if max(val.find("courier"), val.find("mono"), val.find("fixed"), val.find("typewriter"))>=0:
                    t["fontfacename"] = "Courier10 BT Roman"
                elif max(val.find("arial"), val.find("helvetica"), val.find("verdana"),
                         val.find("trebuchet"), val.find("sans")) >= 0:
                    t["fontfacename"] = "Swis721 BT Roman"
                else:
                    t["fontfacename"] = "Dutch801 Rm BT Roman"
            elif key == "font-size":
                unit = Span.unit_convert(val, 14)
                if unit is not None:
                    # Assume a 10 pt font (14 pixels) has fontsize 100
                    t["fontsize"] = str(int (unit / 14.0 * 100))
                else:
                    if val.find("xx-small") >= 0:
                        t["fontsize"] = "40"
                    elif val.find("x-small") >= 0:
                        t["fontsize"] = "60"
                    elif val.find("small") >= 0:
                        t["fontsize"] = "80"
                    elif val.find("xx-large") >= 0:
                        t["fontsize"] = "180"
                    elif val.find("x-large") >= 0:
                        t["fontsize"] = "140"
                    elif val.find("large") >= 0:
                        t["fontsize"] = "120"
                    else:
                        t["fontsize"] = "100"
            elif key == "font-weight":
                m = re.match ("\s*([0-9]+)", val)
                if m is not None:
                    #report (m.group(1))
                    t["fontweight"] = str(int(int(m.group(1))))
                else:
                    if val.find("bold") >= 0 or val.find("strong") >= 0:
                        t["fontweight"] = "1000"
                    else:
                        t["fontweight"] = "400"
            elif key.startswith("margin"):
                if key == "margin":
                    u = []
                    for x in val.split(" "):
                        u.append(Span.unit_convert (x,200)*2)
                    if len(u)==1:
                        u = [u[0], u[0], u[0], u[0]]
                    elif len(u)==2:
                        u = [u[0], u[1], u[0], u[1]]
                    elif len(u)==3:
                        u = [u[0], u[1], u[2], u[1]]
                elif key == "margin-top":
                    u = [Span.unit_convert(val, 200)*2, None, None, None]
                elif key == "margin-right":
                    u = [None, Span.unit_convert(val, 200)*2, None, None]
                elif key == "margin-bottom":
                    u = [None, None, Span.unit_convert(val, 200)*2, None]
                else:
                    u = [None, None, None, Span.unit_convert(val, 200)*2]
                if u[2] is not None:
                    t["parskip"] = str(u[2])
                    t["footskip"] = str(u[2])
                if u[0] is not None:
                    t["topskip"] = str(u[0])
                if u[1] is not None:
                    t["sidemargin"] = str(u[1])
            elif key == "text-align" or key == "align":
                if val in ["right", "foot"]:
                    t["align"] = "foot"
                elif val == "center":
                    t["align"] = "center"
                else:
                    t["align"] = "head"
            else:
                t[key] = d[key]
        return t

    def __init__(self, ns, css):
        src = ns.string
        src = re.sub('[\n\r]+', '', src)
        for pat, repl in Span.rules:
            src = pat.sub(repl, src)
        if not src:
            raise ConversionError('No point in adding an empty string')
        if 'font-style' in css.keys():
            fs = css.pop('font-style')
            if fs.lower() == 'italic':
                src = Italic(src)
        attrs = Span.translate_attrs(css)
        _Span.__init__(self, text=src, **attrs)


class HTMLConvertor(object):
    selector_pat = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
    # Defaults for various formatting tags
    css = dict(
            h1     = {"font-size":"xx-large", "font-weight":"bold"},
            h2     = {"font-size":"x-large", "font-weight":"bold"},
            h3     = {"font-size":"large", "font-weight":"bold"},
            h4     = {"font-size":"large"},
            h5     = {"font-weight":"bold"},
            b      = {"font-weight":"bold"},
            strong = {"font-weight":"bold"},
            i      = {"font-style":"italic"},
            em     = {"font-style":"italic"},
            )

    def __init__(self, book, soup, verbose=False):
        self.book = book #: The Book object representing a BBeB book
        self.soup = soup #: Parsed HTML soup
        self.verbose = verbose
        self.current_page = None
        self.current_para = None
        self.current_style = {}
        self.parse_file(self.soup.html)

    def parse_css(self, style):
        """
        Parse the contents of a <style> tag or .css file.
        @param style: C{str(style)} should be the CSS to parse.
        @return: A dictionary with one entry per selector where the key is the
        selector name and the value is a dictionary of properties
        """
        sdict = dict()
        for sel in re.findall(HTMLConvertor.selector_pat, style):
            for key in sel[0].split(','):
                key = key.strip().lower()
                val = self.parse_style_properties(sel[1])
                if key in sdict:
                    sdict[key].update(val)
                else:
                    sdict[key] = val
        return sdict

    def parse_style_properties(self, props):
        """
        Parses a style attribute. The code within a CSS selector block or in
        the style attribute of an HTML element.
        @return: A dictionary with one entry for each property where the key
                 is the property name and the value is the property value.
        """
        prop = dict()
        for s in props.split(';'):
            l = s.split(':',1)
            if len(l)==2:
                key = str(l[0].strip()).lower()
                val = l[1].strip()
                prop [key] = val
        return prop

    def tag_css(self, tag, parent_css={}):
        """
        Return a dictionary of style properties applicable to Tag tag.
        """
        def merge_parent_css(prop, pcss):
            temp = {}
            for key in pcss.keys():
                if key.lower().startswith('font'):
                    temp[key] = pcss[key]
            prop.update(temp)

        prop = dict()
        if tag.has_key("align"):
            prop["text-align"] = tag["align"]
        if self.css.has_key(tag.name):
            prop.update(self.css[tag.name])
        if tag.has_key("class"):
            cls = tag["class"].lower()
            for classname in ["."+cls, tag.name+"."+cls]:
                if self.css.has_key(classname):
                    prop.update(self.css[classname])
        if parent_css:
            merge_parent_css(prop, parent_css)
        if tag.has_key("style"):
            prop.update(self.parse_style_properties(tag["style"]))
        return prop

    def parse_file(self, html):
        if self.current_page:
            self.book.append(self.current_page)
        self.current_page = Page()
        self.current_block = TextBlock()
        self.current_para = Paragraph()
        self.parse_tag(html, {})
        if self.current_para:
            self.current_block.append(self.current_para)
        if self.current_block:
            self.current_page.append(self.current_block)
        if self.current_page:
            self.book.append(self.current_page)

    def end_page(self):
        self.current_block.append(self.current_para)
        self.current_para = Paragraph()
        self.current_page.append(self.current_block)
        self.current_block = TextBlock()
        self.book.append(self.current_page)
        self.current_page = Page()


    def parse_tag(self, tag, parent_css):
        def sanctify_css(css):
            """ Make css safe for use in a SPAM Xylog tag """
            for key in css.keys():
                test = key.lower()
                if test.startswith('margin') or 'indent' in test or \
                   'padding' in test or 'border' in test or test in \
                   ['color', 'display', 'text-decoration', 'letter-spacing']:
                    css.pop(key)
            return css

        def add_text(tag, css):
            try:
                self.current_para.append(Span(tag, sanctify_css(css)))
            except ConversionError, err:
                if self.verbose:
                    print >>sys.stderr, err


        def process_text_tag(tag, pcss):
            if 'page-break-before' in pcss.keys():
                if pcss['page-break-before'].lower() != 'avoid':
                    self.end_page()
                pcss.pop('page-break-before')
            end_page = False
            if 'page-break-after' in pcss.keys():
                end_page = True
                pcss.pop('page-break-after')
            for c in tag.contents:
                if isinstance(tag, NavigableString):
                    add_text(tag, pcss)
                else:
                    self.parse_tag(c, pcss)
            if end_page:
                self.end_page()

        try:
            tagname = tag.name.lower()
        except AttributeError:
            add_text(tag, parent_css)
            return
        if tagname in ["title", "script", "meta"]:
            pass
        elif tagname in ['style', 'link']:
            # TODO: Append CSS to self.css
            pass
        elif tagname == 'p':
            css = self.tag_css(tag, parent_css=parent_css)
            indent = css.pop('text-indent', '')
            if indent:
                # TODO: If indent is different from current textblock's parindent
                # start a new TextBlock
                pass
            self.current_para.CR() # Put a paragraph end
            self.current_block.append(self.current_para)
            self.current_para = Paragraph()
            process_text_tag(tag, css)
        elif tagname in ['b', 'strong', 'i', 'em', 'span']:
            css = self.tag_css(tag, parent_css=parent_css)
            process_text_tag(tag, css)
        elif tagname == 'font':
            pass
        elif tagname == 'link':
            pass
        elif tagname == 'style':
            pass
        elif tagname == 'br':
            self.current_para.append(CR())
        elif tagname == 'hr':
            self.current_page.append(self.current_para)
            self.current_block.append(self.current_page)
            self.current_para = Paragraph()
            self.current_page = Page()
        else:
            css = self.tag_css(tag, parent_css=parent_css)
            for c in tag.contents:
                if isinstance(c, Comment):
                    continue
                elif isinstance(c, Tag):
                    self.parse_tag(c, css)
                elif isinstance(c, NavigableString):
                    add_text(c, css)

    def writeto(self, path):
        if path.lower().endswith('lrs'):
            self.book.renderLrs(path)
        else:
            self.book.renderLrf(path)


def process_file(path, options):
    cwd = os.getcwd()
    try:
        path = os.path.abspath(path)
        os.chdir(os.path.dirname(path))
        soup = BeautifulSoup(open(path, 'r').read(), \
                         convertEntities=BeautifulSoup.HTML_ENTITIES)
        book = Book(title=options.title, author=options.author, \
                    sourceencoding='utf8')
        conv = HTMLConvertor(book, soup)
        name = os.path.splitext(os.path.basename(path))[0]+'.lrf'
        os.chdir(cwd)
        conv.writeto(name)
    finally:
        os.chdir(cwd)

def main():
    """ CLI for html -> lrf conversions """
    parser = OptionParser(usage=\
        """usage: %prog [options] mybook.txt

        %prog converts mybook.txt to mybook.lrf
        """\
        )
    parser.add_option("-t", "--title", action="store", type="string", \
                    dest="title", help="Set the title")
    parser.add_option("-a", "--author", action="store", type="string", \
                    dest="author", help="Set the author", default='Unknown')
    options, args = parser.parse_args()
    if len(args) != 1:
        parser.print_help()
        sys.exit(1)
    src = args[0]
    if options.title == None:
        options.title = os.path.splitext(os.path.basename(src))[0]
    process_file(src, options)


if __name__ == '__main__':
    main()