calibre/src/libprs500/ebooks/lrf/html/convert_from.py

##    Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
##    This work is based on htmlbbeb created by esperanc.
##
##    This program is free software; you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation; either version 2 of the License, or
##    (at your option) any later version.
##
##    This program is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License along
##    with this program; if not, write to the Free Software Foundation, Inc.,
##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Code to convert HTML ebooks into LRF ebooks.

I am indebted to esperanc for the initial CSS->Xylog Style conversion routines
and to Falstaff for pylrs.
"""
import os, re, sys, shutil, traceback, copy, glob
from htmlentitydefs import name2codepoint
from urllib import unquote
from urlparse import urlparse
from tempfile import mkdtemp
from operator import itemgetter
from math import ceil, floor
try:
    from PIL import Image as PILImage
except ImportError:
    import Image as PILImage

from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, \
                Comment, Tag, NavigableString, Declaration, ProcessingInstruction
from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
                TextBlock, ImageBlock, JumpButton, CharButton, Bold, Space, \
                Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas
from libprs500.ebooks.lrf.pylrs.pylrs import Span as _Span
from libprs500.ebooks.lrf import ConversionError, option_parser, Book, PRS500_PROFILE
from libprs500.ebooks.lrf.html.table import Table
from libprs500 import extract, filename_to_utf8
from libprs500.ptempfile import PersistentTemporaryFile

class Span(_Span):
    replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo', 'nbsp' ]
    patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
    targets  = [ unichr(name2codepoint[i]) for i in replaced_entities ]
    rules = zip(patterns, targets)


    @staticmethod
    def unit_convert(val, ref=80):
        """
        Tries to convert html units stored in C{val} to pixels. C{ref} contains
        the reference value for relative units. Returns the number of pixels
        (an int) if successful. Otherwise, returns None.
        Assumes: 1 pixel is 1/4 mm. One em is 10pts
        """
        result = None
        m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val)
        if m is not None:
            unit = float(m.group(1))
            if m.group(2) == '%':
                result = int(unit/100.0*ref)
            elif m.group(2) == 'px':
                result =  int(unit)
            elif m.group(2) == 'in':
                result =  int(unit * 25.4 * 4)
            elif m.group(2) == 'pt':
                result = int(unit * 25.4 * 4 / 72)
            elif m.group(2)== 'em':
                result = int(unit * 25.4 * 4 / 72 * 10)
            elif m.group(2)== 'pc':
                result =  int(unit * 25.4 * 4 / 72 * 12)
            elif m.group(2)== 'mm':
                result =  int(unit * 4)
            elif m.group(2)== 'cm':
                result =  int(unit * 10 * 4)
        return result

    @staticmethod
    def translate_attrs(d, font_delta=0, memory=None):
        """
        Receives a dictionary of html attributes and styles and returns
        approximate Xylog equivalents in a new dictionary
        """
        def font_weight(val):
            ans = None
            m = re.search("([0-9]+)", val)
            if m:
                ans = str(int(m.group(1)))
            elif val.find("bold") >= 0 or val.find("strong") >= 0:
                ans = "1000"
            return ans

        def font_family(val):
            ans = None
            if max(val.find("courier"), val.find("mono"), val.find("fixed"), val.find("typewriter"))>=0:
                ans = "Courier10 BT Roman"
            elif max(val.find("arial"), val.find("helvetica"), val.find("verdana"),
                 val.find("trebuchet"), val.find("sans")) >= 0:
                ans = "Swis721 BT Roman"
            return ans

        def font_size(val):
            ans = None
            unit = Span.unit_convert(val, 14)
            if unit:
                # Assume a 10 pt font (14 pixels) has fontsize 100
                ans = int (unit / 14.0 * 100)
            else:
                if "xx-small" in val:
                    ans = 40
                elif "x-small" in val >= 0:
                    ans = 60
                elif "small" in val:
                    ans = 80
                elif "xx-large" in val:
                    ans = 180
                elif "x-large" in val >= 0:
                    ans = 140
                elif "large" in val >= 0:
                    ans = 120
            if ans is not None:
                ans += font_delta * 20
                ans = str(ans)
            return ans

        t = dict()
        for key in d.keys():
            val = d[key].lower()
            if key == 'font':
                val = val.split()
                val.reverse()
                for sval in val:
                    ans = font_family(sval)
                    if ans:
                        t['fontfacename'] = ans
                    else:
                        ans = font_size(sval)
                        if ans:
                            t['fontsize'] = ans
                        else:
                            ans = font_weight(sval)
                            if ans:
                                t['fontweight'] = ans
            elif key in ['font-family', 'font-name']:
                ans = font_family(val)
                if ans:
                    t['fontfacename'] = ans
            elif key == "font-size":
                ans = font_size(val)
                if ans:
                    t['fontsize'] = ans
            elif key == 'font-weight':
                ans = font_weight(val)
                if ans:
                    t['fontweight'] = ans
                    if int(ans) > 140:
                        t['wordspace'] = '50'
            else:
                report = True
                if memory != None:
                    if key in memory:
                        report = False
                    else:
                        memory.append(key)
                if report:
                    print >>sys.stderr, 'Unhandled/malformed CSS key:', key, d[key]
        return t

    def __init__(self, ns, css, memory, font_delta=0):
        src = ns.string if hasattr(ns, 'string') else ns
        src = re.sub(r'\s{2,}', ' ', src)  # Remove multiple spaces
        for pat, repl in Span.rules:
            src = pat.sub(repl, src)
        if not src:
            raise ConversionError('No point in adding an empty string to a Span')
        if 'font-style' in css.keys():
            fs = css.pop('font-style')
            if fs.lower() == 'italic':
                src = Italic(src)
        attrs = Span.translate_attrs(css, font_delta=font_delta, memory=memory)
        if 'fontsize' in attrs.keys():
            attrs['baselineskip'] = int(attrs['fontsize']) + 20
        _Span.__init__(self, text=src, **attrs)


class HTMLConverter(object):
    SELECTOR_PAT   = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
    PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
    IGNORED_TAGS   = (Comment, Declaration, ProcessingInstruction)
    # Fix <a /> elements
    MARKUP_MASSAGE   = [(re.compile("(<\s*[aA]\s+.*\/)\s*>"), #Close <a /> tags
                         lambda match: match.group(1)+"></a>"),
                         # Strip comments from <style> tags. This is needed as
                         # sometimes there are unterminated comments
                        (re.compile(r"<\s*style.*?>(.*?)<\/\s*style\s*>", re.DOTALL|re.IGNORECASE),
                         lambda match: match.group().replace('<!--', '').replace('-->', '')),
                         ]
    # Fix Baen markup
    BAEN_SANCTIFY = [(re.compile(r'<\s*[Aa]\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*<\/[Aa]>'),
                      lambda match: ''),
                      (re.compile(r'page-break-before:\s*\w+([\s;\}])'),
                       lambda match: match.group(1)) ]

    class Link(object):
        def __init__(self, para, tag):
            self.para = para
            self.tag = tag

    processed_files = {} #: Files that have been processed

    def __init__(self, book, path,
                 font_delta=0, verbose=False, cover=None,
                 max_link_levels=sys.maxint, link_level=0,
                 is_root=True, baen=False, chapter_detection=True,
                 chapter_regex=re.compile('chapter|book|appendix', re.IGNORECASE),
                 link_exclude=re.compile('$'),
                 page_break=re.compile('h[12]', re.IGNORECASE),
                 profile=PRS500_PROFILE,
                 disable_autorotation=False):
        '''
        Convert HTML file at C{path} and add it to C{book}. After creating
        the object, you must call L{self.process_links} on it to create the links and
        then L{self.writeto} to output the LRF/S file.

        @param book: The LRF book
        @type book:  L{libprs500.lrf.pylrs.Book}
        @param path: path to the HTML file to process
        @type path:  C{str}
        @param width: Width of the device on which the LRF file is to be read
        @type width: C{int}
        @param height: Height of the device on which the LRF file is to be read
        @type height: C{int}
        @param font_delta: The amount in pts by which all fonts should be changed
        @type font_delta: C{int}
        @param verbose: Whether processing should be verbose or not
        @type verbose: C{bool}
        @param cover: Path to an image to use as the cover of this book
        @type cover: C{str}
        @param max_link_levels: Number of link levels to process recursively
        @type max_link_levels: C{int}
        @param link_level: Current link level
        @type link_level: C{int}
        @param is_root: True iff this object is converting the root HTML file
        @type is_root: C{bool}
        @param chapter_detection: Insert page breaks before what looks like
        the start of a chapter
        @type chapter_detection: C{bool}
        @param chapter_regex: The compiled regular expression used to search for chapter titles
        @param link_exclude: Compiled regex. Matching hrefs are ignored.
        @param page_break: Compiled regex. Page breaks are inserted before matching
                           tags if no page-breaks are found and no chapter headings
                           are detected.
        @param profile: Defines the geometry of the display device
        @param disable_autorotation: Don't autorotate very wide images
        '''
        # Defaults for various formatting tags
        self.css = dict(
            h1     = {"font-size"   :"xx-large", "font-weight":"bold", 'text-indent':'0pt'},
            h2     = {"font-size"   :"x-large", "font-weight":"bold", 'text-indent':'0pt'},
            h3     = {"font-size"   :"large", "font-weight":"bold", 'text-indent':'0pt'},
            h4     = {"font-size"   :"large", 'text-indent':'0pt'},
            h5     = {"font-weight" :"bold", 'text-indent':'0pt'},
            b      = {"font-weight" :"bold"},
            strong = {"font-weight" :"bold"},
            i      = {"font-style"  :"italic"},
            em     = {"font-style"  :"italic"},
            small  = {'font-size'   :'small'},
            pre    = {'font-family' :'monospace' },
            tt     = {'font-family' :'monospace'},
            center = {'text-align'  : 'center'}
            )
        self.profile     = profile #: Defines the geometry of the display device
        self.chapter_detection = chapter_detection #: Flag to toggle chapter detection
        self.chapter_regex = chapter_regex #: Regex used to search for chapter titles
        self.link_exclude = link_exclude #: Ignore matching hrefs
        self.scaled_images = {}   #: Temporary files with scaled version of images
        self.rotated_images = {}  #: Temporary files with rotated version of images
        self.max_link_levels = max_link_levels #: Number of link levels to process recursively
        self.link_level  = link_level  #: Current link level
        self.disable_autorotation = disable_autorotation
        self.blockquote_style = book.create_block_style(sidemargin=60,
                                                        topskip=20, footskip=20)
        self.unindented_style = book.create_text_style(parindent=0)
        self.page_break       = page_break #: Regex controlling forced page-break behavior
        self.text_styles      = []#: Keep track of already used textstyles
        self.block_styles     = []#: Keep track of already used blockstyles
        self.images  = {}         #: Images referenced in the HTML document
        self.targets = {}         #: <a name=...> elements
        self.links   = []         #: <a href=...> elements
        self.files   = {}         #: links that point to other files
        self.links_processed = False #: Whether links_processed has been called on this object
        self.font_delta = font_delta
        # Set by table processing code so that any <a name> within the table
        # point to the previous element
        self.anchor_to_previous = None
        self.cover = cover
        self.in_table = False
        self.memory = []          #: Used to ensure that duplicate CSS unhandled erros are not reported
        self.in_ol = False #: Flag indicating we're in an <ol> element
        self.book = book #: The Book object representing a BBeB book
        self.is_root = is_root           #: Are we converting the root HTML file
        self.lstrip_toggle = False #: If true the next add_text call will do an lstrip
        path = os.path.abspath(path)
        os.chdir(os.path.dirname(path))
        self.file_name = os.path.basename(path)
        print "Processing", self.file_name
        print '\tParsing HTML...',
        sys.stdout.flush()
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
        self.baen = baen
        if baen:
            nmassage.extend(HTMLConverter.BAEN_SANCTIFY)
        self.soup = BeautifulSoup(open(self.file_name, 'r').read(),
                         convertEntities=BeautifulSoup.HTML_ENTITIES,
                         markupMassage=nmassage)
        print 'done\n\tConverting to BBeB...',
        sys.stdout.flush()
        self.verbose = verbose
        self.current_page = None
        self.current_para = None
        self.current_style = {}
        self.page_break_found = False
        match = self.PAGE_BREAK_PAT.search(unicode(self.soup))
        if match and not re.match('avoid', match.group(1), re.IGNORECASE):
            self.page_break_found = True
        self.parse_file()
        HTMLConverter.processed_files[path] = self
        print 'done'

    def parse_css(self, style):
        """
        Parse the contents of a <style> tag or .css file.
        @param style: C{str(style)} should be the CSS to parse.
        @return: A dictionary with one entry per selector where the key is the
        selector name and the value is a dictionary of properties
        """
        sdict = dict()
        style = re.sub('/\*.*?\*/', '', style) # Remove /*...*/ comments
        for sel in re.findall(HTMLConverter.SELECTOR_PAT, style):
            for key in sel[0].split(','):
                key = key.strip().lower()
                val = self.parse_style_properties(sel[1])
                if key in sdict:
                    sdict[key].update(val)
                else:
                    sdict[key] = val
        return sdict

    def parse_style_properties(self, props):
        """
        Parses a style attribute. The code within a CSS selector block or in
        the style attribute of an HTML element.
        @return: A dictionary with one entry for each property where the key
                 is the property name and the value is the property value.
        """
        prop = dict()
        for s in props.split(';'):
            l = s.split(':',1)
            if len(l)==2:
                key = str(l[0].strip()).lower()
                val = l[1].strip()
                prop [key] = val
        return prop

    def tag_css(self, tag, parent_css={}):
        """
        Return a dictionary of style properties applicable to Tag tag.
        """
        def merge_parent_css(prop, pcss):
            temp = {}
            for key in pcss.keys():
                chk = key.lower()
                # float should not be inherited according to the CSS spec
                # however we need to as we don't do alignment at a block level.
                # float is removed by the process_alignment function.
                if chk.startswith('font') or chk == 'text-align' or \
                chk == 'float':
                    temp[key] = pcss[key]
            prop.update(temp)

        prop = dict()
        if tag.has_key("align"):
            prop["text-align"] = tag["align"]
        if self.css.has_key(tag.name):
            prop.update(self.css[tag.name])
        if tag.has_key("class"):
            cls = tag["class"].lower()
            for classname in ["."+cls, tag.name+"."+cls]:
                if self.css.has_key(classname):
                    prop.update(self.css[classname])
        if parent_css:
            merge_parent_css(prop, parent_css)
        if tag.has_key("style"):
            prop.update(self.parse_style_properties(tag["style"]))
        return prop

    def parse_file(self):
        def get_valid_block(page):
            for item in page.contents:
                if isinstance(item, (Canvas, TextBlock, ImageBlock, RuledLine)):
                    return item
        previous = self.book.last_page()
        self.current_page = self.book.create_page()
        self.current_block = self.book.create_text_block()
        self.current_para = Paragraph()
        if self.cover:
            self.add_image_page(self.cover)
        self.top = self.current_block

        self.process_children(self.soup, {})

        if self.current_para and self.current_block:
            self.current_para.append_to(self.current_block)
        if self.current_block and self.current_page:
            self.current_block.append_to(self.current_page)
        if self.current_page and self.current_page.has_text():
            self.book.append(self.current_page)

        if not self.top.parent:
            if not previous:
                try:
                    previous = self.book.pages()[0]
                except IndexError:
                    raise ConversionError, self.file_name + ' does not seem to have any content'
                self.top = get_valid_block(previous)
                if not self.top or not self.top.parent:
                    raise ConversionError, self.file_name + ' does not seem to have any content'
                return

            found = False
            for page in self.book.pages():
                if page == previous:
                    found = True
                    continue
                if found:
                    self.top = get_valid_block(page)
                    if not self.top:
                        continue
                    break

            if not self.top or not self.top.parent:
                raise ConversionError, 'Could not parse ' + self.file_name


    def get_text(self, tag):
            css = self.tag_css(tag)
            if (css.has_key('display') and css['display'].lower() == 'none') or \
               (css.has_key('visibility') and css['visibility'].lower() == 'hidden'):
                return ''
            text = ''
            for c in tag.contents:
                if isinstance(c, HTMLConverter.IGNORED_TAGS):
                    return ''
                if isinstance(c, NavigableString):
                    text += str(c)
                elif isinstance(c, Tag):
                    if c.name.lower() == 'img' and c.has_key('alt'):
                        text += c['alt']
                        return text
                    text += self.get_text(c)
            return text

    def process_links(self):
        def add_toc_entry(text, target):
            # TextBlocks in Canvases have a None parent or an Objects Parent
            if target.parent != None and \
               hasattr(target.parent, 'objId'):
                self.book.addTocEntry(ascii_text, tb)
            elif self.verbose:
                print "Cannot add link", ascii_text, "to TOC"


        def get_target_block(fragment, targets):
            '''Return the correct block for the <a name> element'''
            bs = targets[fragment]
            if not isinstance(bs, BlockSpace):
                return bs
            ans, found, page = None, False, bs.parent
            for item in page.contents:
                if found:
                    if isinstance(item, (TextBlock, RuledLine, ImageBlock)):
                        ans = item
                        break
                if item == bs:
                    found = True
                    continue

            if not ans:
                for i in range(len(page.contents)-1, -1, -1):
                    if isinstance(page.contents[i], (TextBlock, RuledLine, ImageBlock)):
                        ans = page.contents[i]
                        break

            if not ans:
                ntb = self.book.create_text_block()
                ntb.Paragraph(' ')
                page.append(ntb)
                ans = ntb

            if found:
                targets[fragment] =  ans
                page.contents.remove(bs)
            return ans

        cwd = os.getcwd()
        for link in self.links:
            para, tag = link.para, link.tag
            text = self.get_text(tag)
            # Needed for TOC entries due to bug in LRF
            ascii_text = text.decode('utf8', 'replace').encode('ascii', 'replace')
            if not text:
                text = 'Link'
                img = tag.find('img')
                if img:
                    try:
                        text = img['alt']
                    except KeyError:
                        pass
            purl = urlparse(link.tag['href'])
            if purl[1]: # Not a link to a file on the local filesystem
                continue
            path, fragment = unquote(purl[2]), purl[5]
            if not path or os.path.basename(path) == self.file_name:
                if fragment in self.targets.keys():
                    tb = get_target_block(fragment, self.targets)
                    if self.is_root:
                        add_toc_entry(ascii_text, tb)
                    sys.stdout.flush()
                    jb = JumpButton(tb)
                    self.book.append(jb)
                    cb = CharButton(jb, text=text)
                    para.contents = []
                    para.append(cb)
            elif self.link_level < self.max_link_levels:
                try: # os.access raises Exceptions in path has null bytes
                    if not os.access(path.encode('utf8', 'replace'), os.R_OK):
                        raise Exception()
                except Exception:
                    if self.verbose:
                        print "Skipping", link
                    continue
                path = os.path.abspath(path)
                if not path in HTMLConverter.processed_files.keys():
                    try:
                        self.files[path] = HTMLConverter(self.book, path,
                                     profile=self.profile,
                                     font_delta=self.font_delta, verbose=self.verbose,
                                     link_level=self.link_level+1,
                                     max_link_levels=self.max_link_levels,
                                     is_root = False, baen=self.baen,
                                     chapter_detection=self.chapter_detection,
                                     chapter_regex=self.chapter_regex,
                                     link_exclude=self.link_exclude,
                                     page_break=self.page_break,
                                     disable_autorotation=self.disable_autorotation)
                        HTMLConverter.processed_files[path] = self.files[path]
                    except Exception:
                        print >>sys.stderr, 'Unable to process', path
                        if self.verbose:
                            traceback.print_exc()
                        continue
                    finally:
                        os.chdir(cwd)
                else:
                    self.files[path] = HTMLConverter.processed_files[path]
                conv = self.files[path]
                if fragment in conv.targets.keys():
                    tb = get_target_block(fragment, conv.targets)
                else:
                    tb = conv.top
                if self.is_root:
                    add_toc_entry(ascii_text, tb)
                jb = JumpButton(tb)
                self.book.append(jb)
                cb = CharButton(jb, text=text)
                para.contents = []
                para.append(cb)

        self.links_processed = True

        for path in self.files.keys():
            if self.files[path].links_processed:
                continue
            try:
                os.chdir(os.path.dirname(path))
                self.files[path].process_links()
            finally:
                os.chdir(cwd)

    def end_page(self):
        """
        End the current page, ensuring that any further content is displayed
        on a new page.
        """
        self.current_para.append_to(self.current_block)
        self.current_para = Paragraph()
        self.current_block.append_to(self.current_page)
        self.current_block = self.book.create_text_block()
        if self.current_page.has_text():
            self.book.append(self.current_page)
            self.current_page = self.book.create_page()


    def add_image_page(self, path):
        if os.access(path, os.R_OK):
            self.end_page()
            page = self.book.create_page(evensidemargin=0, oddsidemargin=0,
                                         topmargin=0, textwidth=self.profile.screen_width,
                                         headheight=0, headsep=0, footspace=0,
                                         footheight=0,
                                         textheight=self.profile.screen_height)
            if not self.images.has_key(path):
                self.images[path] = ImageStream(path)
            ib = ImageBlock(self.images[path])
            page.append(ib)
            self.book.append(page)

    def process_children(self, ptag, pcss):
        """ Process the children of ptag """
        for c in ptag.contents:
            if isinstance(c, HTMLConverter.IGNORED_TAGS):
                continue
            elif isinstance(c, Tag):
                self.parse_tag(c, pcss)
            elif isinstance(c, NavigableString):
                self.add_text(c, pcss)

    def process_alignment(self, css):
        '''
        Create a new TextBlock only if necessary as indicated by css
        @type css: dict
        '''
        align = 'head'
        if css.has_key('text-align'):
            val = css['text-align'].lower()
            if val in ["right", "foot"]:
                align = "foot"
            elif val == "center":
                align = "center"
        if css.has_key('float'):
            val = css['float'].lower()
            if val == 'left':
                align = 'head'
            if val == 'right':
                align = 'foot'
            css.pop('float')
        if align != self.current_block.textStyle.attrs['align']:
            self.current_para.append_to(self.current_block)
            self.current_block.append_to(self.current_page)
            ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
            ts.attrs['align'] = align
            try:
                index = self.text_styles.index(ts)
                ts = self.text_styles[index]
            except ValueError:
                self.text_styles.append(ts)
            self.current_block = self.book.create_text_block(
                                blockStyle=self.current_block.blockStyle,
                                textStyle=ts)
            self.current_para = Paragraph()

    def add_text(self, tag, css):
        '''
        Add text to the current paragraph taking CSS into account.
        @param tag: Either a BeautifulSoup tag or a string
        @param css:
        @type css:
        '''
        src = tag.string if hasattr(tag, 'string') else tag
        if self.lstrip_toggle:
            src = src.lstrip()
            self.lstrip_toggle = False
        if not src.strip():
            self.current_para.append(' ')
        else:
            self.process_alignment(css)
            try:
                self.current_para.append(Span(src, self.sanctify_css(css), self.memory,\
                                              font_delta=self.font_delta))
            except ConversionError, err:
                if self.verbose:
                    print >>sys.stderr, err

    def sanctify_css(self, css):
        """ Return a copy of C{css} that is safe for use in a SPAM Xylog tag """
        css = copy.copy(css)
        for key in css.keys():
            test = key.lower()
            if test.startswith('margin') or test.startswith('text') or \
               'padding' in test or 'border' in test or 'page-break' in test \
               or test.startswith('mso') or test.startswith('background')\
               or test.startswith('line') or test in ['color', 'display', \
                           'letter-spacing', 'font-variant', 'position']:
                css.pop(key)
        return css

    def end_current_para(self):
        '''
        End current paragraph with a paragraph break after it. If the current
        paragraph has no non whitespace text in it do nothing.
        '''
        if not self.current_para.has_text():
            return
        if self.current_para.contents:
            self.current_block.append(self.current_para)
            self.current_para = Paragraph()
        if self.current_block.contents and \
            not isinstance(self.current_block.contents[-1], CR):
            self.current_block.append(CR())

    def end_current_block(self):
        self.current_para.append_to(self.current_block)
        self.current_block.append_to(self.current_page)
        self.current_para = Paragraph()
        self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
                                                         blockStyle=self.current_block.blockStyle)

    def process_image(self, path, tag_css, width=None, height=None):
        if self.rotated_images.has_key(path):
            path = self.rotated_images[path].name
        if self.scaled_images.has_key(path):
            path = self.scaled_images[path].name

        im = PILImage.open(path)

        if width == None or height == None:
            width, height = im.size

        def scale_image(width, height):
            pt = PersistentTemporaryFile(suffix='.jpeg')
            im.resize((int(width), int(height)), PILImage.ANTIALIAS).convert('RGB').save(pt, 'JPEG')
            pt.close()
            self.scaled_images[path] = pt
            return pt.name

        pheight = int(self.current_page.pageStyle.attrs['textheight'])
        pwidth  = int(self.current_page.pageStyle.attrs['textwidth'])

        if not self.disable_autorotation and width > pwidth and width > height:
            pt = PersistentTemporaryFile(suffix='.jpeg')
            im = im.rotate(90)
            im.convert('RGB').save(pt, 'JPEG')
            path = pt.name
            pt.close()
            self.rotated_images[path] = pt
            width, height = im.size


        if height > pheight:
            corrf = pheight/(1.*height)
            width, height = floor(corrf*width), pheight-1
            if width > pwidth:
                corrf = (pwidth)/(1.*width)
                width, height = pwidth-1, floor(corrf*height)
            path = scale_image(width, height)
        if width > pwidth:
            corrf = pwidth/(1.*width)
            width, height = pwidth-1, floor(corrf*height)
            if height > pheight:
                corrf = (pheight)/(1.*height)
                width, height = floor(corrf*width), pheight-1
            path = scale_image(width, height)
        width, height = int(width), int(height)

        if not self.images.has_key(path):
            self.images[path] = ImageStream(path)

        im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
                               xsize=width, ysize=height)
        factor = 720./self.profile.dpi

        self.process_alignment(tag_css)

        if max(width, height) <= min(pwidth, pheight)/5.:
            self.current_para.append(Plot(im, xsize=ceil(width*factor),
                                          ysize=ceil(height*factor)))
        elif height <= int(floor((2/3.)*pheight)):
            pb = self.current_block
            self.end_current_para()
            self.process_alignment(tag_css)
            self.current_para.append(Plot(im, xsize=width*factor,
                                          ysize=height*factor))
            self.current_block.append(self.current_para)
            self.current_page.append(self.current_block)
            self.current_block = self.book.create_text_block(
                                            textStyle=pb.textStyle,
                                            blockStyle=pb.blockStyle)
            self.current_para = Paragraph()
        else:
            self.end_page()
            self.current_page.append(Canvas(width=pwidth,
                                            height=height))
            left = int(floor((pwidth - width)/2.))
            self.current_page.contents[-1].put_object(
                            ImageBlock(self.images[path], xsize=pwidth,
                                       ysize=pheight, x1=pwidth, y1=pheight,
                                       blockwidth=pwidth, blockheight=pheight),
                            left, 0)

    def parse_tag(self, tag, parent_css):
        try:
            tagname = tag.name.lower()
        except AttributeError:
            if not isinstance(tag, HTMLConverter.IGNORED_TAGS):
                self.add_text(tag, parent_css)
            return
        tag_css = self.tag_css(tag, parent_css=parent_css)
        try: # Skip element if its display attribute is set to none
            if tag_css['display'].lower() == 'none' or \
               tag_css['visibility'].lower() == 'hidden':
                return
        except KeyError:
            pass
        if 'page-break-before' in tag_css.keys():
            if tag_css['page-break-before'].lower() != 'avoid':
                self.end_page()
            tag_css.pop('page-break-before')
        end_page = False
        if 'page-break-after' in tag_css.keys() and \
           tag_css['page-break-after'].lower() != 'avoid':
            end_page = True
            tag_css.pop('page-break-after')
        if not self.page_break_found and self.page_break.match(tagname):
            if len(self.current_page.contents) > 3:
                self.end_page()
                if self.verbose:
                    print 'Forcing page break at', tagname
        if tagname in ["title", "script", "meta", 'del', 'frameset']:
            pass
        elif tagname == 'a' and self.max_link_levels >= 0:
            if tag.has_key('name'):
                if self.anchor_to_previous:
                    self.process_children(tag, tag_css)
                    for c in self.anchor_to_previous.contents:
                        if isinstance(c, (TextBlock, ImageBlock)):
                            self.targets[tag['name']] = c
                            return
                    tb = self.book.create_text_block()
                    tb.Paragraph(" ")
                    self.anchor_to_previous.append(tb)
                    self.targets[tag['name']] = tb
                    return
                previous = self.current_block
                self.process_children(tag, tag_css)
                target = None
                if self.current_block == previous:
                    self.current_para.append_to(self.current_block)
                    self.current_para = Paragraph()
                    if self.current_block.has_text():
                        target = self.current_block
                    else:
                        target = BlockSpace()
                        self.current_page.append(target)
                else:
                    found = False
                    for item in self.current_page.contents:
                        if item == previous:
                            found = True
                            continue
                        if found:
                            target = item
                            break
                    if target and not isinstance(target, (TextBlock, ImageBlock)):
                        if isinstance(target, RuledLine):
                            target = self.book.create_text_block(textStyle=self.current_block.textStyle,
                                                         blockStyle=self.current_block.blockStyle)
                            target.Paragraph(' ')
                            self.current_page.append(target)
                        else:
                            target = BlockSpace()
                            self.current_page.append(target)
                    if target == None:
                        if self.current_block.has_text():
                            target = self.current_block
                        else:
                            target = BlockSpace()
                            self.current_page.append(target)
                self.targets[tag['name']] = target
            elif tag.has_key('href') and not self.link_exclude.match(tag['href']):
                purl = urlparse(tag['href'])
                path = unquote(purl[2])
                if path and os.path.splitext(path)[1][1:].lower() in \
                    ['png', 'jpg', 'bmp', 'jpeg']:
                    self.process_image(path, tag_css)
                else:
                    text = self.get_text(tag)
                    if not text:
                        text = "Link"
                    self.add_text(text, tag_css)
                    self.links.append(HTMLConverter.Link(self.current_para.contents[-1], tag))
        elif tagname == 'img':
            if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK):
                path = os.path.abspath(unquote(tag['src']))
                width, height = None, None
                try:
                    width = int(tag['width'])
                    height = int(tag['height'])
                except:
                    pass
                self.process_image(path, tag_css, width, height)

            else:
                print >>sys.stderr, "Failed to process:", tag
        elif tagname in ['style', 'link']:
            def update_css(ncss):
                for key in ncss.keys():
                    if self.css.has_key(key):
                        self.css[key].update(ncss[key])
                    else:
                        self.css[key] = ncss[key]
            ncss = {}
            if tagname == 'style':
                for c in tag.contents:
                    if isinstance(c, NavigableString):
                        ncss.update(self.parse_css(str(c)))
            elif tag.has_key('type') and tag['type'] == "text/css" \
                    and tag.has_key('href'):
                purl = urlparse(tag['href'])
                path = unquote(purl[2])
                try:
                    f = open(path, 'rb')
                    src = f.read()
                    f.close()
                    match = self.PAGE_BREAK_PAT.search(src)
                    if match and not re.match('avoid', match.group(1), re.IGNORECASE):
                        self.page_break_found = True
                    ncss = self.parse_css(src)
                except IOError:
                    pass
            if ncss:
                update_css(ncss)
        elif tagname == 'pre':
            self.end_current_para()
            self.current_block.append_to(self.current_page)
            attrs = Span.translate_attrs(tag_css, self.font_delta, self.memory)
            ts = self.book.create_text_style(**self.unindented_style.attrs)
            ts.attrs.update(attrs)
            self.current_block = self.book.create_text_block(
                                    blockStyle=self.current_block.blockStyle,
                                    textStyle=ts)
            src = ''.join([str(i) for i in tag.contents])
            lines = src.split('\n')
            for line in lines:
                try:
                    self.current_para.append(Span(line, tag_css, self.memory))
                    self.current_para.CR()
                except ConversionError:
                    pass
            self.end_current_block()
        elif tagname in ['ul', 'ol']:
            self.in_ol = 1 if tagname == 'ol' else 0
            self.end_current_block()
            self.current_block = self.book.create_text_block(
                                        blockStyle=self.current_block.blockStyle,
                                        textStyle=self.unindented_style)
            self.process_children(tag, tag_css)
            self.in_ol = 0
            self.end_current_block()
        elif tagname == 'li':
            prepend = str(self.in_ol)+'. ' if self.in_ol else u'\u2022' + ' '
            if self.current_para.has_text():
                self.current_para.append(CR())
                self.current_block.append(self.current_para)
            self.current_para = Paragraph()
            self.current_para.append(Space(xsize=100))
            self.current_para.append(prepend)
            self.process_children(tag, tag_css)
            if self.in_ol:
                self.in_ol += 1
        elif tagname == 'blockquote':
            self.current_para.append_to(self.current_block)
            self.current_block.append_to(self.current_page)
            pb = self.current_block
            self.current_para = Paragraph()
            ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
            ts.attrs['parindent'] = 0
            try:
                index = self.text_styles.index(ts)
                ts = self.text_styles[index]
            except ValueError:
                self.text_styles.append(ts)
            bs = self.book.create_block_style(**self.current_block.blockStyle.attrs)
            bs.attrs['sidemargin'], bs.attrs['topskip'], bs.attrs['footskip'] = \
            60, 20, 20
            try:
                index = self.block_styles.index(bs)
                bs = self.block_styles[index]
            except ValueError:
                self.block_styles.append(bs)
            self.current_block = self.book.create_text_block(
                                    blockStyle=bs, textStyle=ts)
            self.process_children(tag, tag_css)
            self.current_para.append_to(self.current_block)
            self.current_block.append_to(self.current_page)
            self.current_para = Paragraph()
            self.current_block = self.book.create_text_block(textStyle=pb.textStyle,
                                                             blockStyle=pb.blockStyle)
        elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            if self.chapter_detection and tagname.startswith('h'):
                src = self.get_text(tag)
                if self.chapter_regex.search(src):
                    if self.verbose:
                        print 'Detected chapter', src
                    self.end_page()
                    self.page_break_found = True
            self.end_current_para()
            self.lstrip_toggle = True
            if tag_css.has_key('text-indent'):
                indent = Span.unit_convert(tag_css['text-indent'])
                if not indent:
                    indent=0
            else:
                indent = self.book.defaultTextStyle.attrs['parindent']
            if indent != self.current_block.textStyle.attrs['parindent']:
                self.current_block.append_to(self.current_page)
                ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
                ts.attrs['parindent'] = indent
                try:
                    index = self.text_styles.index(ts)
                    ts = self.text_styles[index]
                except ValueError:
                    self.text_styles.append(ts)
                self.current_block = self.book.create_text_block(blockStyle=self.current_block.blockStyle,
                                                                 textStyle=ts)
            self.process_children(tag, tag_css)
            self.end_current_para()
            if tagname.startswith('h'):
                self.current_block.append(CR())
        elif tagname in ['b', 'strong', 'i', 'em', 'span', 'tt']:
            self.process_children(tag, tag_css)
        elif tagname == 'font':
            if tag.has_key('face'):
                tag_css['font-family'] = tag['face']
            self.process_children(tag, tag_css)
        elif tagname in ['br']:
            self.current_para.append(CR())
        elif tagname in ['hr', 'tr']: # tr needed for nested tables
            self.end_current_para()
            self.current_block.append(CR())
            self.end_current_block()
            self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth']))
        elif tagname == 'td': # Needed for nested tables
            self.current_para.append(" ")
            self.process_children(tag, tag_css)
        elif tagname == 'table' and not self.in_table:
            tag_css = self.tag_css(tag) # Table should not inherit CSS
            self.process_table(tag, tag_css)
        else:
            self.process_children(tag, tag_css)
        if end_page:
                self.end_page()

    def process_table(self, tag, tag_css):
        self.end_current_block()
        colpad = 10
        table = Table(self, tag, tag_css, rowpad=10, colpad=10)
        canvases = []
        for block, xpos, ypos, delta in table.blocks(int(self.current_page.pageStyle.attrs['textwidth'])):
            if not block:
                canvases.append(Canvas(int(self.current_page.pageStyle.attrs['textwidth']), ypos+colpad,
                        blockrule='block-fixed'))
            else:
                canvases[-1].put_object(block, xpos + int(delta/2.), 0)

        for canvas in canvases:
            self.current_page.append(canvas)
        self.end_current_block()


    def writeto(self, path, lrs=False):
        self.book.renderLrs(path) if lrs else self.book.renderLrf(path)

    def cleanup(self):
        for _file in self.scaled_images.values() + self.rotated_images.values():
            _file.__del__()

def process_file(path, options):
    cwd = os.getcwd()
    dirpath = None
    default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0])
    try:
        dirpath, path = get_path(path)
        cpath, tpath = '', ''
        isbn = try_opf(path, options)
        if not options.cover and isbn:
            for item in isbn:
                matches = glob.glob(re.sub('-', '', item[1])+'.*')
                for match in matches:
                    if match.lower().endswith('.jpeg') or match.lower().endswith('.jpg') or \
                    match.lower().endswith('.gif') or match.lower().endswith('.png'):
                        options.cover = match
                        break
        if options.cover:
            options.cover = os.path.abspath(os.path.expanduser(options.cover))
            cpath = options.cover
            if os.access(options.cover, os.R_OK):
                from libprs500.devices.prs500.driver import PRS500
                im = PILImage.open(os.path.join(cwd, cpath))
                cim = im.resize((options.profile.screen_width,
                                 options.profile.screen_height),
                                PILImage.BICUBIC)
                cf = PersistentTemporaryFile(prefix="html2lrf_", suffix=".jpg")
                cf.close()
                cim.save(cf.name)
                cpath = cf.name
                th = PRS500.THUMBNAIL_HEIGHT
                tim = im.resize((int(0.75*th), th), PILImage.ANTIALIAS)
                tf = PersistentTemporaryFile(prefix="html2lrf_", suffix=".jpg")
                tf.close()
                tim.save(tf.name)
                tpath = tf.name
            else:
                raise ConversionError, 'Cannot read from: %s'% (options.cover,)


        if not options.title:
            options.title = default_title
        title = (options.title, options.title_sort)
        author = (options.author, options.author_sort)
        args = dict(font_delta=options.font_delta, title=title, \
                    author=author, sourceencoding='utf8',\
                    freetext=options.freetext, category=options.category,
                    publisher=options.publisher,
                    booksetting=BookSetting(dpi=10*options.profile.dpi,
                                            screenheight=options.profile.screen_height,
                                            screenwidth=options.profile.screen_width))
        if tpath:
            args['thumbnail'] = tpath
        header = None
        if options.header:
            header = Paragraph()
            header.append(Bold(options.title))
            header.append(' by ')
            header.append(Italic(options.author+"  "))
        book = Book(options, header=header, **args)
        le = re.compile(options.link_exclude) if options.link_exclude else \
             re.compile('$')
        pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \
             re.compile('$')
        conv = HTMLConverter(book, path, profile=options.profile,
                             font_delta=options.font_delta,
                             cover=cpath, max_link_levels=options.link_levels,
                             verbose=options.verbose, baen=options.baen,
                             chapter_detection=options.chapter_detection,
                             chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE),
                             link_exclude=re.compile(le), page_break=pb,
                             disable_autorotation=options.disable_autorotation)
        conv.process_links()
        oname = options.output
        if not oname:
            suffix = '.lrs' if options.lrs else '.lrf'
            name = os.path.splitext(os.path.basename(path))[0] + suffix
            oname = os.path.join(cwd,name)
        oname = os.path.abspath(os.path.expanduser(oname))
        conv.writeto(oname, lrs=options.lrs)
        print 'Output written to', oname
        conv.cleanup()
    finally:
        os.chdir(cwd)
        if dirpath:
            shutil.rmtree(dirpath, True)

def try_opf(path, options):
    try:
        opf = glob.glob('*.opf')[0]
    except IndexError:
        return
    soup = BeautifulStoneSoup(open(opf).read())
    try:
        title = soup.package.metadata.find('dc:title')
        if title and not options.title:
            options.title = title.string
        creators = soup.package.metadata.findAll('dc:creator')
        if options.author == 'Unknown':
            for author in creators:
                role = author.get('role')
                if not role:
                    role = author.get('opf:role')
                if role == 'aut':
                    options.author = author.string
                    fa = author.get('file-as')
                    if fa:
                        options.author_sort = fa
        if options.publisher == 'Unknown':
            publisher = soup.package.metadata.find('dc:publisher')
            if publisher:
                options.publisher = publisher.string
        if not options.category.strip():
            category = soup.package.metadata.find('dc:type')
            if category:
                options.category = category.string
        isbn = []
        for item in soup.package.metadata.findAll('dc:identifier'):
            scheme = item.get('scheme')
            if not scheme:
                scheme = item.get('opf:scheme')
            isbn.append((scheme, item.string))
        return isbn
    except Exception, err:
        if options.verbose:
            print >>sys.stderr, 'Failed to process opf file', err
        pass


def parse_options(argv=None, cli=True):
    """ CLI for html -> lrf conversions """
    if not argv:
        argv = sys.argv[1:]
    parser = option_parser("""usage: %prog [options] mybook.[html|rar|zip]

         %prog converts mybook.html to mybook.lrf""")
    laf = parser.add_option_group('LOOK AND FEEL')
    laf.add_option('--cover', action='store', dest='cover', default=None, \
                      help='Path to file containing image to be used as cover')
    laf.add_option('--font-delta', action='store', type='int', default=0, \
                      help="""Increase the font size by 2 * FONT_DELTA pts and """
                      '''the line spacing by FONT_DELTA pts. '''
                      """If FONT_DELTA is negative, the font size is decreased.""",
                      dest='font_delta')
    laf.add_option('--disable-autorotation', action='store_true', default=False,
                   help='Disable autorotation of images.', dest='disable_autorotation')
    link = parser.add_option_group('LINK PROCESSING OPTIONS')
    link.add_option('--link-levels', action='store', type='int', default=sys.maxint, \
                      dest='link_levels',
                      help=r'''The maximum number of levels to recursively process '''
                              '''links. A value of 0 means thats links are not followed. '''
                              '''A negative value means that <a> tags are ignored.''')
    link.add_option('--link-exclude', dest='link_exclude', default='$',
                      help='''A regular expression. <a> tags whoose href '''
                      '''matches will be ignored. Defaults to %default''')
    chapter = parser.add_option_group('CHAPTER OPTIONS')
    chapter.add_option('--disable-chapter-detection', action='store_false',
                      default=True, dest='chapter_detection',
                      help='''Prevent html2lrf from automatically inserting page breaks'''
                      ''' before what it thinks are chapters.''')
    chapter.add_option('--chapter-regex', dest='chapter_regex',
                      default='chapter|book|appendix',
                      help='''The regular expression used to detect chapter titles.'''
                      ''' It is searched for in heading tags. Defaults to %default''')
    chapter.add_option('--page-break-before', dest='page_break', default='h[12]',
                      help='''If html2lrf does not find any page breaks in the '''
                      '''html file and cannot detect chapter headings, it will '''
                      '''automatically insert page-breaks before the tags whose '''
                      '''names match this regular expression. Defaults to %default. '''
                      '''You can disable it by setting the regexp to "$". '''
                      '''The purpose of this option is to try to ensure that '''
                      '''there are no really long pages as this degrades the page '''
                      '''turn performance of the LRF. Thus this option is ignored '''
                      '''if the current page has only a few elements.''')
    prepro = parser.add_option_group('PREPROCESSING OPTIONS')
    prepro.add_option('--baen', action='store_true', default=False, dest='baen',
                      help='''Preprocess Baen HTML files to improve generated LRF.''')
    options, args = parser.parse_args(args=argv)
    if len(args) != 1:
        if cli:
            parser.print_help()
        raise ConversionError, 'no filename specified'
    if options.output:
        options.output = os.path.abspath(os.path.expanduser(options.output))
    return options, args, parser


def main():
    try:
        options, args, parser = parse_options()
        src = args[0]
        if options.verbose:
            import warnings
            warnings.defaultaction = 'error'
    except:
        sys.exit(1)
    process_file(src, options)

def console_query(dirpath, candidate, docs):
    if len(docs) == 1:
        return 0
    try:
        import readline
    except ImportError:
        pass
    i = 0
    for doc in docs:
        prefix = '>' if i == candidate else ''
        print prefix+str(i)+'.\t', doc[0]
        i += 1
    print
    while True:
        try:
            choice = raw_input('Choose file to convert (0-'+str(i-1) + \
                               '). Current choice is ['+ str(candidate) + ']:')
            if not choice:
                return candidate
            choice = int(choice)
            if choice < 0 or choice >= i:
                continue
            candidate = choice
        except EOFError, KeyboardInterrupt:
            sys.exit()
        except:
            continue
        break
    return candidate


def get_path(path, query=console_query):
    path = os.path.abspath(os.path.expanduser(path))
    ext = os.path.splitext(path)[1][1:].lower()
    if ext in ['htm', 'html', 'xhtml']:
        return None, path
    dirpath = mkdtemp('','html2lrf')
    extract(path, dirpath)
    candidate, docs = None, []
    for root, dirs, files in os.walk(dirpath):
        for name in files:
            ext = os.path.splitext(name)[1][1:].lower()
            if ext not in ['html', 'xhtml', 'htm', 'xhtm']:
                continue
            docs.append((name, root, os.stat(os.path.join(root, name)).st_size))
            if 'toc' in name.lower():
                candidate = name
    docs.sort(key=itemgetter(2))
    if candidate:
        for i in range(len(docs)):
            if docs[i][0] == candidate:
                candidate = i
                break
    else:
        candidate = len(docs) - 1
    if len(docs) == 0:
        raise ConversionError('No suitable files found in archive')
    if len(docs) > 0:
        candidate = query(dirpath, candidate, docs)
    return dirpath, os.path.join(docs[candidate][1], docs[candidate][0])


if __name__ == '__main__':
    main()