calibre/src/libprs500/ebooks/lrf/html/convert_from.py

##    Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
##    This work is based on htmlbbeb created by esperanc.
##
##    This program is free software; you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation; either version 2 of the License, or
##    (at your option) any later version.
##
##    This program is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License along
##    with this program; if not, write to the Free Software Foundation, Inc.,
##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Code to convert HTML ebooks into LRF ebooks.

I am indebted to esperanc for the initial CSS->Xylog Style conversion code
and to Falstaff for pylrs.
"""
import os, re, sys, copy, glob, logging, tempfile
from collections import deque
from urllib import unquote
from urlparse import urlparse
from math import ceil, floor
from functools import partial

try:
    from PIL import Image as PILImage
except ImportError:
    import Image as PILImage

from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, \
                            NavigableString, Declaration, ProcessingInstruction
from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
                TextBlock, ImageBlock, JumpButton, CharButton, \
                Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
                LrsError, Sup, Sub, EmpLine
from libprs500.ebooks.lrf.pylrs.pylrs import Span
from libprs500.ebooks.lrf import Book, entity_to_unicode
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks import ConversionError
from libprs500.ebooks.lrf.html.table import Table
from libprs500 import filename_to_utf8,  setup_cli_handlers, __appname__, fit_image
from libprs500.ptempfile import PersistentTemporaryFile
from libprs500.ebooks.metadata.opf import OPFReader
from libprs500.devices.interface import Device
from libprs500.ebooks.lrf.html.color_map import lrs_color

def update_css(ncss, ocss):
    for key in ncss.keys():
        if ocss.has_key(key):
            ocss[key].update(ncss[key])
        else:
            ocss[key] = ncss[key]

def munge_paths(basepath, url):
    purl = urlparse(unquote(url),)
    path, fragment = purl[2], purl[5]
    if not path:
        path = basepath
    elif not os.path.isabs(path):
        path = os.path.join(os.path.dirname(basepath), path)
    return os.path.normpath(path), fragment

class HTMLConverter(object):
    SELECTOR_PAT   = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
    PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
    IGNORED_TAGS   = (Comment, Declaration, ProcessingInstruction)


    MARKUP_MASSAGE   = [
                        # Close <a /> tags
                        (re.compile(r'<a(\s[^>]*)?/>', re.IGNORECASE),
                         lambda match: '<a'+match.group(1)+'></a>'),
                         # Strip comments from <style> tags. This is needed as
                         # sometimes there are unterminated comments
                        (re.compile(r"<\s*style.*?>(.*?)<\/\s*style\s*>", re.DOTALL|re.IGNORECASE),
                         lambda match: match.group().replace('<!--', '').replace('-->', '')),
                         # remove <p> tags from within <a> tags
                        (re.compile(r'<a.*?>(.*?)</a\s*>', re.DOTALL|re.IGNORECASE),
                         lambda match: re.compile(r'<\s*?p.*?>', re.IGNORECASE).sub('', match.group())),
                        # Workaround bug in BeautifulSoup &nbsp; handling
                        (re.compile(u'&nbsp;|&#160;|&#xa0;|\xa0', re.IGNORECASE), lambda match : u'\uffff'),
                        # Replace entities
                        (re.compile(ur'&(\S+?);'), partial(entity_to_unicode,
                                                           exceptions=['lt', 'gt', 'amp'])),
                        ]
    # Fix Baen markup
    BAEN = [
                     (re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE),
                      lambda match: match.group(1)),
                     (re.compile(r'<p>\s*(<a id.*?>\s*</a>)\s*</p>', re.IGNORECASE),
                      lambda match: match.group(1)),
                     (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*</a>', re.IGNORECASE),
                      lambda match: ''),
                     ]
    # Fix pdftohtml markup
    PDFTOHTML  = [
                  # Remove <hr> tags
                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<span style="page-break-after:always"> </span>'),
                  # Remove page numbers
                  (re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
                  # Remove <br> and replace <br><br> with <p>
                  (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
                  (re.compile(r'(.*)<br.*?>', re.IGNORECASE),
                   lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40
                                else match.group(1)),
                  # Remove hyphenation
                  (re.compile(r'-\n\r?'), lambda match: ''),

                  ]

    # Fix Book Designer markup
    BOOK_DESIGNER = [
                     # HR
                     (re.compile('<hr>', re.IGNORECASE),
                      lambda match : '<span style="page-break-after:always"> </span>'),
                     # Create header tags
                     (re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
                      lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
                     (re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
                      lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
                     (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                      lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
                     (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                      lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
                     # Blank lines
                     (re.compile('<div[^><]*?>(&nbsp;){4}</div>', re.IGNORECASE),
                      lambda match : '<p></p>'),
                     ]

    def __hasattr__(self, attr):
        if hasattr(self.options, attr):
            return True
        return object.__hasattr__(self, attr)

    def __getattr__(self, attr):
        if hasattr(self.options, attr):
            return getattr(self.options, attr)
        return object.__getattr__(self, attr)

    def __setattr__(self, attr, val):
        if hasattr(self.options, attr):
            setattr(self.options, attr, val)
        else:
            object.__setattr__(self, attr, val)

    CSS = {
           'h1'     : {"font-size"   : "xx-large", "font-weight":"bold", 'text-indent':'0pt'},
           'h2'     : {"font-size"   : "x-large", "font-weight":"bold", 'text-indent':'0pt'},
           'h3'     : {"font-size"   : "large", "font-weight":"bold", 'text-indent':'0pt'},
           'h4'     : {"font-size"   : "large", 'text-indent':'0pt'},
           'h5'     : {"font-weight" : "bold", 'text-indent':'0pt'},
           'b'      : {"font-weight" : "bold"},
           'strong' : {"font-weight" : "bold"},
           'i'      : {"font-style"  : "italic"},
           'cite'   : {'font-style'  : 'italic'},
           'em'     : {"font-style"  : "italic"},
           'small'  : {'font-size'   : 'small'},
           'pre'    : {'font-family' : 'monospace', 'white-space': 'pre' },
           'code'   : {'font-family' : 'monospace' },
           'tt'     : {'font-family' : 'monospace'},
           'center' : {'text-align'  : 'center'},
           'th'     : {'font-size'   : 'large', 'font-weight':'bold'},
           'big'    : {'font-size'   : 'large', 'font-weight':'bold'},
           '.libprs500_dropcaps' : {'font-size': 'xx-large'},
           'u'      : {'text-decoration': 'underline'},
           }

    def __init__(self, book, fonts, options, logger, paths):
        '''
        Convert HTML files at C{paths} and add to C{book}. After creating
        the object, you must call L{self.writeto} to output the LRF/S file.

        @param book: The LRF book
        @type book:  L{libprs500.lrf.pylrs.Book}
        @param fonts: dict specifying the font families to use
        '''
        # Defaults for various formatting tags
        object.__setattr__(self, 'options', options)
        self.logger = logger
        self.fonts = fonts #: dict specifying font families to use
        # Memory
        self.scaled_images = {}    #: Temporary files with scaled version of images
        self.rotated_images = {}  #: Temporary files with rotated version of images
        self.text_styles      = []#: Keep track of already used textstyles
        self.block_styles     = []#: Keep track of already used blockstyles
        self.images  = {}      #: Images referenced in the HTML document
        self.targets = {}      #: <a name=...> and id elements
        self.links   = deque() #: <a href=...> elements
        self.processed_files = []
        self.extra_toc_entries = [] #: TOC entries gleaned from semantic information
        self.image_memory = []
        self.id_counter = 0
        self.unused_target_blocks = [] #: Used to remove extra TextBlocks
        self.link_level  = 0    #: Current link level
        self.memory = []        #: Used to ensure that duplicate CSS unhandled erros are not reported
        self.tops = {}          #: element representing the top of each HTML file in the LRF file
        self.previous_text = '' #: Used to figure out when to lstrip
        self.preserve_block_style = False #: Used so that <p> tags in <blockquote> elements are handled properly
        # Styles
        self.blockquote_style = book.create_block_style(sidemargin=60,
                                                        topskip=20, footskip=20)
        self.unindented_style = book.create_text_style(parindent=0)


        self.in_table = False
        # List processing
        self.list_level = 0
        self.list_indent = 20
        self.list_counter = 1

        self.book = book                #: The Book object representing a BBeB book

        self.override_css = {}
        self.override_pcss = {}

        if self._override_css is not None:
            if os.access(self._override_css, os.R_OK):
                src = open(self._override_css, 'rb').read()
            else:
                src = self._override_css
            match = self.PAGE_BREAK_PAT.search(src)
            if match and not re.match('avoid', match.group(1), re.IGNORECASE):
                self.page_break_found = True
            ncss, npcss = self.parse_css(src)
            if ncss:
                update_css(ncss, self.override_css)
            if npcss:
                update_css(npcss, self.override_pcss)

        paths = [os.path.abspath(path) for path in paths]


        while len(paths) > 0 and self.link_level <= self.link_levels:
            for path in paths:
                if path in self.processed_files:
                    continue
                try:
                    self.add_file(path)
                except KeyboardInterrupt:
                    raise
                except:
                    if self.link_level == 0: # Die on errors in the first level
                        raise
                    for link in self.links:
                        if link['path'] == path:
                            self.links.remove(link)
                            break
                    self.logger.warn('Could not process '+path)
                    if self.verbose:
                        self.logger.exception(' ')
            self.links = self.process_links()
            self.link_level += 1
            paths = [link['path'] for link in self.links]


        for text, tb in self.extra_toc_entries:
            ascii_text = text.encode('ascii', 'ignore')
            self.book.addTocEntry(ascii_text, tb)

        if self.base_font_size > 0:
            self.logger.info('\tRationalizing font sizes...')
            self.book.rationalize_font_sizes(self.base_font_size)

    def is_baen(self, soup):
        return bool(soup.find('meta', attrs={'name':'Publisher',
                        'content':re.compile('Baen', re.IGNORECASE)}))

    def is_book_designer(self, raw):
        return bool(re.search('<H2[^><]*id=BookTitle', raw))

    def preprocess(self, raw):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(HTMLConverter.MARKUP_MASSAGE)

        if not self.book_designer and self.is_book_designer(raw):
            self.book_designer = True
            self.logger.info('\tBook Designer file detected.')

        self.logger.info('\tParsing HTML...')

        if self.baen:
            nmassage.extend(HTMLConverter.BAEN)

        if self.pdftohtml:
            nmassage.extend(HTMLConverter.PDFTOHTML)
        if self.book_designer:
            nmassage.extend(HTMLConverter.BOOK_DESIGNER)
        try:
            soup = BeautifulSoup(raw,
                         convertEntities=BeautifulSoup.HTML_ENTITIES,
                         markupMassage=nmassage)
        except ConversionError, err:
            if 'Failed to coerce to unicode' in str(err):
                raw = unicode(raw, 'utf8', 'replace')
                soup = BeautifulSoup(raw,
                         convertEntities=BeautifulSoup.HTML_ENTITIES,
                         markupMassage=nmassage)

        if not self.baen and self.is_baen(soup):
            self.baen = True
            self.logger.info('\tBaen file detected. Re-parsing...')
            return self.preprocess(raw)
        if self.book_designer:
            t = soup.find(id='BookTitle')
            if t:
                self.book.set_title(self.get_text(t))
            a = soup.find(id='BookAuthor')
            if a:
                self.book.set_author(self.get_text(a))
        if self.verbose:
            tdir = tempfile.gettempdir()
            dump = open(os.path.join(tdir, 'html2lrf-verbose.html'), 'wb')
            dump.write(unicode(soup).encode('utf-8'))
            self.logger.info('Written preprocessed HTML to '+dump.name)
            dump.close()

        #print soup
        return soup

    def add_file(self, path):
        self.css = HTMLConverter.CSS.copy()
        self.pseudo_css = self.override_pcss.copy()
        self.css.update(self.override_css)

        self.file_name = os.path.basename(path)
        self.logger.info('Processing %s', path if self.verbose else self.file_name)
        upath = path.encode('utf-8') if isinstance(path, unicode) else path
        raw = open(upath, 'rb').read()
        soup = self.preprocess(raw)
        self.logger.info('\tConverting to BBeB...')
        self.current_page = None
        self.current_para = None
        self.current_style = {}
        self.page_break_found = False
        self.target_prefix = path
        self.previous_text = '\n'
        self.tops[path] = self.parse_file(soup)
        self.processed_files.append(path)

    def parse_css(self, style):
        """
        Parse the contents of a <style> tag or .css file.
        @param style: C{str(style)} should be the CSS to parse.
        @return: A dictionary with one entry per selector where the key is the
        selector name and the value is a dictionary of properties
        """
        sdict, pdict = {}, {}
        style = re.sub('/\*.*?\*/', '', style) # Remove /*...*/ comments
        for sel in re.findall(HTMLConverter.SELECTOR_PAT, style):
            for key in sel[0].split(','):
                val = self.parse_style_properties(sel[1])
                key = key.strip().lower()
                if ':' in key:
                    key, sep, pseudo = key.partition(':')
                    if key in pdict:
                        if pseudo in pdict[key]:
                            pdict[key][pseudo].update(val)
                        else:
                            pdict[key][pseudo] = val
                    else:
                        pdict[key] = {pseudo:val}
                else:
                    if key in sdict:
                        sdict[key].update(val)
                    else:
                        sdict[key] = val
        return sdict, pdict

    def parse_style_properties(self, props):
        """
        Parses a style attribute. The code within a CSS selector block or in
        the style attribute of an HTML element.
        @return: A dictionary with one entry for each property where the key
                is the property name and the value is the property value.
        """
        prop = dict()
        for s in props.split(';'):
            l = s.split(':',1)
            if len(l)==2:
                key = str(l[0].strip()).lower()
                val = l[1].strip()
                prop [key] = val
        return prop

    def tag_css(self, tag, parent_css={}):
        """
        Return a dictionary of style properties applicable to Tag tag.
        """
        def merge_parent_css(prop, pcss):
            temp = {}
            for key in pcss.keys():
                chk = key.lower()
                # float should not be inherited according to the CSS spec
                # however we need to as we don't do alignment at a block level.
                # float is removed by the process_alignment function.
                if chk.startswith('font') or chk == 'text-align' or \
                chk == 'float' or chk == 'white-space' or chk == 'color' or \
                chk == 'line-height':
                    temp[key] = pcss[key]
            prop.update(temp)

        prop, pprop = {}, {}
        tagname = tag.name.lower()
        if parent_css:
            merge_parent_css(prop, parent_css)
        if tag.has_key("align"):
            al = tag['align'].lower()
            if al in ('left', 'right', 'center', 'justify'):
                prop["text-align"] = al
        if self.css.has_key(tagname):
            prop.update(self.css[tagname])
        if self.pseudo_css.has_key(tagname):
            pprop.update(self.pseudo_css[tagname])
        if tag.has_key("class"):
            cls = tag["class"].lower()
            for cls in cls.split():
                for classname in ["."+cls, tagname+"."+cls]:
                    if self.css.has_key(classname):
                        prop.update(self.css[classname])
                    if self.pseudo_css.has_key(classname):
                        pprop.update(self.pseudo_css[classname])
        if tag.has_key("style"):
            prop.update(self.parse_style_properties(tag["style"]))
        return prop, pprop

    def parse_file(self, soup):
        def get_valid_block(page):
            for item in page.contents:
                if isinstance(item, (Canvas, TextBlock, ImageBlock, RuledLine)):
                    if isinstance(item, TextBlock) and not item.contents:
                        continue
                    return item
        previous = self.book.last_page()
        self.current_page = self.book.create_page()
        self.current_block = self.book.create_text_block()
        self.current_para = Paragraph()
        if self.cover:
            self.add_image_page(self.cover)
            self.cover = None
        top = self.current_block

        self.process_children(soup, {}, {})

        if self.current_para and self.current_block:
            self.current_para.append_to(self.current_block)
        if self.current_block and self.current_page:
            self.current_block.append_to(self.current_page)
        if self.current_page and self.current_page.has_text():
            self.book.append(self.current_page)

        if not top.parent or not top.contents:
            if not previous:
                try:
                    previous = self.book.pages()[0]
                except IndexError:
                    raise ConversionError, self.file_name + ' does not seem to have any content'
                top = get_valid_block(previous)
                if not top or not top.parent:
                    raise ConversionError, self.file_name + ' does not seem to have any content'
                return top

            found = False
            for page in self.book.pages():
                if page == previous:
                    found = True
                    continue
                if found:
                    top = get_valid_block(page)
                    if not top:
                        continue
                    break

            if not top or not top.parent:
                raise ConversionError, 'Could not parse ' + self.file_name
        return top

    def create_link(self, children, tag):
        para = None
        for i in range(len(children)-1, -1, -1):
            if isinstance(children[i], (Span, EmpLine)):
                para = children[i]
                break
        if para is None:
            raise ConversionError('Failed to parse link %s %s'%(tag, children))
        text = self.get_text(tag, 1000)
        if not text:
            text = 'Link'
            img = tag.find('img')
            if img:
                try:
                    text = img['alt']
                except KeyError:
                    pass

        path, fragment = munge_paths(self.target_prefix, tag['href'])
        return {'para':para, 'text':text, 'path':os.path.abspath(path),
                'fragment':fragment, 'in toc': (self.link_level == 0 and not self.use_spine)}


    def get_text(self, tag, limit=None):
            css = self.tag_css(tag)[0]
            if (css.has_key('display') and css['display'].lower() == 'none') or \
               (css.has_key('visibility') and css['visibility'].lower() == 'hidden'):
                return ''
            text = u''
            for c in tag.contents:
                if limit != None and len(text) > limit:
                    break
                if isinstance(c, HTMLConverter.IGNORED_TAGS):
                    return u''
                if isinstance(c, NavigableString):
                    text += unicode(c)
                elif isinstance(c, Tag):
                    if c.name.lower() == 'img' and c.has_key('alt'):
                        text += c['alt']
                        return text
                    text += self.get_text(c)
            return text

    def process_links(self):
        def add_toc_entry(text, target):
            # TextBlocks in Canvases have a None parent or an Objects Parent
            if target.parent != None and \
               hasattr(target.parent, 'objId'):
                self.book.addTocEntry(ascii_text, tb)
            else:
                self.logger.debug("Cannot add link %s to TOC", ascii_text)


        def get_target_block(fragment, targets):
            '''Return the correct block for the <a name> element'''
            bs = targets[fragment]
            if not isinstance(bs, BlockSpace):
                return bs
            ans, found, page = None, False, bs.parent
            for item in page.contents:
                if found:
                    if isinstance(item, (TextBlock, RuledLine, ImageBlock)):
                        ans = item
                        break
                if item == bs:
                    found = True
                    continue

            if not ans:
                for i in range(len(page.contents)-1, -1, -1):
                    if isinstance(page.contents[i], (TextBlock, RuledLine, ImageBlock)):
                        ans = page.contents[i]
                        break

            if not ans:
                ntb = self.book.create_text_block()
                ntb.Paragraph(' ')
                page.append(ntb)
                ans = ntb

            if found:
                targets[fragment] =  ans
                page.contents.remove(bs)
            return ans

        outside_links = deque()
        while len(self.links) > 0:
            link = self.links.popleft()
            para, text, path, fragment = link['para'], link['text'], link['path'], link['fragment']
            ascii_text = text.encode('ascii', 'ignore') # Needed for TOC entries due to bug in SONY LRF renderer

            if path in self.processed_files:
                if path+fragment in self.targets.keys():
                    tb = get_target_block(path+fragment, self.targets)
                else:
                    tb = self.tops[path]
                if link['in toc']:
                    add_toc_entry(ascii_text, tb)

                jb = JumpButton(tb)
                self.book.append(jb)
                cb = CharButton(jb, text=text)
                para.contents = []
                para.append(cb)
                try:
                    self.unused_target_blocks.remove(tb)
                except ValueError:
                    pass
            else:
                outside_links.append(link)

        return outside_links

    def create_toc(self, toc):
        for (path, fragment, txt) in toc:
            ascii_text = txt.encode('ascii', 'ignore') # Bug in SONY LRF renderer
            if not fragment and path in self.tops:
                self.book.addTocEntry(ascii_text, self.tops[path])
            else:
                url = path+fragment
                if url in self.targets:
                    self.book.addTocEntry(ascii_text, self.targets[url])


    def end_page(self):
        """
        End the current page, ensuring that any further content is displayed
        on a new page.
        """
        self.current_para.append_to(self.current_block)
        self.current_para = Paragraph()
        self.current_block.append_to(self.current_page)
        self.current_block = self.book.create_text_block()
        if self.current_page.has_text():
            self.book.append(self.current_page)
            self.current_page = self.book.create_page()


    def add_image_page(self, path):
        if os.access(path, os.R_OK):
            self.end_page()
            pwidth, pheight = self.profile.screen_width, self.profile.screen_height - \
                              self.profile.fudge
            page = self.book.create_page(evensidemargin=0, oddsidemargin=0,
                                         topmargin=0, textwidth=pwidth,
                                         headheight=0, headsep=0, footspace=0,
                                         footheight=0,
                                         textheight=pheight)
            if not self.images.has_key(path):
                self.images[path] = ImageStream(path)
            im = PILImage.open(path)
            width, height = im.size
            canvas = Canvas(pwidth, pheight)
            ib = ImageBlock(self.images[path], x1=width,
                            y1=height, xsize=width, ysize=height,
                            blockwidth=width, blockheight=height)
            canvas.put_object(ib, int((pwidth-width)/2.), int((pheight-height)/2.))
            page.append(canvas)
            self.book.append(page)

    def process_children(self, ptag, pcss, ppcss={}):
        """ Process the children of ptag """
        # Need to make a copy of contents as when
        # extract is called on a child, it will
        # mess up the iteration.
        for c in copy.copy(ptag.contents):
            if isinstance(c, HTMLConverter.IGNORED_TAGS):
                continue
            elif isinstance(c, Tag):
                self.parse_tag(c, pcss)
            elif isinstance(c, NavigableString):
                self.add_text(c, pcss, ppcss)
        if not self.in_table:
            try:
                if self.minimize_memory_usage:
                    ptag.extract()
            except AttributeError:
                print ptag, type(ptag)

    def get_alignment(self, css):
        val = css['text-align'].lower() if css.has_key('text-align') else None
        align = 'head'
        if val is not None:
            if val in ["right", "foot"]:
                align = "foot"
            elif val == "center":
                align = "center"
        if css.has_key('float'):
            val = css['float'].lower()
            if val == 'left':
                align = 'head'
            if val == 'right':
                align = 'foot'
            css.pop('float')
        return align

    def process_alignment(self, css):
        '''
        Create a new TextBlock only if necessary as indicated by css
        @type css: dict
        '''
        align = self.get_alignment(css)
        if align != self.current_block.textStyle.attrs['align']:
            self.current_para.append_to(self.current_block)
            self.current_block.append_to(self.current_page)
            ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
            ts.attrs['align'] = align
            try:
                index = self.text_styles.index(ts)
                ts = self.text_styles[index]
            except ValueError:
                self.text_styles.append(ts)
            self.current_block = self.book.create_text_block(
                                blockStyle=self.current_block.blockStyle,
                                textStyle=ts)
            self.current_para = Paragraph()
            return True
        return False

    def add_text(self, tag, css, pseudo_css, force_span_use=False):
        '''
        Add text to the current paragraph taking CSS into account.
        @param tag: Either a BeautifulSoup tag or a string
        @param css: A dict
        '''
        src = tag.string if hasattr(tag, 'string') else tag
        if len(src) > 32767:
            pos = 0
            while pos < len(src):
                self.add_text(src[pos:pos+32767], css, pseudo_css, force_span_use)
                pos += 32767
            return
        src = src.replace('\r\n', '\n').replace('\r', '\n')

        if pseudo_css.has_key('first-letter') and len(src) > 1:
            src = src.lstrip()
            f = src[0]
            next = 1
            if f in ("'", '"', u'\x8222', u'\x8216', u'\x8221', u'\x8217'):
                if len(src) >= 2:
                    next = 2
                    f = src[:2]
            src = src[next:]
            ncss = css.copy()
            ncss.update(pseudo_css.pop('first-letter'))
            self.add_text(f, ncss, {}, force_span_use)


        collapse_whitespace = not css.has_key('white-space') or css['white-space'] != 'pre'
        if self.process_alignment(css) and collapse_whitespace:
            # Dont want leading blanks in a new paragraph
            src = src.lstrip()

        def append_text(src):
            fp, key, variant = self.font_properties(css)
            src = src.replace(u'\uffff', ' ') # &nbsp; becomes u'\uffff'
            normal_font_size = int(fp['fontsize'])
            if variant == 'small-caps':
                dump = Span(fontsize=normal_font_size-30)
                temp = []
                for c in src:
                    if c.isupper():
                        if temp:
                            dump.append(''.join(temp))
                            temp = []
                        dump.append(Span(c, fontsize=normal_font_size))
                    else:
                        temp.append(c.upper())
                src = dump
                if temp:
                    src.append(''.join(temp))


            if key in ['italic', 'bi']:
                already_italic = False
                for fonts in self.fonts.values():
                    it = fonts['italic'][1] if fonts.has_key('italic') else ''
                    bi = fonts['bi'][1] if fonts.has_key('bi') else ''
                    if fp['fontfacename'] in (it, bi):
                        already_italic = True
                        break
                if not already_italic:
                    src = Italic(src)

            unneeded = []
            for prop in fp:
                if fp[prop] == self.current_block.textStyle.attrs[prop]:
                    unneeded.append(prop)
            for prop in unneeded:
                fp.pop(prop)
            attrs = {}
            if 'color' in css:
                attrs['textcolor'] = lrs_color(css['color'])
            attrs.update(fp)
            elem = Span(text=src, **attrs) if (attrs or force_span_use) else src
            if css.has_key('text-decoration'):
                dec = css['text-decoration'].lower()
                linepos = 'after' if dec == 'underline' else 'before' if dec == 'overline' else None
                if linepos is not None:
                    elem = EmpLine(elem, lineposition=linepos)
            self.current_para.append(elem)


        if collapse_whitespace:
            src = re.sub(r'\s{1,}', ' ', src)
            if len(self.previous_text) != len(self.previous_text.rstrip()):
                src = src.lstrip()
            if len(src):
                self.previous_text = src
                append_text(src)
        else:
            srcs = src.split('\n')
            for src in srcs:
                if src:
                    append_text(src)
                    if len(srcs) > 1:
                        self.line_break()

    def line_break(self):
        self.current_para.append(CR())
        self.previous_text = '\n'

    def end_current_para(self):
        '''
        End current paragraph with a paragraph break after it.
        '''
        if self.current_para.contents:
            self.current_block.append(self.current_para)
        self.current_block.append(CR())
        self.current_para = Paragraph()

    def end_current_block(self):
        '''
        End current TextBlock. Create new TextBlock with the same styles.
        '''
        if self.current_para.contents:
            self.current_block.append(self.current_para)
            self.current_para = Paragraph()
        if self.current_block.contents or self.current_block.must_append:
            self.current_page.append(self.current_block)
            self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
                                                         blockStyle=self.current_block.blockStyle)


    def process_image(self, path, tag_css, width=None, height=None, dropcaps=False):
        def detect_encoding(im):
            fmt = im.format
            if fmt == 'JPG':
                fmt = 'JPEG'
            return fmt
        original_path = path
        if self.rotated_images.has_key(path):
            path = self.rotated_images[path].name
        if self.scaled_images.has_key(path):
            path = self.scaled_images[path].name

        try:
            im = PILImage.open(path)
        except IOError, err:
            self.logger.warning('Unable to process image: %s\n%s', original_path, err)
            return
        encoding = detect_encoding(im)

        if width == None or height == None:
            width, height = im.size

        factor = 720./self.profile.dpi

        def scale_image(width, height):
            pt = PersistentTemporaryFile(suffix='.'+encoding.lower())
            self.image_memory.append(pt) # Neccessary, trust me ;-)
            try:
                im.resize((int(width), int(height)), PILImage.ANTIALIAS).save(pt, encoding)
                pt.close()
                self.scaled_images[path] = pt
                return pt.name
            except (IOError, SystemError), err: # PIL chokes on interlaced PNG images as well a some GIF images
                self.logger.warning('Unable to process image %s. Error: %s'%(path, err))
                return None

        pheight = int(self.current_page.pageStyle.attrs['textheight'])
        pwidth  = int(self.current_page.pageStyle.attrs['textwidth'])

        if dropcaps:
            scale = False
            if width > 0.75*pwidth:
                width = int(0.75*pwidth)
                scale = True
            if height > 0.75*pheight:
                height = int(0.75*pheight)
                scale = True
            if scale:
                path = scale_image(width, height)
            if not self.images.has_key(path):
                self.images[path] = ImageStream(path)
            im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
                               xsize=width, ysize=height)
            line_height = (int(self.current_block.textStyle.attrs['baselineskip']) +
                            int(self.current_block.textStyle.attrs['linespace']))//10
            line_height *= self.profile.dpi/72.
            lines = int(ceil(float(height)/line_height))
            dc = DropCaps(lines)
            dc.append(Plot(im, xsize=ceil(width*factor), ysize=ceil(height*factor)))
            self.current_para.append(dc)
            return

        if self.autorotation and width > pwidth and width > height:
            pt = PersistentTemporaryFile(suffix='.'+encoding.lower())
            try:
                im = im.rotate(90)
                im.save(pt, encoding)
                path = pt.name
                self.rotated_images[path] = pt
                width, height = im.size
            except IOError: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
                self.logger.debug('Unable to process interlaced PNG %s', original_path)
            finally:
                pt.close()

        scaled, width, height = fit_image(width, height, pwidth, pheight)
        if scaled:
            path = scale_image(width, height)

        if not path:
            return

        if not self.images.has_key(path):
            try:
                self.images[path] = ImageStream(path, encoding=encoding)
            except LrsError, err:
                self.logger.warning('Could not process image: %s\n%s', original_path, err)
                return

        im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
                               xsize=width, ysize=height)

        self.process_alignment(tag_css)

        if max(width, height) <= min(pwidth, pheight)/5.:
            self.current_para.append(Plot(im, xsize=ceil(width*factor),
                                          ysize=ceil(height*factor)))
        elif height <= int(floor((2/3.)*pheight)):
            pb = self.current_block
            self.end_current_para()
            self.process_alignment(tag_css)
            self.current_para.append(Plot(im, xsize=width*factor,
                                          ysize=height*factor))
            self.current_block.append(self.current_para)
            self.current_page.append(self.current_block)
            self.current_block = self.book.create_text_block(
                                            textStyle=pb.textStyle,
                                            blockStyle=pb.blockStyle)
            self.current_para = Paragraph()
        else:
            self.end_page()
            self.current_page.append(Canvas(width=pwidth,
                                            height=height))
            left = int(floor((pwidth - width)/2.))
            self.current_page.contents[-1].put_object(
                            ImageBlock(self.images[path], xsize=width,
                                       ysize=height, x1=width, y1=height,
                                       blockwidth=width, blockheight=height),
                            left, 0)

    def process_page_breaks(self, tag, tagname, tag_css):
        if 'page-break-before' in tag_css.keys():
            if tag_css['page-break-before'].lower() != 'avoid':
                self.end_page()
            tag_css.pop('page-break-before')
        end_page = False
        if 'page-break-after' in tag_css.keys() and \
           tag_css['page-break-after'].lower() != 'avoid':
            end_page = True
            tag_css.pop('page-break-after')
        if (self.force_page_break_attr[0].match(tagname) and \
           tag.has_key(self.force_page_break_attr[1]) and \
           self.force_page_break_attr[2].match(tag[self.force_page_break_attr[1]])) or \
           self.force_page_break.match(tagname):
            self.end_page()
            self.page_break_found = True
        if not self.page_break_found and self.page_break.match(tagname):
            if len(self.current_page.contents) > 3:
                self.end_page()
                self.logger.debug('Forcing page break at %s', tagname)
        return end_page

    def block_properties(self, tag_css):
        ans = {}
        for key in ('topskip', 'footskip', 'sidemargin'):
            ans[key] = self.book.defaultBlockStyle.attrs[key]

        src = [None for i in range(4)]
        if tag_css.has_key('padding'):
            msrc = tag_css['padding'].split()
            for i in range(len(msrc)):
                src[i] = msrc[i]
        i = 0
        for c in ('top', 'right', 'bottom', 'left'):
            if tag_css.has_key('padding-'+c):
                src[i] = tag_css['padding-'+c]
            i += 1

        t = {}
        t['topskip'], t['footskip'], t['sidemargin'] = src[0], src[2], src[3]
        for key in ('topskip', 'footskip', 'sidemargin'):
            if t[key] is not None:
                val = self.unit_convert(t[key])
                if val is not None:
                    ans[key] = val

        return ans

    def font_properties(self, css):
        '''
        Convert the font propertiess in css to the Xylog equivalents. If the CSS
        does not contain a particular font property, the default from self.book.defaultTextSytle
        is used.
        @return: dict, key, variant. The dict contains the Xlog equivalents. key indicates
          the font type (i.e. bold, bi, normal) and variant is None or 'small-caps'
        '''
        t = {}
        for key in ('fontwidth', 'fontsize', 'wordspace', 'fontfacename', 'fontweight', 'baselineskip'):
            t[key] = self.book.defaultTextStyle.attrs[key]

        def font_weight(val):
            ans = 0
            m = re.search("([0-9]+)", val)
            if m:
                ans = int(m.group(1))
            elif val.find("bold") >= 0 or val.find("strong") >= 0:
                ans = 700
            return 'bold' if ans >= 700 else 'normal'

        def font_style(val):
            ans = 'normal'
            if 'italic' in val or 'oblique' in val:
                ans = 'italic'
            return ans

        def font_family(val):
            ans = 'serif'
            if max(val.find("courier"), val.find("mono"), val.find("fixed"), val.find("typewriter"))>=0:
                ans = 'mono'
            elif max(val.find("arial"), val.find("helvetica"), val.find("verdana"),
                 val.find("trebuchet"), val.find("sans")) >= 0:
                ans = 'sans'
            return ans

        def font_variant(val):
            ans = None
            if 'small-caps' in val.lower():
                ans = 'small-caps'
            return ans

        def font_key(family, style, weight):
            key = 'normal'
            if style == 'italic' and weight == 'normal':
                key = 'italic'
            elif style == 'normal' and weight == 'bold':
                key = 'bold'
            elif style == 'italic' and weight == 'bold':
                key = 'bi'
            return key

        def font_size(val):
            normal = 100 #10*pts
            ans = self.unit_convert(val, pts=True)
            if ans:
                if ans < 0:
                    ans += normal
                    if ans < 0:
                        ans = normal
            else:
                if "xx-small" in val:
                    ans = 40
                elif "x-small" in val:
                    ans = 60
                elif "small" in val:
                    ans = 80
                elif "xx-large" in val:
                    ans = 180
                elif "x-large" in val:
                    ans = 140
                elif "large" in val:
                    ans = 120
            if ans is not None:
                ans += int(self.font_delta * 20)
                ans = str(ans)
            return ans

        family, weight, style, variant = 'serif', 'normal', 'normal', None
        for key in css.keys():
            val = css[key].lower()
            if key == 'font':
                vals = val.split()
                for val in vals:
                    family = font_family(val)
                    if family != 'serif':
                        break
                for val in vals:
                    weight = font_weight(val)
                    if weight != 'normal':
                        break
                for val in vals:
                    style = font_style(val)
                    if style != 'normal':
                        break
                for val in vals:
                    sz = font_size(val)
                    if sz:
                        t['fontsize'] = sz
                        break
                for val in vals:
                    variant = font_variant(val)
                    if variant:
                        t['fontvariant'] = variant
                        break
            elif key in ['font-family', 'font-name']:
                family = font_family(val)
            elif key == "font-size":
                ans = font_size(val)
                if ans:
                    t['fontsize'] = ans
            elif key == 'font-weight':
                weight = font_weight(val)
            elif key == 'font-style':
                style = font_style(val)
            elif key == 'font-variant':
                variant = font_variant(val)

        key = font_key(family, style, weight)
        if self.fonts[family].has_key(key):
            t['fontfacename'] = self.fonts[family][key][1]
        else:
            t['fontfacename'] = self.fonts[family]['normal'][1]
        if key in ['bold', 'bi']:
                t['fontweight'] = 700

        fs = int(t['fontsize'])
        if fs > 120:
            t['wordspace'] = int(fs/4.)
        t['baselineskip'] = fs + 20
        return t, key, variant

    def unit_convert(self, val, pts=False, base_length='10pt'):
        '''
        Tries to convert html units in C{val} to pixels.
        Assumes: 1em = 100% = 10pts
        @param pts: If True return 10*pts instead of pixels.
        @return: The number of pixels (an int) if successful. Otherwise, returns None.
        '''
        dpi = self.profile.dpi
        result = None
        try:
            result = int(val)
        except ValueError:
            pass
        m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val)
        if m is not None:
            unit = float(m.group(1))
            if m.group(2) == '%':
                normal = self.unit_convert(base_length)
                result = int((unit/100.0)*normal)
            elif m.group(2) == 'px':
                result =  int(unit)
            elif m.group(2) == 'in':
                result =  int(unit * dpi)
            elif m.group(2) == 'pt':
                result = int(unit * dpi/72.)
            elif m.group(2)== 'em':
                result = int(unit * (dpi/72.) * 10)
            elif m.group(2)== 'pc':
                result =  int(unit * (dpi/72.) * 12)
            elif m.group(2)== 'mm':
                result =  int(unit * 0.04 * (dpi/72.))
            elif m.group(2)== 'cm':
                result =  int(unit * 0.4 * (dpi/72.))
        if pts:
            if result is not None:
                result = int((float(result)/dpi)*720)
        return result

    def text_properties(self, tag_css):
        indent = self.book.defaultTextStyle.attrs['parindent']
        if tag_css.has_key('text-indent'):
            indent = self.unit_convert(str(tag_css['text-indent']), pts=True)
            if not indent:
                indent = 0
            if hasattr(self, 'minimum_indent') and indent > 0 and indent < self.minimum_indent:
                indent = self.minimum_indent

        fp = self.font_properties(tag_css)[0]
        fp['parindent'] = indent

        if tag_css.has_key('line-height'):
            bls, ls = int(self.current_block.textStyle.attrs['baselineskip']), \
                      int(self.current_block.textStyle.attrs['linespace'])
            try: # See if line-height is a unitless number
                val = int(float(tag_css['line-height'].strip()) * (bls+ls))
                fp['linespace'] = val
            except ValueError:
                val = self.unit_convert(tag_css['line-height'], pts=True, base_length='1pt')
            if val is not None:
                val -= bls
                if val >= 0:
                    fp['linespace'] = val

        return fp


    def process_block(self, tag, tag_css):
        ''' Ensure padding and text-indent properties are respected '''
        text_properties = self.text_properties(tag_css)
        block_properties = self.block_properties(tag_css)
        align = self.get_alignment(tag_css)

        def fill_out_properties(props, default):
            for key in default.keys():
                if not props.has_key(key):
                    props[key] = default[key]

        fill_out_properties(block_properties, self.book.defaultBlockStyle.attrs)
        fill_out_properties(text_properties, self.book.defaultTextStyle.attrs)

        def properties_different(dict1, dict2):
            for key in dict1.keys():
                if dict1[key] != dict2[key]:
                    return True
            return False

        if properties_different(self.current_block.blockStyle.attrs, block_properties) or \
           properties_different(self.current_block.textStyle.attrs, text_properties) or\
           align != self.current_block.textStyle.attrs['align']:
            ts = self.current_block.textStyle.copy()
            ts.attrs.update(text_properties)
            ts.attrs['align'] = align
            bs = self.current_block.blockStyle.copy()
            if not self.preserve_block_style:
                bs.attrs.update(block_properties)
            self.current_block.append_to(self.current_page)
            try:
                index = self.text_styles.index(ts)
                ts = self.text_styles[index]
            except ValueError:
                self.text_styles.append(ts)
            try:
                index = self.block_styles.index(bs)
                bs = self.block_styles[index]
            except ValueError:
                self.block_styles.append(bs)
            self.current_block = self.book.create_text_block(blockStyle=bs,
                                                             textStyle=ts)
            return True
        return False

    def process_anchor(self, tag, tag_css, tag_pseudo_css):
        if not self.in_table: # Anchors in tables are handled separately
            key = 'name' if tag.has_key('name') else 'id'
            name = tag[key].replace('#', '')
            previous = self.current_block
            self.process_children(tag, tag_css, tag_pseudo_css)
            target = None

            if self.current_block == previous:
                self.current_block.must_append = True
                target = self.current_block
            else:
                found = False
                for item in self.current_page.contents:
                    if item == previous:
                        found = True
                        continue
                    if found:
                        target = item
                        break
                if target and not isinstance(target, (TextBlock, ImageBlock)):
                    if isinstance(target, RuledLine):
                        target = self.book.create_text_block(textStyle=self.current_block.textStyle,
                                                     blockStyle=self.current_block.blockStyle)
                        target.Paragraph(' ')
                        self.current_page.append(target)
                    else:
                        target = BlockSpace()
                        self.current_page.append(target)
                if target == None:
                    if self.current_block.has_text():
                        target = self.current_block
                    else:
                        target = BlockSpace()
                        self.current_page.append(target)
            self.targets[self.target_prefix+name] = target
        else:
            self.process_children(tag, tag_css, tag_pseudo_css)

    def parse_tag(self, tag, parent_css):
        try:
            tagname = tag.name.lower()
        except AttributeError:
            if not isinstance(tag, HTMLConverter.IGNORED_TAGS):
                self.add_text(tag, parent_css, {})
            return
        tag_css, tag_pseudo_css = self.tag_css(tag, parent_css=parent_css)
        try: # Skip element if its display attribute is set to none
            if tag_css['display'].lower() == 'none' or \
               tag_css['visibility'].lower() == 'hidden':
                return
        except KeyError:
            pass
        end_page = self.process_page_breaks(tag, tagname, tag_css)

        if tagname in ["title", "script", "meta", 'del', 'frameset']:
            pass
        elif tagname == 'a' and self.link_levels >= 0:
            if tag.has_key('href') and not self.link_exclude.match(tag['href']):
                path = munge_paths(self.target_prefix, tag['href'])[0]
                ext = os.path.splitext(path)[1]
                if ext: ext = ext[1:].lower()
                if os.access(path, os.R_OK) and os.path.isfile(path):
                    if ext in ['png', 'jpg', 'bmp', 'jpeg']:
                        self.process_image(path, tag_css)
                    else:
                        text = self.get_text(tag, limit=1000)
                        if not text.strip():
                            text = "Link"
                        self.add_text(text, tag_css, {}, force_span_use=True)
                        self.links.append(self.create_link(self.current_para.contents, tag))
                        if tag.has_key('id') or tag.has_key('name'):
                            key = 'name' if tag.has_key('name') else 'id'
                            self.targets[self.target_prefix+tag[key]] = self.current_block
                            self.current_block.must_append = True
                else:
                    self.logger.warn('Could not follow link to '+tag['href'])
                    self.process_children(tag, tag_css, tag_pseudo_css)
            elif tag.has_key('name') or tag.has_key('id'):
                self.process_anchor(tag, tag_css, tag_pseudo_css)
        elif tagname == 'img':
            if tag.has_key('src'):
                path = munge_paths(self.target_prefix, tag['src'])[0]
                if os.access(path, os.R_OK) and os.path.isfile(path):
                    width, height = None, None
                    try:
                        width = int(tag['width'])
                        height = int(tag['height'])
                    except:
                        pass
                    dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps'
                    self.process_image(path, tag_css, width, height, dropcaps=dropcaps)
                elif not urlparse(tag['src'])[0]:
                    self.logger.warn('Could not find image: '+tag['src'])
            else:
                self.logger.debug("Failed to process: %s", str(tag))
        elif tagname in ['style', 'link']:
            ncss, npcss = {}, {}
            if tagname == 'style':
                for c in tag.contents:
                    if isinstance(c, NavigableString):
                        css, pcss = self.parse_css(str(c))
                        ncss.update(css)
                        npcss.update(pcss)
            elif tag.has_key('type') and tag['type'] == "text/css" \
                    and tag.has_key('href'):
                path = munge_paths(self.target_prefix, tag['href'])[0]
                try:
                    f = open(path, 'rb')
                    src = f.read()
                    f.close()
                    match = self.PAGE_BREAK_PAT.search(src)
                    if match and not re.match('avoid', match.group(1), re.IGNORECASE):
                        self.page_break_found = True
                    ncss, npcss = self.parse_css(src)
                except IOError:
                    self.logger.warn('Could not read stylesheet: '+tag['href'])
            if ncss:
                update_css(ncss, self.css)
                self.css.update(self.override_css)
            if npcss:
                update_css(npcss, self.pseudo_css)
                self.pseudo_css.update(self.override_pcss)
        elif tagname == 'pre':
            self.end_current_para()
            self.end_current_block()
            self.current_block = self.book.create_text_block()
            ts = self.current_block.textStyle.copy()
            self.current_block.textStyle = ts
            self.current_block.textStyle.attrs['parindent'] = '0'
            if tag.contents:
                c = tag.contents[0]
                if isinstance(c, NavigableString):
                    c = unicode(c).replace('\r\n', '\n').replace('\r', '\n')
                    if c.startswith('\n'):
                        c = c[1:]
                        tag.contents[0] = NavigableString(c)
                        tag.contents[0].setup(tag)
            self.process_children(tag, tag_css, tag_pseudo_css)
            self.end_current_block()
        elif tagname in ['ul', 'ol', 'dl']:
            self.list_level += 1
            if tagname == 'ol':
                old_counter = self.list_counter
                self.list_counter = 1
            prev_bs = self.current_block.blockStyle
            self.end_current_block()
            attrs = self.current_block.blockStyle.attrs
            attrs = attrs.copy()
            attrs['sidemargin'] = self.list_indent*self.list_level
            bs = self.book.create_block_style(**attrs)
            self.current_block = self.book.create_text_block(
                                        blockStyle=bs,
                                        textStyle=self.unindented_style)
            self.process_children(tag, tag_css, tag_pseudo_css)
            self.end_current_block()
            self.current_block.blockStyle = prev_bs
            self.list_level -= 1
            if tagname == 'ol':
                self.list_counter = old_counter
        elif tagname in ['li', 'dt', 'dd']:
            margin = self.list_indent*self.list_level
            if tagname == 'dd':
                margin += 80
            if int(self.current_block.blockStyle.attrs['sidemargin']) != margin:
                self.end_current_block()
                attrs = self.current_block.blockStyle.attrs
                attrs = attrs.copy()
                attrs['sidemargin'] = margin
                attrs['blockwidth'] = int(attrs['blockwidth']) + margin
                bs = self.book.create_block_style(**attrs)
                self.current_block = self.book.create_text_block(
                                        blockStyle=bs,
                                        textStyle=self.unindented_style)

            if self.current_para.has_text():
                self.line_break()
                self.current_block.append(self.current_para)
            self.current_para = Paragraph()
            self.previous_text = '\n'
            if tagname == 'li':
                in_ol, parent = True, tag.parent
                while parent:
                    if parent.name and parent.name.lower() in ['ul', 'ol']:
                        in_ol = parent.name.lower() == 'ol'
                        break
                    parent = parent.parent
                prepend = str(self.list_counter)+'. ' if in_ol else u'\u2022' + ' '
                self.current_para.append(Span(prepend))
                self.process_children(tag, tag_css, tag_pseudo_css)
                if in_ol:
                    self.list_counter += 1
            else:
                self.process_children(tag, tag_css, tag_pseudo_css)
        elif tagname == 'blockquote':
            self.current_para.append_to(self.current_block)
            self.current_block.append_to(self.current_page)
            pb = self.current_block
            self.current_para = Paragraph()
            ts = self.book.create_text_style()
            ts.attrs['parindent'] = 0
            try:
                index = self.text_styles.index(ts)
                ts = self.text_styles[index]
            except ValueError:
                self.text_styles.append(ts)
            bs = self.book.create_block_style()
            bs.attrs['sidemargin'], bs.attrs['topskip'], bs.attrs['footskip'] = \
            60, 20, 20
            try:
                index = self.block_styles.index(bs)
                bs = self.block_styles[index]
            except ValueError:
                self.block_styles.append(bs)
            self.current_block = self.book.create_text_block(
                                    blockStyle=bs, textStyle=ts)
            self.previous_text = '\n'
            self.preserve_block_style = True
            self.process_children(tag, tag_css, tag_pseudo_css)
            self.preserve_block_style = False
            self.current_para.append_to(self.current_block)
            self.current_block.append_to(self.current_page)
            self.current_para = Paragraph()
            self.current_block = self.book.create_text_block(textStyle=pb.textStyle,
                                                             blockStyle=pb.blockStyle)
        elif tagname in ['sub', 'sup']:
            text = self.get_text(tag)
            elem = Sub if tagname == 'sub' else Sup
            self.current_para.append(elem(text))

        elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            new_block = self.process_block(tag, tag_css)

            if (self.anchor_ids and tag.has_key('id')) or \
               (self.book_designer and tag.has_key('class') and tag['class']=='title'):
                if not tag.has_key('id'):
                    tag['id'] = 'libprs500_id_'+str(self.id_counter)
                    self.id_counter += 1

                tkey = self.target_prefix+tag['id']
                if not new_block:
                    self.end_current_block()
                self.current_block.must_append = True
                self.targets[tkey] = self.current_block
                if (self.book_designer and tag.has_key('class') and tag['class']=='title'):
                    self.extra_toc_entries.append((self.get_text(tag, 100), self.current_block))

            src = self.get_text(tag, limit=1000)

            if not self.disable_chapter_detection and tagname.startswith('h'):
                if self.chapter_regex.search(src):
                    self.logger.debug('Detected chapter %s', src)
                    self.end_page()
                    self.page_break_found = True

            if self.current_para.has_text():
                self.current_para.append_to(self.current_block)
            self.current_para = Paragraph()

            self.previous_text = '\n'

            if not tag.contents:
                self.current_block.append(CR())
                return

            if self.current_block.contents:
                self.current_block.append(CR())

            self.process_children(tag, tag_css, tag_pseudo_css)

            if self.current_para.contents :
                self.current_block.append(self.current_para)
            self.current_para = Paragraph()
            if tagname.startswith('h') or self.blank_after_para:
                self.current_block.append(CR())
        elif tagname in ['b', 'strong', 'i', 'em', 'span', 'tt', 'big', 'code', 'cite']:
            self.process_children(tag, tag_css, tag_pseudo_css)
        elif tagname == 'font':
            if tag.has_key('face'):
                tag_css['font-family'] = tag['face']
            if tag.has_key('color'):
                tag_css['color'] = tag['color']
            self.process_children(tag, tag_css, tag_pseudo_css)
        elif tagname in ['br']:
            self.line_break()
            self.previous_text = '\n'
        elif tagname in ['hr', 'tr']: # tr needed for nested tables
            self.end_current_block()
            if tagname == 'hr':
                self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth']))
            self.previous_text = '\n'
            self.process_children(tag, tag_css, tag_pseudo_css)
        elif tagname == 'td': # Needed for nested tables
            if not self.in_table:
                self.current_para.append(' ')
                self.previous_text = ' '
            self.process_children(tag, tag_css, tag_pseudo_css)
        elif tagname == 'table' and not self.ignore_tables and not self.in_table:
            tag_css = self.tag_css(tag)[0] # Table should not inherit CSS
            try:
                self.process_table(tag, tag_css)
            except Exception, err:
                self.logger.warning('An error occurred while processing a table: %s. Ignoring table markup.', str(err))
                self.logger.debug('', exc_info=True)
                self.logger.debug('Bad table:\n%s', str(tag)[:300])
                self.in_table = False
                self.process_children(tag, tag_css, tag_pseudo_css)
            finally:
                if self.minimize_memory_usage:
                    tag.extract()
        else:
            self.process_children(tag, tag_css, tag_pseudo_css)
        if end_page:
                self.end_page()

    def process_table(self, tag, tag_css):
        self.end_current_block()
        self.current_block = self.book.create_text_block()
        rowpad = 10
        table = Table(self, tag, tag_css, rowpad=rowpad, colpad=10)
        canvases = []
        ps = self.current_page.pageStyle.attrs
        for block, xpos, ypos, delta, targets in table.blocks(int(ps['textwidth']), int(ps['textheight'])):
            if not block:
                if ypos > int(ps['textheight']):
                    raise Exception, 'Table has cell that is too large'
                canvases.append(Canvas(int(self.current_page.pageStyle.attrs['textwidth']), ypos+rowpad,
                        blockrule='block-fixed'))
                for name in targets:
                    self.targets[self.target_prefix+name] = canvases[-1]
            else:
                canvases[-1].put_object(block, xpos + int(delta/2.), ypos)

        for canvas in canvases:
            self.current_page.append(canvas)
        self.end_current_block()


    def remove_unused_target_blocks(self):
        for block in self.unused_target_blocks:
            block.parent.contents.remove(block)
            block.parent = None

    def writeto(self, path, lrs=False):
        self.remove_unused_target_blocks()
        self.book.renderLrs(path) if lrs else self.book.renderLrf(path)

    def cleanup(self):
        for _file in self.scaled_images.values() + self.rotated_images.values():
            _file.__del__()

def process_file(path, options, logger=None):
    if re.match('http://|https://', path):
        raise ConversionError, 'You have to save the website %s as an html file first and then run html2lrf on it.'%(path,)
    if logger is None:
        level = logging.DEBUG if options.verbose else logging.INFO
        logger = logging.getLogger('html2lrf')
        setup_cli_handlers(logger, level)
    path = os.path.abspath(path)
    default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0])
    dirpath = os.path.dirname(path)

    tpath = ''
    try_opf(path, options, logger)
    if options.cover:
        options.cover = os.path.expanduser(options.cover)
        if not os.path.isabs(options.cover):
            options.cover = os.path.join(dirpath, options.cover)
        if os.access(options.cover, os.R_OK):
            th = Device.THUMBNAIL_HEIGHT
            im = PILImage.open(options.cover)
            pwidth, pheight = options.profile.screen_width, \
                              options.profile.screen_height - options.profile.fudge
            width, height = im.size
            if width < pwidth:
                corrf = float(pwidth)/width
                width, height = pwidth, int(corrf*height)

            scaled, width, height = fit_image(width, height, pwidth, pheight)
            cim = im.resize((width, height), PILImage.BICUBIC).convert('RGB') if \
                  scaled else im
            cf = PersistentTemporaryFile(prefix=__appname__+"_", suffix=".jpg")
            cf.close()
            cim.save(cf.name)
            options.cover = cf.name

            tim = im.resize((int(0.75*th), th), PILImage.ANTIALIAS).convert('RGB')
            tf = PersistentTemporaryFile(prefix=__appname__+'_', suffix=".jpg")
            tf.close()
            tim.save(tf.name)
            tpath = tf.name
        else:
            raise ConversionError, 'Cannot read from: %s'% (options.cover,)


    if not options.title:
        options.title = default_title

    for prop in ('author', 'author_sort', 'title', 'title_sort', 'publisher', 'freetext'):
        val = getattr(options, prop)
        if val and not isinstance(val, unicode):
            soup = BeautifulSoup(val)
            setattr(options, prop, unicode(soup))

    title = (options.title, options.title_sort)
    author = (options.author, options.author_sort)

    args = dict(font_delta=options.font_delta, title=title, \
                author=author, sourceencoding='utf8',\
                freetext=options.freetext, category=options.category,
                publisher=options.publisher,
                booksetting=BookSetting(dpi=10*options.profile.dpi,
                                        screenheight=options.profile.screen_height,
                                        screenwidth=options.profile.screen_width))
    if tpath:
        args['thumbnail'] = tpath
    header = None
    if options.header:
        header = Paragraph()
        fheader = options.headerformat
        fheader = re.sub(r'(?<!%)%t', options.title, fheader)
        fheader = re.sub(r'(?<!%)%a', options.author, fheader)
        fheader = re.sub(r'%%a','%a',fheader)
        fheader = re.sub(r'%%t','%t',fheader)
        header.append(fheader + "  ")
    book, fonts = Book(options, logger, header=header, **args)
    le = re.compile(options.link_exclude) if options.link_exclude else \
         re.compile('$')
    pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \
         re.compile('$')
    fpb = re.compile(options.force_page_break, re.IGNORECASE) if options.force_page_break else \
         re.compile('$')
    options.force_page_break = fpb
    options.link_exclude = le
    options.page_break = pb
    options.chapter_regex = re.compile(options.chapter_regex, re.IGNORECASE)
    fpba = options.force_page_break_attr.split(',')
    if len(fpba) != 3:
        fpba = ['$', '', '$']
    options.force_page_break_attr = [re.compile(fpba[0], re.IGNORECASE), fpba[1],
                                     re.compile(fpba[2], re.IGNORECASE)]
    if not hasattr(options, 'anchor_ids'):
        options.anchor_ids = True
    files = options.spine if options.use_spine else [path]
    conv = HTMLConverter(book, fonts, options, logger, files)
    if options.use_spine:
        conv.create_toc(options.toc)
    oname = options.output
    if not oname:
        suffix = '.lrs' if options.lrs else '.lrf'
        name = os.path.splitext(os.path.basename(path))[0] + suffix
        oname = os.path.join(os.getcwd(), name)
    oname = os.path.abspath(os.path.expanduser(oname))
    conv.writeto(oname, lrs=options.lrs)
    logger.info('Output written to %s', oname)
    conv.cleanup()
    return oname

def try_opf(path, options, logger):
    if hasattr(options, 'opf'):
        opf = options.opf
    else:
        try:
            opf = glob.glob(os.path.join(os.path.dirname(path),'*.opf'))[0]
        except IndexError:
            return
    dirpath = os.path.dirname(os.path.abspath(opf))
    opf = OPFReader(open(opf, 'rb'), dirpath)
    try:
        title = opf.title
        if title and not options.title:
            options.title = title
        if options.author == 'Unknown':
            if opf.authors:
                options.author = ', '.join(opf.authors)
            if opf.author_sort:
                options.author_sort = opf.author_sort
        if options.publisher == 'Unknown':
            publisher = opf.publisher
            if publisher:
                options.publisher = publisher
        if not options.category:
            category = opf.category
            if category:
                options.category = category
        if not options.cover or options.use_metadata_cover:
            orig_cover = options.cover
            options.cover = None
            cover = opf.cover
            if cover:
                if not os.path.isabs(cover):
                    cover = os.path.join(dirpath, cover)
                if os.access(cover, os.R_OK):
                    try:
                        PILImage.open(cover)
                        options.cover = cover
                    except:
                        for prefix in opf.possible_cover_prefixes():
                            if options.cover:
                                break
                            for suffix in ['.jpg', '.jpeg', '.gif', '.png', '.bmp']:
                                cpath = os.path.join(os.path.dirname(path), prefix+suffix)
                                try:
                                    PILImage.open(cpath)
                                    options.cover = cpath
                                    break
                                except:
                                    continue
            if not options.cover and orig_cover is not None:
                options.cover = orig_cover
        options.spine = [i.href for i in opf.spine.items()]
        options.toc   = opf.toc
    except Exception:
        logger.exception('Failed to process opf file')


def option_parser():
    return lrf_option_parser('''Usage: %prog [options] mybook.html\n\n'''
                    '''%prog converts mybook.html to mybook.lrf''')

def main(args=sys.argv):
    try:
        parser = option_parser()
        options, args = parser.parse_args(args)
        if options.output:
            options.output = os.path.abspath(os.path.expanduser(options.output))
        if len(args) != 2:
            parser.print_help()
            return 1
        src = args[1]
        if options.verbose:
            import warnings
            warnings.defaultaction = 'error'
    except Exception, err:
        print >> sys.stderr, err
        return 1

    process_file(src, options)
    return 0


if __name__ == '__main__':
    sys.exit(main())