diff --git a/src/calibre/ebooks/conversion/config.py b/src/calibre/ebooks/conversion/config.py
index 91f46b984f..b7591d8f03 100644
--- a/src/calibre/ebooks/conversion/config.py
+++ b/src/calibre/ebooks/conversion/config.py
@@ -221,7 +221,7 @@ OPTIONS = {
 
         'fb2': ('no_inline_fb2_toc',),
 
-        'pdf': ('no_images', 'unwrap_factor'),
+        'pdf': ('no_images', 'unwrap_factor', 'new_pdf_engine', 'pdf_header_skip', 'pdf_footer_skip', 'pdf_header_regex', 'pdf_footer_regex'),
 
         'rtf': ('ignore_wmf',),
 
diff --git a/src/calibre/ebooks/conversion/plugins/pdf_input.py b/src/calibre/ebooks/conversion/plugins/pdf_input.py
index 42840b10db..d9b3dc494c 100644
--- a/src/calibre/ebooks/conversion/plugins/pdf_input.py
+++ b/src/calibre/ebooks/conversion/plugins/pdf_input.py
@@ -11,7 +11,7 @@ from polyglot.builtins import as_bytes
 class PDFInput(InputFormatPlugin):
 
     name        = 'PDF Input'
-    author      = 'Kovid Goyal and John Schember'
+    author      = 'Kovid Goyal and John Schember and Alan Pettigrew'
     description = _('Convert PDF files to HTML')
     file_types  = {'pdf'}
     commit_name = 'pdf_input'
@@ -24,20 +24,27 @@ class PDFInput(InputFormatPlugin):
             'be unwrapped. Valid values are a decimal between 0 and 1. The '
             'default is 0.45, just below the median line length.')),
         OptionRecommendation(name='new_pdf_engine', recommended_value=False,
-            help=_('Use the new PDF conversion engine. Currently not operational.'))
+            help=_('Use the new, experimental, PDF conversion engine.')),
+        OptionRecommendation(name='pdf_header_skip', recommended_value=-1,
+            help=_('Skip everything to the specified number pixels at the top of a page.'
+            ' Negative numbers mean auto-detect and remove headers, zero means do not remove headers and positive numbers'
+            ' mean remove headers that appear above that many pixels from the top of the page. Works only'
+            ' with the new PDF engine.'
+        )),
+        OptionRecommendation(name='pdf_footer_skip', recommended_value=-1,
+            help=_('Skip everything to the specified number of pixels at the bottom of a page.'
+            ' Negative numbers mean auto-detect and remove footers, zero means do not remove footers and positive numbers'
+            ' mean remove footers that appear below that many pixels from the bottom of the page. Works only'
+            ' with the new PDF engine.'
+        )),
+        OptionRecommendation(name='pdf_header_regex', recommended_value='',
+            help=_('Regular expression to remove lines at the top of a page. '
+                   'This only looks at the first line of a page and works only with the new PDF engine.')),
+        OptionRecommendation(name='pdf_footer_regex', recommended_value='',
+            help=_('Regular expression to remove lines at the bottom of a page. '
+                   'This only looks at the last line of a page and works only with the new PDF engine.')),
     }
 
-    def convert_new(self, stream, accelerators):
-        from calibre.ebooks.pdf.pdftohtml import pdftohtml
-        from calibre.ebooks.pdf.reflow import PDFDocument
-        from calibre.utils.cleantext import clean_ascii_chars
-
-        pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True)
-        with open('index.xml', 'rb') as f:
-            xml = clean_ascii_chars(f.read())
-        PDFDocument(xml, self.opts, self.log)
-        return os.path.join(os.getcwd(), 'metadata.opf')
-
     def convert(self, stream, options, file_ext, log,
                 accelerators):
         from calibre.ebooks.metadata.opf2 import OPFCreator
@@ -47,8 +54,14 @@ class PDFInput(InputFormatPlugin):
         # The main html file will be named index.html
         self.opts, self.log = options, log
         if options.new_pdf_engine:
-            return self.convert_new(stream, accelerators)
-        pdftohtml(os.getcwd(), stream.name, options.no_images)
+            from calibre.ebooks.pdf.reflow import PDFDocument
+            from calibre.utils.cleantext import clean_ascii_chars
+            pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True)
+            with open(u'index.xml', 'rb') as f:
+                xml = clean_ascii_chars(f.read())
+            PDFDocument(xml, self.opts, self.log)
+        else:
+            pdftohtml(os.getcwd(), stream.name, options.no_images)
 
         from calibre.ebooks.metadata.meta import get_metadata
         log.debug('Retrieving document metadata...')
diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py
index f66dd4dc00..bc34af6524 100644
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@@ -1,18 +1,66 @@
 #!/usr/bin/env python
 
-
 __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import numbers
 import os
+import re
 import sys
-from itertools import count
+from operator import attrgetter
 
 from lxml import etree
 
-from calibre.utils.xml_parse import safe_xml_fromstring
+# Global constants affecting formatting decisions
+
+#### Pages/lines
+# Fraction of a character width that two strings have to be apart,
+# for them to be considered part of the same text fragment
+# The problem is justified text where fragments can be widely spaced
+# Was 0.5 but this forces just about anything to coalesce.
+# It also means no columns will be found
+COALESCE_FACTOR = 20.0
+
+# Allow some dither of bottom of characters when checking if same line
+# Pixels from the PDF file
+BOTTOM_FACTOR = 2.0
+
+# Fraction of text height that two strings' bottoms can differ by
+# for them to be considered to be part of the same text fragment
+LINE_FACTOR = 0.2
+
+# Fraction of a line width that a line must exceed before
+# it can merge with the next
+# NOW IN  OPTIONS
+#LINE_SPLIT = 0.45
+
+# Fraction of the gap between lines to determine if setting the paragraph break
+# is likely to be valid.  Somewhere between 1 and 2, probably about 1.3
+PARA_FACTOR = 1.3
+
+# Multiplies the gap between lines to determine if this is a section break
+# not a paragraph break  Somewhere between 1 and 2, probably about 1.5
+SECTION_FACTOR = 1.4
+
+# Multiplies the average line height when determining row height
+# of a particular element to detect columns.
+YFUZZ = 1.5
+
+# Left (and other) margins can waver.
+# Plus or minus this
+LEFT_WAVER = 2.0
+
+# Amount left margin must be greater than right for text
+# to be considered right aligned. 1.8 = 180%
+RIGHT_FACTOR = 1.8
+
+# Percentage amount left and right margins can differ
+# and still be considered centered. 0.1 = 10%
+CENTER_FACTOR = 0.1
+
+#### Indents
+# How near must values be to appear the same
+SAME_SPACE = 3.0
 
 
 class Font:
@@ -20,10 +68,10 @@ class Font:
     def __init__(self, spec):
         self.id = spec.get('id')
         self.size = float(spec.get('size'))
+        self.size_em = 0.0
         self.color = spec.get('color')
         self.family = spec.get('family')
 
-
 class Element:
 
     def __init__(self):
@@ -36,6 +84,12 @@ class Element:
     def __hash__(self):
         return hash(self.id)
 
+class DocStats:
+
+    def __init__(self):
+        self.top = self.bottom = self.left = self.left2 = self.right \
+          = self.line_space = self.para_space = self.indent = self.indent2 = 0
+        self.font_size = 0
 
 class Image(Element):
 
@@ -43,15 +97,16 @@ class Image(Element):
         Element.__init__(self)
         self.opts, self.log = opts, log
         self.id = next(idc)
-        self.top, self.left, self.width, self.height, self.iwidth, self.iheight = \
-          map(float, map(img.get, ('top', 'left', 'rwidth', 'rheight', 'iwidth',
-              'iheight')))
+        self.top, self.left, self.width, self.height = \
+          map(float, map(img.get, ('top', 'left', 'width', 'height')))
         self.src = img.get('src')
         self.bottom = self.top + self.height
         self.right = self.left + self.width
+        # Check for alignment done later
+        self.align = 'L'
 
     def to_html(self):
-        return '<img src="%s" width="%dpx" height="%dpx"/>' % \
+        return '<img src="%s" alt="" width="%dpx" height="%dpx"/>' % \
                 (self.src, int(self.width), int(self.height))
 
     def dump(self, f):
@@ -66,14 +121,181 @@ class Text(Element):
         self.id = next(idc)
         self.opts, self.log = opts, log
         self.font_map = font_map
+        self.top, self.left, self.width, self.height = map(round, map(float, map(text.get,
+            ('top', 'left', 'width', 'height'))))
+        # This does nothing, as expected,
+        # but somewhere left (at least) is changed sometimes to not .0
+        if self.left != round(self.left) :
+            self.left = round(self.left)
+        self.bottom  = self.top + self.height
+        self.right = self.left + self.width
+        self.tag = 'p'    # Normal paragraph <p...>
+        self.indented = 0
+        # When joining lines for a paragraph, remember position of last line joined
+        self.last_left = self.left
+        # Remember the length of this line if it is merged into a paragraph
+        self.final_width = self.width
+        # Align = Left, Right, Center, Justified.  Default L
+        self.align = 'L'
+        # Should there be extra space between paragraphs?
+        self.blank_line = 0
+
+        if self.font_map:
+            self.font = self.font_map[text.get('font')]
+            self.font_size = self.font.size
+            self.font_size_em = self.font.size_em
+            self.color = self.font.color
+            self.font_family = self.font.family
+        else:
+            self.font = {}
+            self.font_size = 0.0
+            self.font_size_em = 0.0
+        #    self.color = 0
+
+        text.tail = ''
+        self.text_as_string = etree.tostring(text, method='text', encoding='unicode')
+        self.raw = text.text if text.text else ''
+        for x in text.iterchildren():
+            self.raw += etree.tostring(x, method='xml', encoding='unicode')
+        self.average_character_width = self.width/len(self.text_as_string)
+
+    def coalesce(self, other, page_number, left_margin):
+        if self.opts.verbose > 2:
+            self.log.debug('Coalescing %r with %r on page %d'%(self.text_as_string,
+                other.text_as_string, page_number))
+        # For elements of the same line, is there a space between?
+        # Spaces are narrow, so a_c_w/3
+        if (self.top <= other.top and self.bottom >= other.bottom) \
+          and abs(other.left - self.right) < self.average_character_width / 3.0:
+            has_gap = 0
+        else:	# Insert n spaces to fill gap.  Use TAB?
+            if other.left < self.right:
+                has_gap = 1	# Coalescing different lines
+            else:	# Multiple texts on same line
+                has_gap = round(abs(other.left - self.right) / self.average_character_width)
+
+        self.top = min(self.top, other.top)
+        self.left = min(self.left, other.left)
+        self.right = max(self.right, other.right)
+        self.width +=  other.width
+        self.final_width +=  other.final_width
+        self.bottom = max(self.bottom, other.bottom)
+        self.height = self.bottom - self.top
+        # Need to check for </span> <span... as well
+        if self.font_size_em == other.font_size_em \
+          and self.font.id == other.font.id \
+          and re.match('<span style="font-size:', self.raw) is not None \
+          and re.match('<span style="font-size:', other.raw) is not None :
+            # We have the same class, so merge
+            m_self = re.match('^(.+)</span>$', self.raw)
+            m_other = re.match('^<span style="font-size:.+em">(.+</span>)$', other.raw)
+            if m_self and m_other:
+                self.raw = m_self.group(1)
+                other.raw = m_other.group(1)
+        elif self.font_size_em != other.font_size_em \
+          and self.font_size_em != 1.00 :
+            if re.match('<span', self.raw) is None :
+                self.raw = '<span style="font-size:%sem">%s</span>'%(str(self.font_size_em),self.raw)
+            # Try to allow for a very large initial character
+            elif len(self.text_as_string) <= 2 \
+              and self.font_size_em >= other.font_size_em * 2.0 :
+                # Insert 'float: left' etc. into current font info
+                # Unfortunately, processing to generate the .epub file changes things.
+                # The line height gets set to the same as other parts of the file
+                # and the font size is reduced.
+                # These need to be fixed manually.
+                m_self = re.match('^(.+em">)(.+)$', self.raw)
+                self.raw = m_self.group(1) \
+                  + '<span style="float:left"><span style="line-height:0.5">' \
+                  + m_self.group(2) + '</span></span>'
+
+        self.font_size = max(self.font_size, other.font_size)
+        self.font_size_em = max(self.font_size_em, other.font_size_em)
+        self.font = other.font if self.font_size == other.font_size else other.font
+        if has_gap > 0:
+            if has_gap < 3:	# Small number of spaces = 1 space
+                if not (self.text_as_string.endswith(' ') \
+                     or self.text_as_string.endswith('-') \
+                     or other.text_as_string.startswith(' ') \
+                     or other.text_as_string.startswith('-') ):
+                    has_gap = 1
+                else:
+                    has_gap = 0
+            # Insert multiple spaces
+            while has_gap > 0:
+                self.text_as_string += ' '
+                self.raw += ' '
+                self.width += self.average_character_width
+                #self.final_width += self.average_character_width
+                has_gap -= 1
+
+        self.text_as_string += other.text_as_string
+        #self.width += other.width
+
+        # Try to merge href where there are 2 for the same place
+        # Beware multiple hrefs on the same line, but for different places
+        # e.g.
+        # self.raw = '<a href="index.html#2">T</a>'
+        # other.raw = '<span style="font-size:0.7em"><a href="index.html#2">ITLE</a></span>'
+        # becomes '<a href="index.html#2">T<span style="font-size:0.7em">ITLE</span></a>'
+        # Are there problems if self.raw does not end </a>?
+        # Note that the 2 parts could have different font sizes
+        matchObj = re.match(r'^([^<]*)(<span[^>]*>)*(<a href[^>]+>)(.*)</a>(</span>)*(\s*)$', self.raw)
+        if matchObj is not None :
+            otherObj = re.match('^([^<]*)(<span[^>]*>)*(<a href[^>]+>)(.*)(</a>)(</span>)*(.*)$', other.raw)
+            # There is another href, but is it for the same place?
+            if otherObj is not None  and  matchObj.group(3) == otherObj.group(3) :
+                m2 = matchObj.group(2)
+                if m2 is None:
+                    m2 = ''
+                m5 = matchObj.group(5)
+                if m5 is None:
+                    m5 = ''
+                o2 = otherObj.group(2)
+                if o2 is None:
+                    o2 = ''
+                o6 = otherObj.group(6)
+                if o6 is None:
+                    o6 = ''
+                # Remove the other <a...> stuff and put the </a> last
+                other.raw = otherObj.group(1)+o2+otherObj.group(4)+o6+otherObj.group(5)+otherObj.group(7)
+                # Move the <span... after the <a... and remove the </a>
+                self.raw = matchObj.group(1)+matchObj.group(3)+m2+matchObj.group(4)+m5+matchObj.group(6)
+        #
+        self.raw += other.raw
+        self.average_character_width = self.width/len(self.text_as_string)
+        #self.last_left = other.left
+
+    def to_html(self):
+        #matchObj = re.match(u'^(.*)(<a href[^>]+>)(.*)</a>(.*)$', self.raw)
+        #if
+        return self.raw
+
+    def dump(self, f):
+        f.write('T top={}, left={}, width={}, height={}: '.format(self.top, self.left, self.width, self.height))
+        f.write(self.to_html().encode('utf-8'))
+        f.write('\n')
+
+class Paragraph(Text):
+
+    def __init__(self, text, font_map, opts, log, idc):
+        Text.__init__(self)
+        self.id = next(idc)
+        self.opts, self.log = opts, log
+        self.font_map = font_map
         self.top, self.left, self.width, self.height = map(float, map(text.get,
             ('top', 'left', 'width', 'height')))
         self.bottom  = self.top + self.height
         self.right = self.left + self.width
-        self.font = self.font_map[text.get('font')]
-        self.font_size = self.font.size
-        self.color = self.font.color
-        self.font_family = self.font.family
+        if self.font_map:
+            self.font = self.font_map[text.get('font')]
+            self.font_size = self.font.size
+            self.color = self.font.color
+            self.font_family = self.font.family
+        else:
+            self.font = {}
+            self.font_size = 0
+        #    self.color = 0
 
         text.tail = ''
         self.text_as_string = etree.tostring(text, method='text',
@@ -83,30 +305,14 @@ class Text(Element):
             self.raw += etree.tostring(x, method='xml', encoding='unicode')
         self.average_character_width = self.width/len(self.text_as_string)
 
-    def coalesce(self, other, page_number):
-        if self.opts.verbose > 2:
-            self.log.debug('Coalescing %r with %r on page %d'%(self.text_as_string,
-                other.text_as_string, page_number))
-        self.top = min(self.top, other.top)
-        self.right = other.right
-        self.width = self.right - self.left
-        self.bottom = max(self.bottom, other.bottom)
-        self.height = self.bottom - self.top
-        self.font_size = max(self.font_size, other.font_size)
-        self.font = other.font if self.font_size == other.font_size else other.font
-        self.text_as_string += other.text_as_string
-        self.raw += other.raw
-        self.average_character_width = (self.average_character_width +
-                other.average_character_width)/2.0
-
     def to_html(self):
         return self.raw
 
     def dump(self, f):
+        f.write('P top={}, left={}, width={}, height={}: '.format(self.top, self.left, self.width, self.height))
         f.write(self.to_html().encode('utf-8'))
         f.write('\n')
 
-
 class FontSizeStats(dict):
 
     def __init__(self, stats):
@@ -118,7 +324,6 @@ class FontSizeStats(dict):
                 self.most_common_size, self.chars_at_most_common_size = sz, chars
             self[sz] = chars/total
 
-
 class Interval:
 
     def __init__(self, left, right):
@@ -144,13 +349,13 @@ class Interval:
     def __hash__(self):
         return hash('(%f,%f)'%self.left, self.right)
 
-
 class Column:
 
-    # A column contains an element is the element bulges out to
+    # A column contains an element if the element bulges out to
     # the left or the right by at most HFUZZ*col width.
     HFUZZ = 0.2
 
+
     def __init__(self):
         self.left = self.right = self.top = self.bottom = 0
         self.width = self.height = 0
@@ -170,10 +375,10 @@ class Column:
         self._post_add()
 
     def _post_add(self):
-        self.elements.sort(key=lambda x: x.bottom)
+        self.elements.sort(key=attrgetter('bottom'))
         self.top = self.elements[0].top
         self.bottom = self.elements[-1].bottom
-        self.left, self.right = sys.maxsize, 0
+        self.left, self.right = sys.maxint, 0
         for x in self:
             self.left = min(self.left, x.left)
             self.right = max(self.right, x.right)
@@ -198,7 +403,7 @@ class Column:
             left_margin = elem.left - self.left
             elem.indent_fraction = left_margin/self.width
             elem.width_fraction = elem.width/self.width
-            if i == 0:
+            if i == 0  or  self.average_line_separation == 0:
                 elem.top_gap_ratio = None
             else:
                 elem.top_gap_ratio = (self.elements[i-1].bottom -
@@ -223,14 +428,13 @@ class Box(list):
     def to_html(self):
         ans = ['<%s>'%self.tag]
         for elem in self:
-            if isinstance(elem, numbers.Integral):
+            if isinstance(elem, int):
                 ans.append('<a name="page_%d"/>'%elem)
             else:
                 ans.append(elem.to_html()+' ')
         ans.append('</%s>'%self.tag)
         return ans
 
-
 class ImageBox(Box):
 
     def __init__(self, img):
@@ -243,7 +447,7 @@ class ImageBox(Box):
         if len(self) > 0:
             ans.append('<br/>')
             for elem in self:
-                if isinstance(elem, numbers.Integral):
+                if isinstance(elem, int):
                     ans.append('<a name="page_%d"/>'%elem)
                 else:
                     ans.append(elem.to_html()+' ')
@@ -260,7 +464,7 @@ class Region:
 
     def add(self, columns):
         if not self.columns:
-            for x in sorted(columns, key=lambda x: x.left):
+            for x in sorted(columns, key=attrgetter('left')):
                 self.columns.append(x)
         else:
             for i in range(len(columns)):
@@ -323,11 +527,12 @@ class Region:
                             idx)
                 col.add(elem)
 
+
     def collect_stats(self):
         for column in self.columns:
             column.collect_stats()
-        self.average_line_separation = sum(x.average_line_separation for x in
-            self.columns)/float(len(self.columns))
+        self.average_line_separation = sum([x.average_line_separation for x in
+            self.columns])/float(len(self.columns))
 
     def __iter__(self):
         yield from self.columns
@@ -407,39 +612,104 @@ class Region:
                 self.boxes[-1].append(elem)
 
 
+
 class Page:
 
-    # Fraction of a character width that two strings have to be apart,
-    # for them to be considered part of the same text fragment
-    COALESCE_FACTOR = 0.5
-
-    # Fraction of text height that two strings' bottoms can differ by
-    # for them to be considered to be part of the same text fragment
-    LINE_FACTOR = 0.4
-
-    # Multiplies the average line height when determining row height
-    # of a particular element to detect columns.
-    YFUZZ = 1.5
-
     def __init__(self, page, font_map, opts, log, idc):
+        def text_cmp(frst, secnd):
+            # Compare 2 text objects.
+            # Order by line (top/bottom) then left
+            if (frst.top <= secnd.top and frst.bottom >= secnd.bottom) \
+              or (secnd.top <= frst.top and secnd.bottom >= frst.bottom) :
+                # Overlap = same line
+                if frst.left < secnd.left :
+                    return -1
+                elif frst.left == secnd.left :
+                    return 0
+                return 1
+            # Different line so sort into line number
+            if frst.bottom < secnd.bottom :
+                return -1
+            elif frst.bottom == secnd.bottom :
+                return 0
+            return 1
+
+        # The sort comparison caller
+        from functools import cmp_to_key
+
         self.opts, self.log = opts, log
         self.font_map = font_map
         self.number = int(page.get('number'))
-        self.width, self.height = map(float, map(page.get,
-            ('width', 'height')))
+        self.width, self.height = map(float, map(page.get, ('width', 'height')))
         self.id = 'page%d'%self.number
 
         self.texts = []
-        self.left_margin, self.right_margin = self.width, 0
+        self.imgs = []
+        # Set margins to values that will get adjusted
+        self.left_margin = self.width
+        self.right_margin = 0
+        # For whether page number has been put in <a>
+        self.id_used = 0
 
         for text in page.xpath('descendant::text'):
             self.texts.append(Text(text, self.font_map, self.opts, self.log, idc))
             text = self.texts[-1]
-            self.left_margin = min(text.left, self.left_margin)
-            self.right_margin = max(text.right, self.right_margin)
+            # Allow for '  <i|em|b|strong|span>  ...'
+            matchObj = re.match(r'^(\s*)(<[^>]+>)?(\s*)(.*)$', text.raw)
+            s1 = matchObj.group(1)
+            s2 = matchObj.group(3)
+            t1 = matchObj.group(2) or ''
+            t2 = matchObj.group(4) or ''
+            tx = t1 + t2
+            # Remove lines of only spaces.  Could be <i> </i> etc., but process later
+            # Need to keep any href= (and others?)
+            if len(tx) == 0:
+              #and re.match(r'href=', text.raw) is None:
+                self.texts.remove(text)
+            elif (self.opts.pdf_header_skip <= 0 or text.top >= self.opts.pdf_header_skip) \
+              and (self.opts.pdf_footer_skip <= 0 or text.top <= self.opts.pdf_footer_skip):
+                # Remove leading spaces and make into indent
+                # Assume 1 space = 1 av_char_width?
+                s = len(s1) + len(s2)
+                if s > 1:	# Allow single leading space
+                    w = round(s * text.average_character_width/2.0)	# Spaces < avg width
+                    text.raw = tx
+                    text.text_as_string = text.text_as_string[s:]
+                    text.left += w	# Add indent
+                    text.last_left += w
+                    text.width -= w	# Reduce width
+                    text.final_width -= w
+                    #text.indented gets set later
+                self.left_margin = min(text.left, self.left_margin)
+                self.right_margin = max(text.right, self.right_margin)
+                # Change #nnn to page_nnn in hrefs
+                matchObj = re.match(r'^(.*)(<a href)(.+)("index.html#)(\d+)(".+)$', text.raw)
+                if matchObj is not None:
+                    text.raw = matchObj.group(1)+matchObj.group(2)+matchObj.group(3)+matchObj.group(4) \
+                        +'page_'+matchObj.group(5)+matchObj.group(6)
+
+            else:
+                # Not within text boundaries
+                self.texts.remove(text)
+
+        # Find any image occurances if requested
+        # These can be interspersed with text
+        if not self.opts.no_images:
+            for img in page.xpath('descendant::image'):
+                self.imgs.append(Image(img, self.opts, self.log, idc))
+                #matchObj = re.match(u'^(.*)("/>)$', self.imgs[-1])
+                #if matchObj is not None:
+                #    self.imgs[-1] = matchObj.group(1)+u'" alt=""/>'
 
         self.textwidth = self.right_margin - self.left_margin
 
+        # Sort into page order.  bottom then left
+        # NB. This is only approximate as different sized characters
+        #     can mean sections of a line vary in top or bottom.
+        #     bottom is less varied than top, but is not guaranteed.
+        # Multi-line characters make things even more interesting.
+        self.texts.sort(key=cmp_to_key(text_cmp))
+
         self.font_size_stats = {}
         self.average_text_height = 0
         for t in self.texts:
@@ -452,23 +722,26 @@ class Page:
 
         self.font_size_stats = FontSizeStats(self.font_size_stats)
 
-        self.coalesce_fragments()
-
-        self.elements = list(self.texts)
-        for img in page.xpath('descendant::img'):
-            self.elements.append(Image(img, self.opts, self.log, idc))
-        self.elements.sort(key=lambda x: x.top)
-
-    def coalesce_fragments(self):
+    def join_fragments(self, opts):
+        # Join fragments on a line
+        # Do some basic checks on structure
 
         def find_match(frag):
             for t in self.texts:
-                hdelta = t.left - frag.right
-                hoverlap = self.COALESCE_FACTOR * frag.average_character_width
-                if t is not frag and hdelta > -hoverlap and \
-                    hdelta < hoverlap and \
-                    abs(t.bottom - frag.bottom) < self.LINE_FACTOR*frag.height:
-                    return t
+                #  and abs(t.bottom - frag.bottom) <= BOTTOM_FACTOR :
+                if t is not frag :
+                    if (t.top <= frag.top and t.bottom >= frag.bottom) \
+                      or (t.top >= frag.top and t.bottom <= frag.bottom):
+                        return t	# Force match if same line
+                        # Sorting can put parts of a line in the wrong order if there are small chars
+                        if t.left < frag.left:
+                            hdelta = frag.left - t.right
+                        else:
+                            hdelta = t.left - frag.right
+                        hoverlap = COALESCE_FACTOR * frag.average_character_width
+                        if hdelta > -hoverlap and \
+                          hdelta < hoverlap :
+                            return t
 
         match_found = True
         while match_found:
@@ -477,12 +750,253 @@ class Page:
                 match = find_match(frag)
                 if match is not None:
                     match_found = True
-                    frag.coalesce(match, self.number)
+                    # Because texts are sorted top, left we can get small chars on the same line
+                    # appearing after larger ones, even though they are further left
+                    if frag.left > match.left:
+                        x = frag
+                        frag = match
+                        match = x
+                    frag.coalesce(match, self.number, self.left_margin)
                     break
             if match is not None:
                 self.texts.remove(match)
 
-    def first_pass(self):
+    def check_centered(self, stats):
+        # Check for centered text
+        # Also check for right aligned, and basic chapter structure
+
+        # If there are different left/indents, need to adjust for this page
+        # Is the text offset from the left?
+        # The centering check would fail where all lines on a page are centered
+        # so use stats_left and stats_indent
+        first = True
+        # Assume not Contents
+        self.contents = False
+        m = len(self.texts)
+        #for t in self.texts:
+        for i in range(m):
+            t = self.texts[i]
+            lmargin = t.left
+            rmargin = self.width - t.right
+            # Do we have a sequence of indented lines?
+            xmargin = ymargin = -1
+            if i > 0:
+                xmargin = self.texts[i-1].left
+            if i < m-1:
+                ymargin = self.texts[i+1].left
+
+            # Don't want to set headings on a Contents page
+            # NB Doesn't work where Contents goes to another page
+            if re.match(r'(?i)^\s*(table of )?contents\s*$', t.text_as_string) is not None:
+                self.contents = True
+                t.tag = 'h2'	# It won't get set later
+            # Centered if left and right margins are within FACTOR%
+            if lmargin > self.stats_left \
+              and lmargin != self.stats_indent \
+              and lmargin != xmargin \
+              and lmargin != ymargin \
+              and lmargin >= rmargin - rmargin*CENTER_FACTOR \
+              and lmargin <= rmargin + rmargin*CENTER_FACTOR:
+               #and t.left + t.width + t.left >= self.width + l_offset - t.average_character_width \
+               #and t.left + t.width + t.left <= self.width + l_offset + t.average_character_width:
+                t.align = 'C'
+            # Right aligned if left > FACTOR% of right
+            elif lmargin > self.stats_indent \
+              and lmargin > rmargin*RIGHT_FACTOR:
+              #and t.right >= self.width - t.average_character_width:
+                # What about right-aligned but indented on right?
+                # What about indented rather than right-aligned?
+                t.align = 'R'
+            if not self.contents:
+              # We can get <a href=...Chapter...  Should this check be done?
+              #if 'href=' not in t.raw:
+                # Check for Roman numerals as the only thing on a line
+                if re.match(r'^\s*[iIxXvV]+\s*$', t.text_as_string) is not None:
+                    t.tag = 'h3'
+                # Check for centered digits only
+                elif first and t.align == 'C' and re.match(r'^\s*\d+\s*$', t.text_as_string) is not None:
+                    t.tag = 'h2'
+                elif re.match(r'(?i)^\s*part\s[A-Za-z0-9]+$', t.text_as_string) is not None:
+                    t.tag = 'h1'
+                # Check for 'Chapter' or a centered word at the top of the page
+                # Some PDFs have chapter starts within the page so this check often fails
+                elif re.match(r'(?i)^\s*chapter\s', t.text_as_string) is not None \
+                  or re.match(r'(?i)^\s*prologue|epilogue\s*$', t.text_as_string) is not None \
+                  or (first and t.align == 'C' and re.match(r'(?i)^\s*[a-z -]+\s*$', t.text_as_string) is not None) \
+                  or (first and re.match(r'^\s*[A-Z -]+\s*$', t.text_as_string) is not None):
+                    t.tag = 'h2'
+            first = False
+
+        # Now check image alignment
+        for i in self.imgs:
+            lmargin = i.left
+            rmargin = self.width - i.right
+            if lmargin > self.stats_left \
+              and lmargin != self.stats_indent \
+              and lmargin >= rmargin - rmargin*CENTER_FACTOR \
+              and lmargin <= rmargin + rmargin*CENTER_FACTOR:
+                i.align = 'C'
+
+    def can_merge(self, first_text, second_text, stats):
+        # Can two lines be merged into one paragraph?
+        # Some PDFs have a wandering left margin which is consistent on a page
+        # but not within the whole document.  Hence use self.stats_left
+        # Try to avoid close double quote at end of one and open double quote at start of next
+        #
+        # "float:left" occurs where there is a multi-line character, so indentation is messed up
+        if ((second_text.left < self.stats_left + second_text.average_character_width \
+            and (second_text.left == first_text.last_left \
+             or (second_text.left < first_text.last_left \
+              and (first_text.indented > 0 or '"float:left"' in first_text.raw)))) \
+           or (second_text.left == first_text.last_left \
+            and second_text.left >= self.stats_indent) \
+           or (second_text.left >= first_text.last_left \
+            and second_text.bottom <= first_text.bottom)) \
+          and 'href=' not in second_text.raw \
+          and first_text.bottom + stats.line_space + (stats.line_space*LINE_FACTOR) \
+                >= second_text.bottom \
+          and first_text.final_width > self.width*self.opts.unwrap_factor \
+          and not (first_text.text_as_string[-1] == '\u0022' and second_text.text_as_string[0] == '\u0022') \
+          and not (first_text.text_as_string[-1] == '\u2019' and second_text.text_as_string[0] == '\u2018') \
+          and not (first_text.text_as_string[-1] == '\u201d' and second_text.text_as_string[0] == '\u201c'):
+            # This has checked for single quotes (9...6), double quotes (99...66), and "..."
+            # at end of 1 line then start of next as a check for Don't merge
+            return True
+        return False
+
+    def remove_head_foot(self, opts):
+
+        # Remove headers or footers from a page
+        # if there is a regex supplied
+        if len(opts.pdf_header_regex) > 0 \
+          and len(self.texts) > 0:
+            # Remove the first line if it matches
+            if re.match(opts.pdf_header_regex, self.texts[0].text_as_string) is not None :
+                self.texts.remove(self.texts[0])
+
+
+        if len(opts.pdf_footer_regex) > 0 \
+          and len(self.texts) > 0:
+            # Remove the last line if it matches
+            if re.match(opts.pdf_footer_regex, self.texts[-1].text_as_string) is not None :
+                self.texts.remove(self.texts[-1])
+
+    def coalesce_paras(self, stats):
+
+        # Loop through texts elements and coalesce if same lmargin
+        # and no large gap between lines
+        # Have to restart loop if an entry is removed
+        # Doesn't work well with things like Contents list, hence check href
+        match_found = True
+        last_frag = None
+        scanning = False
+        while match_found:
+            match_found, match = False, None
+            # Same left margin probably means coalesce
+            for frag in self.texts:
+                # Remove lines of only spaces
+                if re.match(r'^\s+$', frag.raw) is not None:
+                    match = frag
+                    break
+                # Loop till we get to the last processed entry
+                if scanning \
+                  and last_frag is not None \
+                  and frag != last_frag:
+                    continue
+                scanning = False
+                if last_frag is not None \
+                  and frag != last_frag \
+                  and self.can_merge(last_frag, frag, stats):
+                    last_frag.coalesce(frag, self.number, self.left_margin)
+                    last_frag.last_left = frag.left
+                    last_frag.final_width = frag.final_width
+                    match = frag
+                    scanning = True  # Restart loop and skip to this entry
+                    break
+                else:
+                    # Check for start of a paragraph being indented
+                    # Ought to have some way of setting a standard indent
+                    if frag.tag == 'p':
+                        if frag.indented == 0 \
+                          and frag.align != 'C' \
+                          and frag.left > self.stats_left + frag.average_character_width:
+                            #frag.indented = int((frag.left - self.stats_left) / frag.average_character_width)
+                            # Is it approx self.stats_indent?
+                            si = self.stats_indent * 0.15	# 15%
+                            if frag.left >= self.stats_indent-si \
+                              and frag.left <= self.stats_indent+si:
+                                frag.indented = 1
+                            else:
+                                frag.indented = 2
+                        if last_frag is not None \
+                          and frag.bottom - last_frag.bottom \
+                              > stats.para_space*SECTION_FACTOR:
+                          #and frag.top - last_frag.bottom > frag.height + stats.line_space + (stats.line_space*LINE_FACTOR):
+                            frag.blank_line = 1
+                last_frag = frag
+            if match is not None:
+                match_found = True
+                self.texts.remove(match)
+
+    def create_page_format(self, stats, opts):
+        # Join fragments into lines
+        # then remove any headers/footers/unwanted areas
+
+        self.update_font_sizes(stats)
+
+        # Join fragments on a line
+        self.join_fragments(opts)
+
+        # Remove headers or footers
+        self.remove_head_foot(opts)
+
+    def find_margins(self, tops, indents, line_spaces, bottoms, rights):
+
+        #from collections import Counter
+
+        # Should check for left margin and indent for this page
+        # Find the most used top, left margins, and gaps between lines
+        # The most used font will be treated as size 1em
+        max_bot = 0
+        max_right = 0
+        last_bottom = 0
+        first = True
+        for text in self.texts:
+            top = text.top
+            left = text.left
+            if round(left) != left :
+                text.left = left = round(left)
+            right = text.right
+            if round(right) != right :
+                text.right = right = round(right)
+            if first:
+                tops[top] = tops.get(top, 0) + 1
+                first = False
+            else:
+                # Some docs have bottom > top although there is a space
+                space = abs(top - last_bottom)
+                # Beware of multiple text on same line. These look like small spacing
+                if text.height <= space:
+                    line_spaces[space] = line_spaces.get(space, 0) + 1
+
+            last_bottom = text.bottom
+            if last_bottom > max_bot:
+                max_bot = last_bottom
+
+            last_right = text.right
+            if last_right > max_right:
+                max_right = last_right
+
+            indents[left] = indents.get(left, 0) + 1
+
+        if max_bot > 0:
+            bottoms[max_bot] = bottoms.get(max_bot, 0) + 1
+        if max_right > 0:
+            rights[max_right] = rights.get(max_right, 0) + 1
+
+        return
+        #########################
+        #### NOT IMPLEMENTED ####
         'Sort page into regions and columns'
         self.regions = []
         if not self.elements:
@@ -510,7 +1024,8 @@ class Page:
             self.dump_regions('pre-coalesce')
 
         self.coalesce_regions()
-        self.dump_regions('post-coalesce')
+        if self.opts.verbose > 2:
+            self.dump_regions('post-coalesce')
 
     def dump_regions(self, fname):
         fname = 'regions-'+fname+'.txt'
@@ -561,16 +1076,16 @@ class Page:
                         absorb_into = prev_region
                         if self.regions[next_region].line_count >= \
                                 self.regions[prev_region].line_count:
-                            avg_column_count = sum(len(r.columns) for r in
-                                regions)/float(len(regions))
+                            avg_column_count = sum([len(r.columns) for r in
+                                regions])/float(len(regions))
                             if self.regions[next_region].line_count > \
                                     self.regions[prev_region].line_count \
                                or abs(avg_column_count -
                                        len(self.regions[prev_region].columns)) \
                                > abs(avg_column_count -
                                        len(self.regions[next_region].columns)):
-                                absorb_into = next_region
-                                absorb_at = 'top'
+                                   absorb_into = next_region
+                                   absorb_at = 'top'
                     if absorb_into is not None:
                         self.regions[absorb_into].absorb_regions(regions, absorb_at)
                         absorbed.update(regions)
@@ -579,7 +1094,7 @@ class Page:
 
     def sort_into_columns(self, elem, neighbors):
         neighbors.add(elem)
-        neighbors = sorted(neighbors, key=lambda x: x.left)
+        neighbors = sorted(neighbors, key=attrgetter('left'))
         if self.opts.verbose > 3:
             self.log.debug('Neighbors:', [x.to_html() for x in neighbors])
         columns = [Column()]
@@ -594,12 +1109,12 @@ class Page:
             if not added:
                 columns.append(Column())
                 columns[-1].add(x)
-                columns.sort(key=lambda x: x.left)
+                columns.sort(key=attrgetter('left'))
         return columns
 
     def find_elements_in_row_of(self, x):
         interval = Interval(x.top,
-                x.top + self.YFUZZ*(self.average_text_height))
+                x.top + YFUZZ*(self.average_text_height))
         h_interval = Interval(x.left, x.right)
         for y in self.elements[x.idx:x.idx+15]:
             if y is not x:
@@ -610,43 +1125,172 @@ class Page:
                     x_interval.intersection(h_interval).width <= 0:
                     yield y
 
-    def second_pass(self):
+    def update_font_sizes(self, stats):
+        # Font sizes start as pixels/points, but em is more useful
+        for text in self.texts:
+            text.font_size_em = self.font_map[text.font.id].size_em
+            if text.font_size_em != 0.00 and text.font_size_em != 1.00:
+                text.raw = '<span style="font-size:%sem">%s</span>'%(str(text.font_size_em),text.raw)
+
+    def second_pass(self, stats, opts):
+
+        # If there are alternating pages, pick the left and indent for this one
+        if stats.left2 > 0 and self.left_margin == stats.left2:
+            self.stats_left = stats.left2
+            self.stats_indent = stats.indent2
+        else:
+            self.stats_left = stats.left
+            self.stats_indent = stats.indent
+
+        # Join lines to form paragraphs
+        self.coalesce_paras(stats)
+
+        self.check_centered(stats)
+
+        #self.elements = list(self.texts)
+        #for img in page.xpath('descendant::img'):
+        #    self.elements.append(Image(img, self.opts, self.log, idc))
+        #self.elements.sort(cmp=lambda x,y:cmp(x.top, y.top))
+
+        return
         'Locate paragraph boundaries in each column'
         for region in self.regions:
             region.collect_stats()
             region.linearize()
 
 
+    def to_html(self):
+        # If ans.append is used, newlines are inserted between each element
+        ans = []
+        #ans.append('<p><a id="%d"></a></p>'%self.number)
+        iind = 0
+        itop = 0
+        ilen = len(self.imgs)
+        for text in self.texts:
+            if iind < ilen:
+                itop = self.imgs[iind].top
+            else:
+                itop = 999999
+            if itop <= text.top:
+                ans.append('<p')
+                if self.imgs[iind].align == 'C':
+                    ans[-1] += ' style="text-align:center"'
+                if self.id_used == 0:
+                    self.id_used = 1
+                    ans[-1] += ' id="page_%d"'%self.number
+                ans[-1] += '>'
+                ans[-1] += self.imgs[iind].to_html()
+                ans[-1] += '</p>'
+                iind += 1
+            if text.blank_line > 0:
+                ans.append('<p style="text-align:center">&#160;</p>')
+            ans.append('<%s'%text.tag)
+            # Should be only for Headings, but there is no guarantee that the heading will be recognised
+            # So put in an ID once per page in case the Contents references it
+            #   and  text.tag[0] == 'h'
+            if self.id_used == 0:
+                self.id_used = 1
+                ans[-1] += ' id="page_%d"'%self.number
+            if text.align == 'C':
+                ans[-1] += ' style="text-align:center"'
+            elif text.align == 'R':
+                ans[-1] += ' style="text-align:right"'
+            elif text.indented > 0:
+                ans[-1] += ' style="text-indent:'
+                ans[-1] += str(text.indented)
+                #ans[-1] += '1'
+                ans[-1] += 'em"'
+            ans[-1] += '>'
+            #ans.append(text.to_html()+' ')
+            #ans.append('</%s>'%text.tag)
+            ans[-1] += text.to_html()
+            ans[-1] += '</%s>'%text.tag
+        # Any remaining images
+        while iind < ilen:
+            ans.append('<p')
+            if self.imgs[iind].align == 'C':
+                ans[-1] += ' style="text-align:center"'
+            if self.id_used == 0:
+                self.id_used = 1
+                ans[-1] += ' id="page_%d"'%self.number
+            ans[-1] += '>'
+            ans[-1] += self.imgs[iind].to_html()
+            ans[-1] += '</p>'
+            iind += 1
+
+        return ans
+
 class PDFDocument:
 
     def __init__(self, xml, opts, log):
+        #from calibre.rpdb import set_trace;  set_trace()
+
         self.opts, self.log = opts, log
-        self.root = safe_xml_fromstring(xml)
-        idc = count()
+
+        # Check for a testable value
+        if self.opts.pdf_header_regex is None:
+            self.opts.pdf_header_regex = ''	# Do nothing
+        if self.opts.pdf_footer_regex is None:
+            self.opts.pdf_footer_regex = ''	# Do nothing
+
+        parser = etree.XMLParser(recover=True)
+        self.root = etree.fromstring(xml, parser=parser)
+        idc = iter(range(sys.maxsize))
+        self.stats = DocStats()
 
         self.fonts = []
+
         self.font_map = {}
 
-        for spec in self.root.xpath('//font'):
+        for spec in self.root.xpath('//fontspec'):
             self.fonts.append(Font(spec))
             self.font_map[self.fonts[-1].id] = self.fonts[-1]
 
         self.pages = []
-        self.page_map = {}
+        #self.page_map = {}
 
         for page in self.root.xpath('//page'):
             page = Page(page, self.font_map, opts, log, idc)
-            self.page_map[page.id] = page
+            #self.page_map[page.id] = page
             self.pages.append(page)
 
+        self.tops = {}
+        self.indents = {}
+        self.line_spaces = {}
+        self.bottoms = {}
+        self.rights = {}
+        self.font_sizes = {}
+
         self.collect_font_statistics()
 
+        # Create lines for pages and remove headers/footers etc.
         for page in self.pages:
             page.document_font_stats = self.font_size_stats
-            page.first_pass()
-            page.second_pass()
+            page.create_page_format(self.stats, self.opts)
 
-        self.linearize()
+        # Need to work out the header/footer automatically if opt < 0
+        if self.opts.pdf_header_skip < 0 or self.opts.pdf_footer_skip < 0:
+            self.find_header_footer()
+
+        # Remove any header/footer
+        if self.opts.pdf_header_skip > 0 or self.opts.pdf_footer_skip > 0:
+            self.remove_header_footer()
+
+        # Work out document dimensions from page format
+        for page in self.pages:
+            page.find_margins(self.tops, self.indents, self.line_spaces, \
+                            self.bottoms, self.rights)
+
+        self.setup_stats()
+
+        # Joins lines etc. into paragraphs
+        for page in self.pages:
+            page.second_pass(self.stats, self.opts)
+
+        # Join paragraphs across page boundaries
+        self.merge_pages(self.stats.left)
+
+        #self.linearize()
         self.render()
 
     def collect_font_statistics(self):
@@ -658,8 +1302,386 @@ class PDFDocument:
                     self.font_size_stats[sz] = 0
                 self.font_size_stats[sz] += chars
 
+            for text in p.texts:
+                font = int(text.font.id)
+                self.font_sizes[font] = self.font_sizes.get(font, 0) + 1
+
         self.font_size_stats = FontSizeStats(self.font_size_stats)
 
+        # Find most popular font so that will be treated as 1em
+        fcount = f_ind = 0
+        for f in self.font_sizes:
+            if fcount < self.font_sizes[f]:
+                fcount = self.font_sizes[f]
+                f_ind = f
+
+        if len(self.fonts) > 0:
+            self.stats.font_size = self.fonts[f_ind].size
+        else:
+            self.stats.font_size = 12.0
+
+        for f in self.fonts:
+            f.size_em = round(f.size / self.stats.font_size, 2)
+
+    def setup_stats(self):
+        # This probably needs more work on line spacing/para spacing
+        # Maybe sort the line_spaces array.
+        # It is possible to have more than 1 line space value, e.g. 8.0, 9.0, 10.0
+        # then more than 1 para space, e.g. 24.0, 25.0, 26.0
+        # Thus the count of a para space could be > the most popular line space.
+        # So, probably need to find the max line space and max para space
+        # rather than simply the most popular.
+        # At the moment it only does this when spaces are close in popularity.
+
+        # Find (next) most popular gap between lines
+        def find_line_space(skip):
+            scount, soffset = 0, 0
+            for s in self.line_spaces:
+                if scount <= self.line_spaces[s] \
+                and (skip <= 0 or self.line_spaces[s] < skip):
+                    scount = self.line_spaces[s]
+                    soffset = s
+            return scount, soffset
+
+        # Find most popular top so that will be treated as top of page
+        tcount = 0
+        for t in self.tops:
+            if tcount < self.tops[t]:
+                tcount = self.tops[t]
+                self.stats.top = t
+
+        # Some PDFs have alternating pages with different lefts/indents.
+        # So, if the 2nd highest is 90% of the highest, assume it is a left.
+        # Same for the indents, assuming there were 2 lefts
+
+        # Find most popular left so that will be treated as left of page
+        icount = 0
+        for i in self.indents:
+            if icount < self.indents[i]:
+                icount = self.indents[i]
+                self.stats.left = i
+
+        # Find second most popular left so that will be treated as indent
+        icount = 0
+        for i in self.indents:
+            if i != self.stats.left and icount < self.indents[i]:
+                icount = self.indents[i]
+                self.stats.indent = i
+
+        # For safety, check left and indent are in the right order
+        if self.stats.indent != 0 \
+          and self.stats.left > self.stats.indent:
+            l = self.stats.left
+            self.stats.left = self.stats.indent
+            self.stats.indent = l
+
+        # Now decide whether there are 2 similar, i.e. within 95% (arbitrary)
+        if self.stats.indent > 0 \
+          and 100.0 * self.indents[self.stats.indent] / self.indents[self.stats.left] > 95.0:
+            self.stats.left2 = self.stats.indent
+
+            # Find next most popular left so that will be treated as indent
+            icount = 0
+            for i in self.indents:
+                if i != self.stats.left and i != self.stats.left2 and icount < self.indents[i]:
+                    icount = self.indents[i]
+                    self.stats.indent = i
+
+            # And the last most popular left so that will be treated as indent2
+            # Should check it is within 90%. What to do if not?
+            icount = 0
+            for i in self.indents:
+                if i != self.stats.left and i != self.stats.left2 \
+                  and i != self.stats.indent and icount < self.indents[i]:
+                    icount = self.indents[i]
+                    self.stats.indent2 = i
+
+            # And check indent and indent2 are in the right order
+            if self.stats.indent > self.stats.indent2:
+                l = self.stats.indent
+                self.stats.indent = self.stats.indent2
+                self.stats.indent2 = l
+
+        # Sort spaces into ascending order then loop through
+        # Lowest value(s) are line spacing, next are para
+        # Spaces not yet set up
+        self.stats.line_space = self.stats.para_space = -1.0
+        # Find spacing values
+        # Find most popular space so that will be treated as line space
+        line_k = 0
+        line_c = 0
+        count = len(self.line_spaces)
+        while count > 0:
+            c, k = find_line_space(line_c)
+            if line_c <= 0:
+                line_c = c
+            if line_k <= 0:
+                line_k = k
+            elif abs(line_k - k) <= SAME_SPACE:
+                line_k = max(line_k, k)
+                line_c = min(line_c, c)
+            else:
+                break
+            count -= 1
+
+        # Get the next most popular gap
+        para_c = line_c-1
+        para_k = 0
+        count = len(self.line_spaces)
+        while count > 0:
+            c, k = find_line_space(para_c)
+            if para_k <= 0:
+                para_k = k
+            if abs(para_k - k) <= SAME_SPACE:
+                para_k = max(para_k, k)
+                para_c = min(para_c, c)
+            else:
+                break
+            count -= 1
+
+        # For safety, check in the right order
+        if line_k > para_k:
+            x = para_k
+            para_k = line_k
+            line_k = x
+
+        self.stats.line_space = line_k
+        # Some docs have no great distinction for paragraphs
+        # Limit the size of the gap, or section breaks not found
+        if para_k > line_k * PARA_FACTOR:
+            self.stats.para_space = round(line_k * PARA_FACTOR)
+        else:
+            self.stats.para_space = para_k
+
+        # Find most popular bottom so that will be treated as bottom of page
+        # Or the max bottom?  Or the max used value within 10% of max value?
+        bcount = 0
+        for b in self.bottoms:
+            if bcount < self.bottoms[b]:
+              #and b > self.stats.bottom*0.9:
+                bcount = self.bottoms[b]
+            #if b > self.stats.bottom:
+                self.stats.bottom = b
+
+        # Find farthest right so that will be treated as page right
+        ## SHOULD DO RIGHT2 as well
+        rcount = 0
+        for r in self.rights:
+            if rcount < r:
+                rcount = r
+                self.stats.right = r
+
+    def find_header_footer(self):
+        # If requested, scan first few pages for possible headers/footers
+
+        scan_count = 20		# Arbitrary
+        head_text = ''
+        head_match = 0
+        head_match1 = 0
+        head_page = 0
+        head_skip = 0
+        foot_text = ''
+        foot_match = 0
+        foot_match1 = 0
+        foot_page = 0
+        foot_skip = 0
+        pagenum_text = r'.*\d+\s+\w+\s+\d+.*'
+
+        pages_to_scan = scan_count
+        # Note the a line may be in more than 1 part
+        # e.g. Page 1 of 6 ... DocName.pdf
+        # so merge first 2 lines if same top
+        # Ditto last 2 lines
+        # Maybe should do more than 2 parts
+        for page in self.pages:
+            if self.opts.pdf_header_skip < 0 and len(page.texts) > 0:
+                t = page.texts[0].text_as_string
+                if len(page.texts) > 1 and page.texts[0].top == page.texts[1].top:
+                    t += page.texts[1].text_as_string
+                if len(head_text) == 0:
+                    head_text = t
+                else:
+                    if head_text == t:
+                        head_match += 1
+                        if head_page == 0:
+                            head_page = page.number
+                    else:	# Look for page count of format 'n xxx n'
+                        if re.match(pagenum_text, t) is not None:
+                            head_match1 += 1
+                            if head_page == 0:
+                                head_page = page.number
+
+            if self.opts.pdf_footer_skip < 0 and len(page.texts) > 0:
+                t = page.texts[-1].text_as_string
+                if len(page.texts) > 1 and page.texts[-1].top == page.texts[-2].top:
+                    t += page.texts[-2].text_as_string
+                if len(foot_text) == 0:
+                    foot_text = t
+                else:
+                    if foot_text == t:
+                        foot_match += 1
+                        if foot_page == 0:
+                            foot_page = page.number
+                    else:	# Look for page count of format 'n xxx n'
+                        if re.match(pagenum_text, t) is not None:
+                            foot_match1 += 1
+                            if foot_page == 0:
+                                foot_page = page.number
+
+            pages_to_scan -= 1
+            if pages_to_scan < 1:
+                break
+
+        # How many pages have been scanned?
+        if pages_to_scan > scan_count-2:
+            # Doc is empty or 1 page.  Can't decide on any skips
+            return
+
+        if pages_to_scan > 0:
+            # Doc is shorter than scan_count
+            pages_to_scan = scan_count - pages_to_scan	# Number scanned
+        else:
+            # All required pages scanned
+            pages_to_scan = scan_count
+        pages_to_scan /= 2	# Are at least half matching?
+
+        if head_match > pages_to_scan or head_match1 > pages_to_scan:
+            t = self.pages[head_page].texts[0]
+            head_skip = t.top + t.height + 1
+
+        if foot_match > pages_to_scan or foot_match1 > pages_to_scan:
+            t = self.pages[foot_page].texts[-1]
+            foot_skip = t.top - 1
+
+        if head_skip > 0:
+            self.opts.pdf_header_skip = head_skip
+        if foot_skip > 0:
+            self.opts.pdf_footer_skip = foot_skip
+
+    def remove_header_footer(self):
+        # Remove any header/footer lines from all pages
+        for page in self.pages:
+            # If a text is removed, we need to restart the loop or what was the next will be skipped
+            removed = 1
+            while removed == 1:
+                removed = 0
+                for t in page.texts:
+                    if self.opts.pdf_header_skip > 0 and t.top < self.opts.pdf_header_skip \
+                    or self.opts.pdf_footer_skip > 0 and t.top > self.opts.pdf_footer_skip:
+                        page.texts.remove(t)
+                        removed = 1
+
+    def merge_pages(self, left_margin):
+        # Check for pages that can be merged
+
+        # When merging pages, assume short last lines mean no merge
+        # BUT unfortunately there is no way to tell the difference
+        # between a continuation of a paragraph and a 'section break'
+        # if the previous page ends a sentence.
+        # Also long words can force a new line (at a new page)
+        # although the end of the previous is < this percent.
+        # Needs to find whether 1st word of 2nd page would fit on
+        # the last line of previous rather than the length of the last line.
+        # Pages can split early to avoid orphans.
+        ORPHAN_LINES = 2
+        # First, find the minimum text top and the maximum text bottom
+        min_top = self.stats.top
+        max_bottom = self.stats.bottom
+        # Keep a note of the position of the final line on the merged page
+        save_bottom = 0
+        # After merge, skip to this page
+        save_number = 0
+
+        # Now merge where bottom of one is within ORPHAN_LINES lines of max_bottom
+        # and top of next is within a line of min_top
+        # and margins correspond, and it's a normal paragraph
+        merge_done = True
+        while merge_done:
+            merge_done = False
+            merged_page = None
+            candidate = None
+            for page in self.pages:
+                if page.number < save_number:
+                    next
+                if page.texts:
+                    if candidate:
+                        last_line = candidate.texts[-1]
+                        merged_text = page.texts[0]
+                        top = merged_text.top
+                          # Should we check that the new line starts lower case?  Doesn't cover all cases.
+                          #and re.match('^[a-z]', merged_text.text_as_string) is not None
+                        # How much space in characters was at the end of the last line?
+                        # If the book is justified text, any space should mean end-of-para
+                        # So, how to check for a justified book/page?
+                        last_spare = (candidate.right_margin - last_line.right) / last_line.average_character_width
+                        # How big is the first word on the next line?
+                        merged_first = re.match(r'.+?(\s)+?', merged_text.text_as_string)
+                        if merged_first is not None:
+                            # First word length as float
+                            merged_len = len(merged_first.group(0)) * 1.0
+                        else:
+                            merged_len = merged_text.right
+                        # Allow where the last line ends with or next line starts with lower case.
+                        if re.match('[a-z, -]$', last_line.text_as_string) is not None \
+                          or re.match('^[a-z, -]', merged_text.text_as_string) is not None :
+                            merged_len = merged_text.right
+
+                        # To use merged_len etc.
+                        # Should not merge if speech where last ends 99 or 9 and next starts 66 or 6
+                        if top <= min_top + page.average_text_height \
+                          and merged_text.tag == 'p' \
+                          and 'href=' not in merged_text.raw \
+                          and merged_text.left < page.stats_left + merged_text.average_character_width \
+                          and not last_spare > merged_len \
+                          and not (re.match('.*(\u201d|”)$', last_line.text_as_string) is not None
+                               and re.match('^(\u201c|“).*', merged_text.text_as_string) is not None):
+                            merge_done = True
+                            # We don't want to merge partial pages
+                            # i.e. if this is the last line, preserve its top/bottom till after merge
+                            if len(page.texts) == 1 :
+                                save_bottom = merged_text.bottom
+                            else:
+                                save_bottom = 0.0
+
+                            merged_text.top = candidate.texts[-1].top + page.average_text_height
+                            merged_text.bottom = merged_text.top + page.average_text_height
+                            candidate.texts.append(merged_text)
+                            merged_page = page
+                            break
+                        candidate = None
+                    last_line = page.texts[-1]
+                    bottom = last_line.bottom
+                    # Decide on whether merging is a good idea
+                    # Non-indented paragraphs are a problem
+                    if bottom >= max_bottom \
+                         - (ORPHAN_LINES*page.average_text_height) \
+                         - (ORPHAN_LINES*self.stats.line_space) \
+                      and (re.match('[a-z, ]$', last_line.text_as_string) is not None \
+                        or last_line.final_width > page.width*self.opts.unwrap_factor):
+                    #    or (last_line.right * 100.0 / page.right_margin > LAST_LINE_PERCENT)) :
+                        candidate = page
+                else:
+                    candidate = None
+            if merge_done:
+                merged_page.texts.remove(merged_text)
+                # We now need to skip to the next page number
+                # The current page can no longer have anything to merge
+                save_number = merged_page.number + 1
+                # Re-calling coalesce_paras doesn't seem to work
+                candidate.texts[-2].coalesce(candidate.texts[-1], candidate.number, left_margin)
+                candidate.texts.remove(candidate.texts[-1])
+                # Put back top/bottom after coalesce if final line
+                if save_bottom != 0.0 :
+                    # Ignore top as that can confuse things where the 1st para of a page
+                    # was merged with a previous.  Keep the original top
+                    #candidate.texts[-1].top = save_top
+                    candidate.texts[-1].bottom = save_bottom
+                #candidate.coalesce_paras()
+                # Have we removed everything from this page (well, all texts)
+                if len(merged_page.texts) == 0:
+                    self.pages.remove(merged_page)
+
+
     def linearize(self):
         self.elements = []
         last_region = last_block = None
@@ -685,14 +1707,28 @@ class PDFDocument:
                     last_block = block
                 last_region = region
 
+
     def render(self):
+        #### Where does the title come from if not run from command line?
+        title = 'Converted Ebook'
+        if len(sys.argv) > 1:
+            title = sys.argv[1]
+        # Need to insert font info and styles
         html = ['<?xml version="1.0" encoding="UTF-8"?>',
                 '<html xmlns="http://www.w3.org/1999/xhtml">', '<head>',
-                '<title>PDF Reflow conversion</title>', '</head>', '<body>',
-                '<div>']
-        for elem in self.elements:
-            html.extend(elem.to_html())
+                '<title>'+title+'</title>',
+                '<meta content="PDF Reflow conversion" name="generator"/>',
+                '</head>', '<body>']
+        for page in self.pages:
+            html.extend(page.to_html())
         html += ['</body>', '</html>']
         raw = ('\n'.join(html)).replace('</strong><strong>', '')
+        raw = raw.replace('</i><i>', '')
+        raw = raw.replace('</em><em>', '')
+        raw = raw.replace('</b><b>', '')
+        raw = raw.replace('</strong> <strong>', ' ')
+        raw = raw.replace('</i> <i>', ' ')
+        raw = raw.replace('</em> <em>', ' ')
+        raw = raw.replace('</b> <b>', ' ')
         with open('index.html', 'wb') as f:
             f.write(raw.encode('utf-8'))
diff --git a/src/calibre/gui2/convert/pdf_input.py b/src/calibre/gui2/convert/pdf_input.py
index 9c6f952de6..a431660978 100644
--- a/src/calibre/gui2/convert/pdf_input.py
+++ b/src/calibre/gui2/convert/pdf_input.py
@@ -18,8 +18,17 @@ class PluginWidget(Widget, Ui_Form):
         Widget.__init__(self, parent, OPTIONS['input']['pdf'])
         self.db, self.book_id = db, book_id
         self.initialize_options(get_option, get_help, db, book_id)
+        self.opt_new_pdf_engine.toggled.connect(self.update_engine_opts)
+        self.update_engine_opts()
 
     def set_value_handler(self, g, val):
         if val is None and isinstance(g, QDoubleSpinBox):
             g.setValue(0.0)
             return True
+
+    def update_engine_opts(self):
+        enabled = self.opt_new_pdf_engine.isChecked()
+        self.opt_pdf_footer_skip.setEnabled(enabled)
+        self.opt_pdf_header_skip.setEnabled(enabled)
+        self.opt_pdf_header_regex.setEnabled(enabled)
+        self.opt_pdf_footer_regex.setEnabled(enabled)
diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui
index 3aa396f1ce..5d9acdd872 100644
--- a/src/calibre/gui2/convert/pdf_input.ui
+++ b/src/calibre/gui2/convert/pdf_input.ui
@@ -6,7 +6,7 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>400</width>
+    <width>413</width>
     <height>300</height>
    </rect>
   </property>
@@ -24,19 +24,6 @@
      </property>
     </widget>
    </item>
-   <item row="2" column="0">
-    <spacer name="verticalSpacer">
-     <property name="orientation">
-      <enum>Qt::Vertical</enum>
-     </property>
-     <property name="sizeHint" stdset="0">
-      <size>
-       <width>20</width>
-       <height>213</height>
-      </size>
-     </property>
-    </spacer>
-   </item>
    <item row="0" column="1">
     <widget class="QDoubleSpinBox" name="opt_unwrap_factor">
      <property name="maximum">
@@ -57,6 +44,118 @@
      </property>
     </widget>
    </item>
+   <item row="3" column="0">
+    <widget class="QLabel" name="label_t">
+     <property name="text">
+      <string>Remove headers at &amp;top of page by:</string>
+     </property>
+     <property name="buddy">
+      <cstring>opt_pdf_header_skip</cstring>
+     </property>
+    </widget>
+   </item>
+   <item row="3" column="1">
+    <widget class="QSpinBox" name="opt_pdf_header_skip">
+     <property name="specialValueText">
+      <string>Automatically</string>
+     </property>
+     <property name="suffix">
+      <string> px</string>
+     </property>
+     <property name="minimum">
+      <number>-1</number>
+     </property>
+     <property name="maximum">
+      <number>99999</number>
+     </property>
+     <property name="value">
+      <number>-1</number>
+     </property>
+    </widget>
+   </item>
+   <item row="4" column="0">
+    <widget class="QLabel" name="label_b">
+     <property name="text">
+      <string>Remove footers at &amp;bottom of page by:</string>
+     </property>
+     <property name="buddy">
+      <cstring>opt_pdf_footer_skip</cstring>
+     </property>
+    </widget>
+   </item>
+   <item row="4" column="1">
+    <widget class="QSpinBox" name="opt_pdf_footer_skip">
+     <property name="specialValueText">
+      <string>Automatically</string>
+     </property>
+     <property name="suffix">
+      <string> px</string>
+     </property>
+     <property name="minimum">
+      <number>-1</number>
+     </property>
+     <property name="maximum">
+      <number>99999</number>
+     </property>
+     <property name="value">
+      <number>-1</number>
+     </property>
+    </widget>
+   </item>
+   <item row="5" column="0">
+    <widget class="QLabel" name="label_rt">
+     <property name="text">
+      <string>Regular expression to remove &amp;header at top of page:</string>
+     </property>
+     <property name="buddy">
+      <cstring>opt_pdf_header_regex</cstring>
+     </property>
+    </widget>
+   </item>
+   <item row="5" column="1">
+    <widget class="QLineEdit" name="opt_pdf_header_regex">
+     <property name="clearButtonEnabled">
+      <bool>true</bool>
+     </property>
+    </widget>
+   </item>
+   <item row="6" column="0">
+    <widget class="QLabel" name="label_rb">
+     <property name="text">
+      <string>Regular expression to remove &amp;footer at bottom of page:</string>
+     </property>
+     <property name="buddy">
+      <cstring>opt_pdf_footer_regex</cstring>
+     </property>
+    </widget>
+   </item>
+   <item row="6" column="1">
+    <widget class="QLineEdit" name="opt_pdf_footer_regex">
+     <property name="clearButtonEnabled">
+      <bool>true</bool>
+     </property>
+    </widget>
+   </item>
+   <item row="7" column="0">
+    <spacer name="verticalSpacer">
+     <property name="orientation">
+      <enum>Qt::Vertical</enum>
+     </property>
+     <property name="sizeHint" stdset="0">
+      <size>
+       <width>20</width>
+       <height>20</height>
+      </size>
+     </property>
+    </spacer>
+   </item>
+   <item row="2" column="0" colspan="2">
+    <widget class="QCheckBox" name="opt_new_pdf_engine">
+     <property name="text">
+      <string>&amp;New, experimental, PDF conversion Engine</string>
+     </property>
+    </widget>
+   </item>
   </layout>
  </widget>
  <resources/>
diff --git a/src/pyj/book_list/conversion_widgets.pyj b/src/pyj/book_list/conversion_widgets.pyj
index 63f915a614..e56e60cdf1 100644
--- a/src/pyj/book_list/conversion_widgets.pyj
+++ b/src/pyj/book_list/conversion_widgets.pyj
@@ -487,6 +487,11 @@ def pdf_input(container):
     container.appendChild(g)
     g.appendChild(float_spin('unwrap_factor', _('Line &un-wrapping factor:'), max=1, step=0.01))
     g.appendChild(checkbox('no_images', _('No &images')))
+    g.appendChild(checkbox('new_pdf_engine', _('New, experimental, PDF conversion &engine')))
+    g.appendChild(int_spin('pdf_header_skip', _('Remove headers at &top of page by:'), min=-1, max=999999, step=1))
+    g.appendChild(int_spin('pdf_footer_skip', _('Remove footers at &bottom of page by:'), min=-1, max=999999, step=1))
+    g.appendChild(lineedit('pdf_header_regex', _('Regular expression to remove &header at top of page:')))
+    g.appendChild(lineedit('pdf_footer_regex', _('Regular expression to remove &footer at bottom of page:')))
 # }}}
 
 # RTF Input {{{