Ignore table markup for tables with large cells. Optimize use of Spans.

This commit is contained in:
Kovid Goyal 2007-08-17 20:35:02 +00:00
parent 7c2aa5b07e
commit 4601d8a257
2 changed files with 42 additions and 16 deletions

View File

@ -62,6 +62,10 @@ class Span(_Span):
Assumes: One em is 10pts Assumes: One em is 10pts
""" """
result = None result = None
try:
result = int(val)
except ValueError:
pass
m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val) m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val)
if m is not None: if m is not None:
unit = float(m.group(1)) unit = float(m.group(1))
@ -81,6 +85,8 @@ class Span(_Span):
result = int(unit * 0.04 * (dpi/72.)) result = int(unit * 0.04 * (dpi/72.))
elif m.group(2)== 'cm': elif m.group(2)== 'cm':
result = int(unit * 0.4 * (dpi/72.)) result = int(unit * 0.4 * (dpi/72.))
if result is None:
result = 0
if pts: if pts:
result = int((float(result)/dpi)*720) result = int((float(result)/dpi)*720)
return result return result
@ -259,7 +265,9 @@ class Span(_Span):
attrs.pop('fontfacename') attrs.pop('fontfacename')
for key in attrs: for key in attrs:
if parent_style.has_key(key) and str(parent_style[key]) == str(attrs[key]): if parent_style.has_key(key) and str(parent_style[key]) == str(attrs[key]):
attrs.pop(key) attrs.pop(key)
self.text_src = src
self.span_needed = bool(attrs)
_Span.__init__(self, text=src, **attrs) _Span.__init__(self, text=src, **attrs)
class HTMLConverter(object): class HTMLConverter(object):
@ -544,9 +552,12 @@ class HTMLConverter(object):
def create_link(self, children, tag): def create_link(self, children, tag):
para = None para = None
for i in range(len(children)-1, -1, -1): for i in range(len(children)-1, -1, -1):
if not isinstance(children[i], CR): if isinstance(children[i], _Span):
para = children[i] para = children[i]
break break
if para is None:
print children
raise ConversionError('Failed to parse link %s'%(tag,))
text = self.get_text(tag, 1000) text = self.get_text(tag, 1000)
if not text: if not text:
text = 'Link' text = 'Link'
@ -710,7 +721,8 @@ class HTMLConverter(object):
self.parse_tag(c, pcss) self.parse_tag(c, pcss)
elif isinstance(c, NavigableString): elif isinstance(c, NavigableString):
self.add_text(c, pcss) self.add_text(c, pcss)
ptag.extract() if not self.in_table:
ptag.extract()
def process_alignment(self, css): def process_alignment(self, css):
''' '''
@ -748,7 +760,7 @@ class HTMLConverter(object):
return True return True
return False return False
def add_text(self, tag, css): def add_text(self, tag, css, force_span_use=False):
''' '''
Add text to the current paragraph taking CSS into account. Add text to the current paragraph taking CSS into account.
@param tag: Either a BeautifulSoup tag or a string @param tag: Either a BeautifulSoup tag or a string
@ -760,20 +772,29 @@ class HTMLConverter(object):
if self.process_alignment(css) and collapse_whitespace: if self.process_alignment(css) and collapse_whitespace:
# Dont want leading blanks in a new paragraph # Dont want leading blanks in a new paragraph
src = src.lstrip() src = src.lstrip()
args = self.sanctify_css(css), self.memory, self.profile.dpi, self.fonts,\ def append_text(src):
self.logger, self.font_delta, self.current_block.textStyle.attrs span = Span(src, self.sanctify_css(css), self.memory, self.profile.dpi,
self.fonts, self.logger, self.font_delta,
self.current_block.textStyle.attrs)
if span.span_needed or force_span_use:
self.current_para.append(span)
else:
if hasattr(span.text_src, 'parent'):
span.text_src.parent.contents = []
span.text_src.parent = None
self.current_para.append(span.text_src)
if collapse_whitespace: if collapse_whitespace:
src = re.sub(r'\s{1,}', ' ', src) src = re.sub(r'\s{1,}', ' ', src)
if len(self.previous_text) != len(self.previous_text.rstrip()): if len(self.previous_text) != len(self.previous_text.rstrip()):
src = src.lstrip() src = src.lstrip()
if len(src): if len(src):
self.previous_text = src self.previous_text = src
self.current_para.append(Span(src, *args)) append_text(src)
else: else:
srcs = src.split('\n') srcs = src.split('\n')
for src in srcs: for src in srcs:
if src: if src:
self.current_para.append(Span(src, *args)) append_text(src)
if len(srcs) > 1: if len(srcs) > 1:
self.line_break() self.line_break()
@ -973,7 +994,7 @@ class HTMLConverter(object):
def process_block(self, tag, tag_css, tkey): def process_block(self, tag, tag_css, tkey):
''' Ensure padding and text-indent properties are respected ''' ''' Ensure padding and text-indent properties are respected '''
if tag_css.has_key('text-indent'): if tag_css.has_key('text-indent'):
indent = Span.unit_convert(tag_css['text-indent'], self.profile.dpi, pts=True) indent = Span.unit_convert(str(tag_css['text-indent']), self.profile.dpi, pts=True)
if not indent: if not indent:
indent = 0 indent = 0
@ -1051,7 +1072,7 @@ class HTMLConverter(object):
text = self.get_text(tag, limit=1000) text = self.get_text(tag, limit=1000)
if not text.strip(): if not text.strip():
text = "Link" text = "Link"
self.add_text(text, tag_css) self.add_text(text, tag_css, force_span_use=True)
self.links[self.target_prefix].append(self.create_link(self.current_para.contents, tag)) self.links[self.target_prefix].append(self.create_link(self.current_para.contents, tag))
if tag.has_key('id') or tag.has_key('name'): if tag.has_key('id') or tag.has_key('name'):
key = 'name' if tag.has_key('name') else 'id' key = 'name' if tag.has_key('name') else 'id'
@ -1312,9 +1333,12 @@ class HTMLConverter(object):
self.process_table(tag, tag_css) self.process_table(tag, tag_css)
except Exception, err: except Exception, err:
self.logger.warning('An error occurred while processing a table: %s', str(err)) self.logger.warning('An error occurred while processing a table: %s', str(err))
self.logger.debug('', exc_info=True)
self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300]) self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300])
self.in_table = False self.in_table = False
self.process_children(tag, tag_css) self.process_children(tag, tag_css)
finally:
tag.extract()
else: else:
self.process_children(tag, tag_css) self.process_children(tag, tag_css)
if end_page: if end_page:
@ -1325,8 +1349,11 @@ class HTMLConverter(object):
rowpad = 10 rowpad = 10
table = Table(self, tag, tag_css, rowpad=rowpad, colpad=10) table = Table(self, tag, tag_css, rowpad=rowpad, colpad=10)
canvases = [] canvases = []
for block, xpos, ypos, delta in table.blocks(int(self.current_page.pageStyle.attrs['textwidth'])): ps = self.current_page.pageStyle.attrs
for block, xpos, ypos, delta in table.blocks(int(ps['textwidth']), int(ps['textheight'])):
if not block: if not block:
if ypos > int(ps['textheight']):
raise Exception, 'Table has cell that is too large'
canvases.append(Canvas(int(self.current_page.pageStyle.attrs['textwidth']), ypos+rowpad, canvases.append(Canvas(int(self.current_page.pageStyle.attrs['textwidth']), ypos+rowpad,
blockrule='block-fixed')) blockrule='block-fixed'))
else: else:

View File

@ -96,8 +96,7 @@ class Cell(object):
self.colspan = int(tag['colspan']) if tag.has_key('colspan') else 1 self.colspan = int(tag['colspan']) if tag.has_key('colspan') else 1
self.rowspan = int(tag['rowspan']) if tag.has_key('rowspan') else 1 self.rowspan = int(tag['rowspan']) if tag.has_key('rowspan') else 1
except: except:
if conv.verbose: pass
print >>sys.stderr, "Error reading row/colspan for ", tag
pp = conv.current_page pp = conv.current_page
conv.book.allow_new_page = False conv.book.allow_new_page = False
@ -219,7 +218,7 @@ class Row(object):
def __init__(self, conv, row, css, colpad): def __init__(self, conv, row, css, colpad):
self.cells = [] self.cells = []
self.colpad = colpad self.colpad = colpad
cells = row.findAll(re.compile('td|th')) cells = row.findAll(re.compile('td|th', re.IGNORECASE))
for cell in cells: for cell in cells:
ccss = conv.tag_css(cell, css) ccss = conv.tag_css(cell, css)
self.cells.append(Cell(conv, cell, ccss)) self.cells.append(Cell(conv, cell, ccss))
@ -347,7 +346,7 @@ class Table(object):
itercount += 1 itercount += 1
return [i+self.colpad for i in widths] return [i+self.colpad for i in widths]
def blocks(self, maxwidth): def blocks(self, maxwidth, maxheight):
rows, cols = self.number_or_rows(), self.number_of_columns() rows, cols = self.number_or_rows(), self.number_of_columns()
cellmatrix = [[None for c in range(cols)] for r in range(rows)] cellmatrix = [[None for c in range(cols)] for r in range(rows)]
rowpos = [0 for i in range(rows)] rowpos = [0 for i in range(rows)]