mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
Ignore table markup for tables with large cells. Optimize use of Spans.
This commit is contained in:
parent
7c2aa5b07e
commit
4601d8a257
@ -62,6 +62,10 @@ class Span(_Span):
|
|||||||
Assumes: One em is 10pts
|
Assumes: One em is 10pts
|
||||||
"""
|
"""
|
||||||
result = None
|
result = None
|
||||||
|
try:
|
||||||
|
result = int(val)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val)
|
m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val)
|
||||||
if m is not None:
|
if m is not None:
|
||||||
unit = float(m.group(1))
|
unit = float(m.group(1))
|
||||||
@ -81,6 +85,8 @@ class Span(_Span):
|
|||||||
result = int(unit * 0.04 * (dpi/72.))
|
result = int(unit * 0.04 * (dpi/72.))
|
||||||
elif m.group(2)== 'cm':
|
elif m.group(2)== 'cm':
|
||||||
result = int(unit * 0.4 * (dpi/72.))
|
result = int(unit * 0.4 * (dpi/72.))
|
||||||
|
if result is None:
|
||||||
|
result = 0
|
||||||
if pts:
|
if pts:
|
||||||
result = int((float(result)/dpi)*720)
|
result = int((float(result)/dpi)*720)
|
||||||
return result
|
return result
|
||||||
@ -259,7 +265,9 @@ class Span(_Span):
|
|||||||
attrs.pop('fontfacename')
|
attrs.pop('fontfacename')
|
||||||
for key in attrs:
|
for key in attrs:
|
||||||
if parent_style.has_key(key) and str(parent_style[key]) == str(attrs[key]):
|
if parent_style.has_key(key) and str(parent_style[key]) == str(attrs[key]):
|
||||||
attrs.pop(key)
|
attrs.pop(key)
|
||||||
|
self.text_src = src
|
||||||
|
self.span_needed = bool(attrs)
|
||||||
_Span.__init__(self, text=src, **attrs)
|
_Span.__init__(self, text=src, **attrs)
|
||||||
|
|
||||||
class HTMLConverter(object):
|
class HTMLConverter(object):
|
||||||
@ -544,9 +552,12 @@ class HTMLConverter(object):
|
|||||||
def create_link(self, children, tag):
|
def create_link(self, children, tag):
|
||||||
para = None
|
para = None
|
||||||
for i in range(len(children)-1, -1, -1):
|
for i in range(len(children)-1, -1, -1):
|
||||||
if not isinstance(children[i], CR):
|
if isinstance(children[i], _Span):
|
||||||
para = children[i]
|
para = children[i]
|
||||||
break
|
break
|
||||||
|
if para is None:
|
||||||
|
print children
|
||||||
|
raise ConversionError('Failed to parse link %s'%(tag,))
|
||||||
text = self.get_text(tag, 1000)
|
text = self.get_text(tag, 1000)
|
||||||
if not text:
|
if not text:
|
||||||
text = 'Link'
|
text = 'Link'
|
||||||
@ -710,7 +721,8 @@ class HTMLConverter(object):
|
|||||||
self.parse_tag(c, pcss)
|
self.parse_tag(c, pcss)
|
||||||
elif isinstance(c, NavigableString):
|
elif isinstance(c, NavigableString):
|
||||||
self.add_text(c, pcss)
|
self.add_text(c, pcss)
|
||||||
ptag.extract()
|
if not self.in_table:
|
||||||
|
ptag.extract()
|
||||||
|
|
||||||
def process_alignment(self, css):
|
def process_alignment(self, css):
|
||||||
'''
|
'''
|
||||||
@ -748,7 +760,7 @@ class HTMLConverter(object):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def add_text(self, tag, css):
|
def add_text(self, tag, css, force_span_use=False):
|
||||||
'''
|
'''
|
||||||
Add text to the current paragraph taking CSS into account.
|
Add text to the current paragraph taking CSS into account.
|
||||||
@param tag: Either a BeautifulSoup tag or a string
|
@param tag: Either a BeautifulSoup tag or a string
|
||||||
@ -760,20 +772,29 @@ class HTMLConverter(object):
|
|||||||
if self.process_alignment(css) and collapse_whitespace:
|
if self.process_alignment(css) and collapse_whitespace:
|
||||||
# Dont want leading blanks in a new paragraph
|
# Dont want leading blanks in a new paragraph
|
||||||
src = src.lstrip()
|
src = src.lstrip()
|
||||||
args = self.sanctify_css(css), self.memory, self.profile.dpi, self.fonts,\
|
def append_text(src):
|
||||||
self.logger, self.font_delta, self.current_block.textStyle.attrs
|
span = Span(src, self.sanctify_css(css), self.memory, self.profile.dpi,
|
||||||
|
self.fonts, self.logger, self.font_delta,
|
||||||
|
self.current_block.textStyle.attrs)
|
||||||
|
if span.span_needed or force_span_use:
|
||||||
|
self.current_para.append(span)
|
||||||
|
else:
|
||||||
|
if hasattr(span.text_src, 'parent'):
|
||||||
|
span.text_src.parent.contents = []
|
||||||
|
span.text_src.parent = None
|
||||||
|
self.current_para.append(span.text_src)
|
||||||
if collapse_whitespace:
|
if collapse_whitespace:
|
||||||
src = re.sub(r'\s{1,}', ' ', src)
|
src = re.sub(r'\s{1,}', ' ', src)
|
||||||
if len(self.previous_text) != len(self.previous_text.rstrip()):
|
if len(self.previous_text) != len(self.previous_text.rstrip()):
|
||||||
src = src.lstrip()
|
src = src.lstrip()
|
||||||
if len(src):
|
if len(src):
|
||||||
self.previous_text = src
|
self.previous_text = src
|
||||||
self.current_para.append(Span(src, *args))
|
append_text(src)
|
||||||
else:
|
else:
|
||||||
srcs = src.split('\n')
|
srcs = src.split('\n')
|
||||||
for src in srcs:
|
for src in srcs:
|
||||||
if src:
|
if src:
|
||||||
self.current_para.append(Span(src, *args))
|
append_text(src)
|
||||||
if len(srcs) > 1:
|
if len(srcs) > 1:
|
||||||
self.line_break()
|
self.line_break()
|
||||||
|
|
||||||
@ -973,7 +994,7 @@ class HTMLConverter(object):
|
|||||||
def process_block(self, tag, tag_css, tkey):
|
def process_block(self, tag, tag_css, tkey):
|
||||||
''' Ensure padding and text-indent properties are respected '''
|
''' Ensure padding and text-indent properties are respected '''
|
||||||
if tag_css.has_key('text-indent'):
|
if tag_css.has_key('text-indent'):
|
||||||
indent = Span.unit_convert(tag_css['text-indent'], self.profile.dpi, pts=True)
|
indent = Span.unit_convert(str(tag_css['text-indent']), self.profile.dpi, pts=True)
|
||||||
if not indent:
|
if not indent:
|
||||||
indent = 0
|
indent = 0
|
||||||
|
|
||||||
@ -1051,7 +1072,7 @@ class HTMLConverter(object):
|
|||||||
text = self.get_text(tag, limit=1000)
|
text = self.get_text(tag, limit=1000)
|
||||||
if not text.strip():
|
if not text.strip():
|
||||||
text = "Link"
|
text = "Link"
|
||||||
self.add_text(text, tag_css)
|
self.add_text(text, tag_css, force_span_use=True)
|
||||||
self.links[self.target_prefix].append(self.create_link(self.current_para.contents, tag))
|
self.links[self.target_prefix].append(self.create_link(self.current_para.contents, tag))
|
||||||
if tag.has_key('id') or tag.has_key('name'):
|
if tag.has_key('id') or tag.has_key('name'):
|
||||||
key = 'name' if tag.has_key('name') else 'id'
|
key = 'name' if tag.has_key('name') else 'id'
|
||||||
@ -1312,9 +1333,12 @@ class HTMLConverter(object):
|
|||||||
self.process_table(tag, tag_css)
|
self.process_table(tag, tag_css)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
self.logger.warning('An error occurred while processing a table: %s', str(err))
|
self.logger.warning('An error occurred while processing a table: %s', str(err))
|
||||||
|
self.logger.debug('', exc_info=True)
|
||||||
self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300])
|
self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300])
|
||||||
self.in_table = False
|
self.in_table = False
|
||||||
self.process_children(tag, tag_css)
|
self.process_children(tag, tag_css)
|
||||||
|
finally:
|
||||||
|
tag.extract()
|
||||||
else:
|
else:
|
||||||
self.process_children(tag, tag_css)
|
self.process_children(tag, tag_css)
|
||||||
if end_page:
|
if end_page:
|
||||||
@ -1325,8 +1349,11 @@ class HTMLConverter(object):
|
|||||||
rowpad = 10
|
rowpad = 10
|
||||||
table = Table(self, tag, tag_css, rowpad=rowpad, colpad=10)
|
table = Table(self, tag, tag_css, rowpad=rowpad, colpad=10)
|
||||||
canvases = []
|
canvases = []
|
||||||
for block, xpos, ypos, delta in table.blocks(int(self.current_page.pageStyle.attrs['textwidth'])):
|
ps = self.current_page.pageStyle.attrs
|
||||||
|
for block, xpos, ypos, delta in table.blocks(int(ps['textwidth']), int(ps['textheight'])):
|
||||||
if not block:
|
if not block:
|
||||||
|
if ypos > int(ps['textheight']):
|
||||||
|
raise Exception, 'Table has cell that is too large'
|
||||||
canvases.append(Canvas(int(self.current_page.pageStyle.attrs['textwidth']), ypos+rowpad,
|
canvases.append(Canvas(int(self.current_page.pageStyle.attrs['textwidth']), ypos+rowpad,
|
||||||
blockrule='block-fixed'))
|
blockrule='block-fixed'))
|
||||||
else:
|
else:
|
||||||
|
@ -96,8 +96,7 @@ class Cell(object):
|
|||||||
self.colspan = int(tag['colspan']) if tag.has_key('colspan') else 1
|
self.colspan = int(tag['colspan']) if tag.has_key('colspan') else 1
|
||||||
self.rowspan = int(tag['rowspan']) if tag.has_key('rowspan') else 1
|
self.rowspan = int(tag['rowspan']) if tag.has_key('rowspan') else 1
|
||||||
except:
|
except:
|
||||||
if conv.verbose:
|
pass
|
||||||
print >>sys.stderr, "Error reading row/colspan for ", tag
|
|
||||||
|
|
||||||
pp = conv.current_page
|
pp = conv.current_page
|
||||||
conv.book.allow_new_page = False
|
conv.book.allow_new_page = False
|
||||||
@ -219,7 +218,7 @@ class Row(object):
|
|||||||
def __init__(self, conv, row, css, colpad):
|
def __init__(self, conv, row, css, colpad):
|
||||||
self.cells = []
|
self.cells = []
|
||||||
self.colpad = colpad
|
self.colpad = colpad
|
||||||
cells = row.findAll(re.compile('td|th'))
|
cells = row.findAll(re.compile('td|th', re.IGNORECASE))
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
ccss = conv.tag_css(cell, css)
|
ccss = conv.tag_css(cell, css)
|
||||||
self.cells.append(Cell(conv, cell, ccss))
|
self.cells.append(Cell(conv, cell, ccss))
|
||||||
@ -347,7 +346,7 @@ class Table(object):
|
|||||||
itercount += 1
|
itercount += 1
|
||||||
return [i+self.colpad for i in widths]
|
return [i+self.colpad for i in widths]
|
||||||
|
|
||||||
def blocks(self, maxwidth):
|
def blocks(self, maxwidth, maxheight):
|
||||||
rows, cols = self.number_or_rows(), self.number_of_columns()
|
rows, cols = self.number_or_rows(), self.number_of_columns()
|
||||||
cellmatrix = [[None for c in range(cols)] for r in range(rows)]
|
cellmatrix = [[None for c in range(cols)] for r in range(rows)]
|
||||||
rowpos = [0 for i in range(rows)]
|
rowpos = [0 for i in range(rows)]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user