mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Don't detect BlockSpace elements as first element of a page.
Skip over hrefs that are in weird encodings.
This commit is contained in:
parent
76c0aeb57c
commit
69f20f634d
@ -400,6 +400,10 @@ class HTMLConverter(object):
|
|||||||
return prop
|
return prop
|
||||||
|
|
||||||
def parse_file(self):
|
def parse_file(self):
|
||||||
|
def get_valid_block(page):
|
||||||
|
for item in page.contents:
|
||||||
|
if isinstance(item, (TextBlock, ImageBlock, RuledLine)):
|
||||||
|
return item
|
||||||
previous = self.book.last_page()
|
previous = self.book.last_page()
|
||||||
self.current_page = self.book.create_page()
|
self.current_page = self.book.create_page()
|
||||||
self.current_block = self.book.create_text_block()
|
self.current_block = self.book.create_text_block()
|
||||||
@ -418,7 +422,7 @@ class HTMLConverter(object):
|
|||||||
|
|
||||||
if not self.top.parent:
|
if not self.top.parent:
|
||||||
if not previous:
|
if not previous:
|
||||||
self.top = self.book.pages()[0].contents[0]
|
self.top = get_valid_block(self.book.pages()[0])
|
||||||
else:
|
else:
|
||||||
found = False
|
found = False
|
||||||
for page in self.book.pages():
|
for page in self.book.pages():
|
||||||
@ -426,7 +430,9 @@ class HTMLConverter(object):
|
|||||||
found = True
|
found = True
|
||||||
continue
|
continue
|
||||||
if found:
|
if found:
|
||||||
self.top = page.contents[0]
|
self.top = get_valid_block(page)
|
||||||
|
if not self.top:
|
||||||
|
continue
|
||||||
break
|
break
|
||||||
if not self.top.parent:
|
if not self.top.parent:
|
||||||
raise ConversionError, 'Could not parse ' + self.file_name
|
raise ConversionError, 'Could not parse ' + self.file_name
|
||||||
@ -455,7 +461,7 @@ class HTMLConverter(object):
|
|||||||
ans, found, page = None, False, bs.parent
|
ans, found, page = None, False, bs.parent
|
||||||
for item in page.contents:
|
for item in page.contents:
|
||||||
if found:
|
if found:
|
||||||
if isinstance(item, (TextBlock, ImageBlock)):
|
if isinstance(item, (TextBlock, RuledLine, ImageBlock)):
|
||||||
ans = item
|
ans = item
|
||||||
break
|
break
|
||||||
if item == bs:
|
if item == bs:
|
||||||
@ -464,7 +470,7 @@ class HTMLConverter(object):
|
|||||||
|
|
||||||
if not ans:
|
if not ans:
|
||||||
for i in range(len(page.contents)-1, -1, -1):
|
for i in range(len(page.contents)-1, -1, -1):
|
||||||
if isinstance(page.contents[i], (TextBlock, ImageBlock)):
|
if isinstance(page.contents[i], (TextBlock, RuledLine, ImageBlock)):
|
||||||
ans = page.contents[i]
|
ans = page.contents[i]
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -497,8 +503,11 @@ class HTMLConverter(object):
|
|||||||
cb = CharButton(jb, text=self.get_text(tag))
|
cb = CharButton(jb, text=self.get_text(tag))
|
||||||
para.contents = []
|
para.contents = []
|
||||||
para.append(cb)
|
para.append(cb)
|
||||||
elif self.link_level < self.max_link_levels:
|
elif self.link_level < self.max_link_levels:
|
||||||
if not os.access(path, os.R_OK):
|
try: # os.access raises Exceptions in path has null bytes
|
||||||
|
if not os.access(path.encode('utf8', 'replace'), os.R_OK):
|
||||||
|
raise Exception()
|
||||||
|
except Exception:
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print "Skipping", link
|
print "Skipping", link
|
||||||
continue
|
continue
|
||||||
@ -641,10 +650,12 @@ class HTMLConverter(object):
|
|||||||
if test.startswith('margin') or test.startswith('text') or \
|
if test.startswith('margin') or test.startswith('text') or \
|
||||||
'padding' in test or 'border' in test or 'page-break' in test \
|
'padding' in test or 'border' in test or 'page-break' in test \
|
||||||
or test.startswith('mso') or test.startswith('background')\
|
or test.startswith('mso') or test.startswith('background')\
|
||||||
or test in ['color', 'display', \
|
or test.startswith('line') or test in ['color', 'display', \
|
||||||
'letter-spacing',
|
'letter-spacing',
|
||||||
'font-variant']:
|
'font-variant']:
|
||||||
css.pop(key)
|
css.pop(key)
|
||||||
|
if self.verbose:
|
||||||
|
print 'Ignoring CSS key:', key
|
||||||
return css
|
return css
|
||||||
|
|
||||||
def end_current_para(self):
|
def end_current_para(self):
|
||||||
@ -730,7 +741,6 @@ class HTMLConverter(object):
|
|||||||
else:
|
else:
|
||||||
target = BlockSpace()
|
target = BlockSpace()
|
||||||
self.current_page.append(target)
|
self.current_page.append(target)
|
||||||
|
|
||||||
self.targets[tag['name']] = target
|
self.targets[tag['name']] = target
|
||||||
elif tag.has_key('href') and not self.link_exclude.match(tag['href']):
|
elif tag.has_key('href') and not self.link_exclude.match(tag['href']):
|
||||||
purl = urlparse(tag['href'])
|
purl = urlparse(tag['href'])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user