Fix #4371 (Conversion to FB2)

This commit is contained in:
Kovid Goyal 2010-01-04 01:22:29 -07:00
commit 1661dbf0ce
3 changed files with 29 additions and 30 deletions

View File

@ -32,12 +32,9 @@ TAG_MAP = {
'p' : 'p', 'p' : 'p',
'li' : 'p', 'li' : 'p',
'div': 'p', 'div': 'p',
'br' : 'p',
} }
TAG_FORCE_P = [
'br',
]
TAG_SPACE = [] TAG_SPACE = []
TAG_IMAGES = [ TAG_IMAGES = [
@ -48,6 +45,10 @@ TAG_LINKS = [
'a', 'a',
] ]
BLOCK = [
'p',
]
STYLES = [ STYLES = [
('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}), ('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}),
('font-style', {'italic' : 'emphasis'}), ('font-style', {'italic' : 'emphasis'}),
@ -240,7 +241,8 @@ class FB2MLizer(object):
if id_name: if id_name:
fb2_text.append(self.get_anchor(page, id_name)) fb2_text.append(self.get_anchor(page, id_name))
if tag in TAG_FORCE_P: fb2_tag = TAG_MAP.get(tag, None)
if fb2_tag == 'p':
if 'p' in tag_stack+tags: if 'p' in tag_stack+tags:
# Close all up to p. Close p. Reopen all closed tags including p. # Close all up to p. Close p. Reopen all closed tags including p.
all_tags = tag_stack+tags all_tags = tag_stack+tags
@ -257,9 +259,7 @@ class FB2MLizer(object):
else: else:
fb2_text.append('<p>') fb2_text.append('<p>')
tags.append('p') tags.append('p')
elif fb2_tag and fb2_tag not in tag_stack+tags:
fb2_tag = TAG_MAP.get(tag, None)
if fb2_tag and fb2_tag not in tag_stack+tags:
fb2_text.append('<%s>' % fb2_tag) fb2_text.append('<%s>' % fb2_tag)
tags.append(fb2_tag) tags.append(fb2_tag)

View File

@ -42,6 +42,7 @@ STYLES = [
BLOCK_TAGS = [ BLOCK_TAGS = [
'p', 'p',
'div',
] ]
BLOCK_STYLES = [ BLOCK_STYLES = [
@ -188,7 +189,7 @@ class PMLMLizer(object):
text = re.sub('\n{2,}', '\n', text) text = re.sub('\n{2,}', '\n', text)
text = re.sub('(?imu)^(?P<text>.+)$', lambda mo: mo.group('text') if re.search(r'\\[XxCm]', mo.group('text')) else ' %s' % mo.group('text'), text) text = re.sub('(?imu)^(?P<text>.+)$', lambda mo: mo.group('text') if re.search(r'\\[XxCm]', mo.group('text')) else ' %s' % mo.group('text'), text)
else: else:
text = re.sub('\n{4,}', '\n\n\n', text) text = re.sub('\n{3,}', '\n\n', text)
return text return text
@ -199,6 +200,7 @@ class PMLMLizer(object):
return [] return []
text = [] text = []
tags = []
style = stylizer.style(elem) style = stylizer.style(elem)
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
@ -206,13 +208,14 @@ class PMLMLizer(object):
return [] return []
tag = barename(elem.tag) tag = barename(elem.tag)
tag_count = 0
# Are we in a paragraph block? # Are we in a paragraph block?
if tag in BLOCK_TAGS: # or style['display'] in BLOCK_STYLES: if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
if 'block' not in tag_stack: if 'block' not in tag_stack+tags:
tag_count += 1 tags.append('block')
tag_stack.append('block') else:
# Start new block
text.append('\n\n')
# Process tags that need special processing and that do not have inner # Process tags that need special processing and that do not have inner
# text. Usually these require an argument # text. Usually these require an argument
@ -245,14 +248,13 @@ class PMLMLizer(object):
# text.append('\\p') # text.append('\\p')
pml_tag = TAG_MAP.get(tag, None) pml_tag = TAG_MAP.get(tag, None)
if pml_tag and pml_tag not in tag_stack: if pml_tag and pml_tag not in tag_stack+tags:
tag_count += 1
text.append('\\%s' % pml_tag) text.append('\\%s' % pml_tag)
tag_stack.append(pml_tag) tags.append(pml_tag)
# Special processing of tags that require an argument. # Special processing of tags that require an argument.
# Anchors links # Anchors links
if tag in LINK_TAGS and 'q' not in tag_stack: if tag in LINK_TAGS and 'q' not in tag_stack+tags:
href = elem.get('href') href = elem.get('href')
if href: if href:
href = page.abshref(href) href = page.abshref(href)
@ -263,8 +265,7 @@ class PMLMLizer(object):
self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys())
href = self.link_hrefs[href] href = self.link_hrefs[href]
text.append('\\q="#%s"' % href) text.append('\\q="#%s"' % href)
tag_count += 1 tags.append('q')
tag_stack.append('q')
# Anchor ids # Anchor ids
id_name = elem.get('id') id_name = elem.get('id')
@ -274,10 +275,9 @@ class PMLMLizer(object):
# Processes style information # Processes style information
for s in STYLES: for s in STYLES:
style_tag = s[1].get(style[s[0]], None) style_tag = s[1].get(style[s[0]], None)
if style_tag and style_tag not in tag_stack: if style_tag and style_tag not in tag_stack+tags:
tag_count += 1
text.append('\\%s' % style_tag) text.append('\\%s' % style_tag)
tag_stack.append(style_tag) tags.append(style_tag)
# margin # margin
# Proccess tags that contain text. # Proccess tags that contain text.
@ -285,16 +285,15 @@ class PMLMLizer(object):
text.append(self.remove_newlines(elem.text)) text.append(self.remove_newlines(elem.text))
for item in elem: for item in elem:
text += self.dump_text(item, stylizer, page, tag_stack) text += self.dump_text(item, stylizer, page, tag_stack+tags)
tags.reverse()
text += self.close_tags(tags)
close_tag_list = []
for i in range(0, tag_count):
close_tag_list.insert(0, tag_stack.pop())
text += self.close_tags(close_tag_list)
if tag in SEPARATE_TAGS: if tag in SEPARATE_TAGS:
text.append('\n\n') text.append('\n\n')
if 'block' not in tag_stack: if 'block' not in tag_stack+tags:
text.append('\n\n') text.append('\n\n')
#if style['page-break-after'] == 'always': #if style['page-break-after'] == 'always':

View File

@ -102,7 +102,7 @@ class TXTMLizer(object):
text = re.sub('\n{2,}', '\n', text) text = re.sub('\n{2,}', '\n', text)
text = re.sub('(?imu)^(?=.)', '\t', text) text = re.sub('(?imu)^(?=.)', '\t', text)
else: else:
text = re.sub('\n{4,}', '\n\n\n', text) text = re.sub('\n{3,}', '\n\n', text)
# Replace spaces at the beginning and end of lines # Replace spaces at the beginning and end of lines
text = re.sub('(?imu)^[ ]+', '', text) text = re.sub('(?imu)^[ ]+', '', text)