Fix #163 and auto-detect Baen files.

This commit is contained in:
Kovid Goyal 2007-08-19 21:42:21 +00:00
parent 04fbb91fae
commit e39dc4223f
2 changed files with 75 additions and 46 deletions

View File

@ -53,10 +53,9 @@ class Span(_Span):
@staticmethod @staticmethod
def unit_convert(val, dpi, ref=80, pts=False): def unit_convert(val, dpi, pts=False):
""" """
Tries to convert html units stored in C{val} to pixels. Tries to convert html units stored in C{val} to pixels. Assumes 100% = 10pt
@param ref: reference size in pixels for % units.
@param pts: If True return 10*pts instead of pixels. @param pts: If True return 10*pts instead of pixels.
@return: The number of pixels (an int) if successful. Otherwise, returns None. @return: The number of pixels (an int) if successful. Otherwise, returns None.
Assumes: One em is 10pts Assumes: One em is 10pts
@ -70,7 +69,8 @@ class Span(_Span):
if m is not None: if m is not None:
unit = float(m.group(1)) unit = float(m.group(1))
if m.group(2) == '%': if m.group(2) == '%':
result = int(unit/100.0*ref) normal = Span.unit_convert('10pt', dpi)
result = int((unit/100.0)*normal)
elif m.group(2) == 'px': elif m.group(2) == 'px':
result = int(unit) result = int(unit)
elif m.group(2) == 'in': elif m.group(2) == 'in':
@ -85,14 +85,13 @@ class Span(_Span):
result = int(unit * 0.04 * (dpi/72.)) result = int(unit * 0.04 * (dpi/72.))
elif m.group(2)== 'cm': elif m.group(2)== 'cm':
result = int(unit * 0.4 * (dpi/72.)) result = int(unit * 0.4 * (dpi/72.))
if result is None:
result = 0
if pts: if pts:
result = int((float(result)/dpi)*720) if result is not None:
result = int((float(result)/dpi)*720)
return result return result
@staticmethod @staticmethod
def translate_attrs(d, dpi, fonts, logger, font_delta=0, memory=None): def translate_font_attrs(d, dpi, fonts, logger, font_delta=0, memory=None):
""" """
Receives a dictionary of html attributes and styles and returns Receives a dictionary of html attributes and styles and returns
approximate Xylog equivalents in a new dictionary approximate Xylog equivalents in a new dictionary
@ -141,16 +140,13 @@ class Span(_Span):
def font_size(val): def font_size(val):
# Assumes a 10 pt font (14 pixels) has fontsize 100 normal = 100 #10*pts
ans = None ans = Span.unit_convert(val, dpi, pts=True)
normal = 14 if ans:
unit = Span.unit_convert(val, dpi, normal) if ans < 0:
if unit: ans += normal
if unit < 0: if ans < 0:
unit = normal + unit ans = normal
if unit < 0:
unit = normal
ans = int(unit * (72./dpi) * 10)
else: else:
if "xx-small" in val: if "xx-small" in val:
ans = 40 ans = 40
@ -211,7 +207,7 @@ class Span(_Span):
variant = font_variant(val) variant = font_variant(val)
if variant: if variant:
t['fontvariant'] = variant t['fontvariant'] = variant
else: elif memory is not None:
report = True report = True
if memory != None: if memory != None:
if key in memory: if key in memory:
@ -221,8 +217,10 @@ class Span(_Span):
if report: if report:
logger.info('Unhandled/malformed CSS key: %s: %s', key, d[key]) logger.info('Unhandled/malformed CSS key: %s: %s', key, d[key])
t['fontfacename'] = (family, font_key(family, style, weight)) t['fontfacename'] = (family, font_key(family, style, weight))
if t.has_key('fontsize') and int(t['fontsize']) > 120: if t.has_key('fontsize'):
t['wordspace'] = 50 if int(t['fontsize']) > 120:
t['wordspace'] = 50
t['baselineskip'] = int(t['fontsize']) + 20
return t return t
def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta, parent_style, def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta, parent_style,
@ -231,7 +229,7 @@ class Span(_Span):
for pat, repl in Span.rules: for pat, repl in Span.rules:
src = pat.sub(repl, src) src = pat.sub(repl, src)
src = src.replace(u'\xa0', ' ') # nbsp is replaced with \xa0 by BeatifulSoup src = src.replace(u'\xa0', ' ') # nbsp is replaced with \xa0 by BeatifulSoup
attrs = Span.translate_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory) attrs = Span.translate_font_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory)
if 'fontsize' in attrs.keys(): if 'fontsize' in attrs.keys():
normal_font_size = int(attrs['fontsize']) normal_font_size = int(attrs['fontsize'])
variant = attrs.pop('fontvariant', None) variant = attrs.pop('fontvariant', None)
@ -259,13 +257,14 @@ class Span(_Span):
attrs['fontweight'] = 700 attrs['fontweight'] = 700
if key in ['italic', 'bi']: if key in ['italic', 'bi']:
src = Italic(src) src = Italic(src)
if 'fontsize' in attrs.keys():
attrs['baselineskip'] = int(attrs['fontsize']) + 20
if attrs['fontfacename'] == fonts['serif']['normal'][1]: if attrs['fontfacename'] == fonts['serif']['normal'][1]:
attrs.pop('fontfacename') attrs.pop('fontfacename')
unneeded = []
for key in attrs: for key in attrs:
if parent_style.has_key(key) and str(parent_style[key]) == str(attrs[key]): if parent_style.has_key(key) and str(parent_style[key]) == str(attrs[key]):
attrs.pop(key) unneeded.append(key)
for key in unneeded:
attrs.pop(key)
self.text_src = src self.text_src = src
self.span_needed = bool(attrs) self.span_needed = bool(attrs)
_Span.__init__(self, text=src, **attrs) _Span.__init__(self, text=src, **attrs)
@ -395,6 +394,10 @@ class HTMLConverter(object):
self.book = book #: The Book object representing a BBeB book self.book = book #: The Book object representing a BBeB book
self.start_on_file(path, is_root=True) self.start_on_file(path, is_root=True)
def is_baen(self, soup):
return bool(soup.find('meta', attrs={'name':'Publisher',
'content':re.compile('Baen', re.IGNORECASE)}))
def start_on_file(self, path, is_root=True, link_level=0): def start_on_file(self, path, is_root=True, link_level=0):
path = os.path.abspath(path) path = os.path.abspath(path)
os.chdir(os.path.dirname(path)) os.chdir(os.path.dirname(path))
@ -413,6 +416,10 @@ class HTMLConverter(object):
soup = BeautifulSoup(raw, soup = BeautifulSoup(raw,
convertEntities=BeautifulSoup.HTML_ENTITIES, convertEntities=BeautifulSoup.HTML_ENTITIES,
markupMassage=nmassage) markupMassage=nmassage)
if not self.baen and self.is_baen(soup):
self.baen = True
self.logger.info('Baen file detected. Re-parsing...')
return self.start_on_file(path, is_root=is_root, link_level=link_level)
self.logger.info('\tConverting to BBeB...') self.logger.info('\tConverting to BBeB...')
sys.stdout.flush() sys.stdout.flush()
self.current_page = None self.current_page = None
@ -990,7 +997,7 @@ class HTMLConverter(object):
self.logger.debug('Forcing page break at %s', tagname) self.logger.debug('Forcing page break at %s', tagname)
return end_page return end_page
def process_block(self, tag, tag_css, tkey): def process_block(self, tag, tag_css, tkey):
''' Ensure padding and text-indent properties are respected ''' ''' Ensure padding and text-indent properties are respected '''
if tag_css.has_key('text-indent'): if tag_css.has_key('text-indent'):
indent = Span.unit_convert(str(tag_css['text-indent']), self.profile.dpi, pts=True) indent = Span.unit_convert(str(tag_css['text-indent']), self.profile.dpi, pts=True)
@ -998,7 +1005,6 @@ class HTMLConverter(object):
indent = 0 indent = 0
if hasattr(self, 'minimum_indent') and indent > 0 and indent < self.minimum_indent: if hasattr(self, 'minimum_indent') and indent > 0 and indent < self.minimum_indent:
indent = self.minimum_indent indent = self.minimum_indent
else: else:
indent = self.book.defaultTextStyle.attrs['parindent'] indent = self.book.defaultTextStyle.attrs['parindent']
@ -1017,14 +1023,32 @@ class HTMLConverter(object):
top = Span.unit_convert(top, self.profile.dpi) if top is not None else 0 top = Span.unit_convert(top, self.profile.dpi) if top is not None else 0
bottom = Span.unit_convert(bottom, self.profile.dpi) if bottom is not None else 0 bottom = Span.unit_convert(bottom, self.profile.dpi) if bottom is not None else 0
left = Span.unit_convert(left, self.profile.dpi) if left is not None else 0 left = Span.unit_convert(left, self.profile.dpi) if left is not None else 0
fonts = Span.translate_font_attrs(tag_css, self.profile.dpi, self.fonts,
if indent != int(self.current_block.textStyle.attrs['parindent']) or \ self.logger, self.font_delta, None)
fonts_changed = False
fonts.pop('fontvariant', None)
family, key = fonts['fontfacename']
if self.fonts[family].has_key(key):
fonts['fontfacename'] = self.fonts[family][key][1]
else:
fonts['fontfacename'] = self.fonts[family]['normal'][1]
for key in fonts.keys():
if str(self.current_block.textStyle.attrs[key]) != str(fonts[key]):
fonts_changed = True
break
if fonts_changed or \
indent != int(self.current_block.textStyle.attrs['parindent']) or \
top != int(self.current_block.blockStyle.attrs['topskip']) or \ top != int(self.current_block.blockStyle.attrs['topskip']) or \
bottom != int(self.current_block.blockStyle.attrs['footskip']) or \ bottom != int(self.current_block.blockStyle.attrs['footskip']) or \
left != int(self.current_block.blockStyle.attrs['sidemargin']): left != int(self.current_block.blockStyle.attrs['sidemargin']):
self.current_block.append_to(self.current_page) self.current_block.append_to(self.current_page)
ts = self.book.create_text_style(**self.current_block.textStyle.attrs) ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
ts.attrs['parindent'] = indent ts.attrs['parindent'] = indent
for key in ('fontfacename', 'fontsize', 'fontwidth', 'wordspace', 'baselineskip'):
ts.attrs[key] = self.book.defaultTextStyle.attrs[key]
for key in fonts:
ts.attrs[key] = fonts[key]
bs = self.book.create_block_style(**self.current_block.blockStyle.attrs) bs = self.book.create_block_style(**self.current_block.blockStyle.attrs)
ba = bs.attrs ba = bs.attrs
ba['topskip'], ba['footskip'], ba['sidemargin'] = top, bottom, left ba['topskip'], ba['footskip'], ba['sidemargin'] = top, bottom, left
@ -1177,7 +1201,7 @@ class HTMLConverter(object):
elif tagname == 'pre': elif tagname == 'pre':
self.end_current_para() self.end_current_para()
self.end_current_block() self.end_current_block()
self.current_block.textStyle = self.current_block.textStyle.copy() self.current_block = self.book.create_text_block()
self.current_block.textStyle.attrs['parindent'] = '0' self.current_block.textStyle.attrs['parindent'] = '0'
if tag.contents: if tag.contents:
c = tag.contents[0] c = tag.contents[0]
@ -1247,14 +1271,14 @@ class HTMLConverter(object):
self.current_block.append_to(self.current_page) self.current_block.append_to(self.current_page)
pb = self.current_block pb = self.current_block
self.current_para = Paragraph() self.current_para = Paragraph()
ts = self.book.create_text_style(**self.current_block.textStyle.attrs) ts = self.book.create_text_style()
ts.attrs['parindent'] = 0 ts.attrs['parindent'] = 0
try: try:
index = self.text_styles.index(ts) index = self.text_styles.index(ts)
ts = self.text_styles[index] ts = self.text_styles[index]
except ValueError: except ValueError:
self.text_styles.append(ts) self.text_styles.append(ts)
bs = self.book.create_block_style(**self.current_block.blockStyle.attrs) bs = self.book.create_block_style()
bs.attrs['sidemargin'], bs.attrs['topskip'], bs.attrs['footskip'] = \ bs.attrs['sidemargin'], bs.attrs['topskip'], bs.attrs['footskip'] = \
60, 20, 20 60, 20, 20
try: try:
@ -1297,19 +1321,25 @@ class HTMLConverter(object):
self.logger.debug('Detected chapter %s', src) self.logger.debug('Detected chapter %s', src)
self.end_page() self.end_page()
self.page_break_found = True self.page_break_found = True
self.end_current_para() if not tag.contents:
if not tag.contents or not src.strip(): # Handle empty <p></p> elements
self.current_block.append(CR()) self.current_block.append(CR())
self.previous_text = '\n' self.current_block.must_append = True
self.process_children(tag, tag_css)
return return
self.previous_text = '\n' if not self.in_table:
self.process_block(tag, tag_css, tkey) self.process_block(tag, tag_css, tkey)
self.process_children(tag, tag_css) if self.current_para.contents:
self.end_current_para() self.current_block.append(self.current_para)
if tagname.startswith('h') or self.blank_after_para: if self.current_block.contents:
self.current_block.append(CR()) self.current_block.append(CR())
self.previous_text = '\n' self.previous_text = '\n'
self.current_para = Paragraph()
self.process_children(tag, tag_css)
if self.current_para.contents:
self.current_block.append(self.current_para)
self.current_para = Paragraph()
if tagname.startswith('h') or self.blank_after_para:
self.current_block.append(CR())
elif tagname in ['b', 'strong', 'i', 'em', 'span', 'tt', 'big', 'code', 'cite']: elif tagname in ['b', 'strong', 'i', 'em', 'span', 'tt', 'big', 'code', 'cite']:
self.process_children(tag, tag_css) self.process_children(tag, tag_css)
elif tagname == 'font': elif tagname == 'font':
@ -1350,6 +1380,7 @@ class HTMLConverter(object):
def process_table(self, tag, tag_css): def process_table(self, tag, tag_css):
self.end_current_block() self.end_current_block()
self.current_block = self.book.create_text_block()
rowpad = 10 rowpad = 10
table = Table(self, tag, tag_css, rowpad=rowpad, colpad=10) table = Table(self, tag, tag_css, rowpad=rowpad, colpad=10)
canvases = [] canvases = []

View File

@ -24,7 +24,7 @@
</ul> </ul>
<h2><a name='lists'>Lists</a></h2> <h2><a name='lists'>Lists</a></h2>
<p></p>
<h3>Nested lists</h3> <h3>Nested lists</h3>
<ol> <ol>
<li>Item 1</li> <li>Item 1</li>
@ -37,9 +37,7 @@
</ol> </ol>
</ul> </ul>
<li>Item 2</li> <li>Item 2</li>
</ol> </ol>
</p>
<br/>
<p></p> <p></p>
<h3>Definition Lists</h3> <h3>Definition Lists</h3>
<dl> <dl>