DOCX Input: Convert tabs to non-breaking spaces

DOCX Input: Convert tabs in the docx file into non-breaking spaces in
the output document. Note that custom tab stops are not supported.
Fixes #1228893 [Converting .docx file fails to preserve para first line indent](https://bugs.launchpad.net/calibre/+bug/1228893)
This commit is contained in:
Kovid Goyal 2013-09-23 10:34:09 +05:30
parent 2c46d54c47
commit a4f6d6d19e
4 changed files with 40 additions and 2 deletions

View File

@ -23,6 +23,7 @@ LINKS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships
FOOTNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes' FOOTNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes'
ENDNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes' ENDNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes'
THEMES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme' THEMES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme'
SETTINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings'
namespaces = { namespaces = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',

View File

@ -0,0 +1,20 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.ebooks.docx.names import XPath, get
class Settings(object):
def __init__(self, root):
self.default_tab_stop = 720 / 20
for dts in XPath('//w:defaultTabStop[@w:val]')(root):
try:
self.default_tab_stop = int(get(dts, 'w:val')) / 20
except (ValueError, TypeError, AttributeError):
pass

View File

@ -458,6 +458,8 @@ class Styles(object):
dl.notes dd:last-of-type { page-break-after: avoid } dl.notes dd:last-of-type { page-break-after: avoid }
span.tab { white-space: pre }
''') % (self.body_font_family, self.body_font_size, self.body_color) ''') % (self.body_font_family, self.body_font_size, self.body_color)
if ef: if ef:
prefix = ef + '\n' + prefix prefix = ef + '\n' + prefix

View File

@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import sys, os, re import sys, os, re, math
from collections import OrderedDict, defaultdict from collections import OrderedDict, defaultdict
from lxml import html from lxml import html
@ -16,7 +16,7 @@ from lxml.html.builder import (
from calibre.ebooks.docx.container import DOCX, fromstring from calibre.ebooks.docx.container import DOCX, fromstring
from calibre.ebooks.docx.names import ( from calibre.ebooks.docx.names import (
XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor, XPath, is_tag, XML, STYLES, NUMBERING, FONTS, get, generate_anchor,
descendants, FOOTNOTES, ENDNOTES, children, THEMES) descendants, FOOTNOTES, ENDNOTES, children, THEMES, SETTINGS)
from calibre.ebooks.docx.styles import Styles, inherit, PageProperties from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
from calibre.ebooks.docx.numbering import Numbering from calibre.ebooks.docx.numbering import Numbering
from calibre.ebooks.docx.fonts import Fonts from calibre.ebooks.docx.fonts import Fonts
@ -27,6 +27,7 @@ from calibre.ebooks.docx.cleanup import cleanup_markup
from calibre.ebooks.docx.theme import Theme from calibre.ebooks.docx.theme import Theme
from calibre.ebooks.docx.toc import create_toc from calibre.ebooks.docx.toc import create_toc
from calibre.ebooks.docx.fields import Fields from calibre.ebooks.docx.fields import Fields
from calibre.ebooks.docx.settings import Settings
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
@ -227,6 +228,7 @@ class Convert(object):
nname = get_name(NUMBERING, 'numbering.xml') nname = get_name(NUMBERING, 'numbering.xml')
sname = get_name(STYLES, 'styles.xml') sname = get_name(STYLES, 'styles.xml')
sename = get_name(SETTINGS, 'settings.xml')
fname = get_name(FONTS, 'fontTable.xml') fname = get_name(FONTS, 'fontTable.xml')
tname = get_name(THEMES, 'theme1.xml') tname = get_name(THEMES, 'theme1.xml')
foname = get_name(FOOTNOTES, 'footnotes.xml') foname = get_name(FOOTNOTES, 'footnotes.xml')
@ -237,6 +239,14 @@ class Convert(object):
foraw = enraw = None foraw = enraw = None
forel, enrel = ({}, {}), ({}, {}) forel, enrel = ({}, {}), ({}, {})
if sename is not None:
try:
seraw = self.docx.read(sename)
except KeyError:
self.log.warn('Settings %s do not exist' % sename)
else:
self.settings = Settings(fromstring(seraw))
if foname is not None: if foname is not None:
try: try:
foraw = self.docx.read(foname) foraw = self.docx.read(foname)
@ -538,6 +548,11 @@ class Convert(object):
l.set('class', 'noteref') l.set('class', 'noteref')
text.add_elem(l) text.add_elem(l)
ans.append(text.elem) ans.append(text.elem)
elif is_tag(child, 'w:tab'):
spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
text.add_elem(SPAN(NBSP * spaces))
ans.append(text.elem)
ans[-1].set('class', 'tab')
if text.buf: if text.buf:
setattr(text.elem, text.attr, ''.join(text.buf)) setattr(text.elem, text.attr, ''.join(text.buf))