RTF Input: Handle underlined text. Fixes #845328 (Underlined text in RTF not propogated when converted to ePub)

This commit is contained in:
Kovid Goyal 2011-09-10 08:45:45 -06:00
commit f1867f1128
18 changed files with 139 additions and 117 deletions

View File

@ -98,7 +98,7 @@
<xsl:apply-templates/> <xsl:apply-templates/>
</emph> </emph>
</xsl:when> </xsl:when>
<xsl:when test = "@underlined"> <xsl:when test = "@underlined and @underlined != 'false'">
<emph rend = "paragraph-emph-underlined"> <emph rend = "paragraph-emph-underlined">
<xsl:apply-templates/> <xsl:apply-templates/>
</emph> </emph>
@ -220,7 +220,7 @@
</xsl:template> </xsl:template>
<xsl:template name="parse-styles-attrs"> <xsl:template name="parse-styles-attrs">
<!--<xsl:text>position:relative;</xsl:text>--> <!--<xsl:text>position:relative;</xsl:text>
<xsl:if test="@space-before"> <xsl:if test="@space-before">
<xsl:text>padding-top:</xsl:text> <xsl:text>padding-top:</xsl:text>
<xsl:value-of select="@space-before"/> <xsl:value-of select="@space-before"/>
@ -230,7 +230,7 @@
<xsl:text>padding-bottom:</xsl:text> <xsl:text>padding-bottom:</xsl:text>
<xsl:value-of select="@space-after"/> <xsl:value-of select="@space-after"/>
<xsl:text>pt;</xsl:text> <xsl:text>pt;</xsl:text>
</xsl:if> </xsl:if>-->
<xsl:if test="@left-indent"> <xsl:if test="@left-indent">
<xsl:text>padding-left:</xsl:text> <xsl:text>padding-left:</xsl:text>
<xsl:value-of select="@left-indent"/> <xsl:value-of select="@left-indent"/>
@ -256,15 +256,15 @@
<xsl:value-of select="'italic'"/> <xsl:value-of select="'italic'"/>
<xsl:text>;</xsl:text> <xsl:text>;</xsl:text>
</xsl:if> </xsl:if>
<xsl:if test="@underline and @underline != 'false'"> <xsl:if test="@underlined and @underlined != 'false'">
<xsl:text>text-decoration:underline</xsl:text> <xsl:text>text-decoration:underline</xsl:text>
<xsl:text>;</xsl:text> <xsl:text>;</xsl:text>
</xsl:if> </xsl:if>
<xsl:if test="@line-spacing"> <!--<xsl:if test="@line-spacing">
<xsl:text>line-height:</xsl:text> <xsl:text>line-height:</xsl:text>
<xsl:value-of select="@line-spacing"/> <xsl:value-of select="@line-spacing"/>
<xsl:text>pt;</xsl:text> <xsl:text>pt;</xsl:text>
</xsl:if> </xsl:if>-->
<xsl:if test="(@align = 'just')"> <xsl:if test="(@align = 'just')">
<xsl:text>text-align: justify;</xsl:text> <xsl:text>text-align: justify;</xsl:text>
</xsl:if> </xsl:if>
@ -314,7 +314,6 @@
</xsl:attribute> </xsl:attribute>
<xsl:apply-templates/> <xsl:apply-templates/>
</xsl:element> </xsl:element>
</xsl:otherwise> </xsl:otherwise>
</xsl:choose> </xsl:choose>
</xsl:template> </xsl:template>
@ -452,6 +451,15 @@
<xsl:apply-templates/> <xsl:apply-templates/>
</xsl:element> </xsl:element>
</xsl:template> </xsl:template>
<xsl:template match = "rtf:field[@type='bookmark-start']">
<xsl:element name ="a">
<xsl:attribute name = "id">
<xsl:value-of select = "@number"/>
</xsl:attribute>
<xsl:apply-templates/>
</xsl:element>
</xsl:template>
<xsl:template match = "rtf:field"> <xsl:template match = "rtf:field">
<xsl:apply-templates/> <xsl:apply-templates/>

View File

@ -93,7 +93,7 @@ def get_metadata(stream):
stream.seek(0) stream.seek(0)
cpg = detect_codepage(stream) cpg = detect_codepage(stream)
stream.seek(0) stream.seek(0)
title_match = title_pat.search(block) title_match = title_pat.search(block)
if title_match is not None: if title_match is not None:
title = decode(title_match.group(1).strip(), cpg) title = decode(title_match.group(1).strip(), cpg)
@ -162,7 +162,8 @@ def set_metadata(stream, options):
index = src.rindex('}') index = src.rindex('}')
return src[:index] + r'{\ '[:-1] + name + ' ' + val + '}}' return src[:index] + r'{\ '[:-1] + name + ' ' + val + '}}'
src, pos = get_document_info(stream) src, pos = get_document_info(stream)
if not src: print 'I was thre'
if src is not None:
create_metadata(stream, options) create_metadata(stream, options)
else: else:
olen = len(src) olen = len(src)

View File

@ -41,7 +41,7 @@ border_style_map = {
class InlineClass(etree.XSLTExtension): class InlineClass(etree.XSLTExtension):
FMTS = ('italics', 'bold', 'underlined', 'strike-through', 'small-caps') FMTS = ('italics', 'bold', 'strike-through', 'small-caps')
def __init__(self, log): def __init__(self, log):
etree.XSLTExtension.__init__(self) etree.XSLTExtension.__init__(self)
@ -54,6 +54,9 @@ class InlineClass(etree.XSLTExtension):
for x in self.FMTS: for x in self.FMTS:
if input_node.get(x, None) == 'true': if input_node.get(x, None) == 'true':
classes.append(x) classes.append(x)
#underlined is special
if input_node.get('underlined', 'false') != 'false':
classes.append('underlined')
fs = input_node.get('font-size', False) fs = input_node.get('font-size', False)
if fs: if fs:
if fs not in self.font_sizes: if fs not in self.font_sizes:
@ -78,12 +81,13 @@ class RTFInput(InputFormatPlugin):
def generate_xml(self, stream): def generate_xml(self, stream):
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
ofile = 'dataxml.xml' ofile = 'dataxml.xml'
run_lev, debug_dir = 1, None run_lev, debug_dir, indent_out = 1, None, 0
if getattr(self.opts, 'debug_pipeline', None) is not None: if getattr(self.opts, 'debug_pipeline', None) is not None:
try: try:
os.mkdir(debug_dir) os.mkdir('rtfdebug')
debug_dir = 'rtfdebug' debug_dir = 'rtfdebug'
run_lev = 4 run_lev = 4
indent_out = 1
self.log('Running RTFParser in debug mode') self.log('Running RTFParser in debug mode')
except: except:
self.log.warn('Impossible to run RTFParser in debug mode') self.log.warn('Impossible to run RTFParser in debug mode')
@ -108,7 +112,7 @@ class RTFInput(InputFormatPlugin):
# Indent resulting XML. # Indent resulting XML.
# Default is 0 (no indent). # Default is 0 (no indent).
indent = 1, indent = indent_out,
# Form lists from RTF. Default is 1. # Form lists from RTF. Default is 1.
form_lists = 1, form_lists = 1,
@ -157,7 +161,8 @@ class RTFInput(InputFormatPlugin):
with open(name, 'wb') as f: with open(name, 'wb') as f:
f.write(data) f.write(data)
imap[count] = name imap[count] = name
#open(name+'.hex', 'wb').write(enc) # with open(name+'.hex', 'wb') as f:
# f.write(enc)
return self.convert_images(imap) return self.convert_images(imap)
def convert_images(self, imap): def convert_images(self, imap):
@ -319,4 +324,6 @@ class RTFInput(InputFormatPlugin):
opf.render(open('metadata.opf', 'wb')) opf.render(open('metadata.opf', 'wb'))
return os.path.abspath('metadata.opf') return os.path.abspath('metadata.opf')
#ebook-convert "bad.rtf" test.epub -v -d "E:\Mes eBooks\Developpement\debug"
# os.makedirs("E:\\Mes eBooks\\Developpement\\rtfdebug")
# debug_dir = "E:\\Mes eBooks\\Developpement\\rtfdebug"

View File

@ -372,17 +372,17 @@ class ParseRtf:
old_rtf = old_rtf_obj.check_if_old_rtf() old_rtf = old_rtf_obj.check_if_old_rtf()
if old_rtf: if old_rtf:
if self.__run_level > 5: if self.__run_level > 5:
msg = 'older RTF\n' msg = 'Older RTF\n'
msg += 'self.__run_level is "%s"\n' % self.__run_level msg += 'self.__run_level is "%s"\n' % self.__run_level
raise RtfInvalidCodeException, msg raise RtfInvalidCodeException, msg
if self.__run_level > 1: if self.__run_level > 1:
sys.stderr.write('File could be older RTF...\n') sys.stderr.write(_('File could be older RTF...\n'))
if found_destination: if found_destination:
if self.__run_level > 1: if self.__run_level > 1:
sys.stderr.write( sys.stderr.write(_(
'File also has newer RTF.\n' 'File also has newer RTF.\n'
'Will do the best to convert.\n' 'Will do the best to convert.\n'
) ))
add_brackets_obj = add_brackets.AddBrackets( add_brackets_obj = add_brackets.AddBrackets(
in_file = self.__temp_file, in_file = self.__temp_file,
bug_handler = RtfInvalidCodeException, bug_handler = RtfInvalidCodeException,

View File

@ -53,4 +53,3 @@ class CheckBrackets:
'total number of brackets is %s') % self.__bracket_count 'total number of brackets is %s') % self.__bracket_count
return (False, msg) return (False, msg)
return (True, "Brackets match!") return (True, "Brackets match!")

View File

@ -25,7 +25,7 @@ class Configure:
if self.__show_config_file and self.__configuration_file: if self.__show_config_file and self.__configuration_file:
sys.stderr.write('configuration file is "%s"\n' % self.__configuration_file) sys.stderr.write('configuration file is "%s"\n' % self.__configuration_file)
if self.__show_config_file and not self.__configuration_file: if self.__show_config_file and not self.__configuration_file:
sys.stderr.write('No configuraiton file found; using default vaules\n') sys.stderr.write('No configuraiton file found; using default values\n')
if self.__configuration_file: if self.__configuration_file:
read_obj = open(self.__configuration_file, 'r') read_obj = open(self.__configuration_file, 'r')
line_to_read = 1 line_to_read = 1

View File

@ -43,6 +43,7 @@ class DeleteInfo:
'cw<it<listtable_', 'cw<it<listtable_',
'cw<it<revi-table', 'cw<it<revi-table',
'cw<ls<list-lev-d', 'cw<ls<list-lev-d',
# Field allowed
'cw<fd<field-inst', 'cw<fd<field-inst',
'cw<an<book-mk-st', 'cw<an<book-mk-st',
'cw<an<book-mk-en', 'cw<an<book-mk-en',
@ -81,7 +82,7 @@ class DeleteInfo:
self.__ob = line self.__ob = line
return False return False
else: else:
# write previous bracket, since didn't fine asterisk # write previous bracket, since didn't find asterisk
if self.__ob: if self.__ob:
self.__write_obj.write(self.__ob) self.__write_obj.write(self.__ob)
self.__ob = 0 self.__ob = 0
@ -104,7 +105,7 @@ class DeleteInfo:
If you find that you are in a delete group, and the previous If you find that you are in a delete group, and the previous
token in not an open bracket (self.__ob = 0), that means token in not an open bracket (self.__ob = 0), that means
that the delete group is nested inside another acceptable that the delete group is nested inside another acceptable
detination group. In this case, you have alrady written detination group. In this case, you have already written
the open bracket, so you will need to write the closed one the open bracket, so you will need to write the closed one
as well. as well.
""" """

View File

@ -10,8 +10,10 @@
# # # #
# # # #
######################################################################### #########################################################################
import sys, os, tempfile, re import sys, os, tempfile, re
from calibre.ebooks.rtf2xml import field_strings, copy from calibre.ebooks.rtf2xml import field_strings, copy
class FieldsSmall: class FieldsSmall:
""" """
================= =================
@ -19,7 +21,7 @@ Purpose
================= =================
Write tags for bookmarks, index and toc entry fields in a tokenized file. Write tags for bookmarks, index and toc entry fields in a tokenized file.
This module does not handle toc or index tables. (This module won't be any This module does not handle toc or index tables. (This module won't be any
use to use to you unless you use it as part of the other modules.) use to you unless you use it as part of the other modules.)
----------- -----------
Method Method
----------- -----------
@ -50,6 +52,7 @@ file.
self.__copy = copy self.__copy = copy
self.__write_to = tempfile.mktemp() self.__write_to = tempfile.mktemp()
self.__run_level = run_level self.__run_level = run_level
def __initiate_values(self): def __initiate_values(self):
""" """
Initiate all values. Initiate all values.
@ -76,6 +79,7 @@ file.
tx = 'tx<nu<__________<(.*?)' tx = 'tx<nu<__________<(.*?)'
reg_st = ob + bk_st + tx + cb reg_st = ob + bk_st + tx + cb
self.__book_start = re.compile(r'%s' % reg_st) self.__book_start = re.compile(r'%s' % reg_st)
def __before_body_func(self, line): def __before_body_func(self, line):
""" """
Requires: Requires:
@ -89,6 +93,7 @@ file.
if self.__token_info == 'mi<mk<body-open_': if self.__token_info == 'mi<mk<body-open_':
self.__state = 'body' self.__state = 'body'
self.__write_obj.write(line) self.__write_obj.write(line)
def __body_func(self, line): def __body_func(self, line):
""" """
Requires: Requires:
@ -105,6 +110,7 @@ file.
action(line, tag) action(line, tag)
else: else:
self.__write_obj.write(line) self.__write_obj.write(line)
def __found_bookmark_func(self, line, tag): def __found_bookmark_func(self, line, tag):
""" """
Requires: Requires:
@ -120,6 +126,7 @@ file.
self.__cb_count = 0 self.__cb_count = 0
self.__state = 'bookmark' self.__state = 'bookmark'
self.__type_of_bookmark = tag self.__type_of_bookmark = tag
def __bookmark_func(self, line): def __bookmark_func(self, line):
""" """
Requires: Requires:
@ -148,6 +155,7 @@ file.
self.__write_obj.write(line) self.__write_obj.write(line)
elif line[0:2] == 'tx': elif line[0:2] == 'tx':
self.__text_string += line[17:-1] self.__text_string += line[17:-1]
def __parse_index_func(self, my_string): def __parse_index_func(self, my_string):
""" """
Requires: Requires:
@ -196,6 +204,7 @@ file.
my_changed_string += '<sub-entry>%s' % sub_entry my_changed_string += '<sub-entry>%s' % sub_entry
my_changed_string += '\n' my_changed_string += '\n'
return my_changed_string return my_changed_string
def __index_see_func(self, my_string): def __index_see_func(self, my_string):
in_see = 0 in_see = 0
bracket_count = 0 bracket_count = 0
@ -221,6 +230,7 @@ file.
in_see = 1 in_see = 1
changed_string += '%s\n' % line changed_string += '%s\n' % line
return changed_string, see_string return changed_string, see_string
def __index_bookmark_func(self, my_string): def __index_bookmark_func(self, my_string):
""" """
Requries: Requries:
@ -257,6 +267,7 @@ file.
in_bookmark = 1 in_bookmark = 1
index_string += '%s\n' % line index_string += '%s\n' % line
return index_string, bookmark_string return index_string, bookmark_string
def __index__format_func(self, my_string): def __index__format_func(self, my_string):
italics = 0 italics = 0
bold =0 bold =0
@ -268,6 +279,7 @@ file.
if token_info == 'cw<in<index-ital': if token_info == 'cw<in<index-ital':
italics = 1 italics = 1
return italics, bold return italics, bold
def __parse_toc_func(self, my_string): def __parse_toc_func(self, my_string):
""" """
Requires: Requires:
@ -303,6 +315,7 @@ file.
my_changed_string += '<main-entry>%s' % main_entry my_changed_string += '<main-entry>%s' % main_entry
my_changed_string += '\n' my_changed_string += '\n'
return my_changed_string return my_changed_string
def __parse_bookmark_for_toc(self, my_string): def __parse_bookmark_for_toc(self, my_string):
""" """
Requires: Requires:
@ -348,6 +361,7 @@ file.
in_bookmark = 1 in_bookmark = 1
toc_string += '%s\n' % line toc_string += '%s\n' % line
return toc_string, book_start_string, book_end_string return toc_string, book_start_string, book_end_string
def __parse_bookmark_func(self, my_string, type): def __parse_bookmark_func(self, my_string, type):
""" """
Requires: Requires:
@ -362,6 +376,7 @@ file.
my_changed_string = ('mi<tg<empty-att_<field<type>%s' my_changed_string = ('mi<tg<empty-att_<field<type>%s'
'<number>%s<update>none\n' % (type, my_string)) '<number>%s<update>none\n' % (type, my_string))
return my_changed_string return my_changed_string
def __found_toc_index_func(self, line, tag): def __found_toc_index_func(self, line, tag):
""" """
Requires: Requires:
@ -377,6 +392,7 @@ file.
self.__cb_count = 0 self.__cb_count = 0
self.__state = 'toc_index' self.__state = 'toc_index'
self.__tag = tag self.__tag = tag
def __toc_index_func(self, line): def __toc_index_func(self, line):
""" """
Requires: Requires:
@ -404,6 +420,7 @@ file.
self.__write_obj.write(line) self.__write_obj.write(line)
else: else:
self.__text_string += line self.__text_string += line
def fix_fields(self): def fix_fields(self):
""" """
Requires: Requires:
@ -418,24 +435,19 @@ file.
bookmark. bookmark.
""" """
self.__initiate_values() self.__initiate_values()
read_obj = open(self.__file) with open(self.__file, 'r') as read_obj:
self.__write_obj = open(self.__write_to, 'w') with open(self.__write_to, 'w') as self.__write_obj:
line_to_read = '1' for line in read_obj:
while line_to_read: self.__token_info = line[:16]
line_to_read = read_obj.readline() if self.__token_info == 'ob<nu<open-brack':
line = line_to_read self.__ob_count = line[-5:-1]
self.__token_info = line[:16] if self.__token_info == 'cb<nu<clos-brack':
if self.__token_info == 'ob<nu<open-brack': self.__cb_count = line[-5:-1]
self.__ob_count = line[-5:-1] action = self.__state_dict.get(self.__state)
if self.__token_info == 'cb<nu<clos-brack': if action is None:
self.__cb_count = line[-5:-1] sys.stderr.write('No matching state in module fields_small.py\n')
action = self.__state_dict.get(self.__state) sys.stderr.write(self.__state + '\n')
if action == None: action(line)
sys.stderr.write('no no matching state in module fields_small.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:
copy_obj.copy_file(self.__write_to, "fields_small.data") copy_obj.copy_file(self.__write_to, "fields_small.data")

View File

@ -25,8 +25,6 @@ class GetCharMap:
'char_file'--the file with the mappings 'char_file'--the file with the mappings
Returns: Returns:
nothing nothing
@ -57,7 +55,6 @@ class GetCharMap:
fields[1].replace('\\colon', ':') fields[1].replace('\\colon', ':')
map_dict[fields[1]] = fields[3] map_dict[fields[1]] = fields[3]
if not found_map: if not found_map:
msg = 'no map found\nmap is "%s"\n'%(map,) msg = 'no map found\nmap is "%s"\n'%(map,)
raise self.__bug_handler, msg raise self.__bug_handler, msg

View File

@ -11,8 +11,10 @@
# # # #
######################################################################### #########################################################################
import sys, os, tempfile, cStringIO import sys, os, tempfile, cStringIO
from calibre.ebooks.rtf2xml import get_char_map, copy from calibre.ebooks.rtf2xml import get_char_map, copy
from calibre.ebooks.rtf2xml.char_set import char_set from calibre.ebooks.rtf2xml.char_set import char_set
class Hex2Utf8: class Hex2Utf8:
""" """
Convert Microsoft hexidecimal numbers to utf-8 Convert Microsoft hexidecimal numbers to utf-8
@ -108,7 +110,7 @@ class Hex2Utf8:
""" """
self.__file=file self.__file=file
self.__copy = copy self.__copy = copy
if area_to_convert != 'preamble' and area_to_convert != 'body': if area_to_convert not in ('preamble', 'body'):
msg = ( msg = (
'in module "hex_2_utf8.py\n' 'in module "hex_2_utf8.py\n'
'"area_to_convert" must be "body" or "preamble"\n' '"area_to_convert" must be "body" or "preamble"\n'
@ -136,12 +138,12 @@ class Hex2Utf8:
Set values, including those for the dictionaries. Set values, including those for the dictionaries.
The file that contains the maps is broken down into many different The file that contains the maps is broken down into many different
sets. For example, for the Symbol font, there is the standard part for sets. For example, for the Symbol font, there is the standard part for
hexidecimal numbers, and the the part for Microsoft charcters. Read hexidecimal numbers, and the part for Microsoft characters. Read
each part in, and then combine them. each part in, and then combine them.
""" """
# the default encoding system, the lower map for characters 0 through # the default encoding system, the lower map for characters 0 through
# 128, and the encoding system for Microsoft characters. # 128, and the encoding system for Microsoft characters.
# New on 2004-05-8: the self.__char_map is not in diretory with other # New on 2004-05-8: the self.__char_map is not in directory with other
# modules # modules
self.__char_file = cStringIO.StringIO(char_set) self.__char_file = cStringIO.StringIO(char_set)
char_map_obj = get_char_map.GetCharMap( char_map_obj = get_char_map.GetCharMap(
@ -188,7 +190,6 @@ class Hex2Utf8:
'body' : self.__body_func, 'body' : self.__body_func,
'mi<mk<body-open_' : self.__found_body_func, 'mi<mk<body-open_' : self.__found_body_func,
'tx<hx<__________' : self.__hex_text_func, 'tx<hx<__________' : self.__hex_text_func,
# 'tx<nu<__________' : self.__text_func,
} }
self.__body_state_dict = { self.__body_state_dict = {
'preamble' : self.__preamble_for_body_func, 'preamble' : self.__preamble_for_body_func,
@ -228,9 +229,7 @@ class Hex2Utf8:
font = self.__current_dict_name font = self.__current_dict_name
if self.__convert_caps\ if self.__convert_caps\
and self.__caps_list[-1] == 'true'\ and self.__caps_list[-1] == 'true'\
and font != 'Symbol'\ and font not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
and font != 'Wingdings'\
and font != 'Zapf Dingbats':
converted = self.__utf_token_to_caps_func(converted) converted = self.__utf_token_to_caps_func(converted)
self.__write_obj.write( self.__write_obj.write(
'tx<ut<__________<%s\n' % converted 'tx<ut<__________<%s\n' % converted
@ -240,9 +239,7 @@ class Hex2Utf8:
font = self.__current_dict_name font = self.__current_dict_name
if self.__convert_caps\ if self.__convert_caps\
and self.__caps_list[-1] == 'true'\ and self.__caps_list[-1] == 'true'\
and font != 'Symbol'\ and font not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
and font != 'Wingdings'\
and font != 'Zapf Dingbats':
converted = converted.upper() converted = converted.upper()
self.__write_obj.write( self.__write_obj.write(
'tx<nu<__________<%s\n' % converted 'tx<nu<__________<%s\n' % converted
@ -282,17 +279,16 @@ class Hex2Utf8:
def __convert_preamble(self): def __convert_preamble(self):
self.__state = 'preamble' self.__state = 'preamble'
self.__write_obj = open(self.__write_to, 'w') with open(self.__write_to, 'w') as self.__write_obj:
with open(self.__file, 'r') as read_obj: with open(self.__file, 'r') as read_obj:
for line in read_obj: for line in read_obj:
self.__token_info = line[:16] self.__token_info = line[:16]
action = self.__preamble_state_dict.get(self.__state) action = self.__preamble_state_dict.get(self.__state)
if action is None: if action is None:
sys.stderr.write(_('error no state found in hex_2_utf8'), sys.stderr.write('error no state found in hex_2_utf8',
self.__state self.__state
) )
action(line) action(line)
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:
copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data") copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data")
@ -461,9 +457,9 @@ class Hex2Utf8:
if len(self.__caps_list) > 1: if len(self.__caps_list) > 1:
self.__caps_list.pop() self.__caps_list.pop()
else: else:
sys.stderr.write('Module is hex_2_utf8\n') sys.stderr.write('Module is hex_2_utf8\n'
sys.stderr.write('method is __end_caps_func\n') 'method is __end_caps_func\n'
sys.stderr.write('caps list should be more than one?\n') #self.__in_caps not set 'caps list should be more than one?\n') #self.__in_caps not set
def __text_func(self, line): def __text_func(self, line):
""" """
@ -486,8 +482,7 @@ class Hex2Utf8:
hex_num = '\'%s' % hex_num hex_num = '\'%s' % hex_num
converted = self.__current_dict.get(hex_num) converted = self.__current_dict.get(hex_num)
if converted is None: if converted is None:
sys.stderr.write('module is hex_2_ut8\n') sys.stderr.write('module is hex_2_ut8\nmethod is __text_func\n')
sys.stderr.write('method is __text_func\n')
sys.stderr.write('no hex value for "%s"\n' % hex_num) sys.stderr.write('no hex value for "%s"\n' % hex_num)
else: else:
the_string += converted the_string += converted
@ -543,16 +538,15 @@ class Hex2Utf8:
def __convert_body(self): def __convert_body(self):
self.__state = 'body' self.__state = 'body'
with open(self.__file, 'r') as read_obj: with open(self.__file, 'r') as read_obj:
self.__write_obj = open(self.__write_to, 'w') with open(self.__write_to, 'w') as self.__write_obj:
for line in read_obj: for line in read_obj:
self.__token_info = line[:16] self.__token_info = line[:16]
action = self.__body_state_dict.get(self.__state) action = self.__body_state_dict.get(self.__state)
if action is None: if action is None:
sys.stderr.write('error no state found in hex_2_utf8', sys.stderr.write('error no state found in hex_2_utf8',
self.__state self.__state
) )
action(line) action(line)
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:
copy_obj.copy_file(self.__write_to, "body_utf_convert.data") copy_obj.copy_file(self.__write_to, "body_utf_convert.data")

View File

@ -68,7 +68,6 @@ class Info:
'cw<di<create-tim' : (self.__found_tag_with_tokens_func, 'creation-time'), 'cw<di<create-tim' : (self.__found_tag_with_tokens_func, 'creation-time'),
'cw<di<revis-time' : (self.__found_tag_with_tokens_func, 'revision-time'), 'cw<di<revis-time' : (self.__found_tag_with_tokens_func, 'revision-time'),
'cw<di<edit-time_' : (self.__found_tag_with_tokens_func, 'editing-time'),
'cw<di<print-time' : (self.__found_tag_with_tokens_func, 'printing-time'), 'cw<di<print-time' : (self.__found_tag_with_tokens_func, 'printing-time'),
'cw<di<backuptime' : (self.__found_tag_with_tokens_func, 'backup-time'), 'cw<di<backuptime' : (self.__found_tag_with_tokens_func, 'backup-time'),
@ -77,6 +76,7 @@ class Info:
'cw<di<numofchrws' : (self.__single_field_func, 'number-of-characters-without-space'), 'cw<di<numofchrws' : (self.__single_field_func, 'number-of-characters-without-space'),
'cw<di<num-of-pag' : (self.__single_field_func, 'number-of-pages'), 'cw<di<num-of-pag' : (self.__single_field_func, 'number-of-pages'),
'cw<di<version___' : (self.__single_field_func, 'version'), 'cw<di<version___' : (self.__single_field_func, 'version'),
'cw<di<edit-time_' : (self.__single_field_func, 'editing-time'),
'cw<di<intern-ver' : (self.__single_field_func, 'internal-version-number'), 'cw<di<intern-ver' : (self.__single_field_func, 'internal-version-number'),
'cw<di<internalID' : (self.__single_field_func, 'internal-id-number'), 'cw<di<internalID' : (self.__single_field_func, 'internal-id-number'),
} }

View File

@ -411,11 +411,11 @@ class Inline:
self.__set_list_func(line) self.__set_list_func(line)
action = self.__state_dict.get(self.__state) action = self.__state_dict.get(self.__state)
if action is None: if action is None:
sys.stderr.write('No matching state in module inline_for_lists.py\n') sys.stderr.write('No matching state in module inline.py\n')
sys.stderr.write(self.__state + '\n') sys.stderr.write(self.__state + '\n')
action(line) action(line)
copy_obj = copy.Copy(bug_handler = self.__bug_handler) copy_obj = copy.Copy(bug_handler = self.__bug_handler)
if self.__copy: if self.__copy:
copy_obj.copy_file(self.__write_to, "inline.data") copy_obj.copy_file(self.__write_to, "inline.data")
copy_obj.rename(self.__write_to, self.__file) copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to) os.remove(self.__write_to)

View File

@ -214,7 +214,27 @@ class ProcessTokens:
'nosupersub' : ('ci', 'no-su-supe', self.__no_sup_sub_func), 'nosupersub' : ('ci', 'no-su-supe', self.__no_sup_sub_func),
'up' : ('ci', 'font-up___', self.divide_by_2), 'up' : ('ci', 'font-up___', self.divide_by_2),
'v' : ('ci', 'hidden____', self.default_func), 'v' : ('ci', 'hidden____', self.default_func),
# table => tb # underline
# can't see why it isn't a char info: 'ul'=>'ci'
'ul' : ('ci', 'underlined<continous', self.two_part_func),
'uld' : ('ci', 'underlined<dotted', self.two_part_func),
'uldash' : ('ci', 'underlined<dash', self.two_part_func),
'uldashd' : ('ci', 'underlined<dash-dot', self.two_part_func),
'uldashdd' : ('ci', 'underlined<dash-dot-dot', self.two_part_func),
'uldb' : ('ci', 'underlined<double', self.two_part_func),
'ulhwave' : ('ci', 'underlined<heavy-wave', self.two_part_func),
'ulldash' : ('ci', 'underlined<long-dash', self.two_part_func),
'ulth' : ('ci', 'underlined<thich', self.two_part_func),
'ulthd' : ('ci', 'underlined<thick-dotted', self.two_part_func),
'ulthdash' : ('ci', 'underlined<thick-dash', self.two_part_func),
'ulthdashd' : ('ci', 'underlined<thick-dash-dot', self.two_part_func),
'ulthdashdd' : ('ci', 'underlined<thick-dash-dot-dot', self.two_part_func),
'ulthldash' : ('ci', 'underlined<thick-long-dash', self.two_part_func),
'ululdbwave' : ('ci', 'underlined<double-wave', self.two_part_func),
'ulw' : ('ci', 'underlined<word', self.two_part_func),
'ulwave' : ('ci', 'underlined<wave', self.two_part_func),
'ulnone' : ('ci', 'underlined<false', self.two_part_func),
# table => tb
'trowd' : ('tb', 'row-def___', self.default_func), 'trowd' : ('tb', 'row-def___', self.default_func),
'cell' : ('tb', 'cell______', self.default_func), 'cell' : ('tb', 'cell______', self.default_func),
'row' : ('tb', 'row_______', self.default_func), 'row' : ('tb', 'row_______', self.default_func),
@ -274,25 +294,6 @@ class ProcessTokens:
'paperh' : ('pa', 'paper-hght', self.divide_by_20), 'paperh' : ('pa', 'paper-hght', self.divide_by_20),
# annotation => an # annotation => an
'annotation' : ('an', 'annotation', self.default_func), 'annotation' : ('an', 'annotation', self.default_func),
# underline
'ul' : ('ul', 'underlined<continous', self.two_part_func),
'uld' : ('ul', 'underlined<dotted', self.two_part_func),
'uldash' : ('ul', 'underlined<dash', self.two_part_func),
'uldashd' : ('ul', 'underlined<dash-dot', self.two_part_func),
'uldashdd' : ('ul', 'underlined<dash-dot-dot', self.two_part_func),
'uldb' : ('ul', 'underlined<double', self.two_part_func),
'ulhwave' : ('ul', 'underlined<heavy-wave', self.two_part_func),
'ulldash' : ('ul', 'underlined<long-dash', self.two_part_func),
'ulth' : ('ul', 'underlined<thich', self.two_part_func),
'ulthd' : ('ul', 'underlined<thick-dotted', self.two_part_func),
'ulthdash' : ('ul', 'underlined<thick-dash', self.two_part_func),
'ulthdashd' : ('ul', 'underlined<thick-dash-dot', self.two_part_func),
'ulthdashdd' : ('ul', 'underlined<thick-dash-dot-dot', self.two_part_func),
'ulthldash' : ('ul', 'underlined<thick-long-dash', self.two_part_func),
'ululdbwave' : ('ul', 'underlined<double-wave', self.two_part_func),
'ulw' : ('ul', 'underlined<word', self.two_part_func),
'ulwave' : ('ul', 'underlined<wave', self.two_part_func),
'ulnone' : ('ul', 'underlined<false', self.two_part_func),
# border => bd # border => bd
'trbrdrh' : ('bd', 'bor-t-r-hi', self.default_func), 'trbrdrh' : ('bd', 'bor-t-r-hi', self.default_func),
'trbrdrv' : ('bd', 'bor-t-r-vi', self.default_func), 'trbrdrv' : ('bd', 'bor-t-r-vi', self.default_func),
@ -757,7 +758,7 @@ class ProcessTokens:
def process_cw(self, token): def process_cw(self, token):
"""Change the value of the control word by determining what dictionary """Change the value of the control word by determining what dictionary
it belongs to""" it belongs to"""
special = [ '*', ':', '}', '{', '~', '_', '-', ';' ] special = [ '*', ':', '}', '{', '~', '_', '-', ';' ]
##if token != "{" or token != "}": ##if token != "{" or token != "}":
token = token[1:] # strip off leading \ token = token[1:] # strip off leading \
token = token.replace(" ", "") token = token.replace(" ", "")
@ -793,7 +794,7 @@ class ProcessTokens:
raise self.__exception_handler, msg raise self.__exception_handler, msg
the_index = token.find('\\ ') the_index = token.find('\\ ')
if token is not None and the_index > -1: if token is not None and the_index > -1:
msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\ msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
% line_count % line_count
raise self.__exception_handler, msg raise self.__exception_handler, msg
@ -832,4 +833,4 @@ class ProcessTokens:
msg = '\nInvalid RTF: document does not have matching brackets.\n' msg = '\nInvalid RTF: document does not have matching brackets.\n'
raise self.__exception_handler, msg raise self.__exception_handler, msg
else: else:
return self.__return_code return self.__return_code

View File

@ -496,7 +496,7 @@ Instead, ingore all section information in a field-block.
self.__token_info = line[:16] self.__token_info = line[:16]
action = self.__state_dict.get(self.__state) action = self.__state_dict.get(self.__state)
if action == None: if action == None:
sys.stderr.write('no no matching state in module sections.py\n') sys.stderr.write('no matching state in module sections.py\n')
sys.stderr.write(self.__state + '\n') sys.stderr.write(self.__state + '\n')
action(line) action(line)
read_obj.close() read_obj.close()

View File

@ -103,8 +103,6 @@ class Styles:
'sect-note_' : 'endnotes-in-section', 'sect-note_' : 'endnotes-in-section',
# list=> ls # list=> ls
'list-text_' : 'list-text', 'list-text_' : 'list-text',
# this line must be wrong because it duplicates an earlier one
'list-text_' : 'list-text',
'list______' : 'list', 'list______' : 'list',
'list-lev-d' : 'list-level-definition', 'list-lev-d' : 'list-level-definition',
'list-cardi' : 'list-cardinal-numbering', 'list-cardi' : 'list-cardinal-numbering',

View File

@ -114,6 +114,7 @@ class Tokenize:
# this is for older RTF # this is for older RTF
input_file = self.__par_exp.sub('\n\\par \n', input_file) input_file = self.__par_exp.sub('\n\\par \n', input_file)
input_file = self.__cwdigit_exp.sub("\g<1>\n\g<2>", input_file) input_file = self.__cwdigit_exp.sub("\g<1>\n\g<2>", input_file)
input_file = self.__cs_ast.sub("\g<1>", input_file)
input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file) input_file = self.__ms_hex_exp.sub("\\mshex0\g<1> ", input_file)
input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file) input_file = self.__utf_ud.sub("\\{\\uc0 \g<1>\\}", input_file)
#remove \n in bin data #remove \n in bin data
@ -163,6 +164,8 @@ class Tokenize:
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)") self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
#this is for old RTF #this is for old RTF
self.__par_exp = re.compile(r'(\\\n+|\\ )') self.__par_exp = re.compile(r'(\\\n+|\\ )')
#handle improper cs char-style with \* before without {
self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)')
#handle cw using a digit as argument and without space as delimiter #handle cw using a digit as argument and without space as delimiter
self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)") self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)")

View File

@ -12,6 +12,7 @@ import os, re
from calibre import prepare_string_for_xml, isbytestring from calibre import prepare_string_for_xml, isbytestring
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.conversion.preprocess import DocAnalysis from calibre.ebooks.conversion.preprocess import DocAnalysis
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars

View File

@ -477,17 +477,17 @@ class BIBTEX(CatalogPlugin): # {{{
if opts.bibfile_enc in bibfile_enc : if opts.bibfile_enc in bibfile_enc :
bibfile_enc = opts.bibfile_enc bibfile_enc = opts.bibfile_enc
else : else :
log(" WARNING: incorrect --choose-encoding flag, revert to default") log.warn("Incorrect --choose-encoding flag, revert to default")
bibfile_enc = bibfile_enc[0] bibfile_enc = bibfile_enc[0]
if opts.bibfile_enctag in bibfile_enctag : if opts.bibfile_enctag in bibfile_enctag :
bibfile_enctag = opts.bibfile_enctag bibfile_enctag = opts.bibfile_enctag
else : else :
log(" WARNING: incorrect --choose-encoding-configuration flag, revert to default") log.warn("Incorrect --choose-encoding-configuration flag, revert to default")
bibfile_enctag = bibfile_enctag[0] bibfile_enctag = bibfile_enctag[0]
if opts.bib_entry in bib_entry : if opts.bib_entry in bib_entry :
bib_entry = opts.bib_entry bib_entry = opts.bib_entry
else : else :
log(" WARNING: incorrect --entry-type flag, revert to default") log.warn("Incorrect --entry-type flag, revert to default")
bib_entry = bib_entry[0] bib_entry = bib_entry[0]
if opts.verbose: if opts.verbose:
@ -544,7 +544,7 @@ class BIBTEX(CatalogPlugin): # {{{
elif opts.impcit == 'True' : elif opts.impcit == 'True' :
citation_bibtex= True citation_bibtex= True
else : else :
log(" WARNING: incorrect --create-citation, revert to default") log.warn("Incorrect --create-citation, revert to default")
citation_bibtex= True citation_bibtex= True
else : else :
citation_bibtex= opts.impcit citation_bibtex= opts.impcit
@ -556,7 +556,7 @@ class BIBTEX(CatalogPlugin): # {{{
elif opts.addfiles == 'True' : elif opts.addfiles == 'True' :
addfiles_bibtex = True addfiles_bibtex = True
else : else :
log(" WARNING: incorrect --add-files-path, revert to default") log.warn("Incorrect --add-files-path, revert to default")
addfiles_bibtex= True addfiles_bibtex= True
else : else :
addfiles_bibtex = opts.addfiles addfiles_bibtex = opts.addfiles
@ -574,7 +574,7 @@ class BIBTEX(CatalogPlugin): # {{{
if bib_entry == 'book' : if bib_entry == 'book' :
nb_books = len(filter(check_entry_book_valid, data)) nb_books = len(filter(check_entry_book_valid, data))
if nb_books < nb_entries : if nb_books < nb_entries :
log(" WARNING: only %d entries in %d are book compatible" % (nb_books, nb_entries)) log.warn("Only %d entries in %d are book compatible" % (nb_books, nb_entries))
nb_entries = nb_books nb_entries = nb_books
# If connected device, add 'On Device' values to data # If connected device, add 'On Device' values to data