mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
254 lines
10 KiB
Python
Executable File
254 lines
10 KiB
Python
Executable File
#########################################################################
|
|
# #
|
|
# #
|
|
# copyright 2002 Paul Henry Tremblay #
|
|
# #
|
|
# This program is distributed in the hope that it will be useful, #
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
|
# General Public License for more details. #
|
|
# #
|
|
# You should have received a copy of the GNU General Public License #
|
|
# along with this program; if not, write to the Free Software #
|
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA #
|
|
# 02111-1307 USA #
|
|
# #
|
|
# #
|
|
#########################################################################
|
|
import sys, os, tempfile
|
|
from libprs500.ebooks.rtf2xml import copy
|
|
class Paragraphs:
|
|
"""
|
|
=================
|
|
Purpose
|
|
=================
|
|
Write paragraph tags for a tokenized file. (This module won't be any use to use
|
|
to you unless you use it as part of the other modules.)
|
|
-------------
|
|
Method
|
|
-------------
|
|
RTF does not tell you when a paragraph begins. It only tells you when the
|
|
paragraph ends.
|
|
In order to make paragraphs out of this limited info, the parser starts in the
|
|
body of the documents and assumes it is not in a paragraph. It looks for clues
|
|
to begin a paragraph. Text starts a paragraph; so does an inline field or
|
|
list-text. If an end of paragraph marker (\par) is found, then this indicates
|
|
a blank paragraph.
|
|
Once a paragraph is found, the state changes to 'paragraph.' In this state,
|
|
clues are looked to for the end of a paragraph. The end of a paragraph marker
|
|
(\par) marks the end of a paragraph. So does the end of a footnote or heading;
|
|
a paragraph definintion; the end of a field-block; and the beginning of a
|
|
section. (How about the end of a section or the end of a field-block?)
|
|
"""
|
|
def __init__(self,
|
|
in_file,
|
|
bug_handler,
|
|
copy = None,
|
|
write_empty_para = 1,
|
|
run_level = 1,
|
|
):
|
|
"""
|
|
Required:
|
|
'file'--file to parse
|
|
Optional:
|
|
'copy'-- whether to make a copy of result for debugging
|
|
'temp_dir' --where to output temporary results (default is
|
|
directory from which the script is run.)
|
|
Returns:
|
|
nothing
|
|
"""
|
|
self.__file = in_file
|
|
self.__bug_handler = bug_handler
|
|
self.__copy = copy
|
|
self.__write_empty_para = write_empty_para
|
|
self.__run_level = run_level
|
|
self.__write_to = tempfile.mktemp()
|
|
def __initiate_values(self):
|
|
"""
|
|
Initiate all values.
|
|
"""
|
|
self.__state = 'before_body'
|
|
self.__start_marker = 'mi<mk<para-start\n' # outside para tags
|
|
self.__start2_marker = 'mi<mk<par-start_\n' # inside para tags
|
|
self.__end2_marker = 'mi<mk<par-end___\n' # inside para tags
|
|
self.__end_marker = 'mi<mk<para-end__\n' # outside para tags
|
|
self.__state_dict = {
|
|
'before_body' : self.__before_body_func,
|
|
'not_paragraph' : self.__not_paragraph_func,
|
|
'paragraph' : self.__paragraph_func,
|
|
}
|
|
self.__paragraph_dict = {
|
|
'cw<pf<par-end___' : self.__close_para_func, # end of paragraph
|
|
'mi<mk<headi_-end' : self.__close_para_func, # end of header or footer
|
|
##'cw<pf<par-def___' : self.__close_para_func, # paragraph definition
|
|
# 'mi<mk<fld-bk-end' : self.__close_para_func, # end of field-block
|
|
'mi<mk<fldbk-end_' : self.__close_para_func, # end of field-block
|
|
'mi<mk<body-close' : self.__close_para_func, # end of body
|
|
'mi<mk<sect-close' : self.__close_para_func, # end of body
|
|
'mi<mk<sect-start' : self.__close_para_func, # start of section
|
|
'mi<mk<foot___clo' : self.__close_para_func, # end of footnote
|
|
'cw<tb<cell______' : self.__close_para_func, # end of cell
|
|
'mi<mk<par-in-fld' : self.__close_para_func, # start of block field
|
|
'cw<pf<par-def___' : self.__bogus_para__def_func, # paragraph definition
|
|
}
|
|
self.__not_paragraph_dict = {
|
|
'tx<nu<__________' : self.__start_para_func,
|
|
'tx<hx<__________' : self.__start_para_func,
|
|
'tx<ut<__________' : self.__start_para_func,
|
|
'tx<mc<__________' : self.__start_para_func,
|
|
'mi<mk<inline-fld' : self.__start_para_func,
|
|
'mi<mk<para-beg__' : self.__start_para_func,
|
|
'cw<pf<par-end___' : self.__empty_para_func,
|
|
'mi<mk<pict-start' : self.__start_para_func,
|
|
'cw<pf<page-break' : self.__empty_pgbk_func, # page break
|
|
}
|
|
def __before_body_func(self, line):
|
|
"""
|
|
Required:
|
|
line -- line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
This function handles all the lines before the start of the body.
|
|
Once the body starts, the state is switched to 'not_paragraph'
|
|
"""
|
|
if self.__token_info == 'mi<mk<body-open_':
|
|
self.__state = 'not_paragraph'
|
|
self.__write_obj.write(line)
|
|
def __not_paragraph_func(self, line):
|
|
"""
|
|
Required:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
This function handles all lines that are outside of the paragraph.
|
|
It looks for clues that start a paragraph, and when found,
|
|
switches states and writes the start tags.
|
|
"""
|
|
action = self.__not_paragraph_dict.get(self.__token_info)
|
|
if action:
|
|
action(line)
|
|
self.__write_obj.write(line)
|
|
def __paragraph_func(self, line):
|
|
"""
|
|
Required:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
This function handles all the lines that are in the paragraph. It
|
|
looks for clues to the end of the paragraph. When a clue is found,
|
|
it calls on another method to write the end of the tag and change
|
|
the state.
|
|
"""
|
|
action = self.__paragraph_dict.get(self.__token_info)
|
|
if action:
|
|
action(line)
|
|
else:
|
|
self.__write_obj.write(line)
|
|
def __start_para_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
This function writes the beginning tags for a paragraph and
|
|
changes the state to paragraph.
|
|
"""
|
|
self.__write_obj.write(self.__start_marker) # marker for later parsing
|
|
self.__write_obj.write(
|
|
'mi<tg<open______<para\n'
|
|
)
|
|
self.__write_obj.write(self.__start2_marker)
|
|
self.__state = 'paragraph'
|
|
def __empty_para_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
This function writes the empty tags for a paragraph.
|
|
It does not do anything if self.__write_empty_para is 0.
|
|
"""
|
|
if self.__write_empty_para:
|
|
self.__write_obj.write(self.__start_marker) # marker for later parsing
|
|
self.__write_obj.write(
|
|
'mi<tg<empty_____<para\n'
|
|
)
|
|
self.__write_obj.write(self.__end_marker) # marker for later parsing
|
|
def __empty_pgbk_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
This function writes the empty tags for a page break.
|
|
"""
|
|
self.__write_obj.write(
|
|
'mi<tg<empty_____<page-break\n'
|
|
)
|
|
def __close_para_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
This function writes the end tags for a paragraph and
|
|
changes the state to not_paragraph.
|
|
"""
|
|
self.__write_obj.write(self.__end2_marker) # marker for later parser
|
|
self.__write_obj.write(
|
|
'mi<tg<close_____<para\n'
|
|
)
|
|
self.__write_obj.write(self.__end_marker) # marker for later parser
|
|
self.__write_obj.write(line)
|
|
self.__state = 'not_paragraph'
|
|
def __bogus_para__def_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
if a \pard occurs in a paragraph, I want to ignore it. (I believe)
|
|
"""
|
|
self.__write_obj.write('mi<mk<bogus-pard\n')
|
|
def make_paragraphs(self):
|
|
"""
|
|
Requires:
|
|
nothing
|
|
Returns:
|
|
nothing (changes the original file)
|
|
Logic:
|
|
Read one line in at a time. Determine what action to take based on
|
|
the state. If the state is before the body, look for the
|
|
beginning of the body.
|
|
When the body is found, change the state to 'not_paragraph'. The
|
|
only other state is 'paragraph'.
|
|
"""
|
|
self.__initiate_values()
|
|
read_obj = open(self.__file, 'r')
|
|
self.__write_obj = open(self.__write_to, 'w')
|
|
line_to_read = 1
|
|
while line_to_read:
|
|
line_to_read = read_obj.readline()
|
|
line = line_to_read
|
|
self.__token_info = line[:16]
|
|
action = self.__state_dict.get(self.__state)
|
|
if action == None:
|
|
sys.stderr.write('no no matching state in module sections.py\n')
|
|
sys.stderr.write(self.__state + '\n')
|
|
action(line)
|
|
read_obj.close()
|
|
self.__write_obj.close()
|
|
copy_obj = copy.Copy(bug_handler = self.__bug_handler)
|
|
if self.__copy:
|
|
copy_obj.copy_file(self.__write_to, "paragraphs.data")
|
|
copy_obj.rename(self.__write_to, self.__file)
|
|
os.remove(self.__write_to)
|