Pull from trunk

This commit is contained in:
Kovid Goyal 2010-05-30 22:51:09 -06:00
commit 1af58e4f03
5 changed files with 62 additions and 9 deletions

View File

@ -132,6 +132,9 @@ class CHMReader(CHMFile):
lpath = os.path.join(output_dir, path) lpath = os.path.join(output_dir, path)
self._ensure_dir(lpath) self._ensure_dir(lpath)
data = self.GetFile(path) data = self.GetFile(path)
if lpath.find(';') != -1:
# fix file names with ";<junk>" at the end, see _reformat()
lpath = lpath.split(';')[0]
with open(lpath, 'wb') as f: with open(lpath, 'wb') as f:
if guess_mimetype(path)[0] == ('text/html'): if guess_mimetype(path)[0] == ('text/html'):
data = self._reformat(data) data = self._reformat(data)
@ -158,14 +161,26 @@ class CHMReader(CHMFile):
# cos they really fuck with the flow of things and generally waste space # cos they really fuck with the flow of things and generally waste space
# since we can't use [a,b] syntax to select arbitrary items from a list # since we can't use [a,b] syntax to select arbitrary items from a list
# we'll have to do this manually... # we'll have to do this manually...
# only remove the tables, if they have an image with an alt attribute
# containing prev, next or team
t = soup('table') t = soup('table')
if t: if t:
if (t[0].previousSibling is None if (t[0].previousSibling is None
or t[0].previousSibling.previousSibling is None): or t[0].previousSibling.previousSibling is None):
t[0].extract() try:
alt = t[0].img['alt'].lower()
if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1:
t[0].extract()
except:
pass
if (t[-1].nextSibling is None if (t[-1].nextSibling is None
or t[-1].nextSibling.nextSibling is None): or t[-1].nextSibling.nextSibling is None):
t[-1].extract() try:
alt = t[-1].img['alt'].lower()
if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1:
t[-1].extract()
except:
pass
# for some very odd reason each page's content appears to be in a table # for some very odd reason each page's content appears to be in a table
# too. and this table has sub-tables for random asides... grr. # too. and this table has sub-tables for random asides... grr.
@ -185,8 +200,24 @@ class CHMReader(CHMFile):
except KeyError: except KeyError:
# and some don't even have a src= ?! # and some don't even have a src= ?!
pass pass
# now give back some pretty html. try:
return soup.prettify('utf-8') # if there is only a single table with a single element
# in the body, replace it by the contents of this single element
tables = soup.body.findAll('table', recursive=False)
if tables and len(tables) == 1:
trs = tables[0].findAll('tr', recursive=False)
if trs and len(trs) == 1:
tds = trs[0].findAll('td', recursive=False)
if tds and len(tds) == 1:
tdContents = tds[0].contents
tableIdx = soup.body.contents.index(tables[0])
tables[0].extract()
while tdContents:
soup.body.insert(tableIdx, tdContents.pop())
except:
pass
# do not prettify, it would reformat the <pre> tags!
return str(soup)
def Contents(self): def Contents(self):
if self._contents is not None: if self._contents is not None:

View File

@ -8,7 +8,8 @@ import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces
class TXTInput(InputFormatPlugin): class TXTInput(InputFormatPlugin):
@ -28,6 +29,9 @@ class TXTInput(InputFormatPlugin):
'an indent (either a tab or 2+ spaces) represents a paragraph. ' 'an indent (either a tab or 2+ spaces) represents a paragraph. '
'Paragraphs end when the next line that starts with an indent ' 'Paragraphs end when the next line that starts with an indent '
'is reached.')), 'is reached.')),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. '
'With this option all spaces will be displayed.')),
OptionRecommendation(name='markdown', recommended_value=False, OptionRecommendation(name='markdown', recommended_value=False,
help=_('Run the text input through the markdown pre-processor. To ' help=_('Run the text input through the markdown pre-processor. To '
'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'), 'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
@ -48,6 +52,8 @@ class TXTInput(InputFormatPlugin):
txt = separate_paragraphs_single_line(txt) txt = separate_paragraphs_single_line(txt)
if options.print_formatted_paras: if options.print_formatted_paras:
txt = separate_paragraphs_print_formatted(txt) txt = separate_paragraphs_print_formatted(txt)
if options.preserve_spaces:
txt = preserve_spaces(txt)
if options.markdown: if options.markdown:
log.debug('Running text though markdown conversion...') log.debug('Running text though markdown conversion...')

View File

@ -24,6 +24,9 @@ def convert_basic(txt, title=''):
for line in txt.splitlines(): for line in txt.splitlines():
lines.append(line.strip()) lines.append(line.strip())
txt = '\n'.join(lines) txt = '\n'.join(lines)
# Condense redundant spaces
txt = re.sub('[ ]{2,}', ' ', txt)
# Remove blank lines from the beginning and end of the document. # Remove blank lines from the beginning and end of the document.
txt = re.sub('^\s+(?=.)', '', txt) txt = re.sub('^\s+(?=.)', '', txt)
@ -56,6 +59,11 @@ def separate_paragraphs_print_formatted(txt):
txt = re.sub('(?miu)^(\t+|[ ]{2,})(?=.)', '\n\t', txt) txt = re.sub('(?miu)^(\t+|[ ]{2,})(?=.)', '\n\t', txt)
return txt return txt
def preserve_spaces(txt):
txt = txt.replace(' ', '&nbsp;')
txt = txt.replace('\t', '&#09;')
return txt
def opf_writer(path, opf_name, manifest, spine, mi): def opf_writer(path, opf_name, manifest, spine, mi):
opf = OPFCreator(path, mi) opf = OPFCreator(path, mi)
opf.create_manifest(manifest) opf.create_manifest(manifest)

View File

@ -14,6 +14,7 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, 'txt_input', Widget.__init__(self, parent, 'txt_input',
['single_line_paras', 'print_formatted_paras', 'markdown', 'markdown_disable_toc']) ['single_line_paras', 'print_formatted_paras', 'markdown',
'markdown_disable_toc', 'preserve_spaces'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -6,7 +6,7 @@
<rect> <rect>
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>400</width> <width>470</width>
<height>300</height> <height>300</height>
</rect> </rect>
</property> </property>
@ -52,7 +52,7 @@
</property> </property>
</widget> </widget>
</item> </item>
<item row="5" column="0"> <item row="6" column="0">
<spacer name="verticalSpacer"> <spacer name="verticalSpacer">
<property name="orientation"> <property name="orientation">
<enum>Qt::Vertical</enum> <enum>Qt::Vertical</enum>
@ -65,10 +65,17 @@
</property> </property>
</spacer> </spacer>
</item> </item>
<item row="5" column="0">
<widget class="QCheckBox" name="opt_preserve_spaces">
<property name="text">
<string>Preserve &amp;spaces</string>
</property>
</widget>
</item>
</layout> </layout>
</widget> </widget>
<resources/> <resources/>
<connections> <connections>
<connection> <connection>
<sender>opt_markdown</sender> <sender>opt_markdown</sender>
<signal>toggled(bool)</signal> <signal>toggled(bool)</signal>