Add option to control line length when preprocessing PDF input

This commit is contained in:
Kovid Goyal 2009-06-23 07:42:30 -07:00
parent 22ec9df720
commit 032c2b0fdc
5 changed files with 17 additions and 3 deletions

View File

@ -694,7 +694,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
''' '''
from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
opts.preprocess_html) opts.preprocess_html, getattr(opts, 'pdf_line_length', 0.5))
oeb = OEBBook(log, html_preprocessor, oeb = OEBBook(log, html_preprocessor,
pretty_print=opts.pretty_print, input_encoding=encoding) pretty_print=opts.pretty_print, input_encoding=encoding)
if not populate: if not populate:

View File

@ -159,9 +159,11 @@ class HTMLPreProcessor(object):
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)), lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
] ]
def __init__(self, input_plugin_preprocess, plugin_preprocess): def __init__(self, input_plugin_preprocess, plugin_preprocess,
pdf_line_length):
self.input_plugin_preprocess = input_plugin_preprocess self.input_plugin_preprocess = input_plugin_preprocess
self.plugin_preprocess = plugin_preprocess self.plugin_preprocess = plugin_preprocess
self.pdf_line_length = pdf_line_length
def is_baen(self, src): def is_baen(self, src):
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"', return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
@ -182,7 +184,7 @@ class HTMLPreProcessor(object):
elif self.is_book_designer(html): elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html): elif self.is_pdftohtml(html):
length = line_length(html, .3) length = line_length(html, self.pdf_line_length)
line_length_rules = [] line_length_rules = []
if length: if length:
line_length_rules = [ line_length_rules = [

View File

@ -261,6 +261,11 @@ class HTMLInput(InputFormatPlugin):
'nasty side effects in the rest of of the conversion pipeline.' 'nasty side effects in the rest of of the conversion pipeline.'
) )
), ),
OptionRecommendation(name='pdf_line_length', recommended_value=0.5,
help=_('Average line length for line breaking if the HTML is from a '
'previous partial conversion of a PDF file.')),
]) ])
def convert(self, stream, opts, file_ext, log, def convert(self, stream, opts, file_ext, log,

View File

@ -20,6 +20,8 @@ class PDFInput(InputFormatPlugin):
options = set([ options = set([
OptionRecommendation(name='no_images', recommended_value=False, OptionRecommendation(name='no_images', recommended_value=False,
help=_('Do not extract images from the document')), help=_('Do not extract images from the document')),
OptionRecommendation(name='pdf_line_length', recommended_value=0.5,
help=_('Average line length for line breaking')),
]) ])
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,

View File

@ -140,6 +140,11 @@ sudo calibre_postinstall
</form> </form>
</div> </div>
<hr/> <hr/>
<h3>Note</h3>
<p>
If your kernel is compiled with CONFIG_SYSFS_DEPRECATED device detection may not work.
</p>
<hr/>
<h3>Dependencies</h3> <h3>Dependencies</h3>
${app} has the following dependencies (the listed version is the minimum version) ${app} has the following dependencies (the listed version is the minimum version)
<br/><br/> <br/><br/>