mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
commit
0e7eab2d1e
@ -51,16 +51,16 @@ def chap_head(match):
|
|||||||
chap = match.group('chap')
|
chap = match.group('chap')
|
||||||
title = match.group('title')
|
title = match.group('title')
|
||||||
if not title:
|
if not title:
|
||||||
return '<h1>'+chap+'</h1><br/>\n'
|
return '<h1>'+chap+'</h1><br/>\n'
|
||||||
else:
|
else:
|
||||||
return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
|
return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
|
||||||
|
|
||||||
def wrap_lines(match):
|
def wrap_lines(match):
|
||||||
ital = match.group('ital')
|
ital = match.group('ital')
|
||||||
if not ital:
|
if not ital:
|
||||||
return ' '
|
return ' '
|
||||||
else:
|
else:
|
||||||
return ital+' '
|
return ital+' '
|
||||||
|
|
||||||
class DocAnalysis(object):
|
class DocAnalysis(object):
|
||||||
'''
|
'''
|
||||||
@ -191,7 +191,7 @@ class Dehyphenator(object):
|
|||||||
dehyphenated = unicode(firsthalf) + unicode(secondhalf)
|
dehyphenated = unicode(firsthalf) + unicode(secondhalf)
|
||||||
lookupword = self.removesuffixes.sub('', dehyphenated)
|
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||||
if self.prefixes.match(firsthalf) is None:
|
if self.prefixes.match(firsthalf) is None:
|
||||||
lookupword = self.removeprefix.sub('', lookupword)
|
lookupword = self.removeprefix.sub('', lookupword)
|
||||||
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
|
||||||
try:
|
try:
|
||||||
searchresult = self.html.find(lookupword.lower())
|
searchresult = self.html.find(lookupword.lower())
|
||||||
|
@ -113,24 +113,24 @@ class PreProcessor(object):
|
|||||||
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
||||||
# <pre> tags), check and mark up line endings if required before proceeding
|
# <pre> tags), check and mark up line endings if required before proceeding
|
||||||
if self.no_markup(html, 0.1):
|
if self.no_markup(html, 0.1):
|
||||||
self.log("not enough paragraph markers, adding now")
|
self.log("not enough paragraph markers, adding now")
|
||||||
# check if content is in pre tags, use txt processor to mark up if so
|
# check if content is in pre tags, use txt processor to mark up if so
|
||||||
pre = re.compile(r'<pre>', re.IGNORECASE)
|
pre = re.compile(r'<pre>', re.IGNORECASE)
|
||||||
if len(pre.findall(html)) == 1:
|
if len(pre.findall(html)) == 1:
|
||||||
self.log("Running Text Processing")
|
self.log("Running Text Processing")
|
||||||
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
|
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
|
||||||
separate_paragraphs_single_line
|
separate_paragraphs_single_line
|
||||||
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
|
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
|
||||||
html = outerhtml.sub('\g<text>', html)
|
html = outerhtml.sub('\g<text>', html)
|
||||||
html = separate_paragraphs_single_line(html)
|
html = separate_paragraphs_single_line(html)
|
||||||
html = preserve_spaces(html)
|
html = preserve_spaces(html)
|
||||||
html = convert_basic(html, epub_split_size_kb=0)
|
html = convert_basic(html, epub_split_size_kb=0)
|
||||||
else:
|
else:
|
||||||
# Add markup naively
|
# Add markup naively
|
||||||
# TODO - find out if there are cases where there are more than one <pre> tag or
|
# TODO - find out if there are cases where there are more than one <pre> tag or
|
||||||
# other types of unmarked html and handle them in some better fashion
|
# other types of unmarked html and handle them in some better fashion
|
||||||
add_markup = re.compile('(?<!>)(\n)')
|
add_markup = re.compile('(?<!>)(\n)')
|
||||||
html = add_markup.sub('</p>\n<p>', html)
|
html = add_markup.sub('</p>\n<p>', html)
|
||||||
|
|
||||||
###### Mark Indents/Cleanup ######
|
###### Mark Indents/Cleanup ######
|
||||||
#
|
#
|
||||||
@ -164,8 +164,8 @@ class PreProcessor(object):
|
|||||||
self.log("deleting blank lines")
|
self.log("deleting blank lines")
|
||||||
html = blankreg.sub('', html)
|
html = blankreg.sub('', html)
|
||||||
elif float(len(blanklines)) / float(len(lines)) > 0.40:
|
elif float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||||
blanks_between_paragraphs = True
|
blanks_between_paragraphs = True
|
||||||
#print "blanks between paragraphs is marked True"
|
#print "blanks between paragraphs is marked True"
|
||||||
else:
|
else:
|
||||||
blanks_between_paragraphs = False
|
blanks_between_paragraphs = False
|
||||||
#self.dump(html, 'before_chapter_markup')
|
#self.dump(html, 'before_chapter_markup')
|
||||||
|
@ -173,7 +173,7 @@ class FB2MLizer(object):
|
|||||||
if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml':
|
if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml':
|
||||||
self.oeb_book.spine.insert(0, title_item, True)
|
self.oeb_book.spine.insert(0, title_item, True)
|
||||||
# Create xhtml page to reference cover image so it can be used.
|
# Create xhtml page to reference cover image so it can be used.
|
||||||
if self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
|
if not title_name and self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
|
||||||
id = unicode(self.oeb_book.metadata.cover[0])
|
id = unicode(self.oeb_book.metadata.cover[0])
|
||||||
cover_item = self.oeb_book.manifest.ids[id]
|
cover_item = self.oeb_book.manifest.ids[id]
|
||||||
if cover_item.media_type in OEB_RASTER_IMAGES:
|
if cover_item.media_type in OEB_RASTER_IMAGES:
|
||||||
|
@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||||
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
from calibre.ebooks.conversion.utils import PreProcessor
|
||||||
@ -18,30 +18,6 @@ class PDBInput(InputFormatPlugin):
|
|||||||
description = 'Convert PDB to HTML'
|
description = 'Convert PDB to HTML'
|
||||||
file_types = set(['pdb'])
|
file_types = set(['pdb'])
|
||||||
|
|
||||||
options = set([
|
|
||||||
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
|
||||||
choices=['auto', 'block', 'single', 'print'],
|
|
||||||
help=_('Paragraph structure.\n'
|
|
||||||
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
|
|
||||||
'* auto: Try to auto detect paragraph type.\n'
|
|
||||||
'* block: Treat a blank line as a paragraph break.\n'
|
|
||||||
'* single: Assume every line is a paragraph.\n'
|
|
||||||
'* print: Assume every line starting with 2+ spaces or a tab '
|
|
||||||
'starts a paragraph.')),
|
|
||||||
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
|
||||||
choices=['auto', 'none', 'markdown'],
|
|
||||||
help=_('Formatting used within the document.'
|
|
||||||
'* auto: Try to auto detect the document formatting.\n'
|
|
||||||
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
|
|
||||||
'* markdown: Run the input though the markdown pre-processor. '
|
|
||||||
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
|
||||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
|
||||||
help=_('Normally extra spaces are condensed into a single space. '
|
|
||||||
'With this option all spaces will be displayed.')),
|
|
||||||
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
|
|
||||||
help=_('Do not insert a Table of Contents into the output text.')),
|
|
||||||
])
|
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
header = PdbHeaderReader(stream)
|
header = PdbHeaderReader(stream)
|
||||||
@ -60,4 +36,4 @@ class PDBInput(InputFormatPlugin):
|
|||||||
def preprocess_html(self, options, html):
|
def preprocess_html(self, options, html):
|
||||||
self.options = options
|
self.options = options
|
||||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
||||||
return preprocessor(html)
|
return preprocessor(html)
|
||||||
|
@ -19,9 +19,6 @@ class Reader(FormatReader):
|
|||||||
self.stream = stream
|
self.stream = stream
|
||||||
self.log = log
|
self.log = log
|
||||||
self.options = options
|
self.options = options
|
||||||
setattr(self.options, 'new_pdf_engine', False)
|
|
||||||
setattr(self.options, 'no_images', False)
|
|
||||||
setattr(self.options, 'unwrap_factor', 0.45)
|
|
||||||
|
|
||||||
def extract_content(self, output_dir):
|
def extract_content(self, output_dir):
|
||||||
self.log.info('Extracting PDF...')
|
self.log.info('Extracting PDF...')
|
||||||
@ -31,7 +28,12 @@ class Reader(FormatReader):
|
|||||||
for x in xrange(self.header.section_count()):
|
for x in xrange(self.header.section_count()):
|
||||||
pdf.write(self.header.section_data(x))
|
pdf.write(self.header.section_data(x))
|
||||||
|
|
||||||
from calibre.customize.ui import plugin_for_input_format
|
from calibre.customize.ui import plugin_for_input_format
|
||||||
pdf.seek(0)
|
|
||||||
return plugin_for_input_format('pdf').convert(pdf, self.options,
|
pdf_plugin = plugin_for_input_format('pdf')
|
||||||
'pdf', self.log, [])
|
for option in pdf_plugin.options:
|
||||||
|
if not hasattr(self.options, option.option.name):
|
||||||
|
setattr(self.options, option.name, option.recommended_value)
|
||||||
|
|
||||||
|
pdf.seek(0)
|
||||||
|
return pdf_plugin.convert(pdf, self.options, 'pdf', self.log, {})
|
||||||
|
@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
|
|
||||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
from calibre.customize.conversion import InputFormatPlugin
|
||||||
from calibre.ebooks.compression.tcr import decompress
|
from calibre.ebooks.compression.tcr import decompress
|
||||||
|
|
||||||
class TCRInput(InputFormatPlugin):
|
class TCRInput(InputFormatPlugin):
|
||||||
@ -16,30 +16,6 @@ class TCRInput(InputFormatPlugin):
|
|||||||
description = 'Convert TCR files to HTML'
|
description = 'Convert TCR files to HTML'
|
||||||
file_types = set(['tcr'])
|
file_types = set(['tcr'])
|
||||||
|
|
||||||
options = set([
|
|
||||||
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
|
||||||
choices=['auto', 'block', 'single', 'print'],
|
|
||||||
help=_('Paragraph structure.\n'
|
|
||||||
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
|
|
||||||
'* auto: Try to auto detect paragraph type.\n'
|
|
||||||
'* block: Treat a blank line as a paragraph break.\n'
|
|
||||||
'* single: Assume every line is a paragraph.\n'
|
|
||||||
'* print: Assume every line starting with 2+ spaces or a tab '
|
|
||||||
'starts a paragraph.')),
|
|
||||||
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
|
||||||
choices=['auto', 'none', 'markdown'],
|
|
||||||
help=_('Formatting used within the document.'
|
|
||||||
'* auto: Try to auto detect the document formatting.\n'
|
|
||||||
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
|
|
||||||
'* markdown: Run the input though the markdown pre-processor. '
|
|
||||||
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
|
||||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
|
||||||
help=_('Normally extra spaces are condensed into a single space. '
|
|
||||||
'With this option all spaces will be displayed.')),
|
|
||||||
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
|
|
||||||
help=_('Do not insert a Table of Contents into the output text.')),
|
|
||||||
])
|
|
||||||
|
|
||||||
def convert(self, stream, options, file_ext, log, accelerators):
|
def convert(self, stream, options, file_ext, log, accelerators):
|
||||||
log.info('Decompressing text...')
|
log.info('Decompressing text...')
|
||||||
raw_txt = decompress(stream)
|
raw_txt = decompress(stream)
|
||||||
|
@ -1,25 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
|
||||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
from calibre.gui2.convert.txt_input_ui import Ui_Form
|
|
||||||
from calibre.gui2.convert import Widget
|
|
||||||
|
|
||||||
class PluginWidget(Widget, Ui_Form):
|
|
||||||
|
|
||||||
TITLE = _('PDB Input')
|
|
||||||
HELP = _('Options specific to')+' PDB '+_('input')
|
|
||||||
COMMIT_NAME = 'pdb_input'
|
|
||||||
ICON = I('mimetypes/txt.png')
|
|
||||||
|
|
||||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
|
||||||
Widget.__init__(self, parent,
|
|
||||||
['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
|
|
||||||
self.db, self.book_id = db, book_id
|
|
||||||
for x in get_option('paragraph_type').option.choices:
|
|
||||||
self.opt_paragraph_type.addItem(x)
|
|
||||||
for x in get_option('formatting_type').option.choices:
|
|
||||||
self.opt_formatting_type.addItem(x)
|
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
|
@ -1,25 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
|
||||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
from calibre.gui2.convert.txt_input_ui import Ui_Form
|
|
||||||
from calibre.gui2.convert import Widget
|
|
||||||
|
|
||||||
class PluginWidget(Widget, Ui_Form):
|
|
||||||
|
|
||||||
TITLE = _('TCR Input')
|
|
||||||
HELP = _('Options specific to')+' TCR '+_('input')
|
|
||||||
COMMIT_NAME = 'tcr_input'
|
|
||||||
ICON = I('mimetypes/txt.png')
|
|
||||||
|
|
||||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
|
||||||
Widget.__init__(self, parent,
|
|
||||||
['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
|
|
||||||
self.db, self.book_id = db, book_id
|
|
||||||
for x in get_option('paragraph_type').option.choices:
|
|
||||||
self.opt_paragraph_type.addItem(x)
|
|
||||||
for x in get_option('formatting_type').option.choices:
|
|
||||||
self.opt_formatting_type.addItem(x)
|
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
|
Loading…
x
Reference in New Issue
Block a user