mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
merge italicize code, other GUI and defaults changes
This commit is contained in:
commit
313cec6699
@ -128,6 +128,36 @@ class PreProcessor(object):
|
|||||||
wordcount = get_wordcount_obj(word_count_text)
|
wordcount = get_wordcount_obj(word_count_text)
|
||||||
return wordcount.words
|
return wordcount.words
|
||||||
|
|
||||||
|
def markup_italicis(self, html):
|
||||||
|
ITALICIZE_WORDS = [
|
||||||
|
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
||||||
|
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
|
||||||
|
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
||||||
|
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
||||||
|
]
|
||||||
|
|
||||||
|
ITALICIZE_STYLE_PATS = [
|
||||||
|
r'(?msu)_(?P<words>.+?)_',
|
||||||
|
r'(?msu)/(?P<words>[^<>]+?)/',
|
||||||
|
r'(?msu)~~(?P<words>.+?)~~',
|
||||||
|
r'(?msu)\*(?P<words>.+?)\*',
|
||||||
|
r'(?msu)~(?P<words>.+?)~',
|
||||||
|
r'(?msu)_/(?P<words>[^<>]+?)/_',
|
||||||
|
r'(?msu)_\*(?P<words>.+?)\*_',
|
||||||
|
r'(?msu)\*/(?P<words>[^<>]+?)/\*',
|
||||||
|
r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
|
||||||
|
r'(?msu)/:(?P<words>[^<>]+?):/',
|
||||||
|
r'(?msu)\|:(?P<words>.+?):\|',
|
||||||
|
]
|
||||||
|
|
||||||
|
for word in ITALICIZE_WORDS:
|
||||||
|
html = html.replace(word, '<i>%s</i>' % word)
|
||||||
|
|
||||||
|
for pat in ITALICIZE_STYLE_PATS:
|
||||||
|
html = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), html)
|
||||||
|
|
||||||
|
return html
|
||||||
|
|
||||||
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
|
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
|
||||||
'''
|
'''
|
||||||
Searches for common chapter headings throughout the document
|
Searches for common chapter headings throughout the document
|
||||||
@ -360,7 +390,7 @@ class PreProcessor(object):
|
|||||||
html = self.markup_pre(html)
|
html = self.markup_pre(html)
|
||||||
|
|
||||||
# Replace series of non-breaking spaces with text-indent
|
# Replace series of non-breaking spaces with text-indent
|
||||||
if getattr(self.extra_opts, 'fix_indents', True):
|
if getattr(self.extra_opts, 'fix_indents', False):
|
||||||
html = self.fix_nbsp_indents(html)
|
html = self.fix_nbsp_indents(html)
|
||||||
|
|
||||||
if self.cleanup_required():
|
if self.cleanup_required():
|
||||||
@ -375,19 +405,21 @@ class PreProcessor(object):
|
|||||||
#self.dump(html, 'before_chapter_markup')
|
#self.dump(html, 'before_chapter_markup')
|
||||||
# detect chapters/sections to match xpath or splitting logic
|
# detect chapters/sections to match xpath or splitting logic
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'markup_chapter_headings', True):
|
if getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||||
html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
|
html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
|
||||||
|
|
||||||
|
if getattr(self.extra_opts, 'italicize_common_cases', False):
|
||||||
|
html = self.markup_italicis(html)
|
||||||
|
|
||||||
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
|
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
|
||||||
# blank paragraphs then delete blank lines to clean up spacing
|
# blank paragraphs then delete blank lines to clean up spacing
|
||||||
if blanks_between_paragraphs and getattr(self.extra_opts,
|
if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
||||||
'delete_blank_paragraphs', False):
|
|
||||||
self.log("deleting blank lines")
|
self.log("deleting blank lines")
|
||||||
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
|
||||||
html = self.blankreg.sub('', html)
|
html = self.blankreg.sub('', html)
|
||||||
|
|
||||||
###### Unwrap lines ######
|
###### Unwrap lines ######
|
||||||
if getattr(self.extra_opts, 'unwrap_lines', True):
|
if getattr(self.extra_opts, 'unwrap_lines', False):
|
||||||
# Determine line ending type
|
# Determine line ending type
|
||||||
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
||||||
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
||||||
|
@ -1,58 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
|
||||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from calibre import prepare_string_for_xml
|
|
||||||
|
|
||||||
class TXTHeuristicProcessor(object):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.ITALICIZE_WORDS = [
|
|
||||||
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
|
||||||
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
|
|
||||||
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
|
||||||
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
|
||||||
]
|
|
||||||
self.ITALICIZE_STYLE_PATS = [
|
|
||||||
r'(?msu)_(?P<words>.+?)_',
|
|
||||||
r'(?msu)/(?P<words>[^<>]+?)/',
|
|
||||||
r'(?msu)~~(?P<words>.+?)~~',
|
|
||||||
r'(?msu)\*(?P<words>.+?)\*',
|
|
||||||
r'(?msu)~(?P<words>.+?)~',
|
|
||||||
r'(?msu)_/(?P<words>[^<>]+?)/_',
|
|
||||||
r'(?msu)_\*(?P<words>.+?)\*_',
|
|
||||||
r'(?msu)\*/(?P<words>[^<>]+?)/\*',
|
|
||||||
r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
|
|
||||||
r'(?msu)/:(?P<words>[^<>]+?):/',
|
|
||||||
r'(?msu)\|:(?P<words>.+?):\|',
|
|
||||||
]
|
|
||||||
|
|
||||||
def process_paragraph(self, paragraph):
|
|
||||||
for word in self.ITALICIZE_WORDS:
|
|
||||||
paragraph = paragraph.replace(word, '<i>%s</i>' % word)
|
|
||||||
for pat in self.ITALICIZE_STYLE_PATS:
|
|
||||||
paragraph = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), paragraph)
|
|
||||||
return paragraph
|
|
||||||
|
|
||||||
def convert(self, txt, title='', epub_split_size_kb=0):
|
|
||||||
from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
|
|
||||||
txt = clean_txt(txt)
|
|
||||||
txt = split_txt(txt, epub_split_size_kb)
|
|
||||||
|
|
||||||
processed = []
|
|
||||||
for line in txt.split('\n\n'):
|
|
||||||
processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
|
|
||||||
|
|
||||||
txt = u'\n'.join(processed)
|
|
||||||
txt = re.sub('[ ]{2,}', ' ', txt)
|
|
||||||
html = HTML_TEMPLATE % (title, txt)
|
|
||||||
|
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
|
||||||
pp = PreProcessor()
|
|
||||||
html = pp.markup_chapters(html, pp.get_word_count(html), False)
|
|
||||||
|
|
||||||
return html
|
|
@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
|
|||||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||||
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||||
convert_heuristic, normalize_line_endings, convert_textile
|
normalize_line_endings, convert_textile
|
||||||
from calibre import _ent_pat, xml_entity_to_unicode
|
from calibre import _ent_pat, xml_entity_to_unicode
|
||||||
|
|
||||||
class TXTInput(InputFormatPlugin):
|
class TXTInput(InputFormatPlugin):
|
||||||
@ -126,11 +126,16 @@ class TXTInput(InputFormatPlugin):
|
|||||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||||
|
|
||||||
flow_size = getattr(options, 'flow_size', 0)
|
flow_size = getattr(options, 'flow_size', 0)
|
||||||
|
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||||
|
|
||||||
if options.formatting_type == 'heuristic':
|
if options.formatting_type == 'heuristic':
|
||||||
html = convert_heuristic(txt, epub_split_size_kb=flow_size)
|
setattr(options, 'enable_heuristics', True)
|
||||||
else:
|
setattr(options, 'markup_chapter_headings', True)
|
||||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
setattr(options, 'italicize_common_cases', True)
|
||||||
|
setattr(options, 'fix_indents', True)
|
||||||
|
setattr(options, 'delete_blank_paragraphs', True)
|
||||||
|
setattr(options, 'format_scene_breaks', True)
|
||||||
|
setattr(options, 'dehyphenate', True)
|
||||||
|
|
||||||
# Dehyphenate in cleanup mode for missed txt and markdown conversion
|
# Dehyphenate in cleanup mode for missed txt and markdown conversion
|
||||||
dehyphenator = Dehyphenator()
|
dehyphenator = Dehyphenator()
|
||||||
|
@ -12,7 +12,6 @@ import os, re
|
|||||||
|
|
||||||
from calibre import prepare_string_for_xml, isbytestring
|
from calibre import prepare_string_for_xml, isbytestring
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
|
|
||||||
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
from calibre.ebooks.conversion.preprocess import DocAnalysis
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
|
|
||||||
@ -67,10 +66,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
|
|||||||
|
|
||||||
return HTML_TEMPLATE % (title, u'\n'.join(lines))
|
return HTML_TEMPLATE % (title, u'\n'.join(lines))
|
||||||
|
|
||||||
def convert_heuristic(txt, title='', epub_split_size_kb=0):
|
|
||||||
tp = TXTHeuristicProcessor()
|
|
||||||
return tp.convert(txt, title, epub_split_size_kb)
|
|
||||||
|
|
||||||
def convert_markdown(txt, title='', disable_toc=False):
|
def convert_markdown(txt, title='', disable_toc=False):
|
||||||
from calibre.ebooks.markdown import markdown
|
from calibre.ebooks.markdown import markdown
|
||||||
md = markdown.Markdown(
|
md = markdown.Markdown(
|
||||||
|
@ -24,25 +24,19 @@ class SearchAndReplaceWidget(Widget, Ui_Form):
|
|||||||
)
|
)
|
||||||
self.db, self.book_id = db, book_id
|
self.db, self.book_id = db, book_id
|
||||||
self.initialize_options(get_option, get_help, db, book_id)
|
self.initialize_options(get_option, get_help, db, book_id)
|
||||||
self.opt_sr1_search.set_msg(_('Search regular expression 1:'))
|
self.opt_sr1_search.set_msg(_('Regular Expression'))
|
||||||
self.opt_sr1_replace.set_msg(_('Replace regular expression 1:'))
|
self.opt_sr2_search.set_msg(_('Regular Expression'))
|
||||||
self.opt_sr2_search.set_msg(_('Search regular expression 2:'))
|
self.opt_sr3_search.set_msg(_('Regular Expression'))
|
||||||
self.opt_sr2_replace.set_msg(_('Replace regular expression 2:'))
|
|
||||||
self.opt_sr3_search.set_msg(_('Search regular expression 3:'))
|
|
||||||
self.opt_sr3_replace.set_msg(_('Replace regular expression 3:'))
|
|
||||||
|
|
||||||
def break_cycles(self):
|
def break_cycles(self):
|
||||||
Widget.break_cycles(self)
|
Widget.break_cycles(self)
|
||||||
|
|
||||||
self.opt_sr1_search.break_cycles()
|
self.opt_sr1_search.break_cycles()
|
||||||
self.opt_sr1_replace.break_cycles()
|
|
||||||
self.opt_sr2_search.break_cycles()
|
self.opt_sr2_search.break_cycles()
|
||||||
self.opt_sr2_replace.break_cycles()
|
|
||||||
self.opt_sr3_search.break_cycles()
|
self.opt_sr3_search.break_cycles()
|
||||||
self.opt_sr3_replace.break_cycles()
|
|
||||||
|
|
||||||
def pre_commit_check(self):
|
def pre_commit_check(self):
|
||||||
for x in ('sr1-search', 'sr1-replace', 'sr2-search', 'sr2-replace', 'sr3-search', 'sr3-replace',):
|
for x in ('sr1-search', 'sr2-search', 'sr3-search'):
|
||||||
x = getattr(self, 'opt_'+x)
|
x = getattr(self, 'opt_'+x)
|
||||||
try:
|
try:
|
||||||
pat = unicode(x.regex)
|
pat = unicode(x.regex)
|
||||||
|
@ -13,24 +13,72 @@
|
|||||||
<property name="windowTitle">
|
<property name="windowTitle">
|
||||||
<string>Form</string>
|
<string>Form</string>
|
||||||
</property>
|
</property>
|
||||||
<layout class="QVBoxLayout" name="verticalLayout">
|
<layout class="QGridLayout" name="gridLayout">
|
||||||
<item>
|
<item row="0" column="1">
|
||||||
|
<widget class="QLabel" name="label_4">
|
||||||
|
<property name="text">
|
||||||
|
<string>Search</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="0" column="2">
|
||||||
|
<widget class="QLabel" name="label_5">
|
||||||
|
<property name="text">
|
||||||
|
<string>Replace</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="0">
|
||||||
|
<widget class="QLabel" name="label">
|
||||||
|
<property name="text">
|
||||||
|
<string>1.</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="1" column="1">
|
||||||
<widget class="RegexEdit" name="opt_sr1_search" native="true"/>
|
<widget class="RegexEdit" name="opt_sr1_search" native="true"/>
|
||||||
</item>
|
</item>
|
||||||
<item>
|
<item row="1" column="2">
|
||||||
<widget class="RegexEdit" name="opt_sr1_replace" native="true"/>
|
<widget class="QLineEdit" name="opt_sr1_replace"/>
|
||||||
</item>
|
</item>
|
||||||
<item>
|
<item row="2" column="0">
|
||||||
|
<widget class="QLabel" name="label_2">
|
||||||
|
<property name="text">
|
||||||
|
<string>2.</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="2" column="1">
|
||||||
<widget class="RegexEdit" name="opt_sr2_search" native="true"/>
|
<widget class="RegexEdit" name="opt_sr2_search" native="true"/>
|
||||||
</item>
|
</item>
|
||||||
<item>
|
<item row="2" column="2">
|
||||||
<widget class="RegexEdit" name="opt_sr2_replace" native="true"/>
|
<widget class="QLineEdit" name="opt_sr2_replace"/>
|
||||||
</item>
|
</item>
|
||||||
<item>
|
<item row="3" column="0">
|
||||||
|
<widget class="QLabel" name="label_3">
|
||||||
|
<property name="text">
|
||||||
|
<string>3.</string>
|
||||||
|
</property>
|
||||||
|
</widget>
|
||||||
|
</item>
|
||||||
|
<item row="3" column="1">
|
||||||
<widget class="RegexEdit" name="opt_sr3_search" native="true"/>
|
<widget class="RegexEdit" name="opt_sr3_search" native="true"/>
|
||||||
</item>
|
</item>
|
||||||
<item>
|
<item row="3" column="2">
|
||||||
<widget class="RegexEdit" name="opt_sr3_replace" native="true"/>
|
<widget class="QLineEdit" name="opt_sr3_replace"/>
|
||||||
|
</item>
|
||||||
|
<item row="4" column="1">
|
||||||
|
<spacer name="verticalSpacer">
|
||||||
|
<property name="orientation">
|
||||||
|
<enum>Qt::Vertical</enum>
|
||||||
|
</property>
|
||||||
|
<property name="sizeHint" stdset="0">
|
||||||
|
<size>
|
||||||
|
<width>20</width>
|
||||||
|
<height>330</height>
|
||||||
|
</size>
|
||||||
|
</property>
|
||||||
|
</spacer>
|
||||||
</item>
|
</item>
|
||||||
</layout>
|
</layout>
|
||||||
</widget>
|
</widget>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user