mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
doc tweaks, delete empty paragraphs during Heuristics
This commit is contained in:
parent
a53f1148c2
commit
6e64f5ec4e
@ -367,6 +367,8 @@ class HeuristicProcessor(object):
|
|||||||
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||||
# Delete microsoft 'smart' tags
|
# Delete microsoft 'smart' tags
|
||||||
html = re.sub('(?i)</?st1:\w+>', '', html)
|
html = re.sub('(?i)</?st1:\w+>', '', html)
|
||||||
|
# Delete self closing paragraph tags
|
||||||
|
html = re.sub('<p\s?/>', '', html)
|
||||||
# Get rid of empty span, bold, font, em, & italics tags
|
# Get rid of empty span, bold, font, em, & italics tags
|
||||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||||
html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
|
html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
|
||||||
|
@ -587,11 +587,11 @@ TXT input supports a number of options to differentiate how paragraphs are detec
|
|||||||
Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when
|
Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when
|
||||||
the next line that starts with an indent is reached::
|
the next line that starts with an indent is reached::
|
||||||
|
|
||||||
This is the
|
This is the
|
||||||
first.
|
first.
|
||||||
This is the second.
|
This is the second.
|
||||||
|
|
||||||
This is the
|
This is the
|
||||||
third.
|
third.
|
||||||
|
|
||||||
:guilabel:`Paragraph Style: Unformatted`
|
:guilabel:`Paragraph Style: Unformatted`
|
||||||
@ -603,7 +603,7 @@ TXT input supports a number of options to differentiate how paragraphs are detec
|
|||||||
formatting will be applied.
|
formatting will be applied.
|
||||||
|
|
||||||
:guilabel:`Formatting Style: Heuristic`
|
:guilabel:`Formatting Style: Heuristic`
|
||||||
Analyses the document for common chapter headings, scene breaks, and italicized words and applies the
|
Analyzes the document for common chapter headings, scene breaks, and italicized words and applies the
|
||||||
appropriate html markup during conversion.
|
appropriate html markup during conversion.
|
||||||
|
|
||||||
:guilabel:`Formatting Style: Markdown`
|
:guilabel:`Formatting Style: Markdown`
|
||||||
|
Loading…
x
Reference in New Issue
Block a user