mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
bockquotes and paragraph tags also have a conflict in mobi files - lxml allows <p> inside blocquote, but not the other way around
This commit is contained in:
parent
8c15219933
commit
15e6c1d212
@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import re
|
import re
|
||||||
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
|
from calibre.utils.wordcount import get_wordcount_obj
|
||||||
|
|
||||||
class PreProcessor(object):
|
class PreProcessor(object):
|
||||||
|
|
||||||
@ -168,9 +169,21 @@ class PreProcessor(object):
|
|||||||
#print "blanks between paragraphs is marked True"
|
#print "blanks between paragraphs is marked True"
|
||||||
else:
|
else:
|
||||||
blanks_between_paragraphs = False
|
blanks_between_paragraphs = False
|
||||||
|
|
||||||
|
# Count the words in the document to estimate how many chapters to look for
|
||||||
|
word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
|
||||||
|
word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
|
||||||
|
wordcount = get_wordcount_obj(word_count_text)
|
||||||
|
|
||||||
|
|
||||||
#self.dump(html, 'before_chapter_markup')
|
#self.dump(html, 'before_chapter_markup')
|
||||||
# detect chapters/sections to match xpath or splitting logic
|
# detect chapters/sections to match xpath or splitting logic
|
||||||
#
|
#
|
||||||
|
min_chapters = 10
|
||||||
|
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||||
|
self.html_preprocess_sections = len(heading.findall(html))
|
||||||
|
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
||||||
|
|
||||||
# Build the Regular Expressions in pieces
|
# Build the Regular Expressions in pieces
|
||||||
init_lookahead = "(?=<(p|div))"
|
init_lookahead = "(?=<(p|div))"
|
||||||
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
||||||
@ -193,11 +206,6 @@ class PreProcessor(object):
|
|||||||
|
|
||||||
default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
|
default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
|
||||||
|
|
||||||
min_chapters = 10
|
|
||||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
|
||||||
self.html_preprocess_sections = len(heading.findall(html))
|
|
||||||
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
|
||||||
|
|
||||||
chapter_types = [
|
chapter_types = [
|
||||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
|
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
|
||||||
[r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
[r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
||||||
@ -222,6 +230,8 @@ class PreProcessor(object):
|
|||||||
|
|
||||||
html = chapdetect.sub(self.chapter_head, html)
|
html = chapdetect.sub(self.chapter_head, html)
|
||||||
|
|
||||||
|
words_per_chptr = wordcount.words / self.html_preprocess_sections
|
||||||
|
print "wordcount is: "+ str(wordcount.words)+", Average words per chapter is: "+str(words_per_chptr)+", Marked "+str(self.html_preprocess_sections)+" chapters"
|
||||||
|
|
||||||
###### Unwrap lines ######
|
###### Unwrap lines ######
|
||||||
#
|
#
|
||||||
|
@ -480,6 +480,9 @@ class MobiReader(object):
|
|||||||
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
|
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
|
||||||
self.processed_html = re.sub(r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', '\g<para>'+'\g<styletags>', self.processed_html)
|
self.processed_html = re.sub(r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', '\g<para>'+'\g<styletags>', self.processed_html)
|
||||||
self.processed_html = re.sub(r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', '\g<styletags>'+'\g<para>', self.processed_html)
|
self.processed_html = re.sub(r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', '\g<styletags>'+'\g<para>', self.processed_html)
|
||||||
|
self.processed_html = re.sub(r'(?i)(?P<blockquote>(</blockquote[^>]*>\s*){1,})(?P<para></p[^>]*>)', '\g<para>'+'\g<blockquote>', self.processed_html)
|
||||||
|
self.processed_html = re.sub(r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<blockquote[^>]*>\s*){1,})', '\g<blockquote>'+'\g<para>', self.processed_html)
|
||||||
|
|
||||||
|
|
||||||
def remove_random_bytes(self, html):
|
def remove_random_bytes(self, html):
|
||||||
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08',
|
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08',
|
||||||
|
83
src/calibre/utils/wordcount.py
Normal file
83
src/calibre/utils/wordcount.py
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
"""
|
||||||
|
Get word, character, and Asian character counts
|
||||||
|
|
||||||
|
1. Get a word count as a dictionary:
|
||||||
|
wc = get_wordcount(text)
|
||||||
|
words = wc['words'] # etc.
|
||||||
|
|
||||||
|
2. Get a word count as an object
|
||||||
|
wc = get_wordcount_obj(text)
|
||||||
|
words = wc.words # etc.
|
||||||
|
|
||||||
|
properties counted:
|
||||||
|
* characters
|
||||||
|
* chars_no_spaces
|
||||||
|
* asian_chars
|
||||||
|
* non_asian_words
|
||||||
|
* words
|
||||||
|
|
||||||
|
Python License
|
||||||
|
"""
|
||||||
|
__version__ = 0.1
|
||||||
|
__author__ = "Ryan Ginstrom"
|
||||||
|
|
||||||
|
IDEOGRAPHIC_SPACE = 0x3000
|
||||||
|
|
||||||
|
def is_asian(char):
|
||||||
|
"""Is the character Asian?"""
|
||||||
|
|
||||||
|
# 0x3000 is ideographic space (i.e. double-byte space)
|
||||||
|
# Anything over is an Asian character
|
||||||
|
return ord(char) > IDEOGRAPHIC_SPACE
|
||||||
|
|
||||||
|
def filter_jchars(c):
|
||||||
|
"""Filters Asian characters to spaces"""
|
||||||
|
if is_asian(c):
|
||||||
|
return ' '
|
||||||
|
return c
|
||||||
|
|
||||||
|
def nonj_len(word):
|
||||||
|
u"""Returns number of non-Asian words in {word}
|
||||||
|
- 日本語AアジアンB -> 2
|
||||||
|
- hello -> 1
|
||||||
|
@param word: A word, possibly containing Asian characters
|
||||||
|
"""
|
||||||
|
# Here are the steps:
|
||||||
|
# 本spam日eggs
|
||||||
|
# -> [' ', 's', 'p', 'a', 'm', ' ', 'e', 'g', 'g', 's']
|
||||||
|
# -> ' spam eggs'
|
||||||
|
# -> ['spam', 'eggs']
|
||||||
|
# The length of which is 2!
|
||||||
|
chars = [filter_jchars(c) for c in word]
|
||||||
|
return len(u''.join(chars).split())
|
||||||
|
|
||||||
|
def get_wordcount(text):
|
||||||
|
"""Get the word/character count for text
|
||||||
|
|
||||||
|
@param text: The text of the segment
|
||||||
|
"""
|
||||||
|
|
||||||
|
characters = len(text)
|
||||||
|
chars_no_spaces = sum([not x.isspace() for x in text])
|
||||||
|
asian_chars = sum([is_asian(x) for x in text])
|
||||||
|
non_asian_words = nonj_len(text)
|
||||||
|
words = non_asian_words + asian_chars
|
||||||
|
|
||||||
|
return dict(characters=characters,
|
||||||
|
chars_no_spaces=chars_no_spaces,
|
||||||
|
asian_chars=asian_chars,
|
||||||
|
non_asian_words=non_asian_words,
|
||||||
|
words=words)
|
||||||
|
|
||||||
|
def dict2obj(dictionary):
|
||||||
|
"""Transform a dictionary into an object"""
|
||||||
|
class Obj(object):
|
||||||
|
def __init__(self, dictionary):
|
||||||
|
self.__dict__.update(dictionary)
|
||||||
|
return Obj(dictionary)
|
||||||
|
|
||||||
|
def get_wordcount_obj(text):
|
||||||
|
"""Get the wordcount as an object rather than a dictionary"""
|
||||||
|
return dict2obj(get_wordcount(text))
|
Loading…
x
Reference in New Issue
Block a user