mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Replace all entities by their unicode equivalents in pre-processing stage.
This commit is contained in:
parent
893863a670
commit
0daa63e395
@ -22,7 +22,6 @@ and to Falstaff for pylrs.
|
||||
"""
|
||||
import os, re, sys, copy, glob, logging, tempfile
|
||||
from collections import deque
|
||||
from htmlentitydefs import name2codepoint
|
||||
from urllib import unquote
|
||||
from urlparse import urlparse
|
||||
from math import ceil, floor
|
||||
@ -38,7 +37,7 @@ from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream,
|
||||
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
|
||||
LrsError, Sup, Sub, properties_different, EmpLine
|
||||
from libprs500.ebooks.lrf.pylrs.pylrs import Span
|
||||
from libprs500.ebooks.lrf import Book
|
||||
from libprs500.ebooks.lrf import Book, entity_to_unicode
|
||||
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
|
||||
from libprs500.ebooks import ConversionError
|
||||
from libprs500.ebooks.lrf.html.table import Table
|
||||
@ -65,16 +64,10 @@ def munge_paths(basepath, url):
|
||||
return os.path.normpath(path), fragment
|
||||
|
||||
|
||||
|
||||
|
||||
class HTMLConverter(object):
|
||||
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
|
||||
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
|
||||
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
|
||||
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo']
|
||||
patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
|
||||
targets = [ unichr(name2codepoint[i]) for i in replaced_entities ]
|
||||
ENTITY_RULES = zip(patterns, targets) + [(re.compile('''), "'")]
|
||||
|
||||
|
||||
MARKUP_MASSAGE = [
|
||||
@ -89,8 +82,10 @@ class HTMLConverter(object):
|
||||
(re.compile(r'<a.*?>(.*?)</a\s*>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: re.compile(r'<\s*?p.*?>', re.IGNORECASE).sub('', match.group())),
|
||||
# Workaround bug in BeautifulSoup handling
|
||||
(re.compile(u' | | |\xa0', re.IGNORECASE), lambda match : u'\uffff')
|
||||
]
|
||||
(re.compile(u' | | |\xa0', re.IGNORECASE), lambda match : u'\uffff'),
|
||||
# Replace entities
|
||||
(re.compile(ur'&(\S+?);'), entity_to_unicode),
|
||||
]
|
||||
# Fix Baen markup
|
||||
BAEN = [
|
||||
(re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE),
|
||||
@ -523,9 +518,6 @@ class HTMLConverter(object):
|
||||
text += c['alt']
|
||||
return text
|
||||
text += self.get_text(c)
|
||||
if text:
|
||||
for rule, sub in self.__class__.ENTITY_RULES:
|
||||
text = rule.sub(sub, text)
|
||||
return text
|
||||
|
||||
def process_links(self):
|
||||
@ -740,8 +732,6 @@ class HTMLConverter(object):
|
||||
|
||||
def append_text(src):
|
||||
fp, key, variant = self.font_properties(css)
|
||||
for pat, repl in self.__class__.ENTITY_RULES:
|
||||
src = pat.sub(repl, src)
|
||||
src = src.replace(u'\uffff', ' ') # becomes u'\uffff'
|
||||
normal_font_size = int(fp['fontsize'])
|
||||
if variant == 'small-caps':
|
||||
|
Loading…
x
Reference in New Issue
Block a user