mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
DRYer
This commit is contained in:
parent
142c2b0314
commit
25e7a5bb74
@ -46,6 +46,7 @@ from calibre.ebooks.oeb.base import (
|
|||||||
)
|
)
|
||||||
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html
|
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html
|
||||||
from calibre.ebooks.oeb.polish.errors import DRMError, InvalidBook
|
from calibre.ebooks.oeb.polish.errors import DRMError, InvalidBook
|
||||||
|
from calibre.ebooks.oeb.polish.parsing import decode_xml
|
||||||
from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak
|
from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak
|
||||||
from calibre.ebooks.oeb.polish.utils import OEB_FONTS, CommentFinder, PositionFinder, adjust_mime_for_epub, guess_type, insert_self_closing, parse_css
|
from calibre.ebooks.oeb.polish.utils import OEB_FONTS, CommentFinder, PositionFinder, adjust_mime_for_epub, guess_type, insert_self_closing, parse_css
|
||||||
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
|
||||||
@ -168,36 +169,10 @@ class ContainerBase: # {{{
|
|||||||
|
|
||||||
:param normalize_to_nfc: Normalize returned unicode to the NFC normal form as is required by both the EPUB and AZW3 formats.
|
:param normalize_to_nfc: Normalize returned unicode to the NFC normal form as is required by both the EPUB and AZW3 formats.
|
||||||
'''
|
'''
|
||||||
def fix_data(d):
|
html, used_encoding = decode_xml(data, normalize_to_nfc)
|
||||||
return d.replace('\r\n', '\n').replace('\r', '\n')
|
if used_encoding:
|
||||||
if isinstance(data, str):
|
self.used_encoding = used_encoding
|
||||||
return fix_data(data)
|
return html
|
||||||
bom_enc = None
|
|
||||||
if data[:4] in {b'\0\0\xfe\xff', b'\xff\xfe\0\0'}:
|
|
||||||
bom_enc = {b'\0\0\xfe\xff':'utf-32-be',
|
|
||||||
b'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
|
|
||||||
data = data[4:]
|
|
||||||
elif data[:2] in {b'\xff\xfe', b'\xfe\xff'}:
|
|
||||||
bom_enc = {b'\xff\xfe':'utf-16-le', b'\xfe\xff':'utf-16-be'}[data[:2]]
|
|
||||||
data = data[2:]
|
|
||||||
elif data[:3] == b'\xef\xbb\xbf':
|
|
||||||
bom_enc = 'utf-8'
|
|
||||||
data = data[3:]
|
|
||||||
if bom_enc is not None:
|
|
||||||
try:
|
|
||||||
self.used_encoding = bom_enc
|
|
||||||
return fix_data(data.decode(bom_enc))
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
self.used_encoding = 'utf-8'
|
|
||||||
return fix_data(data.decode('utf-8'))
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
data, self.used_encoding = xml_to_unicode(data)
|
|
||||||
if normalize_to_nfc:
|
|
||||||
data = unicodedata.normalize('NFC', data)
|
|
||||||
return fix_data(data)
|
|
||||||
|
|
||||||
def parse_xml(self, data):
|
def parse_xml(self, data):
|
||||||
data, self.used_encoding = xml_to_unicode(
|
data, self.used_encoding = xml_to_unicode(
|
||||||
|
@ -18,6 +18,7 @@ from lxml import etree
|
|||||||
|
|
||||||
from calibre.ebooks.oeb.base import XHTML, XPath
|
from calibre.ebooks.oeb.base import XHTML, XPath
|
||||||
from calibre.ebooks.oeb.parse_utils import barename, merge_multiple_html_heads_and_bodies
|
from calibre.ebooks.oeb.parse_utils import barename, merge_multiple_html_heads_and_bodies
|
||||||
|
from calibre.ebooks.oeb.polish.parsing import parse
|
||||||
from calibre.ebooks.oeb.polish.tts import lang_for_elem
|
from calibre.ebooks.oeb.polish.tts import lang_for_elem
|
||||||
from calibre.ebooks.oeb.polish.utils import extract, insert_self_closing
|
from calibre.ebooks.oeb.polish.utils import extract, insert_self_closing
|
||||||
from calibre.spell.break_iterator import sentence_positions
|
from calibre.spell.break_iterator import sentence_positions
|
||||||
@ -176,11 +177,17 @@ def remove_kobo_markup_from_html(root):
|
|||||||
unwrap_body_contents(body)
|
unwrap_body_contents(body)
|
||||||
|
|
||||||
|
|
||||||
def kepubify_html(root, metadata_lang='en'):
|
def kepubify_parsed_html(root, metadata_lang: str = 'en'):
|
||||||
remove_kobo_markup_from_html(root)
|
remove_kobo_markup_from_html(root)
|
||||||
merge_multiple_html_heads_and_bodies(root)
|
merge_multiple_html_heads_and_bodies(root)
|
||||||
add_kobo_markup_to_html(root, metadata_lang)
|
add_kobo_markup_to_html(root, metadata_lang)
|
||||||
|
|
||||||
|
|
||||||
def kepubify(container):
|
def kepubify_html_data(raw: str | bytes, metadata_lang: str = 'en'):
|
||||||
|
root = parse(raw)
|
||||||
|
kepubify_parsed_html(root, metadata_lang)
|
||||||
|
return root
|
||||||
|
|
||||||
|
|
||||||
|
def kepubify_container(container):
|
||||||
lang = container.mi.language
|
lang = container.mi.language
|
||||||
|
@ -5,6 +5,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
import html5_parser
|
import html5_parser
|
||||||
from lxml.etree import Element as LxmlElement
|
from lxml.etree import Element as LxmlElement
|
||||||
@ -23,6 +24,40 @@ except ImportError:
|
|||||||
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||||
|
|
||||||
|
|
||||||
|
def decode_xml(data: bytes, normalize_to_nfc: bool = True) -> tuple[str, str]:
|
||||||
|
used_encoding = ''
|
||||||
|
def fix_data(d):
|
||||||
|
return d.replace('\r\n', '\n').replace('\r', '\n'), used_encoding
|
||||||
|
if isinstance(data, str):
|
||||||
|
return fix_data(data)
|
||||||
|
bom_enc = None
|
||||||
|
if data[:4] in {b'\0\0\xfe\xff', b'\xff\xfe\0\0'}:
|
||||||
|
bom_enc = {b'\0\0\xfe\xff':'utf-32-be',
|
||||||
|
b'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
|
||||||
|
data = data[4:]
|
||||||
|
elif data[:2] in {b'\xff\xfe', b'\xfe\xff'}:
|
||||||
|
bom_enc = {b'\xff\xfe':'utf-16-le', b'\xfe\xff':'utf-16-be'}[data[:2]]
|
||||||
|
data = data[2:]
|
||||||
|
elif data[:3] == b'\xef\xbb\xbf':
|
||||||
|
bom_enc = 'utf-8'
|
||||||
|
data = data[3:]
|
||||||
|
if bom_enc is not None:
|
||||||
|
try:
|
||||||
|
used_encoding = bom_enc
|
||||||
|
return fix_data(data.decode(bom_enc))
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
used_encoding = 'utf-8'
|
||||||
|
return fix_data(data.decode('utf-8'))
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
data, used_encoding = xml_to_unicode(data)
|
||||||
|
if normalize_to_nfc:
|
||||||
|
data = unicodedata.normalize('NFC', data)
|
||||||
|
return fix_data(data)
|
||||||
|
|
||||||
|
|
||||||
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
|
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
|
||||||
if isinstance(raw, bytes):
|
if isinstance(raw, bytes):
|
||||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||||
|
@ -3,8 +3,7 @@
|
|||||||
|
|
||||||
|
|
||||||
from calibre.ebooks.oeb.base import serialize
|
from calibre.ebooks.oeb.base import serialize
|
||||||
from calibre.ebooks.oeb.polish.kepubify import kepubify_html
|
from calibre.ebooks.oeb.polish.kepubify import kepubify_html_data
|
||||||
from calibre.ebooks.oeb.polish.parsing import parse_html5 as parse
|
|
||||||
from calibre.ebooks.oeb.polish.tests.base import BaseTest
|
from calibre.ebooks.oeb.polish.tests.base import BaseTest
|
||||||
|
|
||||||
|
|
||||||
@ -38,8 +37,7 @@ div#book-inner { margin-top: 0; margin-bottom: 0; }</style></head><body><div id=
|
|||||||
# skipped tags
|
# skipped tags
|
||||||
}.items():
|
}.items():
|
||||||
with self.subTest(src=src):
|
with self.subTest(src=src):
|
||||||
root = parse(src)
|
root = kepubify_html_data(src)
|
||||||
kepubify_html(root)
|
|
||||||
actual = serialize(root, 'text/html').decode('utf-8')
|
actual = serialize(root, 'text/html').decode('utf-8')
|
||||||
actual = actual[len(prefix):-len(suffix)]
|
actual = actual[len(prefix):-len(suffix)]
|
||||||
self.assertEqual(expected, actual)
|
self.assertEqual(expected, actual)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user