mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
merge from trunk
This commit is contained in:
commit
7a0f6ec510
@ -11,6 +11,8 @@ class TPM_uk(BasicNewsRecipe):
|
|||||||
__author__ = 'Darko Miletic'
|
__author__ = 'Darko Miletic'
|
||||||
description = 'Title says it all'
|
description = 'Title says it all'
|
||||||
publisher = "The Philosophers' Magazine"
|
publisher = "The Philosophers' Magazine"
|
||||||
|
recipe_disabled = ('This recipe has been disabled as the website has'
|
||||||
|
' started providing articles only in PDF form')
|
||||||
category = 'philosophy, news'
|
category = 'philosophy, news'
|
||||||
oldest_article = 25
|
oldest_article = 25
|
||||||
max_articles_per_feed = 200
|
max_articles_per_feed = 200
|
||||||
|
@ -9,13 +9,16 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import copy
|
import copy
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from collections import defaultdict
|
from collections import defaultdict, namedtuple
|
||||||
|
from io import BytesIO
|
||||||
|
from struct import pack
|
||||||
|
|
||||||
import cssutils
|
import cssutils
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre import isbytestring, force_unicode
|
from calibre import isbytestring, force_unicode
|
||||||
from calibre.ebooks.mobi.utils import to_base
|
from calibre.ebooks.mobi.utils import create_text_record, to_base
|
||||||
|
from calibre.ebooks.compression.palmdoc import compress_doc
|
||||||
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
|
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
|
||||||
extract, XHTML, urlnormalize)
|
extract, XHTML, urlnormalize)
|
||||||
from calibre.ebooks.oeb.parse_utils import barename
|
from calibre.ebooks.oeb.parse_utils import barename
|
||||||
@ -31,11 +34,14 @@ class KF8Writer(object):
|
|||||||
|
|
||||||
def __init__(self, oeb, opts, resources):
|
def __init__(self, oeb, opts, resources):
|
||||||
self.oeb, self.opts, self.log = oeb, opts, oeb.log
|
self.oeb, self.opts, self.log = oeb, opts, oeb.log
|
||||||
|
self.compress = not self.opts.dont_compress
|
||||||
self.log.info('Creating KF8 output')
|
self.log.info('Creating KF8 output')
|
||||||
self.used_images = set()
|
self.used_images = set()
|
||||||
self.resources = resources
|
self.resources = resources
|
||||||
self.dup_data()
|
self.dup_data()
|
||||||
self.flows = [None] # First flow item is reserved for the text
|
self.flows = [None] # First flow item is reserved for the text
|
||||||
|
self.records = []
|
||||||
|
self.fdst_table = []
|
||||||
|
|
||||||
self.replace_resource_links()
|
self.replace_resource_links()
|
||||||
self.extract_css_into_flows()
|
self.extract_css_into_flows()
|
||||||
@ -43,6 +49,10 @@ class KF8Writer(object):
|
|||||||
self.replace_internal_links_with_placeholders()
|
self.replace_internal_links_with_placeholders()
|
||||||
self.insert_aid_attributes()
|
self.insert_aid_attributes()
|
||||||
self.chunk_it_up()
|
self.chunk_it_up()
|
||||||
|
# Dump the cloned data as it is no longer needed
|
||||||
|
del self._data_cache
|
||||||
|
self.create_text_records()
|
||||||
|
self.create_fdst_table()
|
||||||
|
|
||||||
def dup_data(self):
|
def dup_data(self):
|
||||||
''' Duplicate data so that any changes we make to markup/CSS only
|
''' Duplicate data so that any changes we make to markup/CSS only
|
||||||
@ -165,7 +175,7 @@ class KF8Writer(object):
|
|||||||
self.link_map = {}
|
self.link_map = {}
|
||||||
count = 0
|
count = 0
|
||||||
hrefs = {item.href for item in self.oeb.spine}
|
hrefs = {item.href for item in self.oeb.spine}
|
||||||
for i, item in enumerate(self.oeb.spine):
|
for item in self.oeb.spine:
|
||||||
root = self.data(item)
|
root = self.data(item)
|
||||||
|
|
||||||
for a in XPath('//h:a[@href]')(root):
|
for a in XPath('//h:a[@href]')(root):
|
||||||
@ -174,8 +184,7 @@ class KF8Writer(object):
|
|||||||
href, _, frag = ref.partition('#')
|
href, _, frag = ref.partition('#')
|
||||||
href = urlnormalize(href)
|
href = urlnormalize(href)
|
||||||
if href in hrefs:
|
if href in hrefs:
|
||||||
placeholder = 'kindle:pos:fid:%04d:off:%s'%(i,
|
placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count)
|
||||||
to_href(count))
|
|
||||||
self.link_map[placeholder] = (href, frag)
|
self.link_map[placeholder] = (href, frag)
|
||||||
a.set('href', placeholder)
|
a.set('href', placeholder)
|
||||||
|
|
||||||
@ -191,9 +200,9 @@ class KF8Writer(object):
|
|||||||
aid = aidbase + j
|
aid = aidbase + j
|
||||||
tag.attrib['aid'] = to_base(aid, base=32)
|
tag.attrib['aid'] = to_base(aid, base=32)
|
||||||
if tag.tag == XHTML('body'):
|
if tag.tag == XHTML('body'):
|
||||||
self.id_map[(item.href, '')] = tag.attrib['aid']
|
self.id_map[(item.href, '')] = (i, tag.attrib['aid'])
|
||||||
if id_ is not None:
|
if id_ is not None:
|
||||||
self.id_map[(item.href, id_)] = tag.attrib['aid']
|
self.id_map[(item.href, id_)] = (i, tag.attrib['aid'])
|
||||||
|
|
||||||
j += 1
|
j += 1
|
||||||
|
|
||||||
@ -205,12 +214,47 @@ class KF8Writer(object):
|
|||||||
if aid is None:
|
if aid is None:
|
||||||
aid = self.id_map.get((href, ''))
|
aid = self.id_map.get((href, ''))
|
||||||
placeholder_map[placeholder] = aid
|
placeholder_map[placeholder] = aid
|
||||||
chunker = Chunker(self.oeb, self.data, not self.opts.dont_compress,
|
chunker = Chunker(self.oeb, self.data, placeholder_map)
|
||||||
placeholder_map)
|
|
||||||
|
|
||||||
for x in ('skel_table', 'chunk_table', 'aid_offset_map', 'records',
|
for x in ('skel_table', 'chunk_table', 'aid_offset_map'):
|
||||||
'last_text_record_idx', 'first_non_text_record_idx',
|
|
||||||
'text_length'):
|
|
||||||
setattr(self, x, getattr(chunker, x))
|
setattr(self, x, getattr(chunker, x))
|
||||||
|
|
||||||
|
self.flows[0] = chunker.text
|
||||||
|
|
||||||
|
def create_text_records(self):
|
||||||
|
self.flows = [x.encode('utf-8') if isinstance(x, unicode) else x for x
|
||||||
|
in self.flows]
|
||||||
|
text = b''.join(self.flows)
|
||||||
|
self.text_length = len(text)
|
||||||
|
text = BytesIO(text)
|
||||||
|
nrecords = 0
|
||||||
|
records_size = 0
|
||||||
|
|
||||||
|
if self.compress:
|
||||||
|
self.oeb.logger.info(' Compressing markup content...')
|
||||||
|
|
||||||
|
while text.tell() < self.text_length:
|
||||||
|
data, overlap = create_text_record(text)
|
||||||
|
if self.compress:
|
||||||
|
data = compress_doc(data)
|
||||||
|
|
||||||
|
data += overlap
|
||||||
|
data += pack(b'>B', len(overlap))
|
||||||
|
|
||||||
|
self.records.append(data)
|
||||||
|
records_size += len(data)
|
||||||
|
nrecords += 1
|
||||||
|
|
||||||
|
self.last_text_record_idx = nrecords
|
||||||
|
self.first_non_text_record_idx = nrecords + 1
|
||||||
|
# Pad so that the next records starts at a 4 byte boundary
|
||||||
|
if records_size % 4 != 0:
|
||||||
|
self.records.append(b'\x00'*(records_size % 4))
|
||||||
|
self.first_non_text_record_idx += 1
|
||||||
|
|
||||||
|
def create_fdst_table(self):
|
||||||
|
FDST = namedtuple('Flow', 'start end')
|
||||||
|
for i, flow in enumerate(self.flows):
|
||||||
|
start = 0 if i == 0 else self.fdst_table[-1].end
|
||||||
|
self.fdst_table.append(FDST(start, start + len(flow)))
|
||||||
|
|
||||||
|
@ -9,16 +9,13 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from io import BytesIO
|
|
||||||
from struct import pack
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.ebooks.oeb.base import XHTML_NS
|
from calibre.ebooks.oeb.base import XHTML_NS
|
||||||
from calibre.constants import ispy3
|
from calibre.constants import ispy3
|
||||||
from calibre.ebooks.mobi.utils import create_text_record, to_base
|
from calibre.ebooks.mobi.utils import to_base
|
||||||
from calibre.ebooks.compression.palmdoc import compress_doc
|
|
||||||
|
|
||||||
CHUNK_SIZE = 8192
|
CHUNK_SIZE = 8192
|
||||||
|
|
||||||
@ -159,17 +156,16 @@ class Skeleton(object):
|
|||||||
|
|
||||||
class Chunker(object):
|
class Chunker(object):
|
||||||
|
|
||||||
def __init__(self, oeb, data_func, compress, placeholder_map):
|
def __init__(self, oeb, data_func, placeholder_map):
|
||||||
self.oeb, self.log = oeb, oeb.log
|
self.oeb, self.log = oeb, oeb.log
|
||||||
self.data = data_func
|
self.data = data_func
|
||||||
self.compress = compress
|
|
||||||
self.placeholder_map = placeholder_map
|
self.placeholder_map = placeholder_map
|
||||||
|
|
||||||
self.skeletons = []
|
self.skeletons = []
|
||||||
|
|
||||||
# Set this to a list to enable dumping of the original and rebuilt
|
# Set this to a list to enable dumping of the original and rebuilt
|
||||||
# html files for debugging
|
# html files for debugging
|
||||||
self.orig_dumps = None
|
orig_dumps = None
|
||||||
|
|
||||||
for i, item in enumerate(self.oeb.spine):
|
for i, item in enumerate(self.oeb.spine):
|
||||||
root = self.remove_namespaces(self.data(item))
|
root = self.remove_namespaces(self.data(item))
|
||||||
@ -197,8 +193,8 @@ class Chunker(object):
|
|||||||
# for all chunks
|
# for all chunks
|
||||||
self.skeletons.append(Skeleton(i, item, root, chunks))
|
self.skeletons.append(Skeleton(i, item, root, chunks))
|
||||||
|
|
||||||
if self.orig_dumps:
|
if orig_dumps:
|
||||||
self.dump()
|
self.dump(orig_dumps)
|
||||||
|
|
||||||
# Create the SKEL and Chunk tables
|
# Create the SKEL and Chunk tables
|
||||||
self.skel_table = []
|
self.skel_table = []
|
||||||
@ -207,11 +203,7 @@ class Chunker(object):
|
|||||||
|
|
||||||
# Set internal links
|
# Set internal links
|
||||||
text = b''.join(x.raw_text for x in self.skeletons)
|
text = b''.join(x.raw_text for x in self.skeletons)
|
||||||
text = self.set_internal_links(text)
|
self.text = self.set_internal_links(text)
|
||||||
|
|
||||||
# Create text records
|
|
||||||
self.records = []
|
|
||||||
self.create_text_records(text)
|
|
||||||
|
|
||||||
def remove_namespaces(self, root):
|
def remove_namespaces(self, root):
|
||||||
lang = None
|
lang = None
|
||||||
@ -349,7 +341,12 @@ class Chunker(object):
|
|||||||
for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text):
|
for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text):
|
||||||
aid_map[match.group(1)] = match.start()
|
aid_map[match.group(1)] = match.start()
|
||||||
self.aid_offset_map = aid_map
|
self.aid_offset_map = aid_map
|
||||||
placeholder_map = {bytes(k):bytes(to_href(aid_map[v])) for k, v in
|
|
||||||
|
def to_placeholder(x):
|
||||||
|
file_number, aid = x
|
||||||
|
return bytes('%04d:%s'%(file_number, to_href(aid_map[aid])))
|
||||||
|
|
||||||
|
placeholder_map = {bytes(k):to_placeholder(v) for k, v in
|
||||||
self.placeholder_map.iteritems()}
|
self.placeholder_map.iteritems()}
|
||||||
|
|
||||||
# Now update the links
|
# Now update the links
|
||||||
@ -357,42 +354,14 @@ class Chunker(object):
|
|||||||
raw = match.group()
|
raw = match.group()
|
||||||
pl = match.group(1)
|
pl = match.group(1)
|
||||||
try:
|
try:
|
||||||
return raw[:-10] + placeholder_map[pl]
|
return raw[:-15] + placeholder_map[pl]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text)
|
return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text)
|
||||||
|
|
||||||
def create_text_records(self, text):
|
def dump(self, orig_dumps):
|
||||||
self.text_length = len(text)
|
|
||||||
text = BytesIO(text)
|
|
||||||
nrecords = 0
|
|
||||||
records_size = 0
|
|
||||||
|
|
||||||
if self.compress:
|
|
||||||
self.oeb.logger.info(' Compressing markup content...')
|
|
||||||
|
|
||||||
while text.tell() < self.text_length:
|
|
||||||
data, overlap = create_text_record(text)
|
|
||||||
if self.compress:
|
|
||||||
data = compress_doc(data)
|
|
||||||
|
|
||||||
data += overlap
|
|
||||||
data += pack(b'>B', len(overlap))
|
|
||||||
|
|
||||||
self.records.append(data)
|
|
||||||
records_size += len(data)
|
|
||||||
nrecords += 1
|
|
||||||
|
|
||||||
self.last_text_record_idx = nrecords
|
|
||||||
self.first_non_text_record_idx = nrecords + 1
|
|
||||||
# Pad so that the next records starts at a 4 byte boundary
|
|
||||||
if records_size % 4 != 0:
|
|
||||||
self.records.append(b'\x00'*(records_size % 4))
|
|
||||||
self.first_non_text_record_idx += 1
|
|
||||||
|
|
||||||
def dump(self):
|
|
||||||
import tempfile, shutil, os
|
import tempfile, shutil, os
|
||||||
tdir = os.path.join(tempfile.gettempdir(), 'skeleton')
|
tdir = os.path.join(tempfile.gettempdir(), 'skeleton')
|
||||||
self.log('Skeletons dumped to:', tdir)
|
self.log('Skeletons dumped to:', tdir)
|
||||||
@ -402,10 +371,19 @@ class Chunker(object):
|
|||||||
rebuilt = os.path.join(tdir, 'rebuilt')
|
rebuilt = os.path.join(tdir, 'rebuilt')
|
||||||
for x in (orig, rebuilt):
|
for x in (orig, rebuilt):
|
||||||
os.makedirs(x)
|
os.makedirs(x)
|
||||||
|
error = False
|
||||||
for i, skeleton in enumerate(self.skeletons):
|
for i, skeleton in enumerate(self.skeletons):
|
||||||
|
oraw, rraw = orig_dumps[i], skeleton.rebuild()
|
||||||
with open(os.path.join(orig, '%04d.html'%i), 'wb') as f:
|
with open(os.path.join(orig, '%04d.html'%i), 'wb') as f:
|
||||||
f.write(self.orig_dumps[i])
|
f.write(oraw)
|
||||||
with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f:
|
with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f:
|
||||||
f.write(skeleton.rebuild())
|
f.write(rraw)
|
||||||
|
if oraw != rraw:
|
||||||
|
error = True
|
||||||
|
if error:
|
||||||
|
raise ValueError('The before and after HTML differs. Run a diff '
|
||||||
|
'tool on the orig and rebuilt directories')
|
||||||
|
else:
|
||||||
|
self.log('Skeleton HTML before and after is identical.')
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user