merge from trunk

This commit is contained in:
Lee 2012-04-21 00:23:20 +08:00
commit 7a0f6ec510
3 changed files with 83 additions and 59 deletions

View File

@ -11,6 +11,8 @@ class TPM_uk(BasicNewsRecipe):
__author__ = 'Darko Miletic'
description = 'Title says it all'
publisher = "The Philosophers' Magazine"
recipe_disabled = ('This recipe has been disabled as the website has'
' started providing articles only in PDF form')
category = 'philosophy, news'
oldest_article = 25
max_articles_per_feed = 200

View File

@ -9,13 +9,16 @@ __docformat__ = 'restructuredtext en'
import copy
from functools import partial
from collections import defaultdict
from collections import defaultdict, namedtuple
from io import BytesIO
from struct import pack
import cssutils
from lxml import etree
from calibre import isbytestring, force_unicode
from calibre.ebooks.mobi.utils import to_base
from calibre.ebooks.mobi.utils import create_text_record, to_base
from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
extract, XHTML, urlnormalize)
from calibre.ebooks.oeb.parse_utils import barename
@ -31,11 +34,14 @@ class KF8Writer(object):
def __init__(self, oeb, opts, resources):
self.oeb, self.opts, self.log = oeb, opts, oeb.log
self.compress = not self.opts.dont_compress
self.log.info('Creating KF8 output')
self.used_images = set()
self.resources = resources
self.dup_data()
self.flows = [None] # First flow item is reserved for the text
self.records = []
self.fdst_table = []
self.replace_resource_links()
self.extract_css_into_flows()
@ -43,6 +49,10 @@ class KF8Writer(object):
self.replace_internal_links_with_placeholders()
self.insert_aid_attributes()
self.chunk_it_up()
# Dump the cloned data as it is no longer needed
del self._data_cache
self.create_text_records()
self.create_fdst_table()
def dup_data(self):
''' Duplicate data so that any changes we make to markup/CSS only
@ -165,7 +175,7 @@ class KF8Writer(object):
self.link_map = {}
count = 0
hrefs = {item.href for item in self.oeb.spine}
for i, item in enumerate(self.oeb.spine):
for item in self.oeb.spine:
root = self.data(item)
for a in XPath('//h:a[@href]')(root):
@ -174,8 +184,7 @@ class KF8Writer(object):
href, _, frag = ref.partition('#')
href = urlnormalize(href)
if href in hrefs:
placeholder = 'kindle:pos:fid:%04d:off:%s'%(i,
to_href(count))
placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count)
self.link_map[placeholder] = (href, frag)
a.set('href', placeholder)
@ -191,9 +200,9 @@ class KF8Writer(object):
aid = aidbase + j
tag.attrib['aid'] = to_base(aid, base=32)
if tag.tag == XHTML('body'):
self.id_map[(item.href, '')] = tag.attrib['aid']
self.id_map[(item.href, '')] = (i, tag.attrib['aid'])
if id_ is not None:
self.id_map[(item.href, id_)] = tag.attrib['aid']
self.id_map[(item.href, id_)] = (i, tag.attrib['aid'])
j += 1
@ -205,12 +214,47 @@ class KF8Writer(object):
if aid is None:
aid = self.id_map.get((href, ''))
placeholder_map[placeholder] = aid
chunker = Chunker(self.oeb, self.data, not self.opts.dont_compress,
placeholder_map)
chunker = Chunker(self.oeb, self.data, placeholder_map)
for x in ('skel_table', 'chunk_table', 'aid_offset_map', 'records',
'last_text_record_idx', 'first_non_text_record_idx',
'text_length'):
for x in ('skel_table', 'chunk_table', 'aid_offset_map'):
setattr(self, x, getattr(chunker, x))
self.flows[0] = chunker.text
def create_text_records(self):
self.flows = [x.encode('utf-8') if isinstance(x, unicode) else x for x
in self.flows]
text = b''.join(self.flows)
self.text_length = len(text)
text = BytesIO(text)
nrecords = 0
records_size = 0
if self.compress:
self.oeb.logger.info(' Compressing markup content...')
while text.tell() < self.text_length:
data, overlap = create_text_record(text)
if self.compress:
data = compress_doc(data)
data += overlap
data += pack(b'>B', len(overlap))
self.records.append(data)
records_size += len(data)
nrecords += 1
self.last_text_record_idx = nrecords
self.first_non_text_record_idx = nrecords + 1
# Pad so that the next records starts at a 4 byte boundary
if records_size % 4 != 0:
self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1
def create_fdst_table(self):
FDST = namedtuple('Flow', 'start end')
for i, flow in enumerate(self.flows):
start = 0 if i == 0 else self.fdst_table[-1].end
self.fdst_table.append(FDST(start, start + len(flow)))

View File

@ -9,16 +9,13 @@ __docformat__ = 'restructuredtext en'
import re
from collections import namedtuple
from io import BytesIO
from struct import pack
from functools import partial
from lxml import etree
from calibre.ebooks.oeb.base import XHTML_NS
from calibre.constants import ispy3
from calibre.ebooks.mobi.utils import create_text_record, to_base
from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.mobi.utils import to_base
CHUNK_SIZE = 8192
@ -159,17 +156,16 @@ class Skeleton(object):
class Chunker(object):
def __init__(self, oeb, data_func, compress, placeholder_map):
def __init__(self, oeb, data_func, placeholder_map):
self.oeb, self.log = oeb, oeb.log
self.data = data_func
self.compress = compress
self.placeholder_map = placeholder_map
self.skeletons = []
# Set this to a list to enable dumping of the original and rebuilt
# html files for debugging
self.orig_dumps = None
orig_dumps = None
for i, item in enumerate(self.oeb.spine):
root = self.remove_namespaces(self.data(item))
@ -197,8 +193,8 @@ class Chunker(object):
# for all chunks
self.skeletons.append(Skeleton(i, item, root, chunks))
if self.orig_dumps:
self.dump()
if orig_dumps:
self.dump(orig_dumps)
# Create the SKEL and Chunk tables
self.skel_table = []
@ -207,11 +203,7 @@ class Chunker(object):
# Set internal links
text = b''.join(x.raw_text for x in self.skeletons)
text = self.set_internal_links(text)
# Create text records
self.records = []
self.create_text_records(text)
self.text = self.set_internal_links(text)
def remove_namespaces(self, root):
lang = None
@ -349,7 +341,12 @@ class Chunker(object):
for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text):
aid_map[match.group(1)] = match.start()
self.aid_offset_map = aid_map
placeholder_map = {bytes(k):bytes(to_href(aid_map[v])) for k, v in
def to_placeholder(x):
file_number, aid = x
return bytes('%04d:%s'%(file_number, to_href(aid_map[aid])))
placeholder_map = {bytes(k):to_placeholder(v) for k, v in
self.placeholder_map.iteritems()}
# Now update the links
@ -357,42 +354,14 @@ class Chunker(object):
raw = match.group()
pl = match.group(1)
try:
return raw[:-10] + placeholder_map[pl]
return raw[:-15] + placeholder_map[pl]
except KeyError:
pass
return raw
return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text)
def create_text_records(self, text):
self.text_length = len(text)
text = BytesIO(text)
nrecords = 0
records_size = 0
if self.compress:
self.oeb.logger.info(' Compressing markup content...')
while text.tell() < self.text_length:
data, overlap = create_text_record(text)
if self.compress:
data = compress_doc(data)
data += overlap
data += pack(b'>B', len(overlap))
self.records.append(data)
records_size += len(data)
nrecords += 1
self.last_text_record_idx = nrecords
self.first_non_text_record_idx = nrecords + 1
# Pad so that the next records starts at a 4 byte boundary
if records_size % 4 != 0:
self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1
def dump(self):
def dump(self, orig_dumps):
import tempfile, shutil, os
tdir = os.path.join(tempfile.gettempdir(), 'skeleton')
self.log('Skeletons dumped to:', tdir)
@ -402,10 +371,19 @@ class Chunker(object):
rebuilt = os.path.join(tdir, 'rebuilt')
for x in (orig, rebuilt):
os.makedirs(x)
error = False
for i, skeleton in enumerate(self.skeletons):
oraw, rraw = orig_dumps[i], skeleton.rebuild()
with open(os.path.join(orig, '%04d.html'%i), 'wb') as f:
f.write(self.orig_dumps[i])
f.write(oraw)
with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f:
f.write(skeleton.rebuild())
f.write(rraw)
if oraw != rraw:
error = True
if error:
raise ValueError('The before and after HTML differs. Run a diff '
'tool on the orig and rebuilt directories')
else:
self.log('Skeleton HTML before and after is identical.')