merge from trunk

This commit is contained in:
Lee 2012-04-21 00:23:20 +08:00
commit 7a0f6ec510
3 changed files with 83 additions and 59 deletions

View File

@ -11,6 +11,8 @@ class TPM_uk(BasicNewsRecipe):
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Title says it all' description = 'Title says it all'
publisher = "The Philosophers' Magazine" publisher = "The Philosophers' Magazine"
recipe_disabled = ('This recipe has been disabled as the website has'
' started providing articles only in PDF form')
category = 'philosophy, news' category = 'philosophy, news'
oldest_article = 25 oldest_article = 25
max_articles_per_feed = 200 max_articles_per_feed = 200

View File

@ -9,13 +9,16 @@ __docformat__ = 'restructuredtext en'
import copy import copy
from functools import partial from functools import partial
from collections import defaultdict from collections import defaultdict, namedtuple
from io import BytesIO
from struct import pack
import cssutils import cssutils
from lxml import etree from lxml import etree
from calibre import isbytestring, force_unicode from calibre import isbytestring, force_unicode
from calibre.ebooks.mobi.utils import to_base from calibre.ebooks.mobi.utils import create_text_record, to_base
from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath, from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
extract, XHTML, urlnormalize) extract, XHTML, urlnormalize)
from calibre.ebooks.oeb.parse_utils import barename from calibre.ebooks.oeb.parse_utils import barename
@ -31,11 +34,14 @@ class KF8Writer(object):
def __init__(self, oeb, opts, resources): def __init__(self, oeb, opts, resources):
self.oeb, self.opts, self.log = oeb, opts, oeb.log self.oeb, self.opts, self.log = oeb, opts, oeb.log
self.compress = not self.opts.dont_compress
self.log.info('Creating KF8 output') self.log.info('Creating KF8 output')
self.used_images = set() self.used_images = set()
self.resources = resources self.resources = resources
self.dup_data() self.dup_data()
self.flows = [None] # First flow item is reserved for the text self.flows = [None] # First flow item is reserved for the text
self.records = []
self.fdst_table = []
self.replace_resource_links() self.replace_resource_links()
self.extract_css_into_flows() self.extract_css_into_flows()
@ -43,6 +49,10 @@ class KF8Writer(object):
self.replace_internal_links_with_placeholders() self.replace_internal_links_with_placeholders()
self.insert_aid_attributes() self.insert_aid_attributes()
self.chunk_it_up() self.chunk_it_up()
# Dump the cloned data as it is no longer needed
del self._data_cache
self.create_text_records()
self.create_fdst_table()
def dup_data(self): def dup_data(self):
''' Duplicate data so that any changes we make to markup/CSS only ''' Duplicate data so that any changes we make to markup/CSS only
@ -165,7 +175,7 @@ class KF8Writer(object):
self.link_map = {} self.link_map = {}
count = 0 count = 0
hrefs = {item.href for item in self.oeb.spine} hrefs = {item.href for item in self.oeb.spine}
for i, item in enumerate(self.oeb.spine): for item in self.oeb.spine:
root = self.data(item) root = self.data(item)
for a in XPath('//h:a[@href]')(root): for a in XPath('//h:a[@href]')(root):
@ -174,8 +184,7 @@ class KF8Writer(object):
href, _, frag = ref.partition('#') href, _, frag = ref.partition('#')
href = urlnormalize(href) href = urlnormalize(href)
if href in hrefs: if href in hrefs:
placeholder = 'kindle:pos:fid:%04d:off:%s'%(i, placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count)
to_href(count))
self.link_map[placeholder] = (href, frag) self.link_map[placeholder] = (href, frag)
a.set('href', placeholder) a.set('href', placeholder)
@ -191,9 +200,9 @@ class KF8Writer(object):
aid = aidbase + j aid = aidbase + j
tag.attrib['aid'] = to_base(aid, base=32) tag.attrib['aid'] = to_base(aid, base=32)
if tag.tag == XHTML('body'): if tag.tag == XHTML('body'):
self.id_map[(item.href, '')] = tag.attrib['aid'] self.id_map[(item.href, '')] = (i, tag.attrib['aid'])
if id_ is not None: if id_ is not None:
self.id_map[(item.href, id_)] = tag.attrib['aid'] self.id_map[(item.href, id_)] = (i, tag.attrib['aid'])
j += 1 j += 1
@ -205,12 +214,47 @@ class KF8Writer(object):
if aid is None: if aid is None:
aid = self.id_map.get((href, '')) aid = self.id_map.get((href, ''))
placeholder_map[placeholder] = aid placeholder_map[placeholder] = aid
chunker = Chunker(self.oeb, self.data, not self.opts.dont_compress, chunker = Chunker(self.oeb, self.data, placeholder_map)
placeholder_map)
for x in ('skel_table', 'chunk_table', 'aid_offset_map', 'records', for x in ('skel_table', 'chunk_table', 'aid_offset_map'):
'last_text_record_idx', 'first_non_text_record_idx',
'text_length'):
setattr(self, x, getattr(chunker, x)) setattr(self, x, getattr(chunker, x))
self.flows[0] = chunker.text
def create_text_records(self):
self.flows = [x.encode('utf-8') if isinstance(x, unicode) else x for x
in self.flows]
text = b''.join(self.flows)
self.text_length = len(text)
text = BytesIO(text)
nrecords = 0
records_size = 0
if self.compress:
self.oeb.logger.info(' Compressing markup content...')
while text.tell() < self.text_length:
data, overlap = create_text_record(text)
if self.compress:
data = compress_doc(data)
data += overlap
data += pack(b'>B', len(overlap))
self.records.append(data)
records_size += len(data)
nrecords += 1
self.last_text_record_idx = nrecords
self.first_non_text_record_idx = nrecords + 1
# Pad so that the next records starts at a 4 byte boundary
if records_size % 4 != 0:
self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1
def create_fdst_table(self):
FDST = namedtuple('Flow', 'start end')
for i, flow in enumerate(self.flows):
start = 0 if i == 0 else self.fdst_table[-1].end
self.fdst_table.append(FDST(start, start + len(flow)))

View File

@ -9,16 +9,13 @@ __docformat__ = 'restructuredtext en'
import re import re
from collections import namedtuple from collections import namedtuple
from io import BytesIO
from struct import pack
from functools import partial from functools import partial
from lxml import etree from lxml import etree
from calibre.ebooks.oeb.base import XHTML_NS from calibre.ebooks.oeb.base import XHTML_NS
from calibre.constants import ispy3 from calibre.constants import ispy3
from calibre.ebooks.mobi.utils import create_text_record, to_base from calibre.ebooks.mobi.utils import to_base
from calibre.ebooks.compression.palmdoc import compress_doc
CHUNK_SIZE = 8192 CHUNK_SIZE = 8192
@ -159,17 +156,16 @@ class Skeleton(object):
class Chunker(object): class Chunker(object):
def __init__(self, oeb, data_func, compress, placeholder_map): def __init__(self, oeb, data_func, placeholder_map):
self.oeb, self.log = oeb, oeb.log self.oeb, self.log = oeb, oeb.log
self.data = data_func self.data = data_func
self.compress = compress
self.placeholder_map = placeholder_map self.placeholder_map = placeholder_map
self.skeletons = [] self.skeletons = []
# Set this to a list to enable dumping of the original and rebuilt # Set this to a list to enable dumping of the original and rebuilt
# html files for debugging # html files for debugging
self.orig_dumps = None orig_dumps = None
for i, item in enumerate(self.oeb.spine): for i, item in enumerate(self.oeb.spine):
root = self.remove_namespaces(self.data(item)) root = self.remove_namespaces(self.data(item))
@ -197,8 +193,8 @@ class Chunker(object):
# for all chunks # for all chunks
self.skeletons.append(Skeleton(i, item, root, chunks)) self.skeletons.append(Skeleton(i, item, root, chunks))
if self.orig_dumps: if orig_dumps:
self.dump() self.dump(orig_dumps)
# Create the SKEL and Chunk tables # Create the SKEL and Chunk tables
self.skel_table = [] self.skel_table = []
@ -207,11 +203,7 @@ class Chunker(object):
# Set internal links # Set internal links
text = b''.join(x.raw_text for x in self.skeletons) text = b''.join(x.raw_text for x in self.skeletons)
text = self.set_internal_links(text) self.text = self.set_internal_links(text)
# Create text records
self.records = []
self.create_text_records(text)
def remove_namespaces(self, root): def remove_namespaces(self, root):
lang = None lang = None
@ -349,7 +341,12 @@ class Chunker(object):
for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text): for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text):
aid_map[match.group(1)] = match.start() aid_map[match.group(1)] = match.start()
self.aid_offset_map = aid_map self.aid_offset_map = aid_map
placeholder_map = {bytes(k):bytes(to_href(aid_map[v])) for k, v in
def to_placeholder(x):
file_number, aid = x
return bytes('%04d:%s'%(file_number, to_href(aid_map[aid])))
placeholder_map = {bytes(k):to_placeholder(v) for k, v in
self.placeholder_map.iteritems()} self.placeholder_map.iteritems()}
# Now update the links # Now update the links
@ -357,42 +354,14 @@ class Chunker(object):
raw = match.group() raw = match.group()
pl = match.group(1) pl = match.group(1)
try: try:
return raw[:-10] + placeholder_map[pl] return raw[:-15] + placeholder_map[pl]
except KeyError: except KeyError:
pass pass
return raw return raw
return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text) return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text)
def create_text_records(self, text): def dump(self, orig_dumps):
self.text_length = len(text)
text = BytesIO(text)
nrecords = 0
records_size = 0
if self.compress:
self.oeb.logger.info(' Compressing markup content...')
while text.tell() < self.text_length:
data, overlap = create_text_record(text)
if self.compress:
data = compress_doc(data)
data += overlap
data += pack(b'>B', len(overlap))
self.records.append(data)
records_size += len(data)
nrecords += 1
self.last_text_record_idx = nrecords
self.first_non_text_record_idx = nrecords + 1
# Pad so that the next records starts at a 4 byte boundary
if records_size % 4 != 0:
self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1
def dump(self):
import tempfile, shutil, os import tempfile, shutil, os
tdir = os.path.join(tempfile.gettempdir(), 'skeleton') tdir = os.path.join(tempfile.gettempdir(), 'skeleton')
self.log('Skeletons dumped to:', tdir) self.log('Skeletons dumped to:', tdir)
@ -402,10 +371,19 @@ class Chunker(object):
rebuilt = os.path.join(tdir, 'rebuilt') rebuilt = os.path.join(tdir, 'rebuilt')
for x in (orig, rebuilt): for x in (orig, rebuilt):
os.makedirs(x) os.makedirs(x)
error = False
for i, skeleton in enumerate(self.skeletons): for i, skeleton in enumerate(self.skeletons):
oraw, rraw = orig_dumps[i], skeleton.rebuild()
with open(os.path.join(orig, '%04d.html'%i), 'wb') as f: with open(os.path.join(orig, '%04d.html'%i), 'wb') as f:
f.write(self.orig_dumps[i]) f.write(oraw)
with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f: with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f:
f.write(skeleton.rebuild()) f.write(rraw)
if oraw != rraw:
error = True
if error:
raise ValueError('The before and after HTML differs. Run a diff '
'tool on the orig and rebuilt directories')
else:
self.log('Skeleton HTML before and after is identical.')