Date: Sat, 21 Apr 2012 00:59:30 +0800
Subject: [PATCH 25/37] fix the pattern in preprocess
---
src/calibre/ebooks/conversion/preprocess.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index c526cba8a9..16acaad383 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -559,7 +559,7 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*\s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append(
# Un wrap using punctuation
- (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?(i|b|u)>)?\s*(
\s*\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+ (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?(i|b|u)>)?\s*(
\s*\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:
From 081897ae5723958830db099240dd461c521b822f Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Fri, 20 Apr 2012 22:39:32 +0530
Subject: [PATCH 26/37] KF8 Output: Start work on the index layer
---
src/calibre/ebooks/mobi/utils.py | 46 +++++++++++++
src/calibre/ebooks/mobi/writer2/indexer.py | 49 +++-----------
src/calibre/ebooks/mobi/writer8/index.py | 78 ++++++++++++++++++++++
3 files changed, 132 insertions(+), 41 deletions(-)
create mode 100644 src/calibre/ebooks/mobi/writer8/index.py
diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py
index fe5cd7eaf2..319af30f86 100644
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'
import struct, string, imghdr, zlib, os
from collections import OrderedDict
+from io import BytesIO
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
from calibre.ebooks import normalize
@@ -549,3 +550,48 @@ def create_text_record(text):
return data, overlap
+class CNCX(object): # {{{
+
+ '''
+ Create the CNCX records. These are records containing all the strings from
+ an index. Each record is of the form:
+ '''
+
+ MAX_STRING_LENGTH = 500
+
+ def __init__(self, strings=()):
+ self.strings = OrderedDict((s, 0) for s in strings)
+
+ self.records = []
+ offset = 0
+ buf = BytesIO()
+ for key in tuple(self.strings.iterkeys()):
+ utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
+ l = len(utf8)
+ sz_bytes = encint(l)
+ raw = sz_bytes + utf8
+ if 0xfbf8 - buf.tell() < 6 + len(raw):
+ # Records in PDB files cannot be larger than 0x10000, so we
+ # stop well before that.
+ pad = 0xfbf8 - buf.tell()
+ buf.write(b'\0' * pad)
+ self.records.append(buf.getvalue())
+ buf.truncate(0)
+ offset = len(self.records) * 0x10000
+ buf.write(raw)
+ self.strings[key] = offset
+ offset += len(raw)
+
+ self.records.append(align_block(buf.getvalue()))
+
+ def __getitem__(self, string):
+ return self.strings[string]
+
+ def __bool__(self):
+ return bool(self.records)
+ __nonzero__ = __bool__
+
+# }}}
+
+
diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py
index 134fbadc60..be926a80a0 100644
--- a/src/calibre/ebooks/mobi/writer2/indexer.py
+++ b/src/calibre/ebooks/mobi/writer2/indexer.py
@@ -13,54 +13,21 @@ from cStringIO import StringIO
from collections import OrderedDict, defaultdict
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
- encode_tbs, align_block, utf8_text, RECORD_SIZE)
+ encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_)
-class CNCX(object): # {{{
-
- '''
- Create the CNCX records. These are records containing all the strings from
- the NCX. Each record is of the form:
- '''
-
- MAX_STRING_LENGTH = 500
+class CNCX(CNCX_): # {{{
def __init__(self, toc, is_periodical):
- self.strings = OrderedDict()
-
+ strings = []
for item in toc.iterdescendants(breadth_first=True):
- self.strings[item.title] = 0
+ strings.append(item.title)
if is_periodical:
- self.strings[item.klass] = 0
+ strings.append(item.klass)
if item.author:
- self.strings[item.author] = 0
+ strings.append(item.author)
if item.description:
- self.strings[item.description] = 0
-
- self.records = []
- offset = 0
- buf = StringIO()
- for key in tuple(self.strings.iterkeys()):
- utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
- l = len(utf8)
- sz_bytes = encint(l)
- raw = sz_bytes + utf8
- if 0xfbf8 - buf.tell() < 6 + len(raw):
- # Records in PDB files cannot be larger than 0x10000, so we
- # stop well before that.
- pad = 0xfbf8 - buf.tell()
- buf.write(b'\0' * pad)
- self.records.append(buf.getvalue())
- buf.truncate(0)
- offset = len(self.records) * 0x10000
- buf.write(raw)
- self.strings[key] = offset
- offset += len(raw)
-
- self.records.append(align_block(buf.getvalue()))
-
- def __getitem__(self, string):
- return self.strings[string]
+ strings.append(item.description)
+ CNCX_.__init__(self, strings)
# }}}
class TAGX(object): # {{{
diff --git a/src/calibre/ebooks/mobi/writer8/index.py b/src/calibre/ebooks/mobi/writer8/index.py
new file mode 100644
index 0000000000..a2b991a612
--- /dev/null
+++ b/src/calibre/ebooks/mobi/writer8/index.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+from future_builtins import map
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+from collections import namedtuple
+from struct import pack
+
+from calibre.ebooks.mobi.utils import CNCX
+
+TagMeta = namedtuple('TagMeta',
+ 'name number values_per_entry bitmask end_flag')
+EndTagTable = TagMeta('eof', 0, 0, 0, 1)
+
+class Index(object):
+
+ control_byte_count = 1
+ cncx = CNCX()
+ tag_types = (EndTagTable,)
+
+ @classmethod
+ def generate_tagx(cls):
+ header = b'TAGX'
+ byts = bytearray()
+ for tag_meta in cls.tag_types:
+ byts.extend(tag_meta[1:])
+ # table length, control byte count
+ header += pack(b'>II', 12+len(byts), cls.control_byte_count)
+ return header + bytes(byts)
+
+class SkelIndex(Index):
+
+ tag_types = tuple(map(TagMeta, (
+ ('chunk_count', 1, 1, 3, 0),
+ ('geometry', 6, 2, 12, 0),
+ EndTagTable
+ )))
+
+ def __init__(self, skel_table):
+ self.entries = [
+ (s.name, {
+ # Dont ask me why these entries have to be repeated twice
+ 'chunk_count':(s.chunk_count, s.chunk_count),
+ 'geometry':(s.start_pos, s.length, s.start_pos, s.length),
+ }) for s in skel_table
+ ]
+
+
+class ChunkIndex(Index):
+
+ tag_types = tuple(map(TagMeta, (
+ ('cncx_offset', 2, 1, 1, 0),
+ ('file_number', 3, 1, 2, 0),
+ ('sequence_number', 4, 1, 4, 0),
+ ('geometry', 6, 2, 8, 0),
+ EndTagTable
+ )))
+
+ def __init__(self, chunk_table):
+ self.cncx = CNCX(c.selector for c in chunk_table)
+
+ self.entries = [
+ ('%010d'%c.insert_pos, {
+
+ 'cncx_offset':self.cncx[c.selector],
+ 'file_number':c.file_number,
+ 'sequence_number':c.sequence_number,
+ 'geometry':(c.start_pos, c.length),
+ }) for s in chunk_table
+ ]
+
+
+
From 22ee4152416a98e84a587f9fcf1a1f5aa52f4960 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Fri, 20 Apr 2012 23:32:53 +0530
Subject: [PATCH 27/37] ...
---
src/calibre/ebooks/mobi/writer8/index.py | 55 ++++++++++++++++++++++--
1 file changed, 52 insertions(+), 3 deletions(-)
diff --git a/src/calibre/ebooks/mobi/writer8/index.py b/src/calibre/ebooks/mobi/writer8/index.py
index a2b991a612..1ee20857fb 100644
--- a/src/calibre/ebooks/mobi/writer8/index.py
+++ b/src/calibre/ebooks/mobi/writer8/index.py
@@ -10,13 +10,20 @@ __docformat__ = 'restructuredtext en'
from collections import namedtuple
from struct import pack
+from io import BytesIO
-from calibre.ebooks.mobi.utils import CNCX
+from calibre.ebooks.mobi.utils import CNCX, encint
TagMeta = namedtuple('TagMeta',
'name number values_per_entry bitmask end_flag')
EndTagTable = TagMeta('eof', 0, 0, 0, 1)
+# map of mask to number of shifts needed, works with 1 bit and two-bit wide masks
+# could also be extended to 4 bit wide ones as well
+mask_to_bit_shifts = { 1:0, 2:1, 3:0, 4:2, 8:3, 12:2, 16:4, 32:5, 48:4, 64:6,
+ 128:7, 192: 6 }
+
+
class Index(object):
control_byte_count = 1
@@ -33,6 +40,50 @@ class Index(object):
header += pack(b'>II', 12+len(byts), cls.control_byte_count)
return header + bytes(byts)
+ @classmethod
+ def calculate_control_bytes_for_each_entry(cls, entries):
+ control_bytes = []
+ for lead_text, tags in entries:
+ cbs = []
+ ans = 0
+ for (name, number, vpe, mask, endi) in cls.tag_types:
+ if endi == 1:
+ cbs.append(ans)
+ ans = 0
+ continue
+ nvals = len(tags.get(name, ()))
+ nentries = nvals // vpe
+ shifts = mask_to_bit_shifts[mask]
+ ans |= mask & (nentries << shifts)
+ if len(cbs) != cls.control_byte_count:
+ raise ValueError('The entry %r is invalid'%[lead_text, tags])
+ control_bytes.append(cbs)
+ return control_bytes
+
+ def build_records(self):
+ self.control_bytes = self.calculate_control_bytes_for_each_entry(
+ self.entries)
+
+ self.rendered_entries = []
+ offset = 0
+ IndexEntry = namedtuple('IndexEntry', 'offset length raw')
+ for i, x in enumerate(self.entries):
+ control_bytes = self.control_bytes[i]
+ leading_text, tags = x
+ buf = BytesIO()
+ raw = bytearray(leading_text)
+ raw.insert(0, len(leading_text))
+ buf.write(bytes(raw))
+ buf.write(control_bytes)
+ for tag in self.tag_types:
+ values = tags.get(tag.name, None)
+ if values:
+ for val in values:
+ buf.write(encint(val))
+ raw = buf.getvalue()
+ self.rendered_entries.append(IndexEntry(offset, len(raw), raw))
+ offset += len(raw)
+
class SkelIndex(Index):
tag_types = tuple(map(TagMeta, (
@@ -74,5 +125,3 @@ class ChunkIndex(Index):
}) for s in chunk_table
]
-
-
From 5d3e24e1053e6078dfe3a7e9a0fe135baeb69286 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sat, 21 Apr 2012 07:50:27 +0530
Subject: [PATCH 28/37] Fix set_internal_links()
---
src/calibre/ebooks/mobi/writer8/main.py | 4 +-
src/calibre/ebooks/mobi/writer8/skeleton.py | 42 +++++++++++++++------
2 files changed, 33 insertions(+), 13 deletions(-)
diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py
index 430d695fd1..955fbab460 100644
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@@ -200,9 +200,9 @@ class KF8Writer(object):
aid = aidbase + j
tag.attrib['aid'] = to_base(aid, base=32)
if tag.tag == XHTML('body'):
- self.id_map[(item.href, '')] = (i, tag.attrib['aid'])
+ self.id_map[(item.href, '')] = tag.attrib['aid']
if id_ is not None:
- self.id_map[(item.href, id_)] = (i, tag.attrib['aid'])
+ self.id_map[(item.href, id_)] = tag.attrib['aid']
j += 1
diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index d04f119316..4da540cac6 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -172,11 +172,11 @@ class Chunker(object):
body = root.xpath('//body')[0]
body.tail = '\n'
- if self.orig_dumps is not None:
- self.orig_dumps.append(tostring(root, xml_declaration=True,
+ if orig_dumps is not None:
+ orig_dumps.append(tostring(root, xml_declaration=True,
with_tail=True))
- self.orig_dumps[-1] = close_self_closing_tags(
- self.orig_dumps[-1].replace(b']+? aid=[\'"]([A-Z0-9]+)[\'"]', text):
- aid_map[match.group(1)] = match.start()
+ offset = match.start()
+ pos_fid = None
+ for chunk in self.chunk_table:
+ if chunk.insert_pos <= offset < chunk.insert_pos + chunk.length:
+ pos_fid = (chunk.sequence_number, offset-chunk.insert_pos)
+ break
+ if chunk.insert_pos > offset:
+ # This aid is in the skeleton, not in a chunk, so we use
+ # the chunk immediately after
+ pos_fid = (chunk.sequence_number, 0)
+ break
+ if pos_fid is None:
+ raise ValueError('Could not find chunk for aid: %r'%
+ match.group(1))
+ aid_map[match.group(1)] = (to_base(chunk.sequence_number,
+ base=32, min_num_digits=4),
+ to_href(offset-chunk.insert_pos))
+
self.aid_offset_map = aid_map
- def to_placeholder(x):
- file_number, aid = x
- return bytes('%04d:%s'%(file_number, to_href(aid_map[aid])))
+ def to_placeholder(aid):
+ return bytes(':'.join(aid_map[aid]))
placeholder_map = {bytes(k):to_placeholder(v) for k, v in
self.placeholder_map.iteritems()}
@@ -359,7 +379,7 @@ class Chunker(object):
pass
return raw
- return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text)
+ return re.sub(br'<[^>]+(kindle:pos:fid:0000:\d{10})', sub, text)
def dump(self, orig_dumps):
import tempfile, shutil, os
From 5c72ad513b982741b6dc0777d89cda837f9566a8 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sat, 21 Apr 2012 07:52:27 +0530
Subject: [PATCH 29/37] ...
---
src/calibre/ebooks/mobi/writer8/skeleton.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index 4da540cac6..8f0a3795db 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -336,7 +336,9 @@ class Chunker(object):
num += 1
def set_internal_links(self, text):
- # A kindle pos:fid link contains two base 32 numbers of the form
+ ''' Update the internal link placeholders to point to the correct
+ location, based on the chunk table.'''
+ # A kindle:pos:fid link contains two base 32 numbers of the form
# XXXX:YYYYYYYYYY
# The first number is an index into the chunk table and the second is
# an offset from the start of the chunk to the start of the tag pointed
From 9ab4ff1840a7b3735a6e94e4c1465295285bfc4f Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sat, 21 Apr 2012 11:15:31 +0530
Subject: [PATCH 30/37] A nice framework for generating MOBI header records
---
.../ebooks/conversion/plugins/mobi_output.py | 2 +-
src/calibre/ebooks/mobi/debug/index.py | 4 +-
src/calibre/ebooks/mobi/debug/mobi8.py | 2 +-
src/calibre/ebooks/mobi/utils.py | 7 +-
src/calibre/ebooks/mobi/writer8/header.py | 77 +++++++++++
src/calibre/ebooks/mobi/writer8/index.py | 125 +++++++++++++++++-
6 files changed, 206 insertions(+), 11 deletions(-)
create mode 100644 src/calibre/ebooks/mobi/writer8/header.py
diff --git a/src/calibre/ebooks/conversion/plugins/mobi_output.py b/src/calibre/ebooks/conversion/plugins/mobi_output.py
index 89ab91f8eb..971d11df3b 100644
--- a/src/calibre/ebooks/conversion/plugins/mobi_output.py
+++ b/src/calibre/ebooks/conversion/plugins/mobi_output.py
@@ -169,6 +169,7 @@ class MOBIOutput(OutputFormatPlugin):
self.remove_html_cover()
resources = Resources(oeb, opts, self.is_periodical,
add_fonts=create_kf8)
+ self.check_for_periodical()
kf8 = self.create_kf8(resources) if create_kf8 else None
@@ -203,7 +204,6 @@ class MOBIOutput(OutputFormatPlugin):
resources.add_extra_images()
mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
mobimlizer(oeb, opts)
- self.check_for_periodical()
write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
from calibre.ebooks.mobi.writer2.main import MobiWriter
writer = MobiWriter(opts, resources, kf8,
diff --git a/src/calibre/ebooks/mobi/debug/index.py b/src/calibre/ebooks/mobi/debug/index.py
index 1af1611918..94f252e231 100644
--- a/src/calibre/ebooks/mobi/debug/index.py
+++ b/src/calibre/ebooks/mobi/debug/index.py
@@ -17,7 +17,7 @@ from calibre.ebooks.mobi.reader.ncx import (tag_fieldname_map, default_entry)
File = namedtuple('File',
'file_number name divtbl_count start_position length')
-Elem = namedtuple('Elem',
+Elem = namedtuple('Chunk',
'insert_pos toc_text file_number sequence_number start_pos '
'length')
@@ -110,7 +110,7 @@ class SECTIndex(Index):
for i, text in enumerate(self.table.iterkeys()):
tag_map = self.table[text]
if set(tag_map.iterkeys()) != {2, 3, 4, 6}:
- raise ValueError('SECT Index has unknown tags: %s'%
+ raise ValueError('Chunk Index has unknown tags: %s'%
(set(tag_map.iterkeys())-{2, 3, 4, 6}))
toc_text = self.cncx[tag_map[2][0]]
diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py
index 1c61690d42..e3e26af0b1 100644
--- a/src/calibre/ebooks/mobi/debug/mobi8.py
+++ b/src/calibre/ebooks/mobi/debug/mobi8.py
@@ -198,7 +198,7 @@ def inspect_mobi(mobi_file, ddir):
with open(os.path.join(ddir, 'skel.record'), 'wb') as fo:
fo.write(str(f.skel_index).encode('utf-8'))
- with open(os.path.join(ddir, 'sect.record'), 'wb') as fo:
+ with open(os.path.join(ddir, 'chunks.record'), 'wb') as fo:
fo.write(str(f.sect_index).encode('utf-8'))
with open(os.path.join(ddir, 'ncx.record'), 'wb') as fo:
diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py
index 319af30f86..aa59ee2217 100644
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@@ -583,7 +583,9 @@ class CNCX(object): # {{{
self.strings[key] = offset
offset += len(raw)
- self.records.append(align_block(buf.getvalue()))
+ val = buf.getvalue()
+ if val:
+ self.records.append(align_block(val))
def __getitem__(self, string):
return self.strings[string]
@@ -592,6 +594,9 @@ class CNCX(object): # {{{
return bool(self.records)
__nonzero__ = __bool__
+ def __len__(self):
+ return len(self.records)
+
# }}}
diff --git a/src/calibre/ebooks/mobi/writer8/header.py b/src/calibre/ebooks/mobi/writer8/header.py
new file mode 100644
index 0000000000..31571d0f5f
--- /dev/null
+++ b/src/calibre/ebooks/mobi/writer8/header.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+from io import BytesIO
+from collections import OrderedDict
+from struct import pack
+
+from calibre.ebooks.mobi.utils import align_block
+
+NULL = 0xffffffff
+zeroes = lambda x: b'\0'*x
+nulls = lambda x: b'\xff'*x
+
+class Header(OrderedDict):
+
+ HEADER_NAME = b''
+
+ DEFINITION = '''
+ '''
+
+ ALIGN_BLOCK = False
+ POSITIONS = {}
+
+ def __init__(self):
+ OrderedDict.__init__(self)
+
+ for line in self.DEFINITION.splitlines():
+ line = line.strip()
+ if not line or line.startswith('#'): continue
+ name, val = [x.strip() for x in line.partition('=')[0::2]]
+ if val:
+ val = eval(val, {'zeroes':zeroes, 'NULL':NULL, 'DYN':None,
+ 'nulls':nulls})
+ else:
+ val = 0
+ if name in self:
+ raise ValueError('Duplicate field in definition: %r'%name)
+ self[name] = val
+
+ def __call__(self, **kwargs):
+ positions = {}
+ for name, val in kwargs.iteritems():
+ if name not in self:
+ raise KeyError('Not a valid header field: %r'%name)
+ self[name] = val
+
+ buf = BytesIO()
+ buf.write(bytes(self.HEADER_NAME))
+ for name, val in self.iteritems():
+ val = self.format_value(name, val)
+ positions[name] = buf.tell()
+ if val is None:
+ raise ValueError('Dynamic field %r not set'%name)
+ if isinstance(val, (int, long)):
+ val = pack(b'>I', val)
+ buf.write(val)
+
+ for pos_field, field in self.POSITIONS.iteritems():
+ buf.seek(positions[pos_field])
+ buf.write(pack(b'>I', positions[field]))
+
+ ans = buf.getvalue()
+ if self.ALIGN_BLOCK:
+ ans = align_block(ans)
+ return ans
+
+
+ def format_value(self, name, val):
+ return val
+
+
diff --git a/src/calibre/ebooks/mobi/writer8/index.py b/src/calibre/ebooks/mobi/writer8/index.py
index 1ee20857fb..153e140b06 100644
--- a/src/calibre/ebooks/mobi/writer8/index.py
+++ b/src/calibre/ebooks/mobi/writer8/index.py
@@ -12,7 +12,8 @@ from collections import namedtuple
from struct import pack
from io import BytesIO
-from calibre.ebooks.mobi.utils import CNCX, encint
+from calibre.ebooks.mobi.utils import CNCX, encint, align_block
+from calibre.ebooks.mobi.writer8.header import Header
TagMeta = namedtuple('TagMeta',
'name number values_per_entry bitmask end_flag')
@@ -23,13 +24,79 @@ EndTagTable = TagMeta('eof', 0, 0, 0, 1)
mask_to_bit_shifts = { 1:0, 2:1, 3:0, 4:2, 8:3, 12:2, 16:4, 32:5, 48:4, 64:6,
128:7, 192: 6 }
+class IndexHeader(Header): # {{{
-class Index(object):
+ HEADER_NAME = b'INDX'
+ ALIGN_BLOCK = True
+ HEADER_LENGTH = 192
+
+ DEFINITION = '''
+ # 4 - 8: Header Length
+ header_length = {header_length}
+
+ # 8 - 16: Unknown
+ unknown1 = zeroes(8)
+
+ # 16 - 20: Index type: 0 - normal 2 - inflection
+ type = 2
+
+ # 20 - 24: IDXT offset (filled in later)
+ idxt_offset
+
+ # 24 - 28: Number of index records
+ num_of_records = 1
+
+ # 28 - 32: Index encoding (65001 = utf-8)
+ encoding = 65001
+
+ # 32 - 36: Unknown
+ unknown2 = NULL
+
+ # 36 - 40: Number of Index entries
+ num_of_entries = DYN
+
+ # 40 - 44: ORDT offset
+ ordt_offset
+
+ # 44 - 48: LIGT offset
+ ligt_offset
+
+ # 48 - 52: Number of ORDT/LIGT? entries
+ num_of_ordt_entries
+
+ # 52 - 56: Number of CNCX records
+ num_of_cncx = DYN
+
+ # 56 - 180: Unknown
+ unknown3 = zeroes(124)
+
+ # 180 - 184: TAGX offset
+ tagx_offset = {header_length}
+
+ # 184 - 192: Unknown
+ unknown4 = zeroes(8)
+
+ # TAGX
+ tagx = DYN
+
+ # Last Index entry
+ last_index = DYN
+
+ # IDXT
+ idxt = DYN
+ '''.format(header_length=HEADER_LENGTH)
+
+ POSITIONS = {'idxt_offset':'idxt'}
+# }}}
+
+class Index(object): # {{{
control_byte_count = 1
cncx = CNCX()
tag_types = (EndTagTable,)
+ HEADER_LENGTH = IndexHeader.HEADER_LENGTH
+
@classmethod
def generate_tagx(cls):
header = b'TAGX'
@@ -60,17 +127,18 @@ class Index(object):
control_bytes.append(cbs)
return control_bytes
- def build_records(self):
+ def __call__(self):
self.control_bytes = self.calculate_control_bytes_for_each_entry(
self.entries)
- self.rendered_entries = []
+ rendered_entries = []
offset = 0
+ index, idxt, buf = BytesIO(), BytesIO(), BytesIO()
IndexEntry = namedtuple('IndexEntry', 'offset length raw')
for i, x in enumerate(self.entries):
control_bytes = self.control_bytes[i]
leading_text, tags = x
- buf = BytesIO()
+ buf.truncate(0)
raw = bytearray(leading_text)
raw.insert(0, len(leading_text))
buf.write(bytes(raw))
@@ -81,8 +149,53 @@ class Index(object):
for val in values:
buf.write(encint(val))
raw = buf.getvalue()
- self.rendered_entries.append(IndexEntry(offset, len(raw), raw))
+ rendered_entries.append(IndexEntry(offset, len(raw), raw))
+ idxt.write(pack(b'>H', self.HEADER_LENGTH+offset))
offset += len(raw)
+ index.write(raw)
+
+ index_block = align_block(index.getvalue())
+ idxt_block = align_block(b'IDXT' + idxt.getvalue())
+ body = index_block + idxt_block
+ if len(body) + self.HEADER_LENGTH >= 0x10000:
+ raise ValueError('Index has too many entries, calibre does not'
+ ' support generating multiple index records at this'
+ ' time.')
+
+ header = b'INDX'
+ buf.truncate(0)
+ buf.write(pack(b'>I', self.HEADER_LENGTH))
+ buf.write(b'\0'*4) # Unknown
+ buf.write(pack(b'>I', 1)) # Header type? Or index record number?
+ buf.write(b'\0'*4) # Unknown
+
+ # IDXT block offset
+ buf.write(pack(b'>I', self.HEADER_LENGTH + len(index_block)))
+
+ # Number of index entries
+ buf.write(pack(b'>I', len(rendered_entries)))
+
+ buf.write(b'\xff'*8) # Unknown
+
+ buf.write(b'\0'*156) # Unknown
+
+ header += buf.getvalue()
+ index_record = header + body
+
+ tagx = self.generate_tagx()
+ idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) +
+ b'\0')
+ header = {
+ 'num_of_entries': len(rendered_entries),
+ 'num_of_cncx': len(self.cncx),
+ 'tagx':tagx,
+ 'idxt':idxt
+ }
+ header = IndexHeader()(**header)
+ self.records = [header, index_record]
+ self.records.extend(self.cncx.records)
+ return self.records
+# }}}
class SkelIndex(Index):
From 687586f9a16f55b7c675690e130c7a61be145f7e Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sat, 21 Apr 2012 13:20:14 +0530
Subject: [PATCH 31/37] ...
---
src/calibre/manual/faq.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst
index a248962abd..f0d9aa8bd3 100644
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@@ -29,7 +29,7 @@ It can convert every input format in the following list, to every output format.
PRC is a generic format, |app| supports PRC files with TextRead and MOBIBook headers.
PDB is also a generic format. |app| supports eReder, Plucker, PML and zTxt PDB files.
DJVU support is only for converting DJVU files that contain embedded text. These are typically generated by OCR software.
- MOBI books can be of two types Mobi6 and KF8. |app| currently fully supports Mobi6 and supports conversion from, but not to, KF8
+ MOBI books can be of two types Mobi6 and KF8. |app| fully supports both. MOBI files often have .azw or .azw3 file extensions
.. _best-source-formats:
From 0db1fcb10396f81e7a1bbf13e7900d125eeeb88d Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sat, 21 Apr 2012 23:38:52 +0530
Subject: [PATCH 32/37] Fix #986598 (New recipe for News agency Telam)
---
recipes/icons/telam.png | Bin 0 -> 1992 bytes
recipes/telam.recipe | 62 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 62 insertions(+)
create mode 100644 recipes/icons/telam.png
create mode 100644 recipes/telam.recipe
diff --git a/recipes/icons/telam.png b/recipes/icons/telam.png
new file mode 100644
index 0000000000000000000000000000000000000000..f86dcc1dbf16db5e41411f0dd20c4af478336877
GIT binary patch
literal 1992
zcmV;(2RHbMP)t)QrX=s)%^L5e6e
zVnBo6IDE|Jqk9#@q
ze9mwCo*xuKfd8ez|35%3NC03A$aN4ffK&|u1mS-HkO<)gc)GSQ>-pCz*kqUjTGT+^
zNfom8X#Y@mT7&l^-5Vf3x<38mU(dZ&ZO?~3@x;}b7P0i8j4lTMMWmfJI2cyik(
zyBzD|0p!8ooGHKk?upY;He}73z6wHbkoq@G-HLD%lb)#IA(rN#Cl1tn?nj?~Y|F&D
zk8<<}|{$bB4NPdHK)04ZgD<3TN)+`04V
z&+ND-V+9}(;*GP{UYeS6oq-ucje~H^Bzt=LH#&MpE~6^8TuK?f+$>)y-8eUO`GSBv
zR1D;$HSGtV+V<7QxBn|+1>oFF>G@+lYCbq|1izW(?5A0IjP)Wpzt
zKJl>5S;-|=IH`r&g}5~Bb;jje9b7~_R-?89Svn2BIJ*Av@nP8zK^W?ev;X&R{qoqU
zuRXT)+1*dv9jIkMsU80O;L!&k=$hQV@o#hWw_B~tHF$pDmhgix*7c)&H
z>wEQv7lME0brC>0Z#2BlV($xoIs9@fnpgBST4XW9r|m>rXn2aEatr1x#3IQbTZ^J2&~hySDiqJ;xhz(QXa_!uTTy?r;wrHS
zSgzy?jan^_K|6A(9a2ptgvvxn@3GD!K(zvpG7bR>7DXK;Ntm${S;KG?nP`8b|C|U@BNJ
za&TNHWKk-FDC@k}P|}IK>y%UFifX_n*P$WzOBQ>VsYAI3G?}C&fDF}0PuF0^(J5^s
zkd9p@i%G;hL|MOjk?Ng(o9b$aF#!ZqT4NBpVfWy$scM{&<1;fkq6;S-Lk=PXYH*z$
zj~8tPe$Z$#6~p<|qr<&^yOP7a0aOBTJ7glJ6%mb*+JffO5QOqtmOTlhEQ_wvWUN;T
zIGQJLwpyu~S!fhg#R5ndf&aM;gQFd(CAC=AUfVKuZC;;AXMh`;j8UlcvbutOf}KjF
zNTKfYJdq?axFC?330L`Z3GnMh;YS{tRfQQZWWq$yxXyweH6GcpK7wHX$xBU?13aC`aDl);
z!802QJ9-N%#BRwdtYa&L~2#lg4XC#@FPm
z>5DfGTx}GyIWX7_U65D3@AlGX)(xntvO?_cSYM&|{m(x++R>#(F$^e^V%OpbL@{Ht
zROump8)|Bjs5tz>mI(xuz5Cyw+^%Me02ro)ck~(Gd}K=|W!$~z3V`%$)9C77P43#X
zaZKoh6YiO^sWjPQ86e6q?-;le`CGe&C&tEqaq!4%wOYGx807~?66<@T-TQVtG?G=o
zO1sC{BEOW}@X+y-uS`vyC{-M!6^P!yC=L%-@w|5Z9SPU
zjr8vt$msh0rivAqL|mFLpS^J5otgQoR~pmYrKCNIw2gzqnS5uRwE8>x)~+63GniRt
zTb+*npUy=eGR~M!Xb_Q*l7(iVB(%39qnf6nDO}aFx+bgf-%pPpW4gYxa%o5Tdf&lR
a+CKo7=wztDq%i#e0000'
+'''
+www.telam.com.ar
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Telam(BasicNewsRecipe):
+ title = 'Telam'
+ __author__ = 'Darko Miletic'
+ description = 'AGENCIA DE NOTICIAS DE LA REPUBLICA ARGENTINA'
+ publisher = 'Telam S.E.'
+ category = 'news, politics, Argentina'
+ oldest_article = 2
+ max_articles_per_feed = 200
+ no_stylesheets = True
+ encoding = 'utf8'
+ use_embedded_content = False
+ language = 'es_AR'
+ remove_empty_feeds = True
+ publication_type = 'newsportal'
+ masthead_url = 'http://www.telam.com.ar/front/imagenes/encabezado/logotelam.jpg'
+ extra_css = """
+ body{font-family: Arial,Helvetica,sans-serif }
+ img{margin-bottom: 0.4em; display:block}
+ """
+
+ conversion_options = {
+ 'comment' : description
+ , 'tags' : category
+ , 'publisher' : publisher
+ , 'language' : language
+ }
+
+ remove_tags = [dict(name=['meta','link'])]
+ remove_tags_before = dict(attrs={'class':'nota_fecha'})
+ remove_tags_after = dict(attrs={'class':'nota_completa'})
+ remove_attributes = ['lang']
+
+
+ feeds = [
+ (u'Ultimas noticias', u'http://www.telam.com.ar/xml/rss/' )
+ ,(u'Politica' , u'http://www.telam.com.ar/xml/rss/1')
+ ,(u'Economia' , u'http://www.telam.com.ar/xml/rss/2')
+ ,(u'Sociedad' , u'http://www.telam.com.ar/xml/rss/3')
+ ,(u'Policiales' , u'http://www.telam.com.ar/xml/rss/4')
+ ,(u'Internacionales' , u'http://www.telam.com.ar/xml/rss/6')
+ ,(u'Espectaculos' , u'http://www.telam.com.ar/xml/rss/7')
+ ,(u'Cultura' , u'http://www.telam.com.ar/xml/rss/8')
+ ,(u'Deportes' , u'http://www.telam.com.ar/xml/rss/9')
+ ,(u'Telam Investiga' , u'http://www.telam.com.ar/xml/rss/5')
+ ]
+
+ def print_version(self, url):
+ artid = url.rpartition('/')[2]
+ return 'http://www.telam.com.ar/?codProg=imprimir-nota&id=' + artid
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ return soup
From e4a55aae564adfa92bcef668f020982b82a38aab Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 22 Apr 2012 10:17:06 +0530
Subject: [PATCH 33/37] KF8 Output: Create NCX and Guide records
---
src/calibre/ebooks/mobi/utils.py | 4 +
src/calibre/ebooks/mobi/writer2/serializer.py | 5 +-
src/calibre/ebooks/mobi/writer8/index.py | 118 ++++++++++++++++--
src/calibre/ebooks/mobi/writer8/main.py | 99 ++++++++++++++-
src/calibre/ebooks/mobi/writer8/skeleton.py | 8 +-
5 files changed, 208 insertions(+), 26 deletions(-)
diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py
index aa59ee2217..3b8ce61ba8 100644
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@@ -599,4 +599,8 @@ class CNCX(object): # {{{
# }}}
+def is_guide_ref_start(ref):
+ return (ref.title.lower() == 'start' or
+ (ref.type and ref.type.lower() in {'start',
+ 'other.start', 'text'}))
diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py
index d8d63bcff4..2dda657a93 100644
--- a/src/calibre/ebooks/mobi/writer2/serializer.py
+++ b/src/calibre/ebooks/mobi/writer2/serializer.py
@@ -12,6 +12,7 @@ import re
from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS,
namespace, prefixname, urlnormalize)
from calibre.ebooks.mobi.mobiml import MBP_NS
+from calibre.ebooks.mobi.utils import is_guide_ref_start
from collections import defaultdict
from urlparse import urldefrag
@@ -161,9 +162,7 @@ class Serializer(object):
buf.write(b'title="')
self.serialize_text(ref.title, quot=True)
buf.write(b'" ')
- if (ref.title.lower() == 'start' or
- (ref.type and ref.type.lower() in {'start',
- 'other.start', 'text'})):
+ if is_guide_ref_start(ref):
self._start_href = ref.href
self.serialize_href(ref.href)
# Space required or won't work, I kid you not
diff --git a/src/calibre/ebooks/mobi/writer8/index.py b/src/calibre/ebooks/mobi/writer8/index.py
index 153e140b06..1cf9f02d4b 100644
--- a/src/calibre/ebooks/mobi/writer8/index.py
+++ b/src/calibre/ebooks/mobi/writer8/index.py
@@ -15,9 +15,10 @@ from io import BytesIO
from calibre.ebooks.mobi.utils import CNCX, encint, align_block
from calibre.ebooks.mobi.writer8.header import Header
-TagMeta = namedtuple('TagMeta',
+TagMeta_ = namedtuple('TagMeta',
'name number values_per_entry bitmask end_flag')
-EndTagTable = TagMeta('eof', 0, 0, 0, 1)
+TagMeta = lambda x:TagMeta_(*x)
+EndTagTable = TagMeta(('eof', 0, 0, 0, 1))
# map of mask to number of shifts needed, works with 1 bit and two-bit wide masks
# could also be extended to 4 bit wide ones as well
@@ -118,7 +119,10 @@ class Index(object): # {{{
cbs.append(ans)
ans = 0
continue
- nvals = len(tags.get(name, ()))
+ try:
+ nvals = len(tags.get(name, ()))
+ except TypeError:
+ nvals = 1
nentries = nvals // vpe
shifts = mask_to_bit_shifts[mask]
ans |= mask & (nentries << shifts)
@@ -132,36 +136,51 @@ class Index(object): # {{{
self.entries)
rendered_entries = []
- offset = 0
index, idxt, buf = BytesIO(), BytesIO(), BytesIO()
IndexEntry = namedtuple('IndexEntry', 'offset length raw')
+ last_lead_text = b''
+ too_large = ValueError('Index has too many entries, calibre does not'
+ ' support generating multiple index records at this'
+ ' time.')
+
for i, x in enumerate(self.entries):
control_bytes = self.control_bytes[i]
leading_text, tags = x
- buf.truncate(0)
+ buf.seek(0), buf.truncate(0)
+ leading_text = (leading_text.encode('utf-8') if
+ isinstance(leading_text, unicode) else leading_text)
raw = bytearray(leading_text)
raw.insert(0, len(leading_text))
buf.write(bytes(raw))
- buf.write(control_bytes)
+ buf.write(bytes(bytearray(control_bytes)))
for tag in self.tag_types:
values = tags.get(tag.name, None)
+ if values is None: continue
+ try:
+ len(values)
+ except TypeError:
+ values = [values]
if values:
for val in values:
- buf.write(encint(val))
+ try:
+ buf.write(encint(val))
+ except ValueError:
+ raise ValueError('Invalid values for %r: %r'%(
+ tag, values))
raw = buf.getvalue()
+ offset = index.tell()
+ if offset + self.HEADER_LENGTH >= 0x10000:
+ raise too_large
rendered_entries.append(IndexEntry(offset, len(raw), raw))
idxt.write(pack(b'>H', self.HEADER_LENGTH+offset))
- offset += len(raw)
index.write(raw)
+ last_lead_text = leading_text
index_block = align_block(index.getvalue())
idxt_block = align_block(b'IDXT' + idxt.getvalue())
body = index_block + idxt_block
if len(body) + self.HEADER_LENGTH >= 0x10000:
- raise ValueError('Index has too many entries, calibre does not'
- ' support generating multiple index records at this'
- ' time.')
-
+ raise too_large
header = b'INDX'
buf.truncate(0)
buf.write(pack(b'>I', self.HEADER_LENGTH))
@@ -185,10 +204,15 @@ class Index(object): # {{{
tagx = self.generate_tagx()
idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) +
b'\0')
+ # Last index
+ idx = bytes(bytearray([len(last_lead_text)])) + last_lead_text
+ idx += pack(b'>H', len(rendered_entries))
+
header = {
'num_of_entries': len(rendered_entries),
'num_of_cncx': len(self.cncx),
'tagx':tagx,
+ 'last_index':align_block(idx),
'idxt':idxt
}
header = IndexHeader()(**header)
@@ -235,6 +259,74 @@ class ChunkIndex(Index):
'file_number':c.file_number,
'sequence_number':c.sequence_number,
'geometry':(c.start_pos, c.length),
- }) for s in chunk_table
+ }) for c in chunk_table
]
+class GuideIndex(Index):
+
+ tag_types = tuple(map(TagMeta, (
+ ('title', 1, 1, 1, 0),
+ ('pos_fid', 6, 2, 2, 0),
+ EndTagTable
+ )))
+
+ def __init__(self, guide_table):
+ self.cncx = CNCX(c.title for c in guide_table)
+
+ self.entries = [
+ (r.type, {
+
+ 'title':self.cncx[r.title],
+ 'pos_fid':r.pos_fid,
+ }) for r in guide_table
+ ]
+
+
+class NCXIndex(Index):
+
+ control_byte_count = 2
+ tag_types = tuple(map(TagMeta, (
+ ('offset', 1, 1, 1, 0),
+ ('length', 2, 1, 2, 0),
+ ('label', 3, 1, 4, 0),
+ ('depth', 4, 1, 8, 0),
+ ('parent', 21, 1, 16, 0),
+ ('first_child', 22, 1, 32, 0),
+ ('last_child', 23, 1, 64, 0),
+ ('pos_fid', 6, 2, 128, 0),
+ EndTagTable,
+ ('image', 69, 1, 1, 0),
+ ('description', 70, 1, 2, 0),
+ ('author', 71, 1, 4, 0),
+ ('caption', 72, 1, 8, 0),
+ ('attribution', 73, 1, 16, 0),
+ EndTagTable
+ )))
+
+ def __init__(self, toc_table):
+ strings = []
+ for entry in toc_table:
+ strings.append(entry['label'])
+ aut = entry.get('author', None)
+ if aut:
+ strings.append(aut)
+ desc = entry.get('description', None)
+ if desc:
+ strings.append(desc)
+ self.cncx = CNCX(strings)
+
+ def to_entry(x):
+ ans = {}
+ for f in ('offset', 'length', 'depth', 'pos_fid', 'parent',
+ 'first_child', 'last_child'):
+ if f in x:
+ ans[f] = x[f]
+ for f in ('label', 'description', 'author'):
+ if f in x:
+ ans[f] = self.cncx[x[f]]
+ return ('%02x'%x['index'], ans)
+
+ self.entries = list(map(to_entry, toc_table))
+
+
+
diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py
index 955fbab460..76492cb9a9 100644
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@@ -17,12 +17,15 @@ import cssutils
from lxml import etree
from calibre import isbytestring, force_unicode
-from calibre.ebooks.mobi.utils import create_text_record, to_base
+from calibre.ebooks.mobi.utils import (create_text_record, to_base,
+ is_guide_ref_start)
from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
extract, XHTML, urlnormalize)
from calibre.ebooks.oeb.parse_utils import barename
from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href
+from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex,
+ ChunkIndex, GuideIndex)
XML_DOCS = OEB_DOCS | {SVG_MIME}
@@ -38,11 +41,11 @@ class KF8Writer(object):
self.log.info('Creating KF8 output')
self.used_images = set()
self.resources = resources
- self.dup_data()
self.flows = [None] # First flow item is reserved for the text
self.records = []
- self.fdst_table = []
+ self.log('\tGenerating KF8 markup...')
+ self.dup_data()
self.replace_resource_links()
self.extract_css_into_flows()
self.extract_svg_into_flows()
@@ -52,7 +55,10 @@ class KF8Writer(object):
# Dump the cloned data as it is no longer needed
del self._data_cache
self.create_text_records()
- self.create_fdst_table()
+ self.log('\tCreating indices...')
+ self.create_fdst_records()
+ self.create_indices()
+ self.create_guide()
def dup_data(self):
''' Duplicate data so that any changes we make to markup/CSS only
@@ -231,7 +237,7 @@ class KF8Writer(object):
records_size = 0
if self.compress:
- self.oeb.logger.info(' Compressing markup content...')
+ self.oeb.logger.info('\tCompressing markup...')
while text.tell() < self.text_length:
data, overlap = create_text_record(text)
@@ -252,9 +258,90 @@ class KF8Writer(object):
self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1
- def create_fdst_table(self):
+ def create_fdst_records(self):
FDST = namedtuple('Flow', 'start end')
+ entries = []
+ self.fdst_table = []
for i, flow in enumerate(self.flows):
start = 0 if i == 0 else self.fdst_table[-1].end
self.fdst_table.append(FDST(start, start + len(flow)))
+ entries.extend(self.fdst_table[-1])
+ rec = (b'FDST' + pack(b'>LL', len(self.fdst_table), 12) +
+ pack(b'>%dL'%len(entries), *entries))
+ self.fdst_records = [rec]
+
+ def create_indices(self):
+ self.skel_records = SkelIndex(self.skel_table)()
+ self.chunk_records = ChunkIndex(self.chunk_table)()
+ self.ncx_records = []
+ toc = self.oeb.toc
+ max_depth = toc.depth()
+ entries = []
+ is_periodical = self.opts.mobi_periodical
+ if toc.count() < 2:
+ self.log.warn('Document has no ToC, MOBI will have no NCX index')
+ return
+
+ # Flatten the ToC into a depth first list
+ fl = toc.iter() if is_periodical else toc.iterdescendants()
+ for i, item in enumerate(fl):
+ entry = {'index':i, 'depth': max_depth - item.depth() - (0 if
+ is_periodical else 1), 'href':item.href, 'label':(item.title or
+ _('Unknown'))}
+ entries.append(entry)
+ for child in item:
+ child.ncx_parent = entry
+ p = getattr(item, 'ncx_parent', None)
+ if p is not None:
+ entry['parent'] = p['index']
+ if is_periodical:
+ if item.author:
+ entry['author'] = item.author
+ if item.description:
+ entry['description'] = item.description
+
+ for entry in entries:
+ children = [e for e in entries if e.get('parent', -1) == entry['index']]
+ if children:
+ entry['first_child'] = children[0]['index']
+ entry['last_child'] = children[-1]['index']
+ href = entry.pop('href')
+ href, frag = href.partition('#')[0::2]
+ aid = self.id_map.get((href, frag), None)
+ if aid is None:
+ aid = self.id_map.get((href, ''), None)
+ if aid is None:
+ pos, fid = 0, 0
+ else:
+ pos, fid = self.aid_offset_map[aid]
+ chunk = self.chunk_table[pos]
+ offset = chunk.insert_pos + fid
+ length = chunk.length
+ entry['pos_fid'] = (pos, fid)
+ entry['offset'] = offset
+ entry['length'] = length
+
+ self.ncx_records = NCXIndex(entries)()
+
+ def create_guide(self):
+ self.start_offset = None
+ self.guide_table = []
+ self.guide_records = []
+ GuideRef = namedtuple('GuideRef', 'title type pos_fid')
+ for ref in self.oeb.guide:
+ ref = self.oeb.guide[ref]
+ href, frag = ref.href.partition('#')[0::2]
+ aid = self.id_map.get((href, frag), None)
+ if aid is None:
+ aid = self.id_map.get((href, ''))
+ if aid is None:
+ continue
+ pos, fid = self.aid_offset_map[aid]
+ if is_guide_ref_start(ref):
+ self.start_offset = pos
+ self.guide_table.append(GuideRef(ref.title or
+ _('Unknown'), ref.type, (pos, fid)))
+
+ if self.guide_table:
+ self.guide_records = GuideIndex(self.guide_table)()
diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py
index 8f0a3795db..398c684e43 100644
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@@ -359,14 +359,14 @@ class Chunker(object):
if pos_fid is None:
raise ValueError('Could not find chunk for aid: %r'%
match.group(1))
- aid_map[match.group(1)] = (to_base(chunk.sequence_number,
- base=32, min_num_digits=4),
- to_href(offset-chunk.insert_pos))
+ aid_map[match.group(1)] = pos_fid
self.aid_offset_map = aid_map
def to_placeholder(aid):
- return bytes(':'.join(aid_map[aid]))
+ pos, fid = aid_map[aid]
+ pos, fid = to_base(pos, min_num_digits=4), to_href(fid)
+ return bytes(':'.join((pos, fid)))
placeholder_map = {bytes(k):to_placeholder(v) for k, v in
self.placeholder_map.iteritems()}
From 3269b8c3611ec68855f60c46c675cde2a4e3dc5e Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 22 Apr 2012 10:20:47 +0530
Subject: [PATCH 34/37] ...
---
src/calibre/ebooks/mobi/writer8/main.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py
index 76492cb9a9..ffc806cb5a 100644
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@@ -337,7 +337,10 @@ class KF8Writer(object):
if aid is None:
continue
pos, fid = self.aid_offset_map[aid]
- if is_guide_ref_start(ref):
+ if is_guide_ref_start(ref) and fid == 0:
+ # If fid != 0 then we cannot represent the start position as a
+ # single number in the EXTH header, so we do not write it to
+ # EXTH
self.start_offset = pos
self.guide_table.append(GuideRef(ref.title or
_('Unknown'), ref.type, (pos, fid)))
From e5e2bfd8f359df52428d000662613bab89b1a621 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 22 Apr 2012 10:34:28 +0530
Subject: [PATCH 35/37] ...
---
src/calibre/ebooks/mobi/writer8/main.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py
index ffc806cb5a..e061da7df6 100644
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@@ -328,8 +328,7 @@ class KF8Writer(object):
self.guide_table = []
self.guide_records = []
GuideRef = namedtuple('GuideRef', 'title type pos_fid')
- for ref in self.oeb.guide:
- ref = self.oeb.guide[ref]
+ for ref in self.oeb.guide.values():
href, frag = ref.href.partition('#')[0::2]
aid = self.id_map.get((href, frag), None)
if aid is None:
From e0002deb1fba920695c88147b415d583ac79f517 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 22 Apr 2012 12:48:29 +0530
Subject: [PATCH 36/37] Sol Haber by Onur Gungor
---
recipes/sol_haber.recipe | 141 +++++++++++++++++++++++++++++++++++++++
1 file changed, 141 insertions(+)
create mode 100644 recipes/sol_haber.recipe
diff --git a/recipes/sol_haber.recipe b/recipes/sol_haber.recipe
new file mode 100644
index 0000000000..29db88019c
--- /dev/null
+++ b/recipes/sol_haber.recipe
@@ -0,0 +1,141 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Onur Gungor onurgu@gmail.com'
+__docformat__ = 'restructuredtext en'
+
+'''
+www.sol.org.tr
+'''
+
+import datetime
+
+import re
+
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class SolHaberRecipe(BasicNewsRecipe):
+ title = u'soL Haber'
+ oldest_article = 7
+ max_articles_per_feed = 100
+
+ language = 'tr'
+ __author__ = 'Onur Güngör'
+ description = 'Hayata soL''dan bakın..'
+ publisher = 'soL Haber'
+ tags = 'news, haberler, siyaset, türkiye, turkey, politics'
+
+
+ conversion_options = {
+ 'comment' : description
+ , 'tags' : tags
+ , 'publisher' : publisher
+ , 'language' : language
+ }
+
+ category_dict = { 'sonuncu-kavga':'Sonuncu Kavga',
+ 'devlet-ve-siyaset':'Devlet ve Siyaset',
+ 'ekonomi':'Ekonomi',
+ 'enternasyonal-gundem':'Enternasyonel Gündem',
+ 'kent-gundemleri':'Kent Gündemleri',
+ 'kultur-sanat':'Kültür Sanat',
+ 'dunyadan':'Dünyadan',
+ 'serbest-kursu':'Serbest Kürsü',
+ 'medya':'Medya',
+ 'liseliler':'Liseliler',
+ 'yazarlar':'Köşe Yazıları'}
+
+ end_date = datetime.date.today().isoformat()
+ start_date = (datetime.date.today()-datetime.timedelta(days=1)).isoformat()
+
+
+ section_tuples = [['Köşe Yazıları', 'http://haber.sol.org.tr/arsiv?icerik=kose_yazisi&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
+ ['Haberler', 'http://haber.sol.org.tr/arsiv?icerik=haber&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
+ ['soL postal', 'http://haber.sol.org.tr/arsiv?icerik=postal&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
+ ['Bizim Amerika', 'http://haber.sol.org.tr/arsiv?icerik=bizim_amerika&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)]]
+
+
+ # Disable stylesheets from site.
+ no_stylesheets = True
+
+ cover_margins = (20, 20, '#ffffff')
+
+ storybody_reg_exp = '^\s*(haber|kose)\s*$'
+
+ comments_reg_exp = '^\s*makale-elestiri\s*$'
+
+ remove_tags = [dict(name='div', attrs={'class':re.compile(comments_reg_exp, re.IGNORECASE)})]
+
+ keep_only_tags = [dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)})]
+
+ def get_masthead_title(self):
+ return self.title + "(" + self.end_date + ")"
+
+ def parse_index(self):
+
+ result = []
+ articles_dict = dict()
+
+ author_regexp = re.compile('^http://.*?/yazarlar/(.*?)/.*$')
+ category_regexp = re.compile('^http://.*?/(.+?)/.*$')
+
+ for section_tuple in self.section_tuples:
+
+ section_title = section_tuple[0]
+ section_index_url = section_tuple[1]
+
+ self.log('Bölüm:', section_title, 'URL:', section_index_url)
+
+ soup = self.index_to_soup(section_index_url)
+
+ logo = soup.find('div', id='logo').find('img', src=True)
+ if logo is not None:
+ self.cover_url = logo['src']
+ if self.cover_url.startswith('/'):
+ self.cover_url = 'http://haber.sol.org.tr'+self.cover_url
+
+ view_content = soup.find('div', id='ana-icerik').find('div', attrs={'class':'view-content'})
+ if view_content == None:
+ break
+ rows = view_content.find('tbody').findAll('tr')
+
+ self.log('Row sayısı', len(rows))
+ for row in rows:
+ cells = row.findAll('td')
+
+ a = cells[1].find('a', href=True)
+
+ url = a['href']
+ title = self.tag_to_string(a)
+
+ if url.startswith('/'):
+ url = 'http://haber.sol.org.tr'+url
+
+ category = section_title
+ category_match_result = category_regexp.match(url)
+ if category_match_result:
+ category = category_match_result.group(1)
+
+ date = self.tag_to_string(cells[2])
+
+ author = 'soL haber'
+
+ author_match_result = author_regexp.match(url)
+ if author_match_result:
+ author = author_match_result.group(1)
+
+ self.log('\tFound article:', title, 'at', url, 'published at ', date, 'by', author)
+ article = {'title':title, 'url':url, 'description':None, 'date':date, 'author':author}
+ if category in articles_dict:
+ articles_dict[category].append(article)
+ else:
+ articles_dict[category] = [article]
+
+ for category in articles_dict.keys():
+ if category in self.category_dict:
+ result.append((self.category_dict[category], articles_dict[category]))
+ else:
+ result.append((category, articles_dict[category]))
+
+ return result
From fe1e29082003058efbcdaf4f8610021bc3b393f1 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 22 Apr 2012 15:52:12 +0530
Subject: [PATCH 37/37] Bash completion for ebook-viewer should complete all
fiel types for which calibre has an input plugin
---
src/calibre/linux.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/calibre/linux.py b/src/calibre/linux.py
index 64bc9a5a0b..e3bfe04e75 100644
--- a/src/calibre/linux.py
+++ b/src/calibre/linux.py
@@ -228,8 +228,8 @@ class PostInstall:
from calibre.utils.smtp import option_parser as smtp_op
from calibre.library.server.main import option_parser as serv_op
from calibre.ebooks.epub.fix.main import option_parser as fix_op
- any_formats = ['epub', 'htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip',
- 'txt', 'lit', 'rtf', 'pdf', 'prc', 'mobi', 'fb2', 'odt', 'lrf', 'snb']
+ from calibre.ebooks import BOOK_EXTENSIONS
+ input_formats = sorted(all_input_formats())
bc = os.path.join(os.path.dirname(self.opts.staging_sharedir),
'bash-completion')
if os.path.exists(bc):
@@ -249,11 +249,11 @@ class PostInstall:
self.info('Installing bash completion to', f)
with open(f, 'wb') as f:
f.write('# calibre Bash Shell Completion\n')
- f.write(opts_and_exts('calibre', guiop, any_formats))
+ f.write(opts_and_exts('calibre', guiop, BOOK_EXTENSIONS))
f.write(opts_and_exts('lrf2lrs', lrf2lrsop, ['lrf']))
f.write(opts_and_exts('ebook-meta', metaop, list(meta_filetypes())))
f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf']))
- f.write(opts_and_exts('ebook-viewer', viewer_op, any_formats))
+ f.write(opts_and_exts('ebook-viewer', viewer_op, input_formats))
f.write(opts_and_words('fetch-ebook-metadata', fem_op, []))
f.write(opts_and_words('calibre-smtp', smtp_op, []))
f.write(opts_and_words('calibre-server', serv_op, []))