sync to trunc

This commit is contained in:
Alex Stanev 2011-07-22 21:01:20 +03:00
commit 1d55467a48
12 changed files with 690 additions and 103 deletions

33
recipes/idg_se.recipe Normal file
View File

@ -0,0 +1,33 @@
__license__ = 'GPLv3'
from calibre.web.feeds.news import BasicNewsRecipe
class IDGse(BasicNewsRecipe):
title = 'IDG'
description = 'IDG.se'
language = 'se'
__author__ = 'zapt0'
oldest_article = 1
max_articles_per_feed = 40
no_stylesheets = True
encoding = 'ISO-8859-1'
remove_javascript = True
feeds = [(u'Senaste nytt',u'http://feeds.idg.se/idg/vzzs')]
def print_version(self,url):
return url + '?articleRenderMode=print&m=print'
def get_cover_url(this):
return 'http://idgmedia.idg.se/polopoly_fs/2.3275!images/idgmedia_logo_75.jpg'
keep_only_tags = [
dict(name='h1'),
dict(name='div', attrs={'class':['divColumn1Article']}),
]
#remove ads
remove_tags = [
dict(name='div', attrs={'id':['preamble_ad']}),
dict(name='ul', attrs={'class':['share']})
]

View File

@ -64,7 +64,7 @@ class UnitedDaily(BasicNewsRecipe):
__author__ = 'Eddie Lau' __author__ = 'Eddie Lau'
__version__ = '1.1' __version__ = '1.1'
language = 'zh-TW' language = 'zh_TW'
publisher = 'United Daily News Group' publisher = 'United Daily News Group'
description = 'United Daily (Taiwan)' description = 'United Daily (Taiwan)'
category = 'News, Chinese, Taiwan' category = 'News, Chinese, Taiwan'

71
recipes/utrinski.recipe Normal file
View File

@ -0,0 +1,71 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Spasovski <darko.spasovski at gmail.com>'
'''
utrinski.com.mk
'''
import re
import datetime
from calibre.web.feeds.news import BasicNewsRecipe
class UtrinskiVesnik(BasicNewsRecipe):
__author__ = 'Darko Spasovski'
INDEX = 'http://www.utrinski.com.mk/'
title = 'Utrinski Vesnik'
description = 'Daily Macedonian newspaper'
masthead_url = 'http://www.utrinski.com.mk/images/LogoTop.jpg'
language = 'mk'
remove_javascript = True
publication_type = 'newspaper'
category = 'news, Macedonia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the start of the article.
(r'<body.*?Article start-->', lambda match: '<body>'),
## Remove anything after the end of the article.
(r'<!--Article end.*?</body>', lambda match : '</body>'),
]
]
extra_css = """
body{font-family: Arial,Helvetica,sans-serif}
.WB_UTRINSKIVESNIK_Naslov{FONT-WEIGHT: bold; FONT-SIZE: 18px; FONT-FAMILY: Arial, Verdana, Tahoma; TEXT-DECORATION: none}
"""
conversion_options = {
'comment' : description,
'tags' : category,
'language' : language,
'linearize_tables' : True
}
def parse_index(self):
soup = self.index_to_soup(self.INDEX)
feeds = []
for section in soup.findAll('a', attrs={'class':'WB_UTRINSKIVESNIK_TOCTitleBig'}):
sectionTitle = section.contents[0].string
tocItemTable = section.findAllPrevious('table')[1]
if tocItemTable is None: continue
articles = []
while True:
tocItemTable = tocItemTable.nextSibling
if tocItemTable is None: break
article = tocItemTable.findAll('a', attrs={'class': 'WB_UTRINSKIVESNIK_TocItem'})
if len(article)==0: break
title = self.tag_to_string(article[0], use_alt=True).strip()
articles.append({'title': title, 'url':'http://www.utrinski.com.mk/' + article[0]['href'], 'description':'', 'date':''})
if articles:
feeds.append((sectionTitle, articles))
return feeds
def get_cover_url(self):
datum = datetime.datetime.today().strftime('%d_%m_%Y')
return 'http://www.utrinski.com.mk/WBStorage/Files/' + datum + '.jpg'

View File

@ -47,10 +47,12 @@ class ANDROID(USBMS):
# Google # Google
0x18d1 : { 0x18d1 : {
0x0001 : [0x0223],
0x4e11 : [0x0100, 0x226, 0x227], 0x4e11 : [0x0100, 0x226, 0x227],
0x4e12: [0x0100, 0x226, 0x227], 0x4e12 : [0x0100, 0x226, 0x227],
0x4e21: [0x0100, 0x226, 0x227], 0x4e21 : [0x0100, 0x226, 0x227],
0xb058: [0x0222, 0x226, 0x227]}, 0xb058 : [0x0222, 0x226, 0x227]
},
# Samsung # Samsung
0x04e8 : { 0x681d : [0x0222, 0x0223, 0x0224, 0x0400], 0x04e8 : { 0x681d : [0x0222, 0x0223, 0x0224, 0x0400],

View File

@ -35,9 +35,9 @@ class EB600(USBMS):
PRODUCT_ID = [0x1688] PRODUCT_ID = [0x1688]
BCD = [0x110] BCD = [0x110]
VENDOR_NAME = ['NETRONIX', 'WOLDER'] VENDOR_NAME = ['NETRONIX', 'WOLDER', 'MD86371']
WINDOWS_MAIN_MEM = ['EBOOK', 'MIBUK_GAMMA_6.2'] WINDOWS_MAIN_MEM = ['EBOOK', 'MIBUK_GAMMA_6.2', 'MD86371']
WINDOWS_CARD_A_MEM = 'EBOOK' WINDOWS_CARD_A_MEM = ['EBOOK', 'MD86371']
OSX_MAIN_MEM = 'EB600 Internal Storage Media' OSX_MAIN_MEM = 'EB600 Internal Storage Media'
OSX_CARD_A_MEM = 'EB600 Card Storage Media' OSX_CARD_A_MEM = 'EB600 Card Storage Media'

View File

@ -8,10 +8,10 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import struct, datetime, sys, os, shutil import struct, datetime, sys, os, shutil
from collections import OrderedDict from collections import OrderedDict, defaultdict
from calibre.utils.date import utc_tz from calibre.utils.date import utc_tz
from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.langcodes import main_language, sub_language
from calibre.ebooks.mobi.writer2.utils import (decode_hex_number, decint, from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
get_trailing_data) get_trailing_data)
from calibre.utils.magick.draw import identify_data from calibre.utils.magick.draw import identify_data
@ -530,21 +530,21 @@ class Tag(object): # {{{
}, },
'chapter_with_subchapters' : { 'chapter_with_subchapters' : {
22 : ('First subchapter index', 'first_subchapter_index'), 22 : ('First subchapter index', 'first_child_index'),
23 : ('Last subchapter index', 'last_subchapter_index'), 23 : ('Last subchapter index', 'last_child_index'),
}, },
'periodical' : { 'periodical' : {
5 : ('Class offset in cncx', 'class_offset'), 5 : ('Class offset in cncx', 'class_offset'),
22 : ('First section index', 'first_section_index'), 22 : ('First section index', 'first_child_index'),
23 : ('Last section index', 'last_section_index'), 23 : ('Last section index', 'last_child_index'),
}, },
'section' : { 'section' : {
5 : ('Class offset in cncx', 'class_offset'), 5 : ('Class offset in cncx', 'class_offset'),
21 : ('Periodical index', 'periodical_index'), 21 : ('Periodical index', 'parent_index'),
22 : ('First article index', 'first_article_index'), 22 : ('First article index', 'first_child_index'),
23 : ('Last article index', 'last_article_index'), 23 : ('Last article index', 'last_child_index'),
}, },
} }
@ -625,11 +625,56 @@ class IndexEntry(object): # {{{
return tag.cncx_value return tag.cncx_value
return '' return ''
@property
def offset(self):
for tag in self.tags:
if tag.attr == 'offset':
return tag.value
return 0
@property
def size(self):
for tag in self.tags:
if tag.attr == 'size':
return tag.value
return 0
@property
def depth(self):
for tag in self.tags:
if tag.attr == 'depth':
return tag.value
return 0
@property
def parent_index(self):
for tag in self.tags:
if tag.attr == 'parent_index':
return tag.value
return -1
@property
def first_child_index(self):
for tag in self.tags:
if tag.attr == 'first_child_index':
return tag.value
return -1
@property
def last_child_index(self):
for tag in self.tags:
if tag.attr == 'last_child_index':
return tag.value
return -1
def __str__(self): def __str__(self):
ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%( ans = ['Index Entry(index=%s, entry_type=%s, length=%d)'%(
self.index, self.entry_type, len(self.tags))] self.index, self.entry_type, len(self.tags))]
for tag in self.tags: for tag in self.tags:
ans.append('\t'+str(tag)) ans.append('\t'+str(tag))
if self.first_child_index != -1:
ans.append('\tNumber of children: %d'%(self.last_child_index -
self.first_child_index + 1))
return '\n'.join(ans) return '\n'.join(ans)
# }}} # }}}
@ -679,6 +724,15 @@ class IndexRecord(object): # {{{
entry_type = ord(indxt[off+consumed]) entry_type = ord(indxt[off+consumed])
self.indices.append(IndexEntry(index, entry_type, self.indices.append(IndexEntry(index, entry_type,
indxt[off+consumed+1:next_off], cncx, index_header.tagx_entries)) indxt[off+consumed+1:next_off], cncx, index_header.tagx_entries))
index = self.indices[-1]
def get_parent(self, index):
if index.depth < 1:
return None
parent_depth = index.depth - 1
for p in self.indices:
if p.depth != parent_depth:
continue
def __str__(self): def __str__(self):
@ -738,8 +792,7 @@ class CNCX(object) : # {{{
class TextRecord(object): # {{{ class TextRecord(object): # {{{
def __init__(self, idx, record, extra_data_flags, decompress, index_record, def __init__(self, idx, record, extra_data_flags, decompress):
doc_type):
self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags) self.trailing_data, self.raw = get_trailing_data(record.raw, extra_data_flags)
self.raw = decompress(self.raw) self.raw = decompress(self.raw)
if 0 in self.trailing_data: if 0 in self.trailing_data:
@ -751,60 +804,6 @@ class TextRecord(object): # {{{
self.idx = idx self.idx = idx
if 'indexing' in self.trailing_data and index_record is not None:
self.interpret_indexing(doc_type, index_record.indices)
def interpret_indexing(self, doc_type, indices):
raw = self.trailing_data['indexing']
ident, consumed = decint(raw)
raw = raw[consumed:]
entry_type = ident & 0b111
index_entry_idx = ident >> 3
index_entry = None
for i in indices:
if i.index == index_entry_idx:
index_entry = i.label
break
self.trailing_data['interpreted_indexing'] = (
'Type: %s, Index Entry: %s'%(entry_type, index_entry))
if doc_type == 2: # Book
self.interpret_book_indexing(raw, entry_type)
def interpret_book_indexing(self, raw, entry_type):
arg1, consumed = decint(raw)
raw = raw[consumed:]
if arg1 != 0:
raise ValueError('TBS index entry has unknown arg1: %d'%
arg1)
if entry_type == 2:
desc = ('This record has only a single starting or a single'
' ending point')
if raw:
raise ValueError('TBS index entry has unknown extra bytes:'
' %r'%raw)
elif entry_type == 3:
desc = ('This record is spanned by a single node (i.e. it'
' has no start or end points)')
arg2, consumed = decint(raw)
if arg2 != 0:
raise ValueError('TBS index entry has unknown arg2: %d'%
arg2)
elif entry_type == 6:
if len(raw) != 1:
raise ValueError('TBS index entry has unknown extra bytes:'
' %r'%raw)
num = ord(raw[0])
# An unmatched starting or ending point each contributes 1 to
# this count. A matched pair of starting and ending points
# together contribute 1 to this count. Note that you can only
# ever have either 1 unmatched start point or 1 unmatched end
# point, never both (logically impossible).
desc = ('This record has %d starting/ending points and/or complete'
' nodes.')%num
else:
raise ValueError('Unknown TBS index entry type: %d for book'%entry_type)
self.trailing_data['interpreted_indexing'] += ' :: ' + desc
def dump(self, folder): def dump(self, folder):
name = '%06d'%self.idx name = '%06d'%self.idx
with open(os.path.join(folder, name+'.txt'), 'wb') as f: with open(os.path.join(folder, name+'.txt'), 'wb') as f:
@ -848,6 +847,231 @@ class BinaryRecord(object): # {{{
# }}} # }}}
class TBSIndexing(object): # {{{
def __init__(self, text_records, indices, doc_type):
self.record_indices = OrderedDict()
self.doc_type = doc_type
self.indices = indices
pos = 0
for r in text_records:
start = pos
pos += len(r.raw)
end = pos - 1
self.record_indices[r] = x = {'starts':[], 'ends':[],
'complete':[], 'geom': (start, end)}
for entry in indices:
istart, sz = entry.offset, entry.size
iend = istart + sz - 1
has_start = istart >= start and istart <= end
has_end = iend >= start and iend <= end
rec = None
if has_start and has_end:
rec = 'complete'
elif has_start and not has_end:
rec = 'starts'
elif not has_start and has_end:
rec = 'ends'
if rec:
x[rec].append(entry)
def get_index(self, idx):
for i in self.indices:
if i.index == idx: return i
raise IndexError('Index %d not found'%idx)
def __str__(self):
ans = ['*'*20 + ' TBS Indexing (%d records) '%len(self.record_indices)+ '*'*20]
for r, dat in self.record_indices.iteritems():
ans += self.dump_record(r, dat)[-1]
return '\n'.join(ans)
def dump(self, bdir):
types = defaultdict(list)
for r, dat in self.record_indices.iteritems():
tbs_type, strings = self.dump_record(r, dat)
if tbs_type == 0: continue
types[tbs_type] += strings
for typ, strings in types.iteritems():
with open(os.path.join(bdir, 'tbs_type_%d.txt'%typ), 'wb') as f:
f.write('\n'.join(strings))
def dump_record(self, r, dat):
ans = []
ans.append('\nRecord #%d: Starts at: %d Ends at: %d'%(r.idx,
dat['geom'][0], dat['geom'][1]))
s, e, c = dat['starts'], dat['ends'], dat['complete']
ans.append(('\tContains: %d index entries '
'(%d ends, %d complete, %d starts)')%tuple(map(len, (s+e+c, e,
c, s))))
byts = bytearray(r.trailing_data.get('indexing', b''))
sbyts = tuple(hex(b)[2:] for b in byts)
ans.append('TBS bytes: %s'%(' '.join(sbyts)))
for typ, entries in (('Ends', e), ('Complete', c), ('Starts', s)):
if entries:
ans.append('\t%s:'%typ)
for x in entries:
ans.append(('\t\tIndex Entry: %d (Parent index: %d, '
'Depth: %d, Offset: %d, Size: %d) [%s]')%(
x.index, x.parent_index, x.depth, x.offset, x.size, x.label))
def bin3(num):
ans = bin(num)[2:]
return '0'*(3-len(ans)) + ans
tbs_type = 0
if len(byts):
outer, consumed = decint(byts)
byts = byts[consumed:]
tbs_type = outer & 0b111
ans.append('TBS Type: %s (%d)'%(bin3(tbs_type), tbs_type))
ans.append('Outer Index entry: %d'%(outer >> 3))
arg1, consumed = decint(byts)
byts = byts[consumed:]
ans.append('Unknown (vwi: always 0?): %d'%arg1)
if self.doc_type in (257, 259): # Hierarchical periodical
byts, a = self.interpret_periodical(tbs_type, byts)
ans += a
if byts:
sbyts = tuple(hex(b)[2:] for b in byts)
ans.append('Remaining bytes: %s'%' '.join(sbyts))
ans.append('')
return tbs_type, ans
def interpret_periodical(self, tbs_type, byts):
ans = []
def tbs_type_6(byts, psi=None, msg=None): # {{{
if psi is None:
# Assume parent section is 1
psi = self.get_index(1)
if msg is None:
msg = ('Article index at start of record or first article'
' index, relative to parent section')
if byts:
# byts could be empty
arg, consumed = decint(byts)
byts = byts[consumed:]
flags = (arg & 0b1111)
ai = (arg >> 4)
ans.append('%s (fvwi): %d [%d absolute]'%(msg, ai,
ai+psi.index))
if flags == 1:
arg, consumed = decint(byts)
byts = byts[consumed:]
ans.append('EOF (vwi: should be 0): %d'%arg)
elif flags in (4, 5):
num = byts[0]
byts = byts[1:]
ans.append('Number of article nodes in the record (byte): %d'%num)
if flags == 5:
arg, consumed = decint(byts)
byts = byts[consumed:]
ans.append('Unknown ??? (vwi)): %d'%(arg))
elif flags == 0:
pass
else:
raise ValueError('Unknown flags: %d'%flags)
return byts
# }}}
if tbs_type == 3: # {{{
arg2, consumed = decint(byts)
byts = byts[consumed:]
ans.append('Unknown (vwi: always 0?): %d'%arg2)
arg3, consumed = decint(byts)
byts = byts[consumed:]
fsi = arg3 >> 4
extra = arg3 & 0b1111
ans.append('First section index (fvwi): %d'%fsi)
psi = self.get_index(fsi)
ans.append('Extra bits (flag: always 0?): %d'%extra)
byts = tbs_type_6(byts, psi=psi,
msg=('First article of ending section, relative to its'
' parent\'s index'))
if byts:
# We have a transition not just an opening first section
psi = self.get_index(psi.index+1)
arg, consumed = decint(byts)
off = arg >> 4
byts = byts[consumed:]
flags = arg & 0b1111
ans.append('Last article of ending section w.r.t. starting'
' section offset (fvwi): %d [%d absolute]'%(off,
psi.index+off))
ans.append('Flags (always 8?): %d'%flags)
byts = tbs_type_6(byts, psi=psi)
# }}}
elif tbs_type == 7: # {{{
# This occurs for records that have no section nodes and
# whose parent section's index == 1
ans.append('Unknown (maybe vwi?): %r'%bytes(byts[:2]))
byts = byts[2:]
arg, consumed = decint(byts)
byts = byts[consumed:]
ai = arg >> 4
flags = arg & 0b1111
ans.append('Article at start of record (fvwi): %d'%ai)
if flags == 4:
num = byts[0]
byts = byts[1:]
ans.append('Number of articles in record (byte): %d'%num)
elif flags == 0:
pass
elif flags == 1:
arg, consumed = decint(byts)
byts = byts[consumed:]
ans.append('EOF (vwi: should be 0): %d'%arg)
else:
raise ValueError('Unknown flags value: %d'%flags)
# }}}
elif tbs_type == 6: # {{{
# This is used for records spanned by an article whose parent
# section's index == 1 or for the opening record if it contains the
# periodical start, section 1 start and at least one article. The
# two cases are distinguished by the flags on the article index
# vwi.
unk = byts[0]
byts = byts[1:]
ans.append('Unknown (byte: always 2?): %d'%unk)
byts = tbs_type_6(byts)
# }}}
elif tbs_type == 2: # {{{
# This occurs for records with no section nodes and whose parent
# section's index != 1 (undefined (records before the first
# section) or > 1)
# This is also used for records that are spanned by an article
# whose parent section index > 1. In this case the flags of the
# vwi referring to the article at the start
# of the record are set to 1 instead of 4.
arg, consumed = decint(byts)
byts = byts[consumed:]
flags = (arg & 0b1111)
psi = (arg >> 4)
ans.append('Parent section index (fvwi): %d'%psi)
psi = self.get_index(psi)
ans.append('Flags: %d'%flags)
if flags == 1:
arg, consumed = decint(byts)
byts = byts[consumed:]
ans.append('Unknown (vwi?: always 0?): %d'%arg)
byts = tbs_type_6(byts, psi=psi)
elif flags == 0:
byts = tbs_type_6(byts, psi=psi)
else:
raise ValueError('Unkown flags: %d'%flags)
# }}}
return byts, ans
# }}}
class MOBIFile(object): # {{{ class MOBIFile(object): # {{{
def __init__(self, stream): def __init__(self, stream):
@ -910,8 +1134,7 @@ class MOBIFile(object): # {{{
if fntbr == 0xffffffff: if fntbr == 0xffffffff:
fntbr = len(self.records) fntbr = len(self.records)
self.text_records = [TextRecord(r, self.records[r], self.text_records = [TextRecord(r, self.records[r],
self.mobi_header.extra_data_flags, decompress, self.index_record, self.mobi_header.extra_data_flags, decompress) for r in xrange(1,
self.mobi_header.type_raw) for r in xrange(1,
min(len(self.records), ntr+1))] min(len(self.records), ntr+1))]
self.image_records, self.binary_records = [], [] self.image_records, self.binary_records = [], []
for i in xrange(fntbr, len(self.records)): for i in xrange(fntbr, len(self.records)):
@ -930,6 +1153,9 @@ class MOBIFile(object): # {{{
else: else:
self.binary_records.append(BinaryRecord(i, r)) self.binary_records.append(BinaryRecord(i, r))
if self.index_record is not None:
self.tbs_indexing = TBSIndexing(self.text_records,
self.index_record.indices, self.mobi_header.type_raw)
def print_header(self, f=sys.stdout): def print_header(self, f=sys.stdout):
print (str(self.palmdb).encode('utf-8'), file=f) print (str(self.palmdb).encode('utf-8'), file=f)
@ -961,6 +1187,9 @@ def inspect_mobi(path_or_stream, prefix='decompiled'):
print(str(f.cncx).encode('utf-8'), file=out) print(str(f.cncx).encode('utf-8'), file=out)
print('\n\n', file=out) print('\n\n', file=out)
print(str(f.index_record), file=out) print(str(f.index_record), file=out)
with open(os.path.join(ddir, 'tbs_indexing.txt'), 'wb') as out:
print(str(f.tbs_indexing), file=out)
f.tbs_indexing.dump(ddir)
for tdir, attr in [('text', 'text_records'), ('images', 'image_records'), for tdir, attr in [('text', 'text_records'), ('images', 'image_records'),
('binary', 'binary_records')]: ('binary', 'binary_records')]:

View File

@ -0,0 +1,189 @@
Reverse engineering the trailing byte sequences for hierarchical periodicals
===============================================================================
In the following, *vwi* means variable width integer and *fvwi* means a vwi whose lowest four bits are used as a flag.
Opening record
----------------
The text record that contains the opening node for the periodical (depth=0 node in the NCX) can have TBS of 3 different forms:
1. If it has only the periodical node and no section/article nodes, TBS of type 2, like this::
Record #1: Starts at: 0 Ends at: 4095
Contains: 1 index entries (0 ends, 0 complete, 1 starts)
TBS bytes: 82 80
Starts:
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 68470) [j_x's Google reader]
TBS Type: 010 (2)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
2. A periodical and a section node, but no article nodes, TBS type of 6, like this::
Record #1: Starts at: 0 Ends at: 4095
Contains: 2 index entries (0 ends, 0 complete, 2 starts)
TBS bytes: 86 80 2
Starts:
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 93254) [j_x's Google reader]
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 541, Size: 49280) [Ars Technica]
TBS Type: 110 (6)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (byte: always 2?): 2
3. If it has both the section 1 node and at least one article node, TBS of type 6, like this::
Record #1: Starts at: 0 Ends at: 4095
Contains: 4 index entries (0 ends, 1 complete, 3 starts)
TBS bytes: 86 80 2 c4 2
Complete:
Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 549, Size: 1866) [Week in gaming: 3DS review, Crysis 2, George Hotz]
Starts:
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 79253) [j_x's Google reader]
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 541, Size: 35279) [Ars Technica]
Index Entry: 6 (Parent index: 1, Depth: 2, Offset: 2415, Size: 2764) [Week in Apple: ZFS on Mac OS X, rogue tethering, DUI apps, and more]
TBS Type: 110 (6)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (byte: always 2?): 2
Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute]
Number of article nodes in the record (byte): 2
If there was only a single article, instead of 2, then the last two bytes would be: c0, i.e. there would be no byte giving the number of articles in the record.
Records with no nodes
------------------------
These records are spanned by a single article. They are of two types:
1. If the parent section index is 1, TBS type of 6, like this::
Record #4: Starts at: 12288 Ends at: 16383
Contains: 0 index entries (0 ends, 0 complete, 0 starts)
TBS bytes: 86 80 2 c1 80
TBS Type: 110 (6)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (byte: always 2?): 2
Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute]
EOF (vwi: should be 0): 0
If the record is before the first article, the TBS bytes would be: 86 80 2
2. If the parent section index is > 1, TBS type of 2, like this::
Record #14: Starts at: 53248 Ends at: 57343
Contains: 0 index entries (0 ends, 0 complete, 0 starts)
TBS bytes: 82 80 a0 1 e1 80
TBS Type: 010 (2)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Parent section index (fvwi): 2
Flags: 0
Article index at start of record or first article index, relative to parent section (fvwi): 14 [16 absolute]
EOF (vwi: should be 0): 0
Records with only article nodes
-----------------------------------
Such records have no section transitions (i.e. a section end/section start pair). They have only one or more article nodes. They are of two types:
1. If the parent section index is 1, TBS type of 7, like this::
Record #6: Starts at: 20480 Ends at: 24575
Contains: 2 index entries (1 ends, 0 complete, 1 starts)
TBS bytes: 87 80 2 80 1 84 2
Ends:
Index Entry: 9 (Parent index: 1, Depth: 2, Offset: 16453, Size: 4199) [Vaccine's success spurs whooping cough comeback]
Starts:
Index Entry: 10 (Parent index: 1, Depth: 2, Offset: 20652, Size: 4246) [Apple's mobile products do not violate Nokia patents, says ITC]
TBS Type: 111 (7)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown: '\x02\x80' (vwi?: Always 256)
Article at start of record (fvwi): 8
Number of articles in record (byte): 2
If there was only one article in the record, the last two bytes would be replaced by a single byte: 80
If this record is the first record with an article, then the article at the start of the record should be the last section index. At least, that's what kindlegen does, though if you ask me, it should be the first section index.
2. If the parent section index is > 1, TBS type of 2, like this::
Record #16: Starts at: 61440 Ends at: 65535
Contains: 5 index entries (1 ends, 3 complete, 1 starts)
TBS bytes: 82 80 a1 80 1 f4 5
Ends:
Index Entry: 17 (Parent index: 2, Depth: 2, Offset: 60920, Size: 1082) [Microsoft's Joe Belfiore still working on upcoming Zune hardware]
Complete:
Index Entry: 18 (Parent index: 2, Depth: 2, Offset: 62002, Size: 1016) [Rumour: OS X Lion nearing Golden Master stage]
Index Entry: 19 (Parent index: 2, Depth: 2, Offset: 63018, Size: 1045) [iOS 4.3.1 released]
Index Entry: 20 (Parent index: 2, Depth: 2, Offset: 64063, Size: 972) [Windows 8 'system reset' image leaks]
Starts:
Index Entry: 21 (Parent index: 2, Depth: 2, Offset: 65035, Size: 1057) [Windows Phone 7: Why it's failing]
TBS Type: 010 (2)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Parent section index (fvwi) : 2
Flags: 1
Unknown (vwi: always 0?): 0
Article index at start of record or first article index, relative to parent section (fvwi): 15 [17 absolute]
Number of article nodes in the record (byte): 5
If there was only one article in the record, the last two bytes would be replaced by a single byte: f0
Records with a section transition
-----------------------------------
In such a record there is a transition from one section to the next. As such the record must have at least one article ending and one article starting, except in the case of the first section.
TODO: Note you have to test the cases of first section, a single transition and multiple transitions.
1. The first section::
Record #2: Starts at: 4096 Ends at: 8191
Contains: 2 index entries (0 ends, 0 complete, 2 starts)
TBS bytes: 83 80 80 90 c0
Starts:
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 7758, Size: 26279) [Ars Technica]
Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 7766, Size: 1866) [Week in gaming: 3DS review, Crysis 2, George Hotz]
TBS Type: 011 (3)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Unknown (vwi: always 0?): 0
First section index (fvwi) : 1
Extra bits: 0
First section starts
Article at start of block as offset from parent index (fvwi): 4 [5 absolute]
Flags: 0
If there was more than one article at the start then the last byte would be replaced by: c4 n where n is the number of articles
Ending record
----------------
Logically, ending records must have at least one article ending, one section ending and the periodical ending. They are of TBS type 2, like this::
Record #17: Starts at: 65536 Ends at: 68684
Contains: 4 index entries (3 ends, 1 complete, 0 starts)
TBS bytes: 82 80 c0 4 f4 2
Ends:
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 68470) [j_x's Google reader]
Index Entry: 4 (Parent index: 0, Depth: 1, Offset: 51234, Size: 17451) [Slashdot]
Index Entry: 43 (Parent index: 4, Depth: 2, Offset: 65422, Size: 1717) [US ITC May Reverse Judge&#39;s Ruling In Kodak vs. Apple]
Complete:
Index Entry: 44 (Parent index: 4, Depth: 2, Offset: 67139, Size: 1546) [Google Starts Testing Google Music Internally]
TBS Type: 010 (2)
Outer Index entry: 0
Unknown (vwi: always 0?): 0
Parent section index (fvwi): 4
Flags: 0
Article at start of block as offset from parent index (fvwi): 39 [43 absolute]
Number of nodes (byte): 2
If the record had only a single article end, the last two bytes would be replaced with: f0

View File

@ -79,7 +79,7 @@ def encint(value, forward=True):
def decint(raw, forward=True): def decint(raw, forward=True):
''' '''
Read a variable width integer from the bytestring raw and return the Read a variable width integer from the bytestring or bytearray raw and return the
integer and the number of bytes read. If forward is True bytes are read integer and the number of bytes read. If forward is True bytes are read
from the start of raw, otherwise from the end of raw. from the start of raw, otherwise from the end of raw.
@ -88,8 +88,10 @@ def decint(raw, forward=True):
''' '''
val = 0 val = 0
byts = bytearray() byts = bytearray()
for byte in raw if forward else reversed(raw): src = bytearray(raw)
bnum = ord(byte) if not forward:
src.reverse()
for bnum in src:
byts.append(bnum & 0b01111111) byts.append(bnum & 0b01111111)
if bnum & 0b10000000: if bnum & 0b10000000:
break break
@ -161,7 +163,7 @@ def get_trailing_data(record, extra_data_flags):
''' '''
data = OrderedDict() data = OrderedDict()
for i in xrange(16, -1, -1): for i in xrange(16, -1, -1):
flag = 2**i flag = 1 << i # 2**i
if flag & extra_data_flags: if flag & extra_data_flags:
if i == 0: if i == 0:
# Only the first two bits are used for the size since there can # Only the first two bits are used for the size since there can

View File

@ -18,7 +18,7 @@ from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.ebooks.mobi.writer2 import PALMDOC, UNCOMPRESSED from calibre.ebooks.mobi.writer2 import PALMDOC, UNCOMPRESSED
from calibre.ebooks.mobi.writer2.utils import (rescale_image, encint) from calibre.ebooks.mobi.utils import (rescale_image, encint)
EXTH_CODES = { EXTH_CODES = {
'creator': 100, 'creator': 100,

View File

@ -1,20 +1,39 @@
#! /usr/bin/env python #! /usr/bin/env python
# Written by Martin v. Loewis <loewis@informatik.hu-berlin.de> # Written by Martin v. Loewis <loewis@informatik.hu-berlin.de>
# Modified by Kovid Goyal <kovid@kovidgoyal.net>
"""Generate binary message catalog from textual translation description. """Generate binary message catalog from textual translation description.
This program converts a textual Uniforum-style message catalog (.po file) into This program converts a textual Uniforum-style message catalog (.po file) into
a binary GNU catalog (.mo file). This is essentially the same function as the a binary GNU catalog (.mo file). This is essentially the same function as the
GNU msgfmt program, however, it is a simpler implementation. GNU msgfmt program, however, it is a simpler implementation.
Usage: msgfmt.py [OPTIONS] filename.po
Options:
-o file
--output-file=file
Specify the output file to write to. If omitted, output will go to a
file named filename.mo (based off the input file name).
-h
--help
Print this message and exit.
-V
--version
Display version information and exit.
""" """
import sys import sys
import os import os
import getopt
import struct import struct
import array import array
__version__ = "1.2" __version__ = "1.1"
MESSAGES = {}
def usage(code, msg=''): def usage(code, msg=''):
print >> sys.stderr, __doc__ print >> sys.stderr, __doc__
@ -23,16 +42,16 @@ def usage(code, msg=''):
sys.exit(code) sys.exit(code)
def add(id, str, fuzzy):
def add(id, str, fuzzy, MESSAGES):
"Add a non-fuzzy translation to the dictionary." "Add a non-fuzzy translation to the dictionary."
global MESSAGES
if not fuzzy and str: if not fuzzy and str:
MESSAGES[id] = str MESSAGES[id] = str
def generate():
def generate(MESSAGES):
"Return the generated output." "Return the generated output."
global MESSAGES
keys = MESSAGES.keys() keys = MESSAGES.keys()
# the keys are sorted in the .mo file # the keys are sorted in the .mo file
keys.sort() keys.sort()
@ -44,6 +63,7 @@ def generate(MESSAGES):
offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id]))) offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id])))
ids += id + '\0' ids += id + '\0'
strs += MESSAGES[id] + '\0' strs += MESSAGES[id] + '\0'
output = ''
# The header is 7 32-bit unsigned integers. We don't use hash tables, so # The header is 7 32-bit unsigned integers. We don't use hash tables, so
# the keys start right after the index tables. # the keys start right after the index tables.
# translated string. # translated string.
@ -71,9 +91,7 @@ def generate(MESSAGES):
return output return output
def make(filename, outfile): def make(filename, outfile):
MESSAGES = {}
ID = 1 ID = 1
STR = 2 STR = 2
@ -101,7 +119,7 @@ def make(filename, outfile):
lno += 1 lno += 1
# If we get a comment line after a msgstr, this is a new entry # If we get a comment line after a msgstr, this is a new entry
if l[0] == '#' and section == STR: if l[0] == '#' and section == STR:
add(msgid, msgstr, fuzzy, MESSAGES) add(msgid, msgstr, fuzzy)
section = None section = None
fuzzy = 0 fuzzy = 0
# Record a fuzzy mark # Record a fuzzy mark
@ -111,16 +129,39 @@ def make(filename, outfile):
if l[0] == '#': if l[0] == '#':
continue continue
# Now we are in a msgid section, output previous section # Now we are in a msgid section, output previous section
if l.startswith('msgid'): if l.startswith('msgid') and not l.startswith('msgid_plural'):
if section == STR: if section == STR:
add(msgid, msgstr, fuzzy, MESSAGES) add(msgid, msgstr, fuzzy)
section = ID section = ID
l = l[5:] l = l[5:]
msgid = msgstr = '' msgid = msgstr = ''
is_plural = False
# This is a message with plural forms
elif l.startswith('msgid_plural'):
if section != ID:
print >> sys.stderr, 'msgid_plural not preceeded by msgid on %s:%d' %\
(infile, lno)
sys.exit(1)
l = l[12:]
msgid += '\0' # separator of singular and plural
is_plural = True
# Now we are in a msgstr section # Now we are in a msgstr section
elif l.startswith('msgstr'): elif l.startswith('msgstr'):
section = STR section = STR
l = l[6:] if l.startswith('msgstr['):
if not is_plural:
print >> sys.stderr, 'plural without msgid_plural on %s:%d' %\
(infile, lno)
sys.exit(1)
l = l.split(']', 1)[1]
if msgstr:
msgstr += '\0' # Separator of the various plural forms
else:
if is_plural:
print >> sys.stderr, 'indexed msgstr required for plural on %s:%d' %\
(infile, lno)
sys.exit(1)
l = l[6:]
# Skip empty lines # Skip empty lines
l = l.strip() l = l.strip()
if not l: if not l:
@ -138,22 +179,40 @@ def make(filename, outfile):
sys.exit(1) sys.exit(1)
# Add last entry # Add last entry
if section == STR: if section == STR:
add(msgid, msgstr, fuzzy, MESSAGES) add(msgid, msgstr, fuzzy)
# Compute output # Compute output
output = generate(MESSAGES) output = generate()
outfile.write(output)
def main():
try: try:
outfile.write(output) opts, args = getopt.getopt(sys.argv[1:], 'hVo:',
except IOError,msg: ['help', 'version', 'output-file='])
print >> sys.stderr, msg except getopt.error, msg:
usage(1, msg)
outfile = None
# parse options
for opt, arg in opts:
if opt in ('-h', '--help'):
usage(0)
elif opt in ('-V', '--version'):
print >> sys.stderr, "msgfmt.py", __version__
sys.exit(0)
elif opt in ('-o', '--output-file'):
outfile = arg
# do it
if not args:
print >> sys.stderr, 'No input file given'
print >> sys.stderr, "Try `msgfmt --help' for more information."
return
def main(outfile, args=sys.argv[1:]):
for filename in args: for filename in args:
make(filename, outfile) make(filename, outfile)
return 0
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main(sys.stdout)) main()

View File

@ -71,13 +71,13 @@ def set_translators():
lang = get_lang() lang = get_lang()
if lang: if lang:
buf = iso639 = None buf = iso639 = None
if os.access(lang+'.po', os.R_OK): mpath = get_lc_messages_path(lang)
if mpath and os.access(mpath+'.po', os.R_OK):
from calibre.translations.msgfmt import make from calibre.translations.msgfmt import make
buf = cStringIO.StringIO() buf = cStringIO.StringIO()
make(lang+'.po', buf) make(mpath+'.po', buf)
buf = cStringIO.StringIO(buf.getvalue()) buf = cStringIO.StringIO(buf.getvalue())
mpath = get_lc_messages_path(lang)
if mpath is not None: if mpath is not None:
with ZipFile(P('localization/locales.zip', with ZipFile(P('localization/locales.zip',
allow_user_override=False), 'r') as zf: allow_user_override=False), 'r') as zf:

View File

@ -217,6 +217,8 @@ class RecipeModel(QAbstractItemModel, SearchQueryParser):
self.all_urns.add(urn) self.all_urns.add(urn)
if ok(urn): if ok(urn):
lang = x.get('language', 'und') lang = x.get('language', 'und')
if lang:
lang = lang.replace('-', '_')
if lang not in lang_map: if lang not in lang_map:
lang_map[lang] = factory(NewsCategory, new_root, lang) lang_map[lang] = factory(NewsCategory, new_root, lang)
factory(NewsItem, lang_map[lang], urn, x.get('title')) factory(NewsItem, lang_map[lang], urn, x.get('title'))