mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
lrs2lrf: Handle missing style labels gracefully
This commit is contained in:
parent
2c5d951139
commit
a80d705049
@ -18,38 +18,38 @@ from calibre.ebooks.lrf.pylrs.pylrs import Book, PageStyle, TextStyle, \
|
|||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
|
||||||
class LrsParser(object):
|
class LrsParser(object):
|
||||||
|
|
||||||
SELF_CLOSING_TAGS = [i.lower() for i in ['CR', 'Plot', 'NoBR', 'Space',
|
SELF_CLOSING_TAGS = [i.lower() for i in ['CR', 'Plot', 'NoBR', 'Space',
|
||||||
'PutObj', 'RuledLine',
|
'PutObj', 'RuledLine',
|
||||||
'Plot', 'SetDefault', 'BookSetting', 'RegistFont',
|
'Plot', 'SetDefault', 'BookSetting', 'RegistFont',
|
||||||
'PageStyle', 'TextStyle', 'BlockStyle', 'JumpTo',
|
'PageStyle', 'TextStyle', 'BlockStyle', 'JumpTo',
|
||||||
'ImageStream', 'Image']]
|
'ImageStream', 'Image']]
|
||||||
|
|
||||||
def __init__(self, stream, logger):
|
def __init__(self, stream, logger):
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
src = stream.read()
|
src = stream.read()
|
||||||
self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0],
|
self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0],
|
||||||
convertEntities=BeautifulStoneSoup.XML_ENTITIES,
|
convertEntities=BeautifulStoneSoup.XML_ENTITIES,
|
||||||
selfClosingTags=self.SELF_CLOSING_TAGS)
|
selfClosingTags=self.SELF_CLOSING_TAGS)
|
||||||
self.objects = {}
|
self.objects = {}
|
||||||
for obj in self.soup.findAll(objid=True):
|
for obj in self.soup.findAll(objid=True):
|
||||||
self.objects[obj['objid']] = obj
|
self.objects[obj['objid']] = obj
|
||||||
|
|
||||||
self.parsed_objects = {}
|
self.parsed_objects = {}
|
||||||
self.first_pass()
|
self.first_pass()
|
||||||
self.second_pass()
|
self.second_pass()
|
||||||
self.third_pass()
|
self.third_pass()
|
||||||
self.fourth_pass()
|
self.fourth_pass()
|
||||||
self.fifth_pass()
|
self.fifth_pass()
|
||||||
|
|
||||||
def fifth_pass(self):
|
def fifth_pass(self):
|
||||||
for tag in self.soup.findAll(['canvas', 'header', 'footer']):
|
for tag in self.soup.findAll(['canvas', 'header', 'footer']):
|
||||||
canvas = self.parsed_objects[tag.get('objid')]
|
canvas = self.parsed_objects[tag.get('objid')]
|
||||||
for po in tag.findAll('putobj'):
|
for po in tag.findAll('putobj'):
|
||||||
canvas.put_object(self.parsed_objects[po.get('refobj')],
|
canvas.put_object(self.parsed_objects[po.get('refobj')],
|
||||||
po.get('x1'), po.get('y1'))
|
po.get('x1'), po.get('y1'))
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def attrs_to_dict(cls, tag, exclude=('objid',)):
|
def attrs_to_dict(cls, tag, exclude=('objid',)):
|
||||||
result = {}
|
result = {}
|
||||||
@ -58,7 +58,7 @@ class LrsParser(object):
|
|||||||
continue
|
continue
|
||||||
result[str(key)] = val
|
result[str(key)] = val
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def text_tag_to_element(self, tag):
|
def text_tag_to_element(self, tag):
|
||||||
map = {
|
map = {
|
||||||
'span' : Span,
|
'span' : Span,
|
||||||
@ -77,7 +77,7 @@ class LrsParser(object):
|
|||||||
settings = self.attrs_to_dict(tag)
|
settings = self.attrs_to_dict(tag)
|
||||||
settings.pop('spanstyle', '')
|
settings.pop('spanstyle', '')
|
||||||
return map[tag.name](**settings)
|
return map[tag.name](**settings)
|
||||||
|
|
||||||
def process_text_element(self, tag, elem):
|
def process_text_element(self, tag, elem):
|
||||||
for item in tag.contents:
|
for item in tag.contents:
|
||||||
if isinstance(item, NavigableString):
|
if isinstance(item, NavigableString):
|
||||||
@ -86,8 +86,8 @@ class LrsParser(object):
|
|||||||
subelem = self.text_tag_to_element(item)
|
subelem = self.text_tag_to_element(item)
|
||||||
elem.append(subelem)
|
elem.append(subelem)
|
||||||
self.process_text_element(item, subelem)
|
self.process_text_element(item, subelem)
|
||||||
|
|
||||||
|
|
||||||
def process_paragraph(self, tag):
|
def process_paragraph(self, tag):
|
||||||
p = Paragraph()
|
p = Paragraph()
|
||||||
contents = [i for i in tag.contents]
|
contents = [i for i in tag.contents]
|
||||||
@ -104,7 +104,7 @@ class LrsParser(object):
|
|||||||
p.append(elem)
|
p.append(elem)
|
||||||
self.process_text_element(item, elem)
|
self.process_text_element(item, elem)
|
||||||
return p
|
return p
|
||||||
|
|
||||||
def process_text_block(self, tag):
|
def process_text_block(self, tag):
|
||||||
tb = self.parsed_objects[tag.get('objid')]
|
tb = self.parsed_objects[tag.get('objid')]
|
||||||
for item in tag.contents:
|
for item in tag.contents:
|
||||||
@ -119,25 +119,25 @@ class LrsParser(object):
|
|||||||
elem = self.text_tag_to_element(item)
|
elem = self.text_tag_to_element(item)
|
||||||
self.process_text_element(item, elem)
|
self.process_text_element(item, elem)
|
||||||
p.append(elem)
|
p.append(elem)
|
||||||
|
|
||||||
def fourth_pass(self):
|
def fourth_pass(self):
|
||||||
for tag in self.soup.findAll('page'):
|
for tag in self.soup.findAll('page'):
|
||||||
page = self.parsed_objects[tag.get('objid')]
|
page = self.parsed_objects[tag.get('objid')]
|
||||||
self.book.append(page)
|
self.book.append(page)
|
||||||
for block_tag in tag.findAll(['canvas', 'imageblock', 'textblock',
|
for block_tag in tag.findAll(['canvas', 'imageblock', 'textblock',
|
||||||
'ruledline', 'simpletextblock']):
|
'ruledline', 'simpletextblock']):
|
||||||
if block_tag.name == 'ruledline':
|
if block_tag.name == 'ruledline':
|
||||||
page.append(RuledLine(**self.attrs_to_dict(block_tag)))
|
page.append(RuledLine(**self.attrs_to_dict(block_tag)))
|
||||||
else:
|
else:
|
||||||
page.append(self.parsed_objects[block_tag.get('objid')])
|
page.append(self.parsed_objects[block_tag.get('objid')])
|
||||||
|
|
||||||
for tag in self.soup.find('objects').findAll('button'):
|
for tag in self.soup.find('objects').findAll('button'):
|
||||||
jt = tag.find('jumpto')
|
jt = tag.find('jumpto')
|
||||||
tb = self.parsed_objects[jt.get('refobj')]
|
tb = self.parsed_objects[jt.get('refobj')]
|
||||||
jb = JumpButton(tb)
|
jb = JumpButton(tb)
|
||||||
self.book.append(jb)
|
self.book.append(jb)
|
||||||
self.parsed_objects[tag.get('objid')] = jb
|
self.parsed_objects[tag.get('objid')] = jb
|
||||||
|
|
||||||
for tag in self.soup.findAll(['textblock', 'simpletextblock']):
|
for tag in self.soup.findAll(['textblock', 'simpletextblock']):
|
||||||
self.process_text_block(tag)
|
self.process_text_block(tag)
|
||||||
toc = self.soup.find('toc')
|
toc = self.soup.find('toc')
|
||||||
@ -145,11 +145,11 @@ class LrsParser(object):
|
|||||||
for tag in toc.findAll('toclabel'):
|
for tag in toc.findAll('toclabel'):
|
||||||
label = self.tag_to_string(tag)
|
label = self.tag_to_string(tag)
|
||||||
self.book.addTocEntry(label, self.parsed_objects[tag.get('refobj')])
|
self.book.addTocEntry(label, self.parsed_objects[tag.get('refobj')])
|
||||||
|
|
||||||
|
|
||||||
def third_pass(self):
|
def third_pass(self):
|
||||||
map = {
|
map = {
|
||||||
'page' : (Page, ['pagestyle', 'evenfooterid',
|
'page' : (Page, ['pagestyle', 'evenfooterid',
|
||||||
'oddfooterid', 'evenheaderid', 'oddheaderid']),
|
'oddfooterid', 'evenheaderid', 'oddheaderid']),
|
||||||
'textblock' : (TextBlock, ['textstyle', 'blockstyle']),
|
'textblock' : (TextBlock, ['textstyle', 'blockstyle']),
|
||||||
'simpletextblock' : (TextBlock, ['textstyle', 'blockstyle']),
|
'simpletextblock' : (TextBlock, ['textstyle', 'blockstyle']),
|
||||||
@ -167,7 +167,7 @@ class LrsParser(object):
|
|||||||
settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid', 'objlabel'])
|
settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid', 'objlabel'])
|
||||||
for a in ('pagestyle', 'blockstyle', 'textstyle'):
|
for a in ('pagestyle', 'blockstyle', 'textstyle'):
|
||||||
label = tag.get(a, False)
|
label = tag.get(a, False)
|
||||||
if label:
|
if label and label in self._style_labels:
|
||||||
_obj = self.parsed_objects[label] if \
|
_obj = self.parsed_objects[label] if \
|
||||||
self.parsed_objects.has_key(label) else \
|
self.parsed_objects.has_key(label) else \
|
||||||
self._style_labels[label]
|
self._style_labels[label]
|
||||||
@ -181,9 +181,9 @@ class LrsParser(object):
|
|||||||
if tag.has_key('canvaswidth'):
|
if tag.has_key('canvaswidth'):
|
||||||
args += [tag.get('canvaswidth'), tag.get('canvasheight')]
|
args += [tag.get('canvaswidth'), tag.get('canvasheight')]
|
||||||
self.parsed_objects[id] = map[tag.name][0](*args, **settings)
|
self.parsed_objects[id] = map[tag.name][0](*args, **settings)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def second_pass(self):
|
def second_pass(self):
|
||||||
map = {
|
map = {
|
||||||
'pagestyle' : (PageStyle, ['stylelabel', 'evenheaderid', 'oddheaderid', 'evenfooterid', 'oddfooterid']),
|
'pagestyle' : (PageStyle, ['stylelabel', 'evenheaderid', 'oddheaderid', 'evenfooterid', 'oddfooterid']),
|
||||||
@ -207,8 +207,8 @@ class LrsParser(object):
|
|||||||
self._style_labels[x] = self.parsed_objects[id]
|
self._style_labels[x] = self.parsed_objects[id]
|
||||||
if tag.name == 'registfont':
|
if tag.name == 'registfont':
|
||||||
self.book.append(self.parsed_objects[id])
|
self.book.append(self.parsed_objects[id])
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def tag_to_string(cls, tag):
|
def tag_to_string(cls, tag):
|
||||||
'''
|
'''
|
||||||
@ -226,20 +226,20 @@ class LrsParser(object):
|
|||||||
res = cls.tag_to_string(item)
|
res = cls.tag_to_string(item)
|
||||||
if res:
|
if res:
|
||||||
strings.append(res)
|
strings.append(res)
|
||||||
return u''.join(strings)
|
return u''.join(strings)
|
||||||
|
|
||||||
def first_pass(self):
|
def first_pass(self):
|
||||||
info = self.soup.find('bbebxylog').find('bookinformation').find('info')
|
info = self.soup.find('bbebxylog').find('bookinformation').find('info')
|
||||||
bookinfo = info.find('bookinfo')
|
bookinfo = info.find('bookinfo')
|
||||||
docinfo = info.find('docinfo')
|
docinfo = info.find('docinfo')
|
||||||
|
|
||||||
def me(base, tagname):
|
def me(base, tagname):
|
||||||
tag = base.find(tagname.lower())
|
tag = base.find(tagname.lower())
|
||||||
if tag is None:
|
if tag is None:
|
||||||
return ('', '', '')
|
return ('', '', '')
|
||||||
tag = (self.tag_to_string(tag), tag.get('reading') if tag.has_key('reading') else '')
|
tag = (self.tag_to_string(tag), tag.get('reading') if tag.has_key('reading') else '')
|
||||||
return tag
|
return tag
|
||||||
|
|
||||||
title = me(bookinfo, 'Title')
|
title = me(bookinfo, 'Title')
|
||||||
author = me(bookinfo, 'Author')
|
author = me(bookinfo, 'Author')
|
||||||
publisher = me(bookinfo, 'Publisher')
|
publisher = me(bookinfo, 'Publisher')
|
||||||
@ -250,12 +250,12 @@ class LrsParser(object):
|
|||||||
creator = me(docinfo, 'Creator')[0]
|
creator = me(docinfo, 'Creator')[0]
|
||||||
producer = me(docinfo, 'Producer')[0]
|
producer = me(docinfo, 'Producer')[0]
|
||||||
bookid = me(bookinfo, 'BookID')[0]
|
bookid = me(bookinfo, 'BookID')[0]
|
||||||
|
|
||||||
sd = self.soup.find('setdefault')
|
sd = self.soup.find('setdefault')
|
||||||
sd = StyleDefault(**self.attrs_to_dict(sd, ['page_tree_id', 'rubyalignandadjust']))
|
sd = StyleDefault(**self.attrs_to_dict(sd, ['page_tree_id', 'rubyalignandadjust']))
|
||||||
bs = self.soup.find('booksetting')
|
bs = self.soup.find('booksetting')
|
||||||
bs = BookSetting(**self.attrs_to_dict(bs, []))
|
bs = BookSetting(**self.attrs_to_dict(bs, []))
|
||||||
|
|
||||||
settings = {}
|
settings = {}
|
||||||
thumbnail = self.soup.find('cthumbnail')
|
thumbnail = self.soup.find('cthumbnail')
|
||||||
if thumbnail is not None:
|
if thumbnail is not None:
|
||||||
@ -264,23 +264,23 @@ class LrsParser(object):
|
|||||||
settings['thumbnail'] = f
|
settings['thumbnail'] = f
|
||||||
else:
|
else:
|
||||||
print _('Could not read from thumbnail file:'), f
|
print _('Could not read from thumbnail file:'), f
|
||||||
|
|
||||||
self.book = Book(title=title, author=author, publisher=publisher,
|
self.book = Book(title=title, author=author, publisher=publisher,
|
||||||
category=category, classification=classification,
|
category=category, classification=classification,
|
||||||
freetext=freetext, language=language, creator=creator,
|
freetext=freetext, language=language, creator=creator,
|
||||||
producer=producer, bookid=bookid, setdefault=sd,
|
producer=producer, bookid=bookid, setdefault=sd,
|
||||||
booksetting=bs, **settings)
|
booksetting=bs, **settings)
|
||||||
|
|
||||||
for hdr in self.soup.findAll(['header', 'footer']):
|
for hdr in self.soup.findAll(['header', 'footer']):
|
||||||
elem = Header if hdr.name == 'header' else Footer
|
elem = Header if hdr.name == 'header' else Footer
|
||||||
self.parsed_objects[hdr.get('objid')] = elem(**self.attrs_to_dict(hdr))
|
self.parsed_objects[hdr.get('objid')] = elem(**self.attrs_to_dict(hdr))
|
||||||
|
|
||||||
def render(self, file, to_lrs=False):
|
def render(self, file, to_lrs=False):
|
||||||
if to_lrs:
|
if to_lrs:
|
||||||
self.book.renderLrs(file, 'utf-8')
|
self.book.renderLrs(file, 'utf-8')
|
||||||
else:
|
else:
|
||||||
self.book.renderLrf(file)
|
self.book.renderLrf(file)
|
||||||
|
|
||||||
|
|
||||||
def option_parser():
|
def option_parser():
|
||||||
parser = OptionParser(usage=_('%prog [options] file.lrs\nCompile an LRS file into an LRF file.'))
|
parser = OptionParser(usage=_('%prog [options] file.lrs\nCompile an LRS file into an LRF file.'))
|
||||||
@ -299,7 +299,7 @@ def main(args=sys.argv, logger=None):
|
|||||||
level = logging.DEBUG if opts.verbose else logging.INFO
|
level = logging.DEBUG if opts.verbose else logging.INFO
|
||||||
logger = logging.getLogger('lrs2lrf')
|
logger = logging.getLogger('lrs2lrf')
|
||||||
setup_cli_handlers(logger, level)
|
setup_cli_handlers(logger, level)
|
||||||
|
|
||||||
if len(args) != 2:
|
if len(args) != 2:
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
return 1
|
return 1
|
||||||
@ -310,7 +310,7 @@ def main(args=sys.argv, logger=None):
|
|||||||
if opts.verbose:
|
if opts.verbose:
|
||||||
import warnings
|
import warnings
|
||||||
warnings.defaultaction = 'error'
|
warnings.defaultaction = 'error'
|
||||||
|
|
||||||
logger.info('Parsing LRS file...')
|
logger.info('Parsing LRS file...')
|
||||||
converter = LrsParser(open(args[1], 'rb'), logger)
|
converter = LrsParser(open(args[1], 'rb'), logger)
|
||||||
logger.info('Writing to output file...')
|
logger.info('Writing to output file...')
|
||||||
@ -320,4 +320,4 @@ def main(args=sys.argv, logger=None):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main())
|
sys.exit(main())
|
||||||
|
@ -5,7 +5,19 @@ meaning as possible.
|
|||||||
|
|
||||||
from calibre.ebooks.unidecode.unidecoder import Unidecoder
|
from calibre.ebooks.unidecode.unidecoder import Unidecoder
|
||||||
from calibre import sanitize_file_name
|
from calibre import sanitize_file_name
|
||||||
|
from calibre.constants import preferred_encoding
|
||||||
udc = Unidecoder()
|
udc = Unidecoder()
|
||||||
|
|
||||||
|
def ascii_text(orig):
|
||||||
|
try:
|
||||||
|
ascii = udc.decode(orig)
|
||||||
|
except:
|
||||||
|
if isinstance(orig, unicode):
|
||||||
|
ascii = orig.encode('ascii', 'replace')
|
||||||
|
ascii = orig.decode(preferred_encoding,
|
||||||
|
'replace').encode('ascii', 'replace')
|
||||||
|
return ascii
|
||||||
|
|
||||||
|
|
||||||
def ascii_filename(orig):
|
def ascii_filename(orig):
|
||||||
return sanitize_file_name(udc.decode(orig).replace('?', '_'))
|
return sanitize_file_name(ascii_text(orig).replace('?', '_'))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user