Fix #1938 (Error in lrs2lrf)

This commit is contained in:
Kovid Goyal 2009-02-26 12:12:00 -08:00
parent b197a6c86f
commit 786c0c9cae
6 changed files with 141 additions and 48 deletions

View File

@ -30,12 +30,50 @@ def detect(aBuf):
# Added by Kovid
ENCODING_PATS = [
re.compile(r'<\?[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)
re.compile(r'<\?[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>',
re.IGNORECASE),
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>',
re.IGNORECASE)
]
ENTITY_PATTERN = re.compile(r'&(\S+?);')
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entities=False):
def strip_encoding_declarations(raw):
for pat in ENCODING_PATS:
raw = pat.sub('', raw)
return raw
def substitute_entites(raw):
from calibre import entity_to_unicode
from functools import partial
f = partial(entity_to_unicode, exceptions=
['amp', 'apos', 'quot', 'lt', 'gt'])
return ENTITY_PATTERN.sub(f, raw)
_CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" }
def force_encoding(raw, verbose):
from calibre.constants import preferred_encoding
try:
chardet = detect(raw)
except:
chardet = {'encoding':preferred_encoding, 'confidence':0}
encoding = chardet['encoding']
if chardet['confidence'] < 1 and verbose:
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
if not encoding:
encoding = preferred_encoding
encoding = encoding.lower()
if _CHARSET_ALIASES.has_key(encoding):
encoding = _CHARSET_ALIASES[encoding]
if encoding == 'ascii':
encoding = 'utf-8'
return encoding
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
resolve_entities=False):
'''
Force conversion of byte string to unicode. Tries to look for XML/HTML
encoding declaration first, if not found uses the chardet library and
@ -45,44 +83,27 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entiti
encoding = None
if not raw:
return u'', encoding
if isinstance(raw, unicode):
return raw, encoding
if not isinstance(raw, unicode):
if raw.startswith('\xff\xfe'):
raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
elif raw.startswith('\xfe\xff'):
raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
if not isinstance(raw, unicode):
for pat in ENCODING_PATS:
match = pat.search(raw)
if match:
encoding = match.group(1)
break
if strip_encoding_pats:
for pat in ENCODING_PATS:
raw = pat.sub('', raw)
if encoding is None:
try:
chardet = detect(raw)
except:
chardet = {'encoding':'utf-8', 'confidence':0}
encoding = chardet['encoding']
if chardet['confidence'] < 1 and verbose:
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" }
if not encoding:
from calibre import preferred_encoding
encoding = preferred_encoding
if encoding:
encoding = encoding.lower()
if CHARSET_ALIASES.has_key(encoding):
encoding = CHARSET_ALIASES[encoding]
if encoding == 'ascii':
encoding = 'utf-8'
encoding = force_encoding(raw, verbose)
try:
raw = raw.decode(encoding, 'replace')
except LookupError:
raw = raw.decode('utf-8', 'replace')
if strip_encoding_pats:
raw = strip_encoding_declarations(raw)
if resolve_entities:
from calibre import entity_to_unicode
from functools import partial
f = partial(entity_to_unicode, exceptions=['amp', 'apos', 'quot', 'lt', 'gt'])
raw = ENTITY_PATTERN.sub(f, raw)
raw = substitute_entites(raw)
return raw, encoding

View File

@ -73,7 +73,9 @@ class LrsParser(object):
return CharButton(self.parsed_objects[tag.get('refobj')], None)
if tag.name == 'plot':
return Plot(self.parsed_objects[tag.get('refobj')], **self.attrs_to_dict(tag, ['refobj']))
return map[tag.name](**self.attrs_to_dict(tag))
settings = self.attrs_to_dict(tag)
settings.pop('spanstyle', '')
return map[tag.name](**settings)
def process_text_element(self, tag, elem):
for item in tag.contents:
@ -121,7 +123,8 @@ class LrsParser(object):
for tag in self.soup.findAll('page'):
page = self.parsed_objects[tag.get('objid')]
self.book.append(page)
for block_tag in tag.findAll(['canvas', 'imageblock', 'textblock', 'ruledline']):
for block_tag in tag.findAll(['canvas', 'imageblock', 'textblock',
'ruledline', 'simpletextblock']):
if block_tag.name == 'ruledline':
page.append(RuledLine(**self.attrs_to_dict(block_tag)))
else:
@ -134,7 +137,7 @@ class LrsParser(object):
self.book.append(jb)
self.parsed_objects[tag.get('objid')] = jb
for tag in self.soup.findAll('textblock'):
for tag in self.soup.findAll(['textblock', 'simpletextblock']):
self.process_text_block(tag)
toc = self.soup.find('toc')
if toc:
@ -145,8 +148,10 @@ class LrsParser(object):
def third_pass(self):
map = {
'page' : (Page, ['pagestyle', 'evenfooterid', 'oddfooterid', 'evenheaderid', 'oddheaderid']),
'page' : (Page, ['pagestyle', 'evenfooterid',
'oddfooterid', 'evenheaderid', 'oddheaderid']),
'textblock' : (TextBlock, ['textstyle', 'blockstyle']),
'simpletextblock' : (TextBlock, ['textstyle', 'blockstyle']),
'imageblock' : (ImageBlock, ['blockstyle', 'refstream']),
'image' : (Image, ['refstream']),
'canvas' : (Canvas, ['canvaswidth', 'canvasheight']),
@ -160,8 +165,12 @@ class LrsParser(object):
if tag.name in map.keys():
settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid', 'objlabel'])
for a in ('pagestyle', 'blockstyle', 'textstyle'):
if tag.has_key(a):
settings[attrmap[a]] = self.parsed_objects[tag.get(a)]
label = tag.get(a, False)
if label:
_obj = self.parsed_objects[label] if \
self.parsed_objects.has_key(label) else \
self._style_labels[label]
settings[attrmap[a]] = _obj
for a in ('evenfooterid', 'oddfooterid', 'evenheaderid', 'oddheaderid'):
if tag.has_key(a):
settings[a.replace('id', '')] = self.parsed_objects[tag.get(a)]
@ -182,6 +191,7 @@ class LrsParser(object):
'imagestream': (ImageStream, ['imagestreamlabel']),
'registfont' : (Font, [])
}
self._style_labels = {}
for id, tag in self.objects.items():
if tag.name in map.keys():
settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid'])
@ -189,7 +199,11 @@ class LrsParser(object):
for a in ('evenheaderid', 'oddheaderid', 'evenfooterid', 'oddfooterid'):
if tag.has_key(a):
settings[a.replace('id', '')] = self.parsed_objects[tag.get(a)]
settings.pop('autoindex', '')
self.parsed_objects[id] = map[tag.name][0](**settings)
x = tag.get('stylelabel', False)
if x:
self._style_labels[x] = self.parsed_objects[id]
if tag.name == 'registfont':
self.book.append(self.parsed_objects[id])
@ -220,6 +234,8 @@ class LrsParser(object):
def me(base, tagname):
tag = base.find(tagname.lower())
if tag is None:
return ('', '', '')
tag = (self.tag_to_string(tag), tag.get('reading') if tag.has_key('reading') else '')
return tag

View File

@ -255,7 +255,7 @@ class LRFSingleDialog(QDialog, Ui_LRFSingleDialog):
self.gui_headerformat.setDisabled(True)
self.gui_header_separation.setDisabled(True)
self.gui_use_metadata_cover.setCheckState(Qt.Checked)
self.preprocess.addItem('No preprocessing')
self.preprocess.addItem(_('No preprocessing'))
for opt in self.PREPROCESS_OPTIONS:
self.preprocess.addItem(opt.get_opt_string()[2:])
ph = _('Preprocess the file before converting to LRF. This is useful if you know that the file is from a specific source. Known sources:')
@ -338,7 +338,7 @@ class LRFSingleDialog(QDialog, Ui_LRFSingleDialog):
cmd.append(opt)
text = qstring_to_unicode(self.preprocess.currentText())
if text != 'No preprocessing':
if text != _('No preprocessing'):
cmd.append(u'--'+text)
cmd.extend([u'--profile', qstring_to_unicode(self.gui_profile.currentText())])

View File

@ -19,5 +19,4 @@ class Config(_Config):
self.opt_dont_split_on_page_breaks.setVisible(False)
self.opt_preserve_tag_structure.setVisible(False)
self.opt_linearize_tables.setVisible(False)
self.opt_no_justification.setVisible(False)
self.page_map_box.setVisible(False)

Binary file not shown.

After

Width:  |  Height:  |  Size: 455 B

View File

@ -0,0 +1,57 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.army.mil/soldiers/
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Soldiers(BasicNewsRecipe):
title = 'Soldiers'
__author__ = 'Darko Miletic'
description = 'The Official U.S. Army Magazine'
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
remove_javascript = True
simultaneous_downloads = 1
delay = 4
max_connections = 1
encoding = 'utf-8'
publisher = 'U.S. Army'
category = 'news, politics, war, weapons'
language = _('English')
INDEX = 'http://www.army.mil/soldiers/'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'rightCol'})]
remove_tags = [
dict(name='div', attrs={'id':['addThis','comment','articleFooter']})
,dict(name=['object','link'])
]
feeds = [(u'Frontpage', u'http://www.army.mil/rss/feeds/soldiersfrontpage.xml' )]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('img',attrs={'alt':'Current Magazine Cover'})
if cover_item:
cover_url = cover_item['src']
return cover_url