Fix #1938 (Error in lrs2lrf)

This commit is contained in:
Kovid Goyal 2009-02-26 12:12:00 -08:00
parent b197a6c86f
commit 786c0c9cae
6 changed files with 141 additions and 48 deletions

View File

@ -30,12 +30,50 @@ def detect(aBuf):
# Added by Kovid # Added by Kovid
ENCODING_PATS = [ ENCODING_PATS = [
re.compile(r'<\?[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE), re.compile(r'<\?[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>',
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE) re.IGNORECASE),
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>',
re.IGNORECASE)
] ]
ENTITY_PATTERN = re.compile(r'&(\S+?);') ENTITY_PATTERN = re.compile(r'&(\S+?);')
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entities=False): def strip_encoding_declarations(raw):
for pat in ENCODING_PATS:
raw = pat.sub('', raw)
return raw
def substitute_entites(raw):
from calibre import entity_to_unicode
from functools import partial
f = partial(entity_to_unicode, exceptions=
['amp', 'apos', 'quot', 'lt', 'gt'])
return ENTITY_PATTERN.sub(f, raw)
_CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" }
def force_encoding(raw, verbose):
from calibre.constants import preferred_encoding
try:
chardet = detect(raw)
except:
chardet = {'encoding':preferred_encoding, 'confidence':0}
encoding = chardet['encoding']
if chardet['confidence'] < 1 and verbose:
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
if not encoding:
encoding = preferred_encoding
encoding = encoding.lower()
if _CHARSET_ALIASES.has_key(encoding):
encoding = _CHARSET_ALIASES[encoding]
if encoding == 'ascii':
encoding = 'utf-8'
return encoding
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
resolve_entities=False):
''' '''
Force conversion of byte string to unicode. Tries to look for XML/HTML Force conversion of byte string to unicode. Tries to look for XML/HTML
encoding declaration first, if not found uses the chardet library and encoding declaration first, if not found uses the chardet library and
@ -45,44 +83,27 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entiti
encoding = None encoding = None
if not raw: if not raw:
return u'', encoding return u'', encoding
if isinstance(raw, unicode): if not isinstance(raw, unicode):
return raw, encoding if raw.startswith('\xff\xfe'):
for pat in ENCODING_PATS: raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
match = pat.search(raw) elif raw.startswith('\xfe\xff'):
if match: raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
encoding = match.group(1) if not isinstance(raw, unicode):
break
if strip_encoding_pats:
for pat in ENCODING_PATS: for pat in ENCODING_PATS:
raw = pat.sub('', raw) match = pat.search(raw)
if encoding is None: if match:
encoding = match.group(1)
break
if encoding is None:
encoding = force_encoding(raw, verbose)
try: try:
chardet = detect(raw) raw = raw.decode(encoding, 'replace')
except: except LookupError:
chardet = {'encoding':'utf-8', 'confidence':0} raw = raw.decode('utf-8', 'replace')
encoding = chardet['encoding']
if chardet['confidence'] < 1 and verbose:
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" }
if not encoding:
from calibre import preferred_encoding
encoding = preferred_encoding
if encoding:
encoding = encoding.lower()
if CHARSET_ALIASES.has_key(encoding):
encoding = CHARSET_ALIASES[encoding]
if encoding == 'ascii':
encoding = 'utf-8'
try: if strip_encoding_pats:
raw = raw.decode(encoding, 'replace') raw = strip_encoding_declarations(raw)
except LookupError:
raw = raw.decode('utf-8', 'replace')
if resolve_entities: if resolve_entities:
from calibre import entity_to_unicode raw = substitute_entites(raw)
from functools import partial
f = partial(entity_to_unicode, exceptions=['amp', 'apos', 'quot', 'lt', 'gt'])
raw = ENTITY_PATTERN.sub(f, raw)
return raw, encoding return raw, encoding

View File

@ -73,7 +73,9 @@ class LrsParser(object):
return CharButton(self.parsed_objects[tag.get('refobj')], None) return CharButton(self.parsed_objects[tag.get('refobj')], None)
if tag.name == 'plot': if tag.name == 'plot':
return Plot(self.parsed_objects[tag.get('refobj')], **self.attrs_to_dict(tag, ['refobj'])) return Plot(self.parsed_objects[tag.get('refobj')], **self.attrs_to_dict(tag, ['refobj']))
return map[tag.name](**self.attrs_to_dict(tag)) settings = self.attrs_to_dict(tag)
settings.pop('spanstyle', '')
return map[tag.name](**settings)
def process_text_element(self, tag, elem): def process_text_element(self, tag, elem):
for item in tag.contents: for item in tag.contents:
@ -121,7 +123,8 @@ class LrsParser(object):
for tag in self.soup.findAll('page'): for tag in self.soup.findAll('page'):
page = self.parsed_objects[tag.get('objid')] page = self.parsed_objects[tag.get('objid')]
self.book.append(page) self.book.append(page)
for block_tag in tag.findAll(['canvas', 'imageblock', 'textblock', 'ruledline']): for block_tag in tag.findAll(['canvas', 'imageblock', 'textblock',
'ruledline', 'simpletextblock']):
if block_tag.name == 'ruledline': if block_tag.name == 'ruledline':
page.append(RuledLine(**self.attrs_to_dict(block_tag))) page.append(RuledLine(**self.attrs_to_dict(block_tag)))
else: else:
@ -134,7 +137,7 @@ class LrsParser(object):
self.book.append(jb) self.book.append(jb)
self.parsed_objects[tag.get('objid')] = jb self.parsed_objects[tag.get('objid')] = jb
for tag in self.soup.findAll('textblock'): for tag in self.soup.findAll(['textblock', 'simpletextblock']):
self.process_text_block(tag) self.process_text_block(tag)
toc = self.soup.find('toc') toc = self.soup.find('toc')
if toc: if toc:
@ -145,8 +148,10 @@ class LrsParser(object):
def third_pass(self): def third_pass(self):
map = { map = {
'page' : (Page, ['pagestyle', 'evenfooterid', 'oddfooterid', 'evenheaderid', 'oddheaderid']), 'page' : (Page, ['pagestyle', 'evenfooterid',
'oddfooterid', 'evenheaderid', 'oddheaderid']),
'textblock' : (TextBlock, ['textstyle', 'blockstyle']), 'textblock' : (TextBlock, ['textstyle', 'blockstyle']),
'simpletextblock' : (TextBlock, ['textstyle', 'blockstyle']),
'imageblock' : (ImageBlock, ['blockstyle', 'refstream']), 'imageblock' : (ImageBlock, ['blockstyle', 'refstream']),
'image' : (Image, ['refstream']), 'image' : (Image, ['refstream']),
'canvas' : (Canvas, ['canvaswidth', 'canvasheight']), 'canvas' : (Canvas, ['canvaswidth', 'canvasheight']),
@ -160,8 +165,12 @@ class LrsParser(object):
if tag.name in map.keys(): if tag.name in map.keys():
settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid', 'objlabel']) settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid', 'objlabel'])
for a in ('pagestyle', 'blockstyle', 'textstyle'): for a in ('pagestyle', 'blockstyle', 'textstyle'):
if tag.has_key(a): label = tag.get(a, False)
settings[attrmap[a]] = self.parsed_objects[tag.get(a)] if label:
_obj = self.parsed_objects[label] if \
self.parsed_objects.has_key(label) else \
self._style_labels[label]
settings[attrmap[a]] = _obj
for a in ('evenfooterid', 'oddfooterid', 'evenheaderid', 'oddheaderid'): for a in ('evenfooterid', 'oddfooterid', 'evenheaderid', 'oddheaderid'):
if tag.has_key(a): if tag.has_key(a):
settings[a.replace('id', '')] = self.parsed_objects[tag.get(a)] settings[a.replace('id', '')] = self.parsed_objects[tag.get(a)]
@ -182,6 +191,7 @@ class LrsParser(object):
'imagestream': (ImageStream, ['imagestreamlabel']), 'imagestream': (ImageStream, ['imagestreamlabel']),
'registfont' : (Font, []) 'registfont' : (Font, [])
} }
self._style_labels = {}
for id, tag in self.objects.items(): for id, tag in self.objects.items():
if tag.name in map.keys(): if tag.name in map.keys():
settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid']) settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid'])
@ -189,7 +199,11 @@ class LrsParser(object):
for a in ('evenheaderid', 'oddheaderid', 'evenfooterid', 'oddfooterid'): for a in ('evenheaderid', 'oddheaderid', 'evenfooterid', 'oddfooterid'):
if tag.has_key(a): if tag.has_key(a):
settings[a.replace('id', '')] = self.parsed_objects[tag.get(a)] settings[a.replace('id', '')] = self.parsed_objects[tag.get(a)]
settings.pop('autoindex', '')
self.parsed_objects[id] = map[tag.name][0](**settings) self.parsed_objects[id] = map[tag.name][0](**settings)
x = tag.get('stylelabel', False)
if x:
self._style_labels[x] = self.parsed_objects[id]
if tag.name == 'registfont': if tag.name == 'registfont':
self.book.append(self.parsed_objects[id]) self.book.append(self.parsed_objects[id])
@ -220,6 +234,8 @@ class LrsParser(object):
def me(base, tagname): def me(base, tagname):
tag = base.find(tagname.lower()) tag = base.find(tagname.lower())
if tag is None:
return ('', '', '')
tag = (self.tag_to_string(tag), tag.get('reading') if tag.has_key('reading') else '') tag = (self.tag_to_string(tag), tag.get('reading') if tag.has_key('reading') else '')
return tag return tag

View File

@ -255,7 +255,7 @@ class LRFSingleDialog(QDialog, Ui_LRFSingleDialog):
self.gui_headerformat.setDisabled(True) self.gui_headerformat.setDisabled(True)
self.gui_header_separation.setDisabled(True) self.gui_header_separation.setDisabled(True)
self.gui_use_metadata_cover.setCheckState(Qt.Checked) self.gui_use_metadata_cover.setCheckState(Qt.Checked)
self.preprocess.addItem('No preprocessing') self.preprocess.addItem(_('No preprocessing'))
for opt in self.PREPROCESS_OPTIONS: for opt in self.PREPROCESS_OPTIONS:
self.preprocess.addItem(opt.get_opt_string()[2:]) self.preprocess.addItem(opt.get_opt_string()[2:])
ph = _('Preprocess the file before converting to LRF. This is useful if you know that the file is from a specific source. Known sources:') ph = _('Preprocess the file before converting to LRF. This is useful if you know that the file is from a specific source. Known sources:')
@ -338,7 +338,7 @@ class LRFSingleDialog(QDialog, Ui_LRFSingleDialog):
cmd.append(opt) cmd.append(opt)
text = qstring_to_unicode(self.preprocess.currentText()) text = qstring_to_unicode(self.preprocess.currentText())
if text != 'No preprocessing': if text != _('No preprocessing'):
cmd.append(u'--'+text) cmd.append(u'--'+text)
cmd.extend([u'--profile', qstring_to_unicode(self.gui_profile.currentText())]) cmd.extend([u'--profile', qstring_to_unicode(self.gui_profile.currentText())])

View File

@ -19,5 +19,4 @@ class Config(_Config):
self.opt_dont_split_on_page_breaks.setVisible(False) self.opt_dont_split_on_page_breaks.setVisible(False)
self.opt_preserve_tag_structure.setVisible(False) self.opt_preserve_tag_structure.setVisible(False)
self.opt_linearize_tables.setVisible(False) self.opt_linearize_tables.setVisible(False)
self.opt_no_justification.setVisible(False)
self.page_map_box.setVisible(False) self.page_map_box.setVisible(False)

Binary file not shown.

After

Width:  |  Height:  |  Size: 455 B

View File

@ -0,0 +1,57 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.army.mil/soldiers/
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Soldiers(BasicNewsRecipe):
title = 'Soldiers'
__author__ = 'Darko Miletic'
description = 'The Official U.S. Army Magazine'
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
remove_javascript = True
simultaneous_downloads = 1
delay = 4
max_connections = 1
encoding = 'utf-8'
publisher = 'U.S. Army'
category = 'news, politics, war, weapons'
language = _('English')
INDEX = 'http://www.army.mil/soldiers/'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'rightCol'})]
remove_tags = [
dict(name='div', attrs={'id':['addThis','comment','articleFooter']})
,dict(name=['object','link'])
]
feeds = [(u'Frontpage', u'http://www.army.mil/rss/feeds/soldiersfrontpage.xml' )]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('img',attrs={'alt':'Current Magazine Cover'})
if cover_item:
cover_url = cover_item['src']
return cover_url