mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #1938 (Error in lrs2lrf)
This commit is contained in:
parent
b197a6c86f
commit
786c0c9cae
@ -30,12 +30,50 @@ def detect(aBuf):
|
|||||||
|
|
||||||
# Added by Kovid
|
# Added by Kovid
|
||||||
ENCODING_PATS = [
|
ENCODING_PATS = [
|
||||||
re.compile(r'<\?[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
|
re.compile(r'<\?[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>',
|
||||||
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)
|
re.IGNORECASE),
|
||||||
|
re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>',
|
||||||
|
re.IGNORECASE)
|
||||||
]
|
]
|
||||||
ENTITY_PATTERN = re.compile(r'&(\S+?);')
|
ENTITY_PATTERN = re.compile(r'&(\S+?);')
|
||||||
|
|
||||||
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entities=False):
|
def strip_encoding_declarations(raw):
|
||||||
|
for pat in ENCODING_PATS:
|
||||||
|
raw = pat.sub('', raw)
|
||||||
|
return raw
|
||||||
|
|
||||||
|
def substitute_entites(raw):
|
||||||
|
from calibre import entity_to_unicode
|
||||||
|
from functools import partial
|
||||||
|
f = partial(entity_to_unicode, exceptions=
|
||||||
|
['amp', 'apos', 'quot', 'lt', 'gt'])
|
||||||
|
return ENTITY_PATTERN.sub(f, raw)
|
||||||
|
|
||||||
|
_CHARSET_ALIASES = { "macintosh" : "mac-roman",
|
||||||
|
"x-sjis" : "shift-jis" }
|
||||||
|
|
||||||
|
|
||||||
|
def force_encoding(raw, verbose):
|
||||||
|
from calibre.constants import preferred_encoding
|
||||||
|
try:
|
||||||
|
chardet = detect(raw)
|
||||||
|
except:
|
||||||
|
chardet = {'encoding':preferred_encoding, 'confidence':0}
|
||||||
|
encoding = chardet['encoding']
|
||||||
|
if chardet['confidence'] < 1 and verbose:
|
||||||
|
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
|
||||||
|
if not encoding:
|
||||||
|
encoding = preferred_encoding
|
||||||
|
encoding = encoding.lower()
|
||||||
|
if _CHARSET_ALIASES.has_key(encoding):
|
||||||
|
encoding = _CHARSET_ALIASES[encoding]
|
||||||
|
if encoding == 'ascii':
|
||||||
|
encoding = 'utf-8'
|
||||||
|
return encoding
|
||||||
|
|
||||||
|
|
||||||
|
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
||||||
|
resolve_entities=False):
|
||||||
'''
|
'''
|
||||||
Force conversion of byte string to unicode. Tries to look for XML/HTML
|
Force conversion of byte string to unicode. Tries to look for XML/HTML
|
||||||
encoding declaration first, if not found uses the chardet library and
|
encoding declaration first, if not found uses the chardet library and
|
||||||
@ -45,44 +83,27 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entiti
|
|||||||
encoding = None
|
encoding = None
|
||||||
if not raw:
|
if not raw:
|
||||||
return u'', encoding
|
return u'', encoding
|
||||||
if isinstance(raw, unicode):
|
if not isinstance(raw, unicode):
|
||||||
return raw, encoding
|
if raw.startswith('\xff\xfe'):
|
||||||
for pat in ENCODING_PATS:
|
raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
|
||||||
match = pat.search(raw)
|
elif raw.startswith('\xfe\xff'):
|
||||||
if match:
|
raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
|
||||||
encoding = match.group(1)
|
if not isinstance(raw, unicode):
|
||||||
break
|
|
||||||
if strip_encoding_pats:
|
|
||||||
for pat in ENCODING_PATS:
|
for pat in ENCODING_PATS:
|
||||||
raw = pat.sub('', raw)
|
match = pat.search(raw)
|
||||||
if encoding is None:
|
if match:
|
||||||
|
encoding = match.group(1)
|
||||||
|
break
|
||||||
|
if encoding is None:
|
||||||
|
encoding = force_encoding(raw, verbose)
|
||||||
try:
|
try:
|
||||||
chardet = detect(raw)
|
raw = raw.decode(encoding, 'replace')
|
||||||
except:
|
except LookupError:
|
||||||
chardet = {'encoding':'utf-8', 'confidence':0}
|
raw = raw.decode('utf-8', 'replace')
|
||||||
encoding = chardet['encoding']
|
|
||||||
if chardet['confidence'] < 1 and verbose:
|
|
||||||
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
|
|
||||||
CHARSET_ALIASES = { "macintosh" : "mac-roman",
|
|
||||||
"x-sjis" : "shift-jis" }
|
|
||||||
if not encoding:
|
|
||||||
from calibre import preferred_encoding
|
|
||||||
encoding = preferred_encoding
|
|
||||||
if encoding:
|
|
||||||
encoding = encoding.lower()
|
|
||||||
if CHARSET_ALIASES.has_key(encoding):
|
|
||||||
encoding = CHARSET_ALIASES[encoding]
|
|
||||||
if encoding == 'ascii':
|
|
||||||
encoding = 'utf-8'
|
|
||||||
|
|
||||||
try:
|
if strip_encoding_pats:
|
||||||
raw = raw.decode(encoding, 'replace')
|
raw = strip_encoding_declarations(raw)
|
||||||
except LookupError:
|
|
||||||
raw = raw.decode('utf-8', 'replace')
|
|
||||||
if resolve_entities:
|
if resolve_entities:
|
||||||
from calibre import entity_to_unicode
|
raw = substitute_entites(raw)
|
||||||
from functools import partial
|
|
||||||
f = partial(entity_to_unicode, exceptions=['amp', 'apos', 'quot', 'lt', 'gt'])
|
|
||||||
raw = ENTITY_PATTERN.sub(f, raw)
|
|
||||||
|
|
||||||
return raw, encoding
|
return raw, encoding
|
||||||
|
@ -73,7 +73,9 @@ class LrsParser(object):
|
|||||||
return CharButton(self.parsed_objects[tag.get('refobj')], None)
|
return CharButton(self.parsed_objects[tag.get('refobj')], None)
|
||||||
if tag.name == 'plot':
|
if tag.name == 'plot':
|
||||||
return Plot(self.parsed_objects[tag.get('refobj')], **self.attrs_to_dict(tag, ['refobj']))
|
return Plot(self.parsed_objects[tag.get('refobj')], **self.attrs_to_dict(tag, ['refobj']))
|
||||||
return map[tag.name](**self.attrs_to_dict(tag))
|
settings = self.attrs_to_dict(tag)
|
||||||
|
settings.pop('spanstyle', '')
|
||||||
|
return map[tag.name](**settings)
|
||||||
|
|
||||||
def process_text_element(self, tag, elem):
|
def process_text_element(self, tag, elem):
|
||||||
for item in tag.contents:
|
for item in tag.contents:
|
||||||
@ -121,7 +123,8 @@ class LrsParser(object):
|
|||||||
for tag in self.soup.findAll('page'):
|
for tag in self.soup.findAll('page'):
|
||||||
page = self.parsed_objects[tag.get('objid')]
|
page = self.parsed_objects[tag.get('objid')]
|
||||||
self.book.append(page)
|
self.book.append(page)
|
||||||
for block_tag in tag.findAll(['canvas', 'imageblock', 'textblock', 'ruledline']):
|
for block_tag in tag.findAll(['canvas', 'imageblock', 'textblock',
|
||||||
|
'ruledline', 'simpletextblock']):
|
||||||
if block_tag.name == 'ruledline':
|
if block_tag.name == 'ruledline':
|
||||||
page.append(RuledLine(**self.attrs_to_dict(block_tag)))
|
page.append(RuledLine(**self.attrs_to_dict(block_tag)))
|
||||||
else:
|
else:
|
||||||
@ -134,7 +137,7 @@ class LrsParser(object):
|
|||||||
self.book.append(jb)
|
self.book.append(jb)
|
||||||
self.parsed_objects[tag.get('objid')] = jb
|
self.parsed_objects[tag.get('objid')] = jb
|
||||||
|
|
||||||
for tag in self.soup.findAll('textblock'):
|
for tag in self.soup.findAll(['textblock', 'simpletextblock']):
|
||||||
self.process_text_block(tag)
|
self.process_text_block(tag)
|
||||||
toc = self.soup.find('toc')
|
toc = self.soup.find('toc')
|
||||||
if toc:
|
if toc:
|
||||||
@ -145,8 +148,10 @@ class LrsParser(object):
|
|||||||
|
|
||||||
def third_pass(self):
|
def third_pass(self):
|
||||||
map = {
|
map = {
|
||||||
'page' : (Page, ['pagestyle', 'evenfooterid', 'oddfooterid', 'evenheaderid', 'oddheaderid']),
|
'page' : (Page, ['pagestyle', 'evenfooterid',
|
||||||
|
'oddfooterid', 'evenheaderid', 'oddheaderid']),
|
||||||
'textblock' : (TextBlock, ['textstyle', 'blockstyle']),
|
'textblock' : (TextBlock, ['textstyle', 'blockstyle']),
|
||||||
|
'simpletextblock' : (TextBlock, ['textstyle', 'blockstyle']),
|
||||||
'imageblock' : (ImageBlock, ['blockstyle', 'refstream']),
|
'imageblock' : (ImageBlock, ['blockstyle', 'refstream']),
|
||||||
'image' : (Image, ['refstream']),
|
'image' : (Image, ['refstream']),
|
||||||
'canvas' : (Canvas, ['canvaswidth', 'canvasheight']),
|
'canvas' : (Canvas, ['canvaswidth', 'canvasheight']),
|
||||||
@ -160,8 +165,12 @@ class LrsParser(object):
|
|||||||
if tag.name in map.keys():
|
if tag.name in map.keys():
|
||||||
settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid', 'objlabel'])
|
settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid', 'objlabel'])
|
||||||
for a in ('pagestyle', 'blockstyle', 'textstyle'):
|
for a in ('pagestyle', 'blockstyle', 'textstyle'):
|
||||||
if tag.has_key(a):
|
label = tag.get(a, False)
|
||||||
settings[attrmap[a]] = self.parsed_objects[tag.get(a)]
|
if label:
|
||||||
|
_obj = self.parsed_objects[label] if \
|
||||||
|
self.parsed_objects.has_key(label) else \
|
||||||
|
self._style_labels[label]
|
||||||
|
settings[attrmap[a]] = _obj
|
||||||
for a in ('evenfooterid', 'oddfooterid', 'evenheaderid', 'oddheaderid'):
|
for a in ('evenfooterid', 'oddfooterid', 'evenheaderid', 'oddheaderid'):
|
||||||
if tag.has_key(a):
|
if tag.has_key(a):
|
||||||
settings[a.replace('id', '')] = self.parsed_objects[tag.get(a)]
|
settings[a.replace('id', '')] = self.parsed_objects[tag.get(a)]
|
||||||
@ -182,6 +191,7 @@ class LrsParser(object):
|
|||||||
'imagestream': (ImageStream, ['imagestreamlabel']),
|
'imagestream': (ImageStream, ['imagestreamlabel']),
|
||||||
'registfont' : (Font, [])
|
'registfont' : (Font, [])
|
||||||
}
|
}
|
||||||
|
self._style_labels = {}
|
||||||
for id, tag in self.objects.items():
|
for id, tag in self.objects.items():
|
||||||
if tag.name in map.keys():
|
if tag.name in map.keys():
|
||||||
settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid'])
|
settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid'])
|
||||||
@ -189,7 +199,11 @@ class LrsParser(object):
|
|||||||
for a in ('evenheaderid', 'oddheaderid', 'evenfooterid', 'oddfooterid'):
|
for a in ('evenheaderid', 'oddheaderid', 'evenfooterid', 'oddfooterid'):
|
||||||
if tag.has_key(a):
|
if tag.has_key(a):
|
||||||
settings[a.replace('id', '')] = self.parsed_objects[tag.get(a)]
|
settings[a.replace('id', '')] = self.parsed_objects[tag.get(a)]
|
||||||
|
settings.pop('autoindex', '')
|
||||||
self.parsed_objects[id] = map[tag.name][0](**settings)
|
self.parsed_objects[id] = map[tag.name][0](**settings)
|
||||||
|
x = tag.get('stylelabel', False)
|
||||||
|
if x:
|
||||||
|
self._style_labels[x] = self.parsed_objects[id]
|
||||||
if tag.name == 'registfont':
|
if tag.name == 'registfont':
|
||||||
self.book.append(self.parsed_objects[id])
|
self.book.append(self.parsed_objects[id])
|
||||||
|
|
||||||
@ -220,6 +234,8 @@ class LrsParser(object):
|
|||||||
|
|
||||||
def me(base, tagname):
|
def me(base, tagname):
|
||||||
tag = base.find(tagname.lower())
|
tag = base.find(tagname.lower())
|
||||||
|
if tag is None:
|
||||||
|
return ('', '', '')
|
||||||
tag = (self.tag_to_string(tag), tag.get('reading') if tag.has_key('reading') else '')
|
tag = (self.tag_to_string(tag), tag.get('reading') if tag.has_key('reading') else '')
|
||||||
return tag
|
return tag
|
||||||
|
|
||||||
|
@ -255,7 +255,7 @@ class LRFSingleDialog(QDialog, Ui_LRFSingleDialog):
|
|||||||
self.gui_headerformat.setDisabled(True)
|
self.gui_headerformat.setDisabled(True)
|
||||||
self.gui_header_separation.setDisabled(True)
|
self.gui_header_separation.setDisabled(True)
|
||||||
self.gui_use_metadata_cover.setCheckState(Qt.Checked)
|
self.gui_use_metadata_cover.setCheckState(Qt.Checked)
|
||||||
self.preprocess.addItem('No preprocessing')
|
self.preprocess.addItem(_('No preprocessing'))
|
||||||
for opt in self.PREPROCESS_OPTIONS:
|
for opt in self.PREPROCESS_OPTIONS:
|
||||||
self.preprocess.addItem(opt.get_opt_string()[2:])
|
self.preprocess.addItem(opt.get_opt_string()[2:])
|
||||||
ph = _('Preprocess the file before converting to LRF. This is useful if you know that the file is from a specific source. Known sources:')
|
ph = _('Preprocess the file before converting to LRF. This is useful if you know that the file is from a specific source. Known sources:')
|
||||||
@ -338,7 +338,7 @@ class LRFSingleDialog(QDialog, Ui_LRFSingleDialog):
|
|||||||
cmd.append(opt)
|
cmd.append(opt)
|
||||||
|
|
||||||
text = qstring_to_unicode(self.preprocess.currentText())
|
text = qstring_to_unicode(self.preprocess.currentText())
|
||||||
if text != 'No preprocessing':
|
if text != _('No preprocessing'):
|
||||||
cmd.append(u'--'+text)
|
cmd.append(u'--'+text)
|
||||||
cmd.extend([u'--profile', qstring_to_unicode(self.gui_profile.currentText())])
|
cmd.extend([u'--profile', qstring_to_unicode(self.gui_profile.currentText())])
|
||||||
|
|
||||||
|
@ -19,5 +19,4 @@ class Config(_Config):
|
|||||||
self.opt_dont_split_on_page_breaks.setVisible(False)
|
self.opt_dont_split_on_page_breaks.setVisible(False)
|
||||||
self.opt_preserve_tag_structure.setVisible(False)
|
self.opt_preserve_tag_structure.setVisible(False)
|
||||||
self.opt_linearize_tables.setVisible(False)
|
self.opt_linearize_tables.setVisible(False)
|
||||||
self.opt_no_justification.setVisible(False)
|
|
||||||
self.page_map_box.setVisible(False)
|
self.page_map_box.setVisible(False)
|
BIN
src/calibre/gui2/images/news/soldiers.png
Normal file
BIN
src/calibre/gui2/images/news/soldiers.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 455 B |
57
src/calibre/web/feeds/recipes/recipe_soldiers.py
Normal file
57
src/calibre/web/feeds/recipes/recipe_soldiers.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||||
|
'''
|
||||||
|
www.army.mil/soldiers/
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Soldiers(BasicNewsRecipe):
|
||||||
|
title = 'Soldiers'
|
||||||
|
__author__ = 'Darko Miletic'
|
||||||
|
description = 'The Official U.S. Army Magazine'
|
||||||
|
oldest_article = 30
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
use_embedded_content = False
|
||||||
|
remove_javascript = True
|
||||||
|
simultaneous_downloads = 1
|
||||||
|
delay = 4
|
||||||
|
max_connections = 1
|
||||||
|
encoding = 'utf-8'
|
||||||
|
publisher = 'U.S. Army'
|
||||||
|
category = 'news, politics, war, weapons'
|
||||||
|
language = _('English')
|
||||||
|
INDEX = 'http://www.army.mil/soldiers/'
|
||||||
|
|
||||||
|
html2lrf_options = [
|
||||||
|
'--comment', description
|
||||||
|
, '--category', category
|
||||||
|
, '--publisher', publisher
|
||||||
|
]
|
||||||
|
|
||||||
|
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
||||||
|
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'rightCol'})]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'id':['addThis','comment','articleFooter']})
|
||||||
|
,dict(name=['object','link'])
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [(u'Frontpage', u'http://www.army.mil/rss/feeds/soldiersfrontpage.xml' )]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for item in soup.findAll(style=True):
|
||||||
|
del item['style']
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
cover_url = None
|
||||||
|
soup = self.index_to_soup(self.INDEX)
|
||||||
|
cover_item = soup.find('img',attrs={'alt':'Current Magazine Cover'})
|
||||||
|
if cover_item:
|
||||||
|
cover_url = cover_item['src']
|
||||||
|
return cover_url
|
Loading…
x
Reference in New Issue
Block a user