diff --git a/src/calibre/ebooks/chardet/__init__.py b/src/calibre/ebooks/chardet/__init__.py index 8ad41c524f..af6d724883 100644 --- a/src/calibre/ebooks/chardet/__init__.py +++ b/src/calibre/ebooks/chardet/__init__.py @@ -30,12 +30,50 @@ def detect(aBuf): # Added by Kovid ENCODING_PATS = [ - re.compile(r'<\?[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE), - re.compile(r'', re.IGNORECASE) + re.compile(r'<\?[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', + re.IGNORECASE), + re.compile(r'', + re.IGNORECASE) ] ENTITY_PATTERN = re.compile(r'&(\S+?);') -def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entities=False): +def strip_encoding_declarations(raw): + for pat in ENCODING_PATS: + raw = pat.sub('', raw) + return raw + +def substitute_entites(raw): + from calibre import entity_to_unicode + from functools import partial + f = partial(entity_to_unicode, exceptions= + ['amp', 'apos', 'quot', 'lt', 'gt']) + return ENTITY_PATTERN.sub(f, raw) + +_CHARSET_ALIASES = { "macintosh" : "mac-roman", + "x-sjis" : "shift-jis" } + + +def force_encoding(raw, verbose): + from calibre.constants import preferred_encoding + try: + chardet = detect(raw) + except: + chardet = {'encoding':preferred_encoding, 'confidence':0} + encoding = chardet['encoding'] + if chardet['confidence'] < 1 and verbose: + print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100) + if not encoding: + encoding = preferred_encoding + encoding = encoding.lower() + if _CHARSET_ALIASES.has_key(encoding): + encoding = _CHARSET_ALIASES[encoding] + if encoding == 'ascii': + encoding = 'utf-8' + return encoding + + +def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, + resolve_entities=False): ''' Force conversion of byte string to unicode. Tries to look for XML/HTML encoding declaration first, if not found uses the chardet library and @@ -45,44 +83,27 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entiti encoding = None if not raw: return u'', encoding - if isinstance(raw, unicode): - return raw, encoding - for pat in ENCODING_PATS: - match = pat.search(raw) - if match: - encoding = match.group(1) - break - if strip_encoding_pats: + if not isinstance(raw, unicode): + if raw.startswith('\xff\xfe'): + raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le' + elif raw.startswith('\xfe\xff'): + raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be' + if not isinstance(raw, unicode): for pat in ENCODING_PATS: - raw = pat.sub('', raw) - if encoding is None: + match = pat.search(raw) + if match: + encoding = match.group(1) + break + if encoding is None: + encoding = force_encoding(raw, verbose) try: - chardet = detect(raw) - except: - chardet = {'encoding':'utf-8', 'confidence':0} - encoding = chardet['encoding'] - if chardet['confidence'] < 1 and verbose: - print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100) - CHARSET_ALIASES = { "macintosh" : "mac-roman", - "x-sjis" : "shift-jis" } - if not encoding: - from calibre import preferred_encoding - encoding = preferred_encoding - if encoding: - encoding = encoding.lower() - if CHARSET_ALIASES.has_key(encoding): - encoding = CHARSET_ALIASES[encoding] - if encoding == 'ascii': - encoding = 'utf-8' + raw = raw.decode(encoding, 'replace') + except LookupError: + raw = raw.decode('utf-8', 'replace') - try: - raw = raw.decode(encoding, 'replace') - except LookupError: - raw = raw.decode('utf-8', 'replace') + if strip_encoding_pats: + raw = strip_encoding_declarations(raw) if resolve_entities: - from calibre import entity_to_unicode - from functools import partial - f = partial(entity_to_unicode, exceptions=['amp', 'apos', 'quot', 'lt', 'gt']) - raw = ENTITY_PATTERN.sub(f, raw) - + raw = substitute_entites(raw) + return raw, encoding diff --git a/src/calibre/ebooks/lrf/lrs/convert_from.py b/src/calibre/ebooks/lrf/lrs/convert_from.py index 89a0eb5d44..495d9adb50 100644 --- a/src/calibre/ebooks/lrf/lrs/convert_from.py +++ b/src/calibre/ebooks/lrf/lrs/convert_from.py @@ -73,7 +73,9 @@ class LrsParser(object): return CharButton(self.parsed_objects[tag.get('refobj')], None) if tag.name == 'plot': return Plot(self.parsed_objects[tag.get('refobj')], **self.attrs_to_dict(tag, ['refobj'])) - return map[tag.name](**self.attrs_to_dict(tag)) + settings = self.attrs_to_dict(tag) + settings.pop('spanstyle', '') + return map[tag.name](**settings) def process_text_element(self, tag, elem): for item in tag.contents: @@ -121,7 +123,8 @@ class LrsParser(object): for tag in self.soup.findAll('page'): page = self.parsed_objects[tag.get('objid')] self.book.append(page) - for block_tag in tag.findAll(['canvas', 'imageblock', 'textblock', 'ruledline']): + for block_tag in tag.findAll(['canvas', 'imageblock', 'textblock', + 'ruledline', 'simpletextblock']): if block_tag.name == 'ruledline': page.append(RuledLine(**self.attrs_to_dict(block_tag))) else: @@ -134,7 +137,7 @@ class LrsParser(object): self.book.append(jb) self.parsed_objects[tag.get('objid')] = jb - for tag in self.soup.findAll('textblock'): + for tag in self.soup.findAll(['textblock', 'simpletextblock']): self.process_text_block(tag) toc = self.soup.find('toc') if toc: @@ -145,8 +148,10 @@ class LrsParser(object): def third_pass(self): map = { - 'page' : (Page, ['pagestyle', 'evenfooterid', 'oddfooterid', 'evenheaderid', 'oddheaderid']), + 'page' : (Page, ['pagestyle', 'evenfooterid', + 'oddfooterid', 'evenheaderid', 'oddheaderid']), 'textblock' : (TextBlock, ['textstyle', 'blockstyle']), + 'simpletextblock' : (TextBlock, ['textstyle', 'blockstyle']), 'imageblock' : (ImageBlock, ['blockstyle', 'refstream']), 'image' : (Image, ['refstream']), 'canvas' : (Canvas, ['canvaswidth', 'canvasheight']), @@ -160,8 +165,12 @@ class LrsParser(object): if tag.name in map.keys(): settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid', 'objlabel']) for a in ('pagestyle', 'blockstyle', 'textstyle'): - if tag.has_key(a): - settings[attrmap[a]] = self.parsed_objects[tag.get(a)] + label = tag.get(a, False) + if label: + _obj = self.parsed_objects[label] if \ + self.parsed_objects.has_key(label) else \ + self._style_labels[label] + settings[attrmap[a]] = _obj for a in ('evenfooterid', 'oddfooterid', 'evenheaderid', 'oddheaderid'): if tag.has_key(a): settings[a.replace('id', '')] = self.parsed_objects[tag.get(a)] @@ -182,6 +191,7 @@ class LrsParser(object): 'imagestream': (ImageStream, ['imagestreamlabel']), 'registfont' : (Font, []) } + self._style_labels = {} for id, tag in self.objects.items(): if tag.name in map.keys(): settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid']) @@ -189,7 +199,11 @@ class LrsParser(object): for a in ('evenheaderid', 'oddheaderid', 'evenfooterid', 'oddfooterid'): if tag.has_key(a): settings[a.replace('id', '')] = self.parsed_objects[tag.get(a)] + settings.pop('autoindex', '') self.parsed_objects[id] = map[tag.name][0](**settings) + x = tag.get('stylelabel', False) + if x: + self._style_labels[x] = self.parsed_objects[id] if tag.name == 'registfont': self.book.append(self.parsed_objects[id]) @@ -220,6 +234,8 @@ class LrsParser(object): def me(base, tagname): tag = base.find(tagname.lower()) + if tag is None: + return ('', '', '') tag = (self.tag_to_string(tag), tag.get('reading') if tag.has_key('reading') else '') return tag diff --git a/src/calibre/gui2/dialogs/lrf_single.py b/src/calibre/gui2/dialogs/lrf_single.py index 9083d3e4df..fdcf908d1d 100644 --- a/src/calibre/gui2/dialogs/lrf_single.py +++ b/src/calibre/gui2/dialogs/lrf_single.py @@ -255,7 +255,7 @@ class LRFSingleDialog(QDialog, Ui_LRFSingleDialog): self.gui_headerformat.setDisabled(True) self.gui_header_separation.setDisabled(True) self.gui_use_metadata_cover.setCheckState(Qt.Checked) - self.preprocess.addItem('No preprocessing') + self.preprocess.addItem(_('No preprocessing')) for opt in self.PREPROCESS_OPTIONS: self.preprocess.addItem(opt.get_opt_string()[2:]) ph = _('Preprocess the file before converting to LRF. This is useful if you know that the file is from a specific source. Known sources:') @@ -338,7 +338,7 @@ class LRFSingleDialog(QDialog, Ui_LRFSingleDialog): cmd.append(opt) text = qstring_to_unicode(self.preprocess.currentText()) - if text != 'No preprocessing': + if text != _('No preprocessing'): cmd.append(u'--'+text) cmd.extend([u'--profile', qstring_to_unicode(self.gui_profile.currentText())]) diff --git a/src/calibre/gui2/dialogs/mobi.py b/src/calibre/gui2/dialogs/mobi.py index 7d0324e0f4..b9cff08200 100644 --- a/src/calibre/gui2/dialogs/mobi.py +++ b/src/calibre/gui2/dialogs/mobi.py @@ -19,5 +19,4 @@ class Config(_Config): self.opt_dont_split_on_page_breaks.setVisible(False) self.opt_preserve_tag_structure.setVisible(False) self.opt_linearize_tables.setVisible(False) - self.opt_no_justification.setVisible(False) self.page_map_box.setVisible(False) \ No newline at end of file diff --git a/src/calibre/gui2/images/news/soldiers.png b/src/calibre/gui2/images/news/soldiers.png new file mode 100644 index 0000000000..df04f108e6 Binary files /dev/null and b/src/calibre/gui2/images/news/soldiers.png differ diff --git a/src/calibre/web/feeds/recipes/recipe_soldiers.py b/src/calibre/web/feeds/recipes/recipe_soldiers.py new file mode 100644 index 0000000000..dfaa070928 --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_soldiers.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2009, Darko Miletic ' +''' +www.army.mil/soldiers/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Soldiers(BasicNewsRecipe): + title = 'Soldiers' + __author__ = 'Darko Miletic' + description = 'The Official U.S. Army Magazine' + oldest_article = 30 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + remove_javascript = True + simultaneous_downloads = 1 + delay = 4 + max_connections = 1 + encoding = 'utf-8' + publisher = 'U.S. Army' + category = 'news, politics, war, weapons' + language = _('English') + INDEX = 'http://www.army.mil/soldiers/' + + html2lrf_options = [ + '--comment', description + , '--category', category + , '--publisher', publisher + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + + keep_only_tags = [dict(name='div', attrs={'id':'rightCol'})] + + remove_tags = [ + dict(name='div', attrs={'id':['addThis','comment','articleFooter']}) + ,dict(name=['object','link']) + ] + + feeds = [(u'Frontpage', u'http://www.army.mil/rss/feeds/soldiersfrontpage.xml' )] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup + + def get_cover_url(self): + cover_url = None + soup = self.index_to_soup(self.INDEX) + cover_item = soup.find('img',attrs={'alt':'Current Magazine Cover'}) + if cover_item: + cover_url = cover_item['src'] + return cover_url