From fa0799597d686a4e870bf0a6d76a9addb8114756 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 30 Nov 2010 18:35:14 -0700 Subject: [PATCH 1/7] Fix #7746 (Converting prc->epub: names of streets and lakes) --- src/calibre/ebooks/mobi/reader.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index f80d15359c..48ece79f45 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -504,6 +504,9 @@ class MobiReader(object): 'x-large': '5', 'xx-large': '6', } + def barename(x): + return x.rpartition(':')[-1] + mobi_version = self.book_header.mobi_version for x in root.xpath('//ncx'): x.getparent().remove(x) @@ -512,8 +515,9 @@ class MobiReader(object): for x in tag.attrib: if ':' in x: del tag.attrib[x] - if tag.tag in ('country-region', 'place', 'placetype', 'placename', - 'state', 'city', 'street', 'address', 'content', 'form'): + if tag.tag and barename(tag.tag.lower()) in \ + ('country-region', 'place', 'placetype', 'placename', + 'state', 'city', 'street', 'address', 'content', 'form'): tag.tag = 'div' if tag.tag in ('content', 'form') else 'span' for key in tag.attrib.keys(): tag.attrib.pop(key) From 983da070950ca74aab2d9f05cda3b4143cd66322 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 30 Nov 2010 18:37:34 -0700 Subject: [PATCH 2/7] Fix #7753 (setPlaceholderText not found) --- src/calibre/gui2/search_box.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/search_box.py b/src/calibre/gui2/search_box.py index 4d598a3bbb..94c9bbe33d 100644 --- a/src/calibre/gui2/search_box.py +++ b/src/calibre/gui2/search_box.py @@ -236,7 +236,11 @@ class SavedSearchBox(QComboBox): def initialize(self, _search_box, colorize=False, help_text=_('Search')): self.search_box = _search_box - self.line_edit.setPlaceholderText(help_text) + try: + self.line_edit.setPlaceholderText(help_text) + except: + # Using Qt < 4.7 + pass self.colorize = colorize self.clear() From ba831d21e38f0d0bad7ff0f791126d95ec2c417a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 30 Nov 2010 19:03:01 -0700 Subject: [PATCH 3/7] Search box: Remove select all on focus event as focus events are fired when completion fails/succeeds. Instead select all only when search box is focussed via the keyboard shortcut --- src/calibre/gui2/search_box.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/calibre/gui2/search_box.py b/src/calibre/gui2/search_box.py index 94c9bbe33d..dabd88ff9f 100644 --- a/src/calibre/gui2/search_box.py +++ b/src/calibre/gui2/search_box.py @@ -28,10 +28,6 @@ class SearchLineEdit(QLineEdit): QLineEdit.mouseReleaseEvent(self, event) QLineEdit.selectAll(self) - def focusInEvent(self, event): - QLineEdit.focusInEvent(self, event) - QLineEdit.selectAll(self) - def dropEvent(self, ev): self.parent().normalize_state() return QLineEdit.dropEvent(self, ev) @@ -334,14 +330,17 @@ class SearchBoxMixin(object): shortcuts = QKeySequence.keyBindings(QKeySequence.Find) shortcuts = list(shortcuts) + [QKeySequence('/'), QKeySequence('Alt+S')] self.action_focus_search.setShortcuts(shortcuts) - self.action_focus_search.triggered.connect(lambda x: - self.search.setFocus(Qt.OtherFocusReason)) + self.action_focus_search.triggered.connect(self.focus_search_box) self.addAction(self.action_focus_search) self.search.setStatusTip(re.sub(r'<\w+>', ' ', unicode(self.search.toolTip()))) self.advanced_search_button.setStatusTip(self.advanced_search_button.toolTip()) self.clear_button.setStatusTip(self.clear_button.toolTip()) + def focus_search_box(self, *args): + self.search.setFocus(Qt.OtherFocusReason) + self.search.lineEdit().selectAll() + def search_box_cleared(self): self.tags_view.clear() self.saved_search.clear() From 4ab34dff95f00ce8d76af0873975e8648a4fc3bf Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 30 Nov 2010 19:12:24 -0700 Subject: [PATCH 4/7] Fix #7749 (Book details panel does not always display correct selected book) --- src/calibre/gui2/book_details.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/calibre/gui2/book_details.py b/src/calibre/gui2/book_details.py index 4ffc8da650..b101d4c44f 100644 --- a/src/calibre/gui2/book_details.py +++ b/src/calibre/gui2/book_details.py @@ -208,8 +208,9 @@ class BookInfo(QWebView): rows = u'\n'.join([u'%s:%s'%(k,t) for k, t in rows]) comments = data.get(_('Comments'), '') - if comments and comments != u'None': - self.renderer.queue.put((rows, comments)) + if not comments or comments == u'None': + comments = '' + self.renderer.queue.put((rows, comments)) self._show_data(rows, '') From c63425f0b030811981b89ce1185b84588ec5176e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 30 Nov 2010 19:43:16 -0700 Subject: [PATCH 5/7] Fix #7686 (Updated recipes for Newswek Polska and Polityka, new recipes for Esensja, Histmag and Wprost) --- resources/recipes/esenja.recipe | 87 ++++++++++++++++++++++ resources/recipes/histmag.recipe | 59 +++++++++++++++ resources/recipes/newsweek_polska.recipe | 53 ++++++++++---- resources/recipes/polityka.recipe | 7 +- resources/recipes/wprost.recipe | 91 ++++++++++++++++++++++++ 5 files changed, 278 insertions(+), 19 deletions(-) create mode 100644 resources/recipes/esenja.recipe create mode 100644 resources/recipes/histmag.recipe create mode 100644 resources/recipes/wprost.recipe diff --git a/resources/recipes/esenja.recipe b/resources/recipes/esenja.recipe new file mode 100644 index 0000000000..b8b94ad66e --- /dev/null +++ b/resources/recipes/esenja.recipe @@ -0,0 +1,87 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, matek09, matek09@gmail.com' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class Esensja(BasicNewsRecipe): + + title = u'Esensja' + __author__ = 'matek09' + description = 'Monthly magazine' + encoding = 'utf-8' + no_stylesheets = True + language = 'pl' + remove_javascript = True + HREF = '0' + + #keep_only_tags =[] + #keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'}) + remove_tags_before = dict(dict(name = 'div', attrs = {'class' : 't-title'})) + remove_tags_after = dict(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_bot.gif'})) + + remove_tags =[] + remove_tags.append(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_top.gif'})) + remove_tags.append(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_bot.gif'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 't-title2 nextpage'})) + + extra_css = ''' + .t-title {font-size: x-large; font-weight: bold; text-align: left} + .t-author {font-size: x-small; text-align: left} + .t-title2 {font-size: x-small; font-style: italic; text-align: left} + .text {font-size: small; text-align: left} + .annot-ref {font-style: italic; text-align: left} + ''' + + preprocess_regexps = [(re.compile(r'alt="[^"]*"'), + lambda match: '')] + + def parse_index(self): + soup = self.index_to_soup('http://www.esensja.pl/magazyn/') + a = soup.find('a', attrs={'href' : re.compile('.*/index.html')}) + year = a['href'].split('/')[0] + month = a['href'].split('/')[1] + self.HREF = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/iso/' + soup = self.index_to_soup(self.HREF + '01.html') + self.cover_url = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/img/ilustr/cover_b.jpg' + feeds = [] + intro = soup.find('div', attrs={'class' : 'n-title'}) + introduction = {'title' : self.tag_to_string(intro.a), + 'url' : self.HREF + intro.a['href'], + 'date' : '', + 'description' : ''} + chapter = 'Wprowadzenie' + subchapter = '' + articles = [] + articles.append(introduction) + for tag in intro.findAllNext(attrs={'class': ['chapter', 'subchapter', 'n-title']}): + if tag.name in 'td': + if len(articles) > 0: + section = chapter + if len(subchapter) > 0: + section += ' - ' + subchapter + feeds.append((section, articles)) + articles = [] + if tag['class'] == 'chapter': + chapter = self.tag_to_string(tag).capitalize() + subchapter = '' + else: + subchapter = self.tag_to_string(tag) + subchapter = self.tag_to_string(tag) + continue + articles.append({'title' : self.tag_to_string(tag.a), 'url' : self.HREF + tag.a['href'], 'date' : '', 'description' : ''}) + + a = self.index_to_soup(self.HREF + tag.a['href']) + i = 1 + while True: + div = a.find('div', attrs={'class' : 't-title2 nextpage'}) + if div is not None: + a = self.index_to_soup(self.HREF + div.a['href']) + articles.append({'title' : self.tag_to_string(tag.a) + ' c. d. ' + str(i), 'url' : self.HREF + div.a['href'], 'date' : '', 'description' : ''}) + i = i + 1 + else: + break + + return feeds diff --git a/resources/recipes/histmag.recipe b/resources/recipes/histmag.recipe new file mode 100644 index 0000000000..38956e7995 --- /dev/null +++ b/resources/recipes/histmag.recipe @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, matek09, matek09@gmail.com' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class Histmag(BasicNewsRecipe): + + title = u'Histmag' + __author__ = 'matek09' + description = u"Artykuly historyczne i publicystyczne" + encoding = 'utf-8' + no_stylesheets = True + language = 'pl' + remove_javascript = True + #max_articles_per_feed = 1 + remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'article'})) + remove_tags_after = dict(dict(name = 'h2', attrs = {'class' : 'komentarze'})) + #keep_only_tags =[] + #keep_only_tags.append(dict(name = 'h2')) + #keep_only_tags.append(dict(name = 'p')) + + remove_tags =[] + remove_tags.append(dict(name = 'p', attrs = {'class' : 'podpis'})) + remove_tags.append(dict(name = 'h2', attrs = {'class' : 'komentarze'})) + remove_tags.append(dict(name = 'img', attrs = {'src' : 'style/buttons/wesprzyjnas-1.jpg'})) + + preprocess_regexps = [(re.compile(r''), lambda match: '

'), + (re.compile(r''), lambda match: '

')] + extra_css = ''' + .left {font-size: x-small} + .right {font-size: x-small} + ''' + + def find_articles(self, soup): + articles = [] + for div in soup.findAll('div', attrs={'class' : 'text'}): + articles.append({ + 'title' : self.tag_to_string(div.h3.a), + 'url' : 'http://www.histmag.org/' + div.h3.a['href'], + 'date' : self.tag_to_string(div.next('p')).split('|')[0], + 'description' : self.tag_to_string(div.next('p', podpis=False)), + }) + return articles + + def parse_index(self): + soup = self.index_to_soup('http://histmag.org/?arc=4&dx=0') + feeds = [] + feeds.append((u"Artykuly historyczne", self.find_articles(soup))) + soup = self.index_to_soup('http://histmag.org/?arc=5&dx=0') + feeds.append((u"Artykuly publicystyczne", self.find_articles(soup))) + soup = self.index_to_soup('http://histmag.org/?arc=1&dx=0') + feeds.append((u"Wydarzenia", self.find_articles(soup))) + + return feeds + + diff --git a/resources/recipes/newsweek_polska.recipe b/resources/recipes/newsweek_polska.recipe index 31dd8ccddd..4227a88026 100644 --- a/resources/recipes/newsweek_polska.recipe +++ b/resources/recipes/newsweek_polska.recipe @@ -1,19 +1,22 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2010, Mateusz Kielar, matek09@gmail.com' +__copyright__ = '2010, matek09, matek09@gmail.com' from calibre.web.feeds.news import BasicNewsRecipe class Newsweek(BasicNewsRecipe): - EDITION = 0 + FIND_LAST_FULL_ISSUE = True + EDITION = '0' + EXCLUDE_LOCKED = True + LOCKED_ICO = 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif' title = u'Newsweek Polska' - __author__ = 'Mateusz Kielar' + __author__ = 'matek09' description = 'Weekly magazine' encoding = 'utf-8' no_stylesheets = True - language = 'en' + language = 'pl' remove_javascript = True keep_only_tags =[] @@ -33,34 +36,54 @@ class Newsweek(BasicNewsRecipe): def print_version(self, url): return url.replace("http://www.newsweek.pl/artykuly/wydanie/" + str(self.EDITION), "http://www.newsweek.pl/artykuly") + '/print' + def is_locked(self, a): + if a.findNext('img')['src'] == 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif': + return True + else: + return False + + def is_full(self, issue_soup): + if len(issue_soup.findAll('img', attrs={'src' : 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif'})) > 1: + return False + else: + return True + def find_last_full_issue(self): - page = self.index_to_soup('http://www.newsweek.pl/Frames/IssueCover.aspx') - issue = 'http://www.newsweek.pl/Frames/' + page.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href'] - page = self.index_to_soup(issue) - issue = 'http://www.newsweek.pl/Frames/' + page.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href'] - page = self.index_to_soup(issue) - self.EDITION = page.find('a', attrs={'target' : '_parent'})['href'].replace('/wydania/','') + frame_url = 'http://www.newsweek.pl/Frames/IssueCover.aspx' + while True: + frame_soup = self.index_to_soup(frame_url) + self.EDITION = frame_soup.find('a', attrs={'target' : '_parent'})['href'].replace('/wydania/','') + issue_soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION) + if self.is_full(issue_soup): + break + frame_url = 'http://www.newsweek.pl/Frames/' + frame_soup.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href'] + + def parse_index(self): - self.find_last_full_issue() - soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + str(self.EDITION)) + if self.FIND_LAST_FULL_ISSUE: + self.find_last_full_issue() + soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION) img = soup.find('img', id="ctl00_C1_PaperIsssueView_IssueImage", src=True) self.cover_url = img['src'] feeds = [] parent = soup.find(id='content-left-big') for txt in parent.findAll(attrs={'class':'txt_normal_red strong'}): - section = self.tag_to_string(txt).capitalize() articles = list(self.find_articles(txt)) - feeds.append((section, articles)) + if len(articles) > 0: + section = self.tag_to_string(txt).capitalize() + feeds.append((section, articles)) return feeds def find_articles(self, txt): for a in txt.findAllNext( attrs={'class':['strong','hr']}): if a.name in "div": break + if (not self.FIND_LAST_FULL_ISSUE) & self.EXCLUDE_LOCKED & self.is_locked(a): + continue yield { 'title' : self.tag_to_string(a), - 'url' : 'http://www.newsweek.pl'+a['href'], + 'url' : 'http://www.newsweek.pl' + a['href'], 'date' : '', 'description' : '' } diff --git a/resources/recipes/polityka.recipe b/resources/recipes/polityka.recipe index ab31e148aa..16ccae6085 100644 --- a/resources/recipes/polityka.recipe +++ b/resources/recipes/polityka.recipe @@ -1,18 +1,18 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2010, Mateusz Kielar, matek09@gmail.com' +__copyright__ = '2010, matek09, matek09@gmail.com' from calibre.web.feeds.news import BasicNewsRecipe class Polityka(BasicNewsRecipe): title = u'Polityka' - __author__ = 'Mateusz Kielar' + __author__ = 'matek09' description = 'Weekly magazine. Last archive issue' encoding = 'utf-8' no_stylesheets = True - language = 'en' + language = 'pl' remove_javascript = True remove_tags_before = dict(dict(name = 'h2', attrs = {'class' : 'box_nag'})) @@ -48,7 +48,6 @@ class Polityka(BasicNewsRecipe): for div in box.findAll('div', attrs={'class': 'list_tresc'}): article_page = self.index_to_soup('http://archiwum.polityka.pl' + div.a['href'],) section = self.tag_to_string(article_page.find('h2', attrs = {'class' : 'box_nag'})).split('/')[0].lstrip().rstrip() - print section if not articles.has_key(section): articles[section] = [] articles[section].append( { diff --git a/resources/recipes/wprost.recipe b/resources/recipes/wprost.recipe new file mode 100644 index 0000000000..b317571981 --- /dev/null +++ b/resources/recipes/wprost.recipe @@ -0,0 +1,91 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, matek09, matek09@gmail.com' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class Wprost(BasicNewsRecipe): + EDITION = 0 + FIND_LAST_FULL_ISSUE = True + EXCLUDE_LOCKED = True + ICO_BLOCKED = 'http://www.wprost.pl/G/icons/ico_blocked.gif' + + title = u'Wprost' + __author__ = 'matek09' + description = 'Weekly magazine' + encoding = 'ISO-8859-2' + no_stylesheets = True + language = 'pl' + remove_javascript = True + + remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) + remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) + + '''keep_only_tags =[] + keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))''' + + preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), + (re.compile(r'display: block;'), lambda match: '')] + + + remove_tags =[] + remove_tags.append(dict(name = 'div', attrs = {'class' : 'def element-date'})) + remove_tags.append(dict(name = 'div', attrs = {'class' : 'def silver'})) + remove_tags.append(dict(name = 'div', attrs = {'id' : 'content-main-column-right'})) + + + extra_css = ''' + .div-header {font-size: x-small; font-weight: bold} + ''' +#h2 {font-size: x-large; font-weight: bold} + def is_blocked(self, a): + if a.findNextSibling('img') is None: + return False + else: + return True + + + + def find_last_issue(self): + soup = self.index_to_soup('http://www.wprost.pl/archiwum/') + a = 0 + if self.FIND_LAST_FULL_ISSUE: + ico_blocked = soup.findAll('img', attrs={'src' : self.ICO_BLOCKED}) + a = ico_blocked[-1].findNext('a', attrs={'title' : re.compile('Zobacz spis tre.ci')}) + else: + a = soup.find('a', attrs={'title' : re.compile('Zobacz spis tre.ci')}) + self.EDITION = a['href'].replace('/tygodnik/?I=', '') + self.cover_url = a.img['src'] + + + + def parse_index(self): + self.find_last_issue() + soup = self.index_to_soup('http://www.wprost.pl/tygodnik/?I=' + self.EDITION) + feeds = [] + for main_block in soup.findAll(attrs={'class':'main-block-s3 s3-head head-red3'}): + articles = list(self.find_articles(main_block)) + if len(articles) > 0: + section = self.tag_to_string(main_block) + feeds.append((section, articles)) + return feeds + + def find_articles(self, main_block): + for a in main_block.findAllNext( attrs={'style':['','padding-top: 15px;']}): + if a.name in "td": + break + if self.EXCLUDE_LOCKED & self.is_blocked(a): + continue + yield { + 'title' : self.tag_to_string(a), + 'url' : 'http://www.wprost.pl' + a['href'], + 'date' : '', + 'description' : '' + } + + From aef657b0993aaad9647c13a6970963938e8c8268 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 30 Nov 2010 19:53:21 -0700 Subject: [PATCH 6/7] Fix #7723 (View Specific Format Does Not Allow More Than One Selection) --- src/calibre/gui2/actions/view.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/src/calibre/gui2/actions/view.py b/src/calibre/gui2/actions/view.py index 5f4f7ce428..0a26653771 100644 --- a/src/calibre/gui2/actions/view.py +++ b/src/calibre/gui2/actions/view.py @@ -12,7 +12,7 @@ from PyQt4.Qt import Qt, QMenu from calibre.constants import isosx from calibre.gui2 import error_dialog, Dispatcher, question_dialog, config, \ - open_local_file + open_local_file, info_dialog from calibre.gui2.dialogs.choose_format import ChooseFormatDialog from calibre.utils.config import prefs from calibre.ptempfile import PersistentTemporaryFile @@ -89,18 +89,34 @@ class ViewAction(InterfaceAction): self._launch_viewer(name, viewer, internal) def view_specific_format(self, triggered): - rows = self.gui.library_view.selectionModel().selectedRows() + rows = list(self.gui.library_view.selectionModel().selectedRows()) if not rows or len(rows) == 0: d = error_dialog(self.gui, _('Cannot view'), _('No book selected')) d.exec_() return - row = rows[0].row() - formats = self.gui.library_view.model().db.formats(row).upper().split(',') - d = ChooseFormatDialog(self.gui, _('Choose the format to view'), formats) + db = self.gui.library_view.model().db + rows = [r.row() for r in rows] + formats = [db.formats(row) for row in rows] + formats = [list(f.upper().split(',')) if f else None for f in formats] + all_fmts = set([]) + for x in formats: + for f in x: all_fmts.add(f) + d = ChooseFormatDialog(self.gui, _('Choose the format to view'), + list(sorted(all_fmts))) if d.exec_() == d.Accepted: - format = d.format() - self.view_format(row, format) + fmt = d.format() + orig_num = len(rows) + rows = [rows[i] for i in range(len(rows)) if formats[i] and fmt in + formats[i]] + if self._view_check(len(rows)): + for row in rows: + self.view_format(row, fmt) + if len(rows) < orig_num: + info_dialog(self.gui, _('Format unavailable'), + _('Not all the selected books were available in' + ' the %s format. You should convert' + ' them first.')%fmt, show=True) def _view_check(self, num, max_=3): if num <= max_: From 50b082fa8f9ed5ef14530dc57b0a3a412c5944b4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 30 Nov 2010 19:55:24 -0700 Subject: [PATCH 7/7] Fix #7704 (Updated recipe for NIN) --- resources/recipes/nin.recipe | 79 +++++++++++++++++++++++++++++++----- 1 file changed, 69 insertions(+), 10 deletions(-) diff --git a/resources/recipes/nin.recipe b/resources/recipes/nin.recipe index 70fd998a09..27942f7d43 100644 --- a/resources/recipes/nin.recipe +++ b/resources/recipes/nin.recipe @@ -8,12 +8,15 @@ www.nin.co.rs import re from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe +from contextlib import nested, closing +from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag +from calibre import entity_to_unicode class Nin(BasicNewsRecipe): title = 'NIN online' __author__ = 'Darko Miletic' description = 'Nedeljne Informativne Novine' - publisher = 'NIN d.o.o.' + publisher = 'NIN d.o.o. - Ringier d.o.o.' category = 'news, politics, Serbia' no_stylesheets = True delay = 1 @@ -26,18 +29,29 @@ class Nin(BasicNewsRecipe): use_embedded_content = False language = 'sr' publication_type = 'magazine' - extra_css = ' @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Verdana, Lucida, sans1, sans-serif} .article_description{font-family: Verdana, Lucida, sans1, sans-serif} .artTitle{font-size: x-large; font-weight: bold; color: #900} .izjava{font-size: x-large; font-weight: bold} .columnhead{font-size: small; font-weight: bold;} img{margin-top:0.5em; margin-bottom: 0.7em} b{margin-top: 1em} ' + extra_css = """ + @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} + body{font-family: Verdana, Lucida, sans1, sans-serif} + .article_description{font-family: Verdana, Lucida, sans1, sans-serif} + .artTitle{font-size: x-large; font-weight: bold; color: #900} + .izjava{font-size: x-large; font-weight: bold} + .columnhead{font-size: small; font-weight: bold;} + img{margin-top:0.5em; margin-bottom: 0.7em; display: block} + b{margin-top: 1em} + """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - , 'linearize_tables' : True + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language } - preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - remove_attributes = ['height','width'] + preprocess_regexps = [ + (re.compile(r'.*?', re.DOTALL|re.IGNORECASE),lambda match: '') + ,(re.compile(r'.*?', re.DOTALL|re.IGNORECASE),lambda match: '') + ,(re.compile(u'\u0110'), lambda match: u'\u00D0') + ] def get_browser(self): br = BasicNewsRecipe.get_browser() @@ -50,7 +64,10 @@ class Nin(BasicNewsRecipe): return br keep_only_tags =[dict(name='td', attrs={'width':'520'})] + remove_tags_before =dict(name='span', attrs={'class':'izjava'}) remove_tags_after =dict(name='html') + remove_tags = [dict(name=['object','link','iframe','meta','base'])] + remove_attributes=['border','background','height','width','align','valign'] def get_cover_url(self): cover_url = None @@ -63,7 +80,7 @@ class Nin(BasicNewsRecipe): def parse_index(self): articles = [] count = 0 - soup = self.index_to_soup(self.PREFIX) + soup = self.index_to_soup(self.INDEX) for item in soup.findAll('a',attrs={'class':'lmeninavFont'}): count = count +1 if self.test and count > 2: @@ -90,3 +107,45 @@ class Nin(BasicNewsRecipe): articles.append((section,inarts)) return articles + def index_to_soup(self, url_or_raw, raw=False): + if re.match(r'\w+://', url_or_raw): + open_func = getattr(self.browser, 'open_novisit', self.browser.open) + with closing(open_func(url_or_raw)) as f: + _raw = f.read() + if not _raw: + raise RuntimeError('Could not fetch index from %s'%url_or_raw) + else: + _raw = url_or_raw + if raw: + return _raw + if not isinstance(_raw, unicode) and self.encoding: + if callable(self.encoding): + _raw = self.encoding(_raw) + else: + _raw = _raw.decode(self.encoding, 'replace') + massage = list(BeautifulSoup.MARKUP_MASSAGE) + enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding + massage.append((re.compile(r'&(\S+?);'), lambda match: + entity_to_unicode(match, encoding=enc))) + massage.append((re.compile(r'[\x00-\x08]+'), lambda match: + '')) + return BeautifulSoup(_raw, markupMassage=massage) + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('div'): + if len(item.contents) == 0: + item.extract() + for item in soup.findAll(['td','tr']): + item.name='div' + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + for tbl in soup.findAll('table'): + img = tbl.find('img') + if img: + img.extract() + tbl.replaceWith(img) + return soup + \ No newline at end of file