From a4649e5e31cd5da2421b6d6184da53adbd7c8d23 Mon Sep 17 00:00:00 2001
From: GRiker
Date: Tue, 30 Nov 2010 15:42:40 -0700
Subject: [PATCH 01/28] GwR fixes for read tag code
---
src/calibre/gui2/catalog/catalog_epub_mobi.py | 1 -
src/calibre/library/catalog.py | 10 +---------
2 files changed, 1 insertion(+), 10 deletions(-)
diff --git a/src/calibre/gui2/catalog/catalog_epub_mobi.py b/src/calibre/gui2/catalog/catalog_epub_mobi.py
index 7ac6010896..4a330900b1 100644
--- a/src/calibre/gui2/catalog/catalog_epub_mobi.py
+++ b/src/calibre/gui2/catalog/catalog_epub_mobi.py
@@ -82,7 +82,6 @@ class PluginWidget(QWidget,Ui_Form):
elif opt[0] in ['read_source_field_cb']:
# Look for last-stored combo box value
index = self.read_source_field_cb.findText(opt_value)
- print "last index: %d" % index
if index == -1:
index = self.read_source_field_cb.findText('Tag')
self.read_source_field_cb.setCurrentIndex(index)
diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py
index b7dbd3c35f..631e635937 100644
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@@ -2676,14 +2676,7 @@ class EPUB_MOBI(CatalogPlugin):
pBookTag = Tag(soup, "p")
ptc = 0
- # THIS SHOULDN'T BE NECESSARY
- # book with read/reading/unread symbol
-# for tag in book['tags']:
-# if tag == self.opts.read_tag:
-# book['read'] = True
-# break
-# else:
-# book['read'] = False
+ book['read'] = self.discoverReadStatus(book)
# book with read|reading|unread symbol or wishlist item
if self.opts.wishlist_tag in book.get('tags', []):
@@ -4552,7 +4545,6 @@ class EPUB_MOBI(CatalogPlugin):
markerTags = []
markerTags.extend(self.opts.exclude_tags.split(','))
markerTags.extend(self.opts.note_tag.split(','))
- markerTags.extend(self.opts.read_tag.split(','))
return markerTags
def letter_or_symbol(self,char):
From acac5a479b925bc8b981b42e395b8fa9279517b0 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Tue, 30 Nov 2010 18:27:09 -0500
Subject: [PATCH 02/28] Implement bug #7738: FB2 Output option to create
section per HTML file. Implement FB2 options to wrap h1-3 tags with title
elments.
---
src/calibre/ebooks/fb2/fb2ml.py | 14 +++++++++++-
src/calibre/ebooks/fb2/output.py | 14 ++++++++++++
src/calibre/gui2/convert/fb2_output.py | 4 +++-
src/calibre/gui2/convert/fb2_output.ui | 30 +++++++++++++++++++++++++-
4 files changed, 59 insertions(+), 3 deletions(-)
diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py
index 3987ffa1b8..2a9a92612e 100644
--- a/src/calibre/ebooks/fb2/fb2ml.py
+++ b/src/calibre/ebooks/fb2/fb2ml.py
@@ -91,6 +91,10 @@ class FB2MLizer(object):
return u'\n%s' % etree.tostring(etree.fromstring(output), encoding=unicode, pretty_print=True)
def clean_text(self, text):
+ text = re.sub(r'(?miu)', '', text)
+ text = re.sub(r'(?miu)\s+', '', text)
+ text = re.sub(r'(?miu)\n\n', text)
+
text = re.sub(r'(?miu)\s*
', '', text)
text = re.sub(r'(?miu)\s+
', '', text)
text = re.sub(r'(?miu)', '
\n\n', text)
@@ -166,11 +170,15 @@ class FB2MLizer(object):
def get_text(self):
text = []
- for item in self.oeb_book.spine:
+ for i, item in enumerate(self.oeb_book.spine):
+ if self.opts.sectionize_chapters_using_file_structure and i is not 0:
+ text.append('')
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
text.append(self.add_page_anchor(item))
text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
+ if self.opts.sectionize_chapters_using_file_structure and i is not len(self.oeb_book.spine) - 1:
+ text.append('')
return ''.join(text)
def fb2_body_footer(self):
@@ -258,6 +266,10 @@ class FB2MLizer(object):
if id_name:
fb2_text.append(self.get_anchor(page, id_name))
+ if tag == 'h1' and self.opts.h1_to_title or tag == 'h2' and self.opts.h2_to_title or tag == 'h3' and self.opts.h3_to_title:
+ fb2_text.append('
')
+ tags.append('title')
+
fb2_tag = TAG_MAP.get(tag, None)
if fb2_tag == 'p':
if 'p' in tag_stack+tags:
diff --git a/src/calibre/ebooks/fb2/output.py b/src/calibre/ebooks/fb2/output.py
index d6c7a25a90..bacaf0da91 100644
--- a/src/calibre/ebooks/fb2/output.py
+++ b/src/calibre/ebooks/fb2/output.py
@@ -25,6 +25,20 @@ class FB2Output(OutputFormatPlugin):
'WARNING: ' \
'This option is experimental. It can cause conversion ' \
'to fail. It can also produce unexpected output.')),
+ OptionRecommendation(name='sectionize_chapters_using_file_structure',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('Try to turn chapters into individual sections using the ' \
+ 'internal structure of the ebook. This works well for EPUB ' \
+ 'books that have been internally split by chapter.')),
+ OptionRecommendation(name='h1_to_title',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('Wrap all h1 tags with fb2 title elements.')),
+ OptionRecommendation(name='h2_to_title',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('Wrap all h2 tags with fb2 title elements.')),
+ OptionRecommendation(name='h3_to_title',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('Wrap all h3 tags with fb2 title elements.')),
])
def convert(self, oeb_book, output_path, input_plugin, opts, log):
diff --git a/src/calibre/gui2/convert/fb2_output.py b/src/calibre/gui2/convert/fb2_output.py
index a3cbe0e647..5d927146a5 100644
--- a/src/calibre/gui2/convert/fb2_output.py
+++ b/src/calibre/gui2/convert/fb2_output.py
@@ -17,6 +17,8 @@ class PluginWidget(Widget, Ui_Form):
ICON = I('mimetypes/fb2.png')
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
- Widget.__init__(self, parent, ['inline_toc', 'sectionize_chapters'])
+ Widget.__init__(self, parent, ['inline_toc', 'sectionize_chapters',
+ 'sectionize_chapters_using_file_structure', 'h1_to_title',
+ 'h2_to_title', 'h3_to_title'])
self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id)
diff --git a/src/calibre/gui2/convert/fb2_output.ui b/src/calibre/gui2/convert/fb2_output.ui
index a43a8b72ea..a90ecd615e 100644
--- a/src/calibre/gui2/convert/fb2_output.ui
+++ b/src/calibre/gui2/convert/fb2_output.ui
@@ -14,7 +14,7 @@
Form
- -
+
-
Qt::Vertical
@@ -41,6 +41,34 @@
+ -
+
+
+ Sectionize Chapters using file structure
+
+
+
+ -
+
+
+ Wrap h1 tags with <title> elements
+
+
+
+ -
+
+
+ Wrap h2 tags with <title> elements
+
+
+
+ -
+
+
+ Wrap h3 tags with <title> elements
+
+
+
From 37cde21c6d96b9b385f85d543227b0b3806db879 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Tue, 30 Nov 2010 18:48:21 -0500
Subject: [PATCH 03/28] Fix bug #7745: PML output should ignore external links
as it is not support by the spec. Fix part of bug #7742: PML output extra %
sign.
---
src/calibre/ebooks/pml/pmlml.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py
index 560a132ce1..f97f74f4a0 100644
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@@ -216,7 +216,7 @@ class PMLMLizer(object):
w = '\\w'
width = elem.get('width')
if width:
- w += '="%s%%"' % width
+ w += '="%s"' % width
else:
w += '="50%"'
text.append(w)
@@ -252,8 +252,8 @@ class PMLMLizer(object):
if href not in self.link_hrefs.keys():
self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys())
href = '#%s' % self.link_hrefs[href]
- text.append('\\q="%s"' % href)
- tags.append('q')
+ text.append('\\q="%s"' % href)
+ tags.append('q')
# Anchor ids
id_name = elem.get('id')
From c38eb08018ca9eb404247de0ccc84bf73196ed20 Mon Sep 17 00:00:00 2001
From: John Schember
Date: Tue, 30 Nov 2010 19:04:26 -0500
Subject: [PATCH 04/28] PML Output: ensure \w always ends with a %.
---
src/calibre/ebooks/pml/pmlml.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py
index f97f74f4a0..ceb7f36124 100644
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@@ -216,6 +216,8 @@ class PMLMLizer(object):
w = '\\w'
width = elem.get('width')
if width:
+ if not width.endswith('%'):
+ width += '%'
w += '="%s"' % width
else:
w += '="50%"'
From fa0799597d686a4e870bf0a6d76a9addb8114756 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Tue, 30 Nov 2010 18:35:14 -0700
Subject: [PATCH 05/28] Fix #7746 (Converting prc->epub: names of streets and
lakes)
---
src/calibre/ebooks/mobi/reader.py | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py
index f80d15359c..48ece79f45 100644
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@@ -504,6 +504,9 @@ class MobiReader(object):
'x-large': '5',
'xx-large': '6',
}
+ def barename(x):
+ return x.rpartition(':')[-1]
+
mobi_version = self.book_header.mobi_version
for x in root.xpath('//ncx'):
x.getparent().remove(x)
@@ -512,8 +515,9 @@ class MobiReader(object):
for x in tag.attrib:
if ':' in x:
del tag.attrib[x]
- if tag.tag in ('country-region', 'place', 'placetype', 'placename',
- 'state', 'city', 'street', 'address', 'content', 'form'):
+ if tag.tag and barename(tag.tag.lower()) in \
+ ('country-region', 'place', 'placetype', 'placename',
+ 'state', 'city', 'street', 'address', 'content', 'form'):
tag.tag = 'div' if tag.tag in ('content', 'form') else 'span'
for key in tag.attrib.keys():
tag.attrib.pop(key)
From 983da070950ca74aab2d9f05cda3b4143cd66322 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Tue, 30 Nov 2010 18:37:34 -0700
Subject: [PATCH 06/28] Fix #7753 (setPlaceholderText not found)
---
src/calibre/gui2/search_box.py | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/src/calibre/gui2/search_box.py b/src/calibre/gui2/search_box.py
index 4d598a3bbb..94c9bbe33d 100644
--- a/src/calibre/gui2/search_box.py
+++ b/src/calibre/gui2/search_box.py
@@ -236,7 +236,11 @@ class SavedSearchBox(QComboBox):
def initialize(self, _search_box, colorize=False, help_text=_('Search')):
self.search_box = _search_box
- self.line_edit.setPlaceholderText(help_text)
+ try:
+ self.line_edit.setPlaceholderText(help_text)
+ except:
+ # Using Qt < 4.7
+ pass
self.colorize = colorize
self.clear()
From ba831d21e38f0d0bad7ff0f791126d95ec2c417a Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Tue, 30 Nov 2010 19:03:01 -0700
Subject: [PATCH 07/28] Search box: Remove select all on focus event as focus
events are fired when completion fails/succeeds. Instead select all only when
search box is focussed via the keyboard shortcut
---
src/calibre/gui2/search_box.py | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/src/calibre/gui2/search_box.py b/src/calibre/gui2/search_box.py
index 94c9bbe33d..dabd88ff9f 100644
--- a/src/calibre/gui2/search_box.py
+++ b/src/calibre/gui2/search_box.py
@@ -28,10 +28,6 @@ class SearchLineEdit(QLineEdit):
QLineEdit.mouseReleaseEvent(self, event)
QLineEdit.selectAll(self)
- def focusInEvent(self, event):
- QLineEdit.focusInEvent(self, event)
- QLineEdit.selectAll(self)
-
def dropEvent(self, ev):
self.parent().normalize_state()
return QLineEdit.dropEvent(self, ev)
@@ -334,14 +330,17 @@ class SearchBoxMixin(object):
shortcuts = QKeySequence.keyBindings(QKeySequence.Find)
shortcuts = list(shortcuts) + [QKeySequence('/'), QKeySequence('Alt+S')]
self.action_focus_search.setShortcuts(shortcuts)
- self.action_focus_search.triggered.connect(lambda x:
- self.search.setFocus(Qt.OtherFocusReason))
+ self.action_focus_search.triggered.connect(self.focus_search_box)
self.addAction(self.action_focus_search)
self.search.setStatusTip(re.sub(r'<\w+>', ' ',
unicode(self.search.toolTip())))
self.advanced_search_button.setStatusTip(self.advanced_search_button.toolTip())
self.clear_button.setStatusTip(self.clear_button.toolTip())
+ def focus_search_box(self, *args):
+ self.search.setFocus(Qt.OtherFocusReason)
+ self.search.lineEdit().selectAll()
+
def search_box_cleared(self):
self.tags_view.clear()
self.saved_search.clear()
From 4ab34dff95f00ce8d76af0873975e8648a4fc3bf Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Tue, 30 Nov 2010 19:12:24 -0700
Subject: [PATCH 08/28] Fix #7749 (Book details panel does not always display
correct selected book)
---
src/calibre/gui2/book_details.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/calibre/gui2/book_details.py b/src/calibre/gui2/book_details.py
index 4ffc8da650..b101d4c44f 100644
--- a/src/calibre/gui2/book_details.py
+++ b/src/calibre/gui2/book_details.py
@@ -208,8 +208,9 @@ class BookInfo(QWebView):
rows = u'\n'.join([u'%s: | %s |
'%(k,t) for
k, t in rows])
comments = data.get(_('Comments'), '')
- if comments and comments != u'None':
- self.renderer.queue.put((rows, comments))
+ if not comments or comments == u'None':
+ comments = ''
+ self.renderer.queue.put((rows, comments))
self._show_data(rows, '')
From c63425f0b030811981b89ce1185b84588ec5176e Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Tue, 30 Nov 2010 19:43:16 -0700
Subject: [PATCH 09/28] Fix #7686 (Updated recipes for Newswek Polska and
Polityka, new recipes for Esensja, Histmag and Wprost)
---
resources/recipes/esenja.recipe | 87 ++++++++++++++++++++++
resources/recipes/histmag.recipe | 59 +++++++++++++++
resources/recipes/newsweek_polska.recipe | 53 ++++++++++----
resources/recipes/polityka.recipe | 7 +-
resources/recipes/wprost.recipe | 91 ++++++++++++++++++++++++
5 files changed, 278 insertions(+), 19 deletions(-)
create mode 100644 resources/recipes/esenja.recipe
create mode 100644 resources/recipes/histmag.recipe
create mode 100644 resources/recipes/wprost.recipe
diff --git a/resources/recipes/esenja.recipe b/resources/recipes/esenja.recipe
new file mode 100644
index 0000000000..b8b94ad66e
--- /dev/null
+++ b/resources/recipes/esenja.recipe
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+
+__license__ = 'GPL v3'
+__copyright__ = '2010, matek09, matek09@gmail.com'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+import re
+
+class Esensja(BasicNewsRecipe):
+
+ title = u'Esensja'
+ __author__ = 'matek09'
+ description = 'Monthly magazine'
+ encoding = 'utf-8'
+ no_stylesheets = True
+ language = 'pl'
+ remove_javascript = True
+ HREF = '0'
+
+ #keep_only_tags =[]
+ #keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'})
+ remove_tags_before = dict(dict(name = 'div', attrs = {'class' : 't-title'}))
+ remove_tags_after = dict(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_bot.gif'}))
+
+ remove_tags =[]
+ remove_tags.append(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_top.gif'}))
+ remove_tags.append(dict(name = 'img', attrs = {'src' : '../../../2000/01/img/tab_bot.gif'}))
+ remove_tags.append(dict(name = 'div', attrs = {'class' : 't-title2 nextpage'}))
+
+ extra_css = '''
+ .t-title {font-size: x-large; font-weight: bold; text-align: left}
+ .t-author {font-size: x-small; text-align: left}
+ .t-title2 {font-size: x-small; font-style: italic; text-align: left}
+ .text {font-size: small; text-align: left}
+ .annot-ref {font-style: italic; text-align: left}
+ '''
+
+ preprocess_regexps = [(re.compile(r'alt="[^"]*"'),
+ lambda match: '')]
+
+ def parse_index(self):
+ soup = self.index_to_soup('http://www.esensja.pl/magazyn/')
+ a = soup.find('a', attrs={'href' : re.compile('.*/index.html')})
+ year = a['href'].split('/')[0]
+ month = a['href'].split('/')[1]
+ self.HREF = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/iso/'
+ soup = self.index_to_soup(self.HREF + '01.html')
+ self.cover_url = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/img/ilustr/cover_b.jpg'
+ feeds = []
+ intro = soup.find('div', attrs={'class' : 'n-title'})
+ introduction = {'title' : self.tag_to_string(intro.a),
+ 'url' : self.HREF + intro.a['href'],
+ 'date' : '',
+ 'description' : ''}
+ chapter = 'Wprowadzenie'
+ subchapter = ''
+ articles = []
+ articles.append(introduction)
+ for tag in intro.findAllNext(attrs={'class': ['chapter', 'subchapter', 'n-title']}):
+ if tag.name in 'td':
+ if len(articles) > 0:
+ section = chapter
+ if len(subchapter) > 0:
+ section += ' - ' + subchapter
+ feeds.append((section, articles))
+ articles = []
+ if tag['class'] == 'chapter':
+ chapter = self.tag_to_string(tag).capitalize()
+ subchapter = ''
+ else:
+ subchapter = self.tag_to_string(tag)
+ subchapter = self.tag_to_string(tag)
+ continue
+ articles.append({'title' : self.tag_to_string(tag.a), 'url' : self.HREF + tag.a['href'], 'date' : '', 'description' : ''})
+
+ a = self.index_to_soup(self.HREF + tag.a['href'])
+ i = 1
+ while True:
+ div = a.find('div', attrs={'class' : 't-title2 nextpage'})
+ if div is not None:
+ a = self.index_to_soup(self.HREF + div.a['href'])
+ articles.append({'title' : self.tag_to_string(tag.a) + ' c. d. ' + str(i), 'url' : self.HREF + div.a['href'], 'date' : '', 'description' : ''})
+ i = i + 1
+ else:
+ break
+
+ return feeds
diff --git a/resources/recipes/histmag.recipe b/resources/recipes/histmag.recipe
new file mode 100644
index 0000000000..38956e7995
--- /dev/null
+++ b/resources/recipes/histmag.recipe
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+__license__ = 'GPL v3'
+__copyright__ = '2010, matek09, matek09@gmail.com'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+import re
+
+class Histmag(BasicNewsRecipe):
+
+ title = u'Histmag'
+ __author__ = 'matek09'
+ description = u"Artykuly historyczne i publicystyczne"
+ encoding = 'utf-8'
+ no_stylesheets = True
+ language = 'pl'
+ remove_javascript = True
+ #max_articles_per_feed = 1
+ remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'article'}))
+ remove_tags_after = dict(dict(name = 'h2', attrs = {'class' : 'komentarze'}))
+ #keep_only_tags =[]
+ #keep_only_tags.append(dict(name = 'h2'))
+ #keep_only_tags.append(dict(name = 'p'))
+
+ remove_tags =[]
+ remove_tags.append(dict(name = 'p', attrs = {'class' : 'podpis'}))
+ remove_tags.append(dict(name = 'h2', attrs = {'class' : 'komentarze'}))
+ remove_tags.append(dict(name = 'img', attrs = {'src' : 'style/buttons/wesprzyjnas-1.jpg'}))
+
+ preprocess_regexps = [(re.compile(r''), lambda match: '
'),
+ (re.compile(r''), lambda match: '
')]
+ extra_css = '''
+ .left {font-size: x-small}
+ .right {font-size: x-small}
+ '''
+
+ def find_articles(self, soup):
+ articles = []
+ for div in soup.findAll('div', attrs={'class' : 'text'}):
+ articles.append({
+ 'title' : self.tag_to_string(div.h3.a),
+ 'url' : 'http://www.histmag.org/' + div.h3.a['href'],
+ 'date' : self.tag_to_string(div.next('p')).split('|')[0],
+ 'description' : self.tag_to_string(div.next('p', podpis=False)),
+ })
+ return articles
+
+ def parse_index(self):
+ soup = self.index_to_soup('http://histmag.org/?arc=4&dx=0')
+ feeds = []
+ feeds.append((u"Artykuly historyczne", self.find_articles(soup)))
+ soup = self.index_to_soup('http://histmag.org/?arc=5&dx=0')
+ feeds.append((u"Artykuly publicystyczne", self.find_articles(soup)))
+ soup = self.index_to_soup('http://histmag.org/?arc=1&dx=0')
+ feeds.append((u"Wydarzenia", self.find_articles(soup)))
+
+ return feeds
+
+
diff --git a/resources/recipes/newsweek_polska.recipe b/resources/recipes/newsweek_polska.recipe
index 31dd8ccddd..4227a88026 100644
--- a/resources/recipes/newsweek_polska.recipe
+++ b/resources/recipes/newsweek_polska.recipe
@@ -1,19 +1,22 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
-__copyright__ = '2010, Mateusz Kielar, matek09@gmail.com'
+__copyright__ = '2010, matek09, matek09@gmail.com'
from calibre.web.feeds.news import BasicNewsRecipe
class Newsweek(BasicNewsRecipe):
- EDITION = 0
+ FIND_LAST_FULL_ISSUE = True
+ EDITION = '0'
+ EXCLUDE_LOCKED = True
+ LOCKED_ICO = 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif'
title = u'Newsweek Polska'
- __author__ = 'Mateusz Kielar'
+ __author__ = 'matek09'
description = 'Weekly magazine'
encoding = 'utf-8'
no_stylesheets = True
- language = 'en'
+ language = 'pl'
remove_javascript = True
keep_only_tags =[]
@@ -33,34 +36,54 @@ class Newsweek(BasicNewsRecipe):
def print_version(self, url):
return url.replace("http://www.newsweek.pl/artykuly/wydanie/" + str(self.EDITION), "http://www.newsweek.pl/artykuly") + '/print'
+ def is_locked(self, a):
+ if a.findNext('img')['src'] == 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif':
+ return True
+ else:
+ return False
+
+ def is_full(self, issue_soup):
+ if len(issue_soup.findAll('img', attrs={'src' : 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif'})) > 1:
+ return False
+ else:
+ return True
+
def find_last_full_issue(self):
- page = self.index_to_soup('http://www.newsweek.pl/Frames/IssueCover.aspx')
- issue = 'http://www.newsweek.pl/Frames/' + page.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href']
- page = self.index_to_soup(issue)
- issue = 'http://www.newsweek.pl/Frames/' + page.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href']
- page = self.index_to_soup(issue)
- self.EDITION = page.find('a', attrs={'target' : '_parent'})['href'].replace('/wydania/','')
+ frame_url = 'http://www.newsweek.pl/Frames/IssueCover.aspx'
+ while True:
+ frame_soup = self.index_to_soup(frame_url)
+ self.EDITION = frame_soup.find('a', attrs={'target' : '_parent'})['href'].replace('/wydania/','')
+ issue_soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
+ if self.is_full(issue_soup):
+ break
+ frame_url = 'http://www.newsweek.pl/Frames/' + frame_soup.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href']
+
+
def parse_index(self):
- self.find_last_full_issue()
- soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + str(self.EDITION))
+ if self.FIND_LAST_FULL_ISSUE:
+ self.find_last_full_issue()
+ soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
img = soup.find('img', id="ctl00_C1_PaperIsssueView_IssueImage", src=True)
self.cover_url = img['src']
feeds = []
parent = soup.find(id='content-left-big')
for txt in parent.findAll(attrs={'class':'txt_normal_red strong'}):
- section = self.tag_to_string(txt).capitalize()
articles = list(self.find_articles(txt))
- feeds.append((section, articles))
+ if len(articles) > 0:
+ section = self.tag_to_string(txt).capitalize()
+ feeds.append((section, articles))
return feeds
def find_articles(self, txt):
for a in txt.findAllNext( attrs={'class':['strong','hr']}):
if a.name in "div":
break
+ if (not self.FIND_LAST_FULL_ISSUE) & self.EXCLUDE_LOCKED & self.is_locked(a):
+ continue
yield {
'title' : self.tag_to_string(a),
- 'url' : 'http://www.newsweek.pl'+a['href'],
+ 'url' : 'http://www.newsweek.pl' + a['href'],
'date' : '',
'description' : ''
}
diff --git a/resources/recipes/polityka.recipe b/resources/recipes/polityka.recipe
index ab31e148aa..16ccae6085 100644
--- a/resources/recipes/polityka.recipe
+++ b/resources/recipes/polityka.recipe
@@ -1,18 +1,18 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
-__copyright__ = '2010, Mateusz Kielar, matek09@gmail.com'
+__copyright__ = '2010, matek09, matek09@gmail.com'
from calibre.web.feeds.news import BasicNewsRecipe
class Polityka(BasicNewsRecipe):
title = u'Polityka'
- __author__ = 'Mateusz Kielar'
+ __author__ = 'matek09'
description = 'Weekly magazine. Last archive issue'
encoding = 'utf-8'
no_stylesheets = True
- language = 'en'
+ language = 'pl'
remove_javascript = True
remove_tags_before = dict(dict(name = 'h2', attrs = {'class' : 'box_nag'}))
@@ -48,7 +48,6 @@ class Polityka(BasicNewsRecipe):
for div in box.findAll('div', attrs={'class': 'list_tresc'}):
article_page = self.index_to_soup('http://archiwum.polityka.pl' + div.a['href'],)
section = self.tag_to_string(article_page.find('h2', attrs = {'class' : 'box_nag'})).split('/')[0].lstrip().rstrip()
- print section
if not articles.has_key(section):
articles[section] = []
articles[section].append( {
diff --git a/resources/recipes/wprost.recipe b/resources/recipes/wprost.recipe
new file mode 100644
index 0000000000..b317571981
--- /dev/null
+++ b/resources/recipes/wprost.recipe
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+
+__license__ = 'GPL v3'
+__copyright__ = '2010, matek09, matek09@gmail.com'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+import re
+
+class Wprost(BasicNewsRecipe):
+ EDITION = 0
+ FIND_LAST_FULL_ISSUE = True
+ EXCLUDE_LOCKED = True
+ ICO_BLOCKED = 'http://www.wprost.pl/G/icons/ico_blocked.gif'
+
+ title = u'Wprost'
+ __author__ = 'matek09'
+ description = 'Weekly magazine'
+ encoding = 'ISO-8859-2'
+ no_stylesheets = True
+ language = 'pl'
+ remove_javascript = True
+
+ remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
+ remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'}))
+
+ '''keep_only_tags =[]
+ keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'}))
+ keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'}))
+ keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'}))
+ keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))'''
+
+ preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''),
+ (re.compile(r'display: block;'), lambda match: '')]
+
+
+ remove_tags =[]
+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'def element-date'}))
+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'def silver'}))
+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'content-main-column-right'}))
+
+
+ extra_css = '''
+ .div-header {font-size: x-small; font-weight: bold}
+ '''
+#h2 {font-size: x-large; font-weight: bold}
+ def is_blocked(self, a):
+ if a.findNextSibling('img') is None:
+ return False
+ else:
+ return True
+
+
+
+ def find_last_issue(self):
+ soup = self.index_to_soup('http://www.wprost.pl/archiwum/')
+ a = 0
+ if self.FIND_LAST_FULL_ISSUE:
+ ico_blocked = soup.findAll('img', attrs={'src' : self.ICO_BLOCKED})
+ a = ico_blocked[-1].findNext('a', attrs={'title' : re.compile('Zobacz spis tre.ci')})
+ else:
+ a = soup.find('a', attrs={'title' : re.compile('Zobacz spis tre.ci')})
+ self.EDITION = a['href'].replace('/tygodnik/?I=', '')
+ self.cover_url = a.img['src']
+
+
+
+ def parse_index(self):
+ self.find_last_issue()
+ soup = self.index_to_soup('http://www.wprost.pl/tygodnik/?I=' + self.EDITION)
+ feeds = []
+ for main_block in soup.findAll(attrs={'class':'main-block-s3 s3-head head-red3'}):
+ articles = list(self.find_articles(main_block))
+ if len(articles) > 0:
+ section = self.tag_to_string(main_block)
+ feeds.append((section, articles))
+ return feeds
+
+ def find_articles(self, main_block):
+ for a in main_block.findAllNext( attrs={'style':['','padding-top: 15px;']}):
+ if a.name in "td":
+ break
+ if self.EXCLUDE_LOCKED & self.is_blocked(a):
+ continue
+ yield {
+ 'title' : self.tag_to_string(a),
+ 'url' : 'http://www.wprost.pl' + a['href'],
+ 'date' : '',
+ 'description' : ''
+ }
+
+
From aef657b0993aaad9647c13a6970963938e8c8268 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Tue, 30 Nov 2010 19:53:21 -0700
Subject: [PATCH 10/28] Fix #7723 (View Specific Format Does Not Allow More
Than One Selection)
---
src/calibre/gui2/actions/view.py | 30 +++++++++++++++++++++++-------
1 file changed, 23 insertions(+), 7 deletions(-)
diff --git a/src/calibre/gui2/actions/view.py b/src/calibre/gui2/actions/view.py
index 5f4f7ce428..0a26653771 100644
--- a/src/calibre/gui2/actions/view.py
+++ b/src/calibre/gui2/actions/view.py
@@ -12,7 +12,7 @@ from PyQt4.Qt import Qt, QMenu
from calibre.constants import isosx
from calibre.gui2 import error_dialog, Dispatcher, question_dialog, config, \
- open_local_file
+ open_local_file, info_dialog
from calibre.gui2.dialogs.choose_format import ChooseFormatDialog
from calibre.utils.config import prefs
from calibre.ptempfile import PersistentTemporaryFile
@@ -89,18 +89,34 @@ class ViewAction(InterfaceAction):
self._launch_viewer(name, viewer, internal)
def view_specific_format(self, triggered):
- rows = self.gui.library_view.selectionModel().selectedRows()
+ rows = list(self.gui.library_view.selectionModel().selectedRows())
if not rows or len(rows) == 0:
d = error_dialog(self.gui, _('Cannot view'), _('No book selected'))
d.exec_()
return
- row = rows[0].row()
- formats = self.gui.library_view.model().db.formats(row).upper().split(',')
- d = ChooseFormatDialog(self.gui, _('Choose the format to view'), formats)
+ db = self.gui.library_view.model().db
+ rows = [r.row() for r in rows]
+ formats = [db.formats(row) for row in rows]
+ formats = [list(f.upper().split(',')) if f else None for f in formats]
+ all_fmts = set([])
+ for x in formats:
+ for f in x: all_fmts.add(f)
+ d = ChooseFormatDialog(self.gui, _('Choose the format to view'),
+ list(sorted(all_fmts)))
if d.exec_() == d.Accepted:
- format = d.format()
- self.view_format(row, format)
+ fmt = d.format()
+ orig_num = len(rows)
+ rows = [rows[i] for i in range(len(rows)) if formats[i] and fmt in
+ formats[i]]
+ if self._view_check(len(rows)):
+ for row in rows:
+ self.view_format(row, fmt)
+ if len(rows) < orig_num:
+ info_dialog(self.gui, _('Format unavailable'),
+ _('Not all the selected books were available in'
+ ' the %s format. You should convert'
+ ' them first.')%fmt, show=True)
def _view_check(self, num, max_=3):
if num <= max_:
From 50b082fa8f9ed5ef14530dc57b0a3a412c5944b4 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Tue, 30 Nov 2010 19:55:24 -0700
Subject: [PATCH 11/28] Fix #7704 (Updated recipe for NIN)
---
resources/recipes/nin.recipe | 79 +++++++++++++++++++++++++++++++-----
1 file changed, 69 insertions(+), 10 deletions(-)
diff --git a/resources/recipes/nin.recipe b/resources/recipes/nin.recipe
index 70fd998a09..27942f7d43 100644
--- a/resources/recipes/nin.recipe
+++ b/resources/recipes/nin.recipe
@@ -8,12 +8,15 @@ www.nin.co.rs
import re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
+from contextlib import nested, closing
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
+from calibre import entity_to_unicode
class Nin(BasicNewsRecipe):
title = 'NIN online'
__author__ = 'Darko Miletic'
description = 'Nedeljne Informativne Novine'
- publisher = 'NIN d.o.o.'
+ publisher = 'NIN d.o.o. - Ringier d.o.o.'
category = 'news, politics, Serbia'
no_stylesheets = True
delay = 1
@@ -26,18 +29,29 @@ class Nin(BasicNewsRecipe):
use_embedded_content = False
language = 'sr'
publication_type = 'magazine'
- extra_css = ' @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Verdana, Lucida, sans1, sans-serif} .article_description{font-family: Verdana, Lucida, sans1, sans-serif} .artTitle{font-size: x-large; font-weight: bold; color: #900} .izjava{font-size: x-large; font-weight: bold} .columnhead{font-size: small; font-weight: bold;} img{margin-top:0.5em; margin-bottom: 0.7em} b{margin-top: 1em} '
+ extra_css = """
+ @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
+ body{font-family: Verdana, Lucida, sans1, sans-serif}
+ .article_description{font-family: Verdana, Lucida, sans1, sans-serif}
+ .artTitle{font-size: x-large; font-weight: bold; color: #900}
+ .izjava{font-size: x-large; font-weight: bold}
+ .columnhead{font-size: small; font-weight: bold;}
+ img{margin-top:0.5em; margin-bottom: 0.7em; display: block}
+ b{margin-top: 1em}
+ """
conversion_options = {
- 'comment' : description
- , 'tags' : category
- , 'publisher' : publisher
- , 'language' : language
- , 'linearize_tables' : True
+ 'comment' : description
+ , 'tags' : category
+ , 'publisher' : publisher
+ , 'language' : language
}
- preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
- remove_attributes = ['height','width']
+ preprocess_regexps = [
+ (re.compile(r'