From 7ab3ce7fe46ba7c271a7f4894253b1b5d3902c15 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 17 Dec 2010 19:06:32 -0700 Subject: [PATCH 01/11] ... --- resources/recipes/nejm.recipe | 91 +++++++++++++++-------------------- src/calibre/web/feeds/news.py | 1 + 2 files changed, 41 insertions(+), 51 deletions(-) diff --git a/resources/recipes/nejm.recipe b/resources/recipes/nejm.recipe index c860413926..35098c92cb 100644 --- a/resources/recipes/nejm.recipe +++ b/resources/recipes/nejm.recipe @@ -38,61 +38,50 @@ class NYTimes(BasicNewsRecipe): #TO GET ARTICLE TOC def nejm_get_index(self): - return self.index_to_soup('http://content.nejm.org/current.dtl') + return self.index_to_soup('http://content.nejm.org/current.dtl') # To parse artice toc def parse_index(self): - parse_soup = self.nejm_get_index() + parse_soup = self.nejm_get_index() - div = parse_soup.find(id='centerTOC') + feeds = [] - current_section = None - current_articles = [] - feeds = [] - for x in div.findAll(True): - if x.name == 'img' and '/toc/' in x.get('src', '') and 'uarrow.gif' not in x.get('src', ''): - # Section heading found - if current_articles and current_section and 'Week in the' not in current_section: - feeds.append((current_section, current_articles)) - current_section = x.get('alt') - current_articles = [] - self.log('\tFound section:', current_section) - if current_section is not None and x.name == 'strong': - title = self.tag_to_string(x) - a = x.parent.find('a', href=lambda x: x and '/full/' in x) - if a is None: - continue - url = a.get('href', False) - if not url or not title: - continue - if url.startswith('/'): - url = 'http://content.nejm.org'+url - self.log('\t\tFound article:', title) - self.log('\t\t\t', url) - if url.startswith('/'): - url = 'http://online.wsj.com'+url - current_articles.append({'title': title, 'url':url, - 'description':'', 'date':''}) - - if current_articles and current_section: - feeds.append((current_section, current_articles)) - - return feeds - - def preprocess_html(self, soup): - for a in soup.findAll(text=lambda x: x and '[in this window]' in x): - a = a.findParent('a') - url = a.get('href', None) - if not url: + div = parse_soup.find(attrs={'class':'tocContent'}) + for group in div.findAll(attrs={'class':'articleGrouping'}): + feed_title = group.find(attrs={'class':'articleType'}) + if feed_title is None: continue - if url.startswith('/'): - url = 'http://content.nejm.org'+url - isoup = self.index_to_soup(url) - img = isoup.find('img', src=lambda x: x and - x.startswith('/content/')) - if img is not None: - img.extract() - table = a.findParent('table') - table.replaceWith(img) - return soup + feed_title = self.tag_to_string(feed_title) + articles = [] + self.log('Found section:', feed_title) + for art in group.findAll(attrs={'class':lambda x: x and 'articleEntry' + in x}): + link = art.find(attrs={'class':lambda x:x and 'articleLink' in + x}) + if link is None: + continue + a = link.find('a', href=True) + if a is None: + continue + url = a.get('href') + if url.startswith('/'): + url = 'http://www.nejm.org'+url + title = self.tag_to_string(a) + self.log.info('\tFound article:', title, 'at', url) + article = {'title':title, 'url':url, 'date':''} + au = art.find(attrs={'class':'articleAuthors'}) + if au is not None: + article['author'] = self.tag_to_string(au) + desc = art.find(attrs={'class':'hover_text'}) + if desc is not None: + desc = self.tag_to_string(desc) + if 'author' in article: + desc = ' by ' + article['author'] + ' ' +desc + article['description'] = desc + articles.append(article) + if articles: + feeds.append((feed_title, articles)) + + return feeds + diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index ad2991d620..7bd5301dfb 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -548,6 +548,7 @@ class BasicNewsRecipe(Recipe): } For an example, see the recipe for downloading `The Atlantic`. + In addition, you can add 'author' for the author of the article. ''' raise NotImplementedError From b2ace1e470c67f40648dad5d8f98ed1bb249f27d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 17 Dec 2010 19:15:32 -0700 Subject: [PATCH 02/11] ... --- resources/recipes/nejm.recipe | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/resources/recipes/nejm.recipe b/resources/recipes/nejm.recipe index 35098c92cb..a6580a5232 100644 --- a/resources/recipes/nejm.recipe +++ b/resources/recipes/nejm.recipe @@ -11,16 +11,7 @@ class NYTimes(BasicNewsRecipe): language = 'en' no_stylesheets = True - remove_tags_before = dict(name='div', attrs={'align':'center'}) - remove_tags_after = dict(name='ol', attrs={'compact':'COMPACT'}) - remove_tags = [ - dict(name='iframe'), - #dict(name='div', attrs={'class':'related-articles'}), - dict(name='div', attrs={'id':['sidebar']}), - #dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or author')"}), - dict(name='table', attrs={'align':'RIGHT'}), - ] - + keep_only_tags = dict(id='content') #TO LOGIN From 3cc953d621c6bd4a00040601a5050d8dbb067fc7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 17 Dec 2010 20:21:12 -0700 Subject: [PATCH 03/11] Fix #7934 (Database corruption error) --- src/calibre/library/database2.py | 2 +- src/calibre/library/schema_upgrades.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 1557f5065e..35ef8c1f58 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -1967,7 +1967,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): @classmethod def cleanup_tags(cls, tags): - tags = [x.strip() for x in tags if x.strip()] + tags = [x.strip().replace(',', ';') for x in tags if x.strip()] tags = [x.decode(preferred_encoding, 'replace') \ if isbytestring(x) else x for x in tags] tags = [u' '.join(x.split()) for x in tags] diff --git a/src/calibre/library/schema_upgrades.py b/src/calibre/library/schema_upgrades.py index 597cee8cfd..1483743e4a 100644 --- a/src/calibre/library/schema_upgrades.py +++ b/src/calibre/library/schema_upgrades.py @@ -427,7 +427,9 @@ class SchemaUpgrade(object): def upgrade_version_15(self): 'Remove commas from tags' - self.conn.execute("UPDATE tags SET name=REPLACE(name, ',', ';')") + self.conn.execute("UPDATE OR IGNORE tags SET name=REPLACE(name, ',', ';')") + self.conn.execute("UPDATE OR IGNORE tags SET name=REPLACE(name, ',', ';;')") + self.conn.execute("UPDATE OR IGNORE tags SET name=REPLACE(name, ',', '')") def upgrade_version_16(self): self.conn.executescript(''' From b9f2346cba22abbf5881735fb75f6a440a73fb38 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 17 Dec 2010 20:59:48 -0700 Subject: [PATCH 04/11] Fix #7938 (Stanza shows some authors as "catalog is empty") --- resources/recipes/johm.recipe | 78 +++++++++++++----------------- src/calibre/library/server/opds.py | 14 +++--- 2 files changed, 41 insertions(+), 51 deletions(-) diff --git a/resources/recipes/johm.recipe b/resources/recipes/johm.recipe index ee162b27c2..6930f05a50 100644 --- a/resources/recipes/johm.recipe +++ b/resources/recipes/johm.recipe @@ -12,16 +12,6 @@ class JournalofHospitalMedicine(BasicNewsRecipe): language = 'en' no_stylesheets = True - #remove_tags_before = dict(name='div', attrs={'align':'center'}) - #remove_tags_after = dict(name='ol', attrs={'compact':'COMPACT'}) - remove_tags = [ - dict(name='iframe'), - dict(name='div', attrs={'class':'subContent'}), - dict(name='div', attrs={'id':['contentFrame']}), - #dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or author')"}), - #dict(name='table', attrs={'align':'RIGHT'}), - ] - # TO LOGIN @@ -39,47 +29,47 @@ class JournalofHospitalMedicine(BasicNewsRecipe): #TO GET ARTICLE TOC def johm_get_index(self): - return self.index_to_soup('http://www3.interscience.wiley.com/journal/111081937/home') + return self.index_to_soup('http://www3.interscience.wiley.com/journal/111081937/home') # To parse artice toc def parse_index(self): - parse_soup = self.johm_get_index() + parse_soup = self.johm_get_index() - div = parse_soup.find(id='contentCell') + div = parse_soup.find(id='contentCell') - current_section = None - current_articles = [] - feeds = [] - for x in div.findAll(True): - if x.name == 'h4': - # Section heading found - if current_articles and current_section: - feeds.append((current_section, current_articles)) - current_section = self.tag_to_string(x) - current_articles = [] - self.log('\tFound section:', current_section) - if current_section is not None and x.name == 'strong': - title = self.tag_to_string(x) - p = x.parent.parent.find('a', href=lambda x: x and '/HTMLSTART' in x) - if p is None: - continue - url = p.get('href', False) - if not url or not title: - continue - if url.startswith('/'): - url = 'http://www3.interscience.wiley.com'+url - url = url.replace('/HTMLSTART', '/main.html,ftx_abs') - self.log('\t\tFound article:', title) - self.log('\t\t\t', url) - #if url.startswith('/'): - #url = 'http://online.wsj.com'+url - current_articles.append({'title': title, 'url':url, - 'description':'', 'date':''}) + current_section = None + current_articles = [] + feeds = [] + for x in div.findAll(True): + if x.name == 'h4': + # Section heading found + if current_articles and current_section: + feeds.append((current_section, current_articles)) + current_section = self.tag_to_string(x) + current_articles = [] + self.log('\tFound section:', current_section) + if current_section is not None and x.name == 'strong': + title = self.tag_to_string(x) + p = x.parent.parent.find('a', href=lambda x: x and '/HTMLSTART' in x) + if p is None: + continue + url = p.get('href', False) + if not url or not title: + continue + if url.startswith('/'): + url = 'http://www3.interscience.wiley.com'+url + url = url.replace('/HTMLSTART', '/main.html,ftx_abs') + self.log('\t\tFound article:', title) + self.log('\t\t\t', url) + #if url.startswith('/'): + #url = 'http://online.wsj.com'+url + current_articles.append({'title': title, 'url':url, + 'description':'', 'date':''}) - if current_articles and current_section: - feeds.append((current_section, current_articles)) + if current_articles and current_section: + feeds.append((current_section, current_articles)) - return feeds + return feeds def preprocess_html(self, soup): for img in soup.findAll('img', src=True): diff --git a/src/calibre/library/server/opds.py b/src/calibre/library/server/opds.py index af635ebf48..cd0b80d4f0 100644 --- a/src/calibre/library/server/opds.py +++ b/src/calibre/library/server/opds.py @@ -19,7 +19,7 @@ from calibre.ebooks.metadata import fmt_sidx from calibre.library.comments import comments_to_html from calibre.library.server import custom_fields_to_display from calibre.library.server.utils import format_tag_string, Offsets -from calibre import guess_type +from calibre import guess_type, prepare_string_for_xml as xml from calibre.utils.icu import sort_key from calibre.utils.ordered_dict import OrderedDict @@ -150,13 +150,13 @@ def ACQUISITION_ENTRY(item, version, db, updated, CFM, CKEYS, prefix): extra.append(_('RATING: %s
')%rating) tags = item[FM['tags']] if tags: - extra.append(_('TAGS: %s
')%format_tag_string(tags, ',', + extra.append(_('TAGS: %s
')%xml(format_tag_string(tags, ',', ignore_max=True, - no_tag_count=True)) + no_tag_count=True))) series = item[FM['series']] if series: extra.append(_('SERIES: %s [%s]
')%\ - (series, + (xml(series), fmt_sidx(float(item[FM['series_index']])))) for key in CKEYS: mi = db.get_metadata(item[CFM['id']['rec_index']], index_is_id=True) @@ -164,11 +164,11 @@ def ACQUISITION_ENTRY(item, version, db, updated, CFM, CKEYS, prefix): if val: datatype = CFM[key]['datatype'] if datatype == 'text' and CFM[key]['is_multiple']: - extra.append('%s: %s
'%(name, format_tag_string(val, ',', + extra.append('%s: %s
'%(xml(name), xml(format_tag_string(val, ',', ignore_max=True, - no_tag_count=True))) + no_tag_count=True)))) else: - extra.append('%s: %s
'%(name, val)) + extra.append('%s: %s
'%(xml(name), xml(unicode(val)))) comments = item[FM['comments']] if comments: comments = comments_to_html(comments) From 545cdfd00d14f9bdab4967318ea5edcf8d6bb24c Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Sat, 18 Dec 2010 09:43:11 +0000 Subject: [PATCH 05/11] Enhancement #7931: add tweak to change display in tags pane from author to author_sort. --- resources/default_tweaks.py | 14 ++++++++++++++ src/calibre/gui2/tag_view.py | 29 ++++++++++++++++++++++------- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index 750af9efa7..6d4f7a405f 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -41,6 +41,20 @@ series_index_auto_increment = 'next' # selecting 'manage authors', and pressing 'Recalculate all author sort values'. author_sort_copy_method = 'invert' +# Set which author field to display in the tags pane (the list of authors, +# series, publishers etc on the left hand side). The choices are author and +# author_sort. This tweak affects only the tags pane, and only what is displayed +# under the authors category. Please note that if you set this to author_sort, +# it is very possible to see duplicate names in the list becasue although it is +# guaranteed that author names are unique, there is no such guarantee for +# author_sort values. Showing duplicates won't break anything, but it could +# lead to some confusion. When using 'author_sort', the tooltip will show the +# author's name. +# Examples: +# tags_pane_use_field_for_author_name = 'author' +# tags_pane_use_field_for_author_name = 'author_sort' +tags_pane_use_field_for_author_name = 'author' + # Set whether boolean custom columns are two- or three-valued. # Two-values for true booleans diff --git a/src/calibre/gui2/tag_view.py b/src/calibre/gui2/tag_view.py index f75061da12..a0e26cf77c 100644 --- a/src/calibre/gui2/tag_view.py +++ b/src/calibre/gui2/tag_view.py @@ -18,6 +18,7 @@ from PyQt4.Qt import Qt, QTreeView, QApplication, pyqtSignal, \ from calibre.ebooks.metadata import title_sort from calibre.gui2 import config, NONE from calibre.library.field_metadata import TagsIcons, category_icon_map +from calibre.utils.config import tweaks from calibre.utils.icu import sort_key from calibre.utils.search_query_parser import saved_searches from calibre.gui2 import error_dialog @@ -409,17 +410,31 @@ class TagTreeItem(object): # {{{ return NONE def tag_data(self, role): + tag = self.tag + if tag.category == 'authors' and \ + tweaks['tags_pane_use_field_for_author_name'] == 'author_sort': + name = tag.sort + tt_author = True + else: + name = tag.name + tt_author = False if role == Qt.DisplayRole: - if self.tag.count == 0: - return QVariant('%s'%(self.tag.name)) + if tag.count == 0: + return QVariant('%s'%(name)) else: - return QVariant('[%d] %s'%(self.tag.count, self.tag.name)) + return QVariant('[%d] %s'%(tag.count, name)) if role == Qt.EditRole: - return QVariant(self.tag.name) + return QVariant(tag.name) if role == Qt.DecorationRole: - return self.icon_state_map[self.tag.state] - if role == Qt.ToolTipRole and self.tag.tooltip is not None: - return QVariant(self.tag.tooltip) + return self.icon_state_map[tag.state] + if role == Qt.ToolTipRole: + if tt_author: + if tag.tooltip is not None: + return QVariant('(%s) %s'%(tag.name, tag.tooltip)) + else: + return QVariant(tag.name) + if tag.tooltip is not None: + return QVariant(tag.tooltip) return NONE def toggle(self): From d45398fe478b88930f68a4a2c1052fb5bb9405f2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 18 Dec 2010 09:40:45 -0700 Subject: [PATCH 06/11] ... --- src/calibre/manual/conversion.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst index fea20a3163..3a7ae16598 100644 --- a/src/calibre/manual/conversion.rst +++ b/src/calibre/manual/conversion.rst @@ -541,7 +541,9 @@ Use the options to remove headers and footers to mitigate this issue. If the hea removed from the text it can throw off the paragraph unwrapping. Some limitations of PDF input is complex, multi-column, and image based documents are not supported. -Extraction of vector images and tables from within the document is also not supported. +Extraction of vector images and tables from within the document is also not supported. Some PDFs use special glyphs to +represent double ll or doubfle ff or fi,etc. Conversion of these may or may not work depending on jusy how they are +represented internally in the PDF. Comic Book Collections ~~~~~~~~~~~~~~~~~~~~~~~~~ From 3d03f5c4a4a7886834e06281b21e6ed2f780d73b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 18 Dec 2010 10:03:56 -0700 Subject: [PATCH 07/11] Content server: Fix regression that broke browsing by rating --- src/calibre/library/server/browse.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/calibre/library/server/browse.py b/src/calibre/library/server/browse.py index 37f024c08d..485601a311 100644 --- a/src/calibre/library/server/browse.py +++ b/src/calibre/library/server/browse.py @@ -373,13 +373,18 @@ class BrowseServer(object): script='toplevel();', main=main) def browse_sort_categories(self, items, sort): - if sort not in ('rating', 'name', 'popularity'): - sort = 'name' - items.sort(key=lambda x: sort_key(getattr(x, 'sort', x.name))) + def keyg(x): + x = getattr(x, 'sort', x.name) + if isinstance(x, unicode): + return sort_key(x) + return x + if sort == 'popularity': items.sort(key=operator.attrgetter('count'), reverse=True) elif sort == 'rating': items.sort(key=operator.attrgetter('avg_rating'), reverse=True) + else: + items.sort(key=keyg) return sort def browse_category(self, category, sort): From 13e83afab8ddc523087d3c4f92f83e162a289230 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Sat, 18 Dec 2010 17:22:26 +0000 Subject: [PATCH 08/11] Fix problem where ratings have non-string sort values --- src/calibre/library/database2.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 35ef8c1f58..2848e8ebb3 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -1128,6 +1128,10 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): for l in list: (id, val, sort_val) = (l[0], l[1], l[2]) tids[category][val] = (id, sort_val) + elif cat['datatype'] == 'rating': + for l in list: + (id, val) = (l[0], l[1]) + tids[category][val] = (id, '{0:05.2f}'.format(val)) else: for l in list: (id, val) = (l[0], l[1]) @@ -1256,12 +1260,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): # sort the list if sort == 'name': - def get_sort_key(x): - sk = x.s - if isinstance(sk, unicode): - sk = sort_key(sk) - return sk - kf = get_sort_key + kf = lambda x :sort_key(x.s) reverse=False elif sort == 'popularity': kf = lambda x: x.c From 20d7b486e4f6e2761c129a8e3b5dbb6d3ed8fee3 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Sat, 18 Dec 2010 17:30:12 +0000 Subject: [PATCH 09/11] Revert changes from browse.py --- src/calibre/library/server/browse.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/calibre/library/server/browse.py b/src/calibre/library/server/browse.py index 485601a311..37f024c08d 100644 --- a/src/calibre/library/server/browse.py +++ b/src/calibre/library/server/browse.py @@ -373,18 +373,13 @@ class BrowseServer(object): script='toplevel();', main=main) def browse_sort_categories(self, items, sort): - def keyg(x): - x = getattr(x, 'sort', x.name) - if isinstance(x, unicode): - return sort_key(x) - return x - + if sort not in ('rating', 'name', 'popularity'): + sort = 'name' + items.sort(key=lambda x: sort_key(getattr(x, 'sort', x.name))) if sort == 'popularity': items.sort(key=operator.attrgetter('count'), reverse=True) elif sort == 'rating': items.sort(key=operator.attrgetter('avg_rating'), reverse=True) - else: - items.sort(key=keyg) return sort def browse_category(self, category, sort): From 1ee512bf23272388c97e208719abc10efc52352c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 18 Dec 2010 10:47:16 -0700 Subject: [PATCH 10/11] Fix #7935 (Changed device ID's for SWEEX MM300) --- src/calibre/devices/misc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/devices/misc.py b/src/calibre/devices/misc.py index e27aee4393..68f4dba9a9 100644 --- a/src/calibre/devices/misc.py +++ b/src/calibre/devices/misc.py @@ -62,9 +62,9 @@ class SWEEX(USBMS): # Ordered list of supported formats FORMATS = ['epub', 'prc', 'fb2', 'html', 'rtf', 'chm', 'pdf', 'txt'] - VENDOR_ID = [0x0525] - PRODUCT_ID = [0xa4a5] - BCD = [0x0319] + VENDOR_ID = [0x0525, 0x177f] + PRODUCT_ID = [0xa4a5, 0x300] + BCD = [0x0319, 0x110] VENDOR_NAME = 'SWEEX' WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'EBOOKREADER' From 66f9313ac94886c45e7dd1cb94a3c38e306d5ba1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 18 Dec 2010 11:19:12 -0700 Subject: [PATCH 11/11] ... --- Changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Changelog.yaml b/Changelog.yaml index 7f7afc117a..bcf58ae03d 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -11,7 +11,7 @@ - title: "Page turn animations in the e-book viewer" type: major description: > - "Now when you use the Page Down/Page Up keys or the next/previous page buttons in the viewer, page turning will be animated. The duration of the animation can be controlled in the viewer preferences. Setting it to o disables the animation completely." + "Now when you use the Page Down/Page Up keys or the next/previous page buttons in the viewer, page turning will be animated. The duration of the animation can be controlled in the viewer preferences. Setting it to 0 disables the animation completely." - title: "Conversion pipeline: Add an option to set the minimum line height of all elemnts as a percentage of the computed font size. By default, calibre now sets the line height to 120% of the computed font size."