IGN:Fixes for various regressions. feeds2disk now handles bad feeds gracefully. Tags view is now more tightly coupled to the database and the search box.

2025-07-09 03:04:10 -04:00 · 2008-09-17 12:51:59 -07:00 · 2008-09-17 12:51:59 -07:00 · 3502579474
commit 3502579474
parent 3c404a7a66
7 changed files with 175 additions and 60 deletions
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -13,8 +13,8 @@ from urlparse import urlparse
 from urllib import unquote
 from lxml import html, etree
-from lxml.html import soupparser, HTMLParser
+from lxml.html import soupparser
-from lxml.etree import XPath, XMLParser
+from lxml.etree import XPath
 get_text = XPath("//text()")
 from calibre import LoggingInterface, unicode_path
@ -298,8 +298,6 @@ class PreProcessor(object):
 class Parser(PreProcessor, LoggingInterface):
    PARSER = HTMLParser(recover=True)
    def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
        LoggingInterface.__init__(self, logging.getLogger(name))
        self.setup_cli_handler(opts.verbose)
@ -350,9 +348,7 @@ class Parser(PreProcessor, LoggingInterface):
        for pat in ENCODING_PATS:
            src = pat.sub('', src)
        try:
-            self.root = etree.HTML(src, self.PARSER)
+            self.root =  html.fromstring(src)
            if self.root is None:
                raise ValueError('%s is empty'%self.htmlfile.path)
        except:
            if self.opts.verbose:
                self.log_exception('lxml based parsing failed')
--- a/src/calibre/ebooks/lrf/rtf/convert_from.py
+++ b/src/calibre/ebooks/lrf/rtf/convert_from.py
@ -7,14 +7,11 @@ from lxml import etree
 from calibre.ebooks.lrf import option_parser as lrf_option_parser
 from calibre.ebooks.metadata.meta import get_metadata
 from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
-from calibre import isosx, setup_cli_handlers, __appname__
+from calibre import setup_cli_handlers, __appname__
 from calibre.libwand import convert, WandException
 from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
 from calibre.ebooks.lrf.rtf.xsl import xhtml
-
+from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
 UNRTF   = 'unrtf'
 if isosx and hasattr(sys, 'frameworks_dir'):
    UNRTF   = os.path.join(getattr(sys, 'frameworks_dir'), UNRTF)
 def option_parser():
    parser = lrf_option_parser(
@ -139,7 +136,10 @@ def generate_xml(rtfpath):
 def generate_html(rtfpath, logger):
    logger.info('Converting RTF to XML...')
    try:
        xml = generate_xml(rtfpath)
    except RtfInvalidCodeException:
        raise Exception(_('This RTF file has a feature calibre does not support. Convert it to HTML and then convert it.'))
    tdir = os.path.dirname(xml)
    cwd = os.getcwdu()
    os.chdir(tdir)
--- a/src/calibre/gui2/main.py
+++ b/src/calibre/gui2/main.py
@ -5,7 +5,7 @@ from xml.parsers.expat import ExpatError
 from functools import partial
 from PyQt4.QtCore import Qt, SIGNAL, QObject, QCoreApplication, QUrl
 from PyQt4.QtGui import QPixmap, QColor, QPainter, QMenu, QIcon, QMessageBox, \
-                        QToolButton, QDialog, QDesktopServices
+                        QToolButton, QDialog, QDesktopServices, QFileDialog
 from PyQt4.QtSvg import QSvgRenderer
 from calibre import __version__, __appname__, islinux, sanitize_file_name, \
@ -216,6 +216,16 @@ class Main(MainWindow, Ui_MainWindow):
        self.show()
        self.stack.setCurrentIndex(0)
        try:
            db = LibraryDatabase2(self.library_path)
        except OSError, err:
            error_dialog(self, _('Bad database location'), unicode(err)).exec_()
            dir = unicode(QFileDialog.getExistingDirectory(self, 
                            _('Choose a location for your ebook library.'), os.path.expanduser('~')))
            if not dir:
                QCoreApplication.exit(1)
            else:
                self.library_path = dir
                db = LibraryDatabase2(self.library_path)
        self.library_view.set_database(db)
        if self.olddb is not None:
@ -252,6 +262,8 @@ in which you want to store your books files. Any existing books will be automati
        self.connect(self.tags_view, SIGNAL('tags_marked(PyQt_PyObject, PyQt_PyObject)'),
                     self.search.search_from_tokens)
        self.connect(self.status_bar.tag_view_button, SIGNAL('toggled(bool)'), self.toggle_tags_view)
        self.connect(self.search, SIGNAL('search(PyQt_PyObject, PyQt_PyObject)'),
                     self.tags_view.model().reinit)
        ########################### Cover Flow ################################
        self.cover_flow = None
        if CoverFlow is not None:
@ -1272,7 +1284,6 @@ in which you want to store your books files. Any existing books will be automati
            home = os.path.dirname(self.database_path)
            if not os.path.exists(home):
                home = os.getcwd()
            from PyQt4.QtGui import QFileDialog
            dir = unicode(QFileDialog.getExistingDirectory(self, 
                            _('Choose a location for your ebook library.'), home))
            if not dir:
--- a/src/calibre/gui2/tags.py
+++ b/src/calibre/gui2/tags.py
@ -6,7 +6,6 @@ __docformat__ = 'restructuredtext en'
 '''
 Browsing book collection by tags.
 '''
 from PyQt4.Qt import QAbstractItemModel, Qt, QVariant, QTreeView, QModelIndex, \
                     QFont, SIGNAL, QSize, QColor, QIcon
@ -45,6 +44,8 @@ class TagsModel(QAbstractItemModel):
    def __init__(self, db):
        QAbstractItemModel.__init__(self)
        self.db = db
        self.ignore_next_search = False
        self._data = {}
        self.refresh()
        self.bold_font = QFont()
        self.bold_font.setBold(True)
@ -53,18 +54,40 @@ class TagsModel(QAbstractItemModel):
        self.status_map = list(map(QVariant, self.status_map))
        self.cmap = [QIcon(':/images/user_profile.svg'), QIcon(':/images/series.svg'), QIcon(':/images/book.svg'), QIcon(':/images/publisher.png'), QIcon(':/images/tags.svg')]
        self.cmap = list(map(QVariant, self.cmap))
        self.db.add_listener(self.database_changed)
    def database_changed(self, event, ids):
        self.refresh()
    def refresh(self):
        old_data = self._data
        self._data = self.db.get_categories()
        for key in self._data:
            self._data[key] = list(map(Tag, self._data[key]))
        for key in old_data.keys():
            for tag in old_data[key]:
                try:
                    index = self._data[key].index(tag)
                    if index > -1:
                        self._data[key][index].state = tag.state
                except:
                    continue
        self.reset()
    def reinit(self, *args, **kwargs):
        if not self.ignore_next_search:
            for category in self._data.values():
                for tag in category:
                    tag.state = 0
            self.reset()
        self.ignore_next_search = False
    def toggle(self, index):
        if index.parent().isValid():
            category = self.row_map[index.parent().row()]
            tag = self._data[category][index.row()]
            tag.state = (tag.state + 1)%3
            self.ignore_next_search = True
            self.emit(SIGNAL('dataChanged(QModelIndex,QModelIndex)'), index, index)
            return True
        return False
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
 The database used to store ebook metadata
 '''
 import os, re, sys, shutil, cStringIO, glob, collections, textwrap, \
-       operator, itertools, functools
+       operator, itertools, functools, traceback
 import sqlite3 as sqlite
 from itertools import repeat
@ -356,6 +356,7 @@ class LibraryDatabase2(LibraryDatabase):
    def __init__(self, library_path, row_factory=False):
        if not os.path.exists(library_path):
            os.makedirs(library_path)
        self.listeners = set([])
        self.library_path = os.path.abspath(library_path)
        self.row_factory = row_factory
        self.dbpath = os.path.join(library_path, 'metadata.db')
@ -486,7 +487,7 @@ class LibraryDatabase2(LibraryDatabase):
            if cdata is not None:
                open(os.path.join(tpath, 'cover.jpg'), 'wb').write(cdata)
            for format in formats:
-                # Get data as string (cant use file as source and target files may be the same)
+                # Get data as string (can't use file as source and target files may be the same)
                f = self.format(id, format, index_is_id=True, as_file=False)
                if not  f:
                    continue
@ -503,6 +504,22 @@ class LibraryDatabase2(LibraryDatabase):
                if len(os.listdir(parent)) == 0:
                    shutil.rmtree(parent)
    def add_listener(self, listener):
        '''
        Add a listener. Will be called on change events with two arguments.
        Event name and list of affected ids.
        '''
        self.listeners.add(listener)
    def notify(self, event, ids=[]):
        'Notify all listeners'
        for listener in self.listeners:
            try:
                listener(event, ids)
            except:
                traceback.print_exc()
                continue
    def cover(self, index, index_is_id=False, as_file=False, as_image=False):
        '''
        Return the cover image as a bytestring (in JPEG format) or None.
@ -601,6 +618,7 @@ class LibraryDatabase2(LibraryDatabase):
        self.conn.execute('INSERT INTO data (book,format,uncompressed_size,name) VALUES (?,?,?,?)',
                          (id, format.upper(), size, name))
        self.conn.commit()
        self.notify('metadata', [id])
    def delete_book(self, id):
        '''
@ -615,6 +633,8 @@ class LibraryDatabase2(LibraryDatabase):
                shutil.rmtree(parent)
        self.conn.execute('DELETE FROM books WHERE id=?', (id,))
        self.conn.commit()
        self.clean()
        self.notify('delete', [id])
    def remove_format(self, index, format, index_is_id=False):
        id = index if index_is_id else self.id(index)
@ -630,6 +650,7 @@ class LibraryDatabase2(LibraryDatabase):
                pass
            self.conn.execute('DELETE FROM data WHERE book=? AND format=?', (id, format.upper()))
            self.conn.commit()
            self.notify('metadata', [id])
    def clean(self):
        '''
@ -668,16 +689,17 @@ class LibraryDatabase2(LibraryDatabase):
        self.data.set(row, col, val)
        if column == 'authors':
            val = val.split('&,')
-            self.set_authors(id, val)
+            self.set_authors(id, val, notify=False)
        elif column == 'title':
-            self.set_title(id, val)
+            self.set_title(id, val, notify=False)
        elif column == 'publisher':
-            self.set_publisher(id, val)
+            self.set_publisher(id, val, notify=False)
        elif column == 'rating':
            self.set_rating(id, val)
        elif column == 'tags':
-            self.set_tags(id, val.split(','), append=False)
+            self.set_tags(id, val.split(','), append=False, notify=False)
        self.set_path(id, True)
        self.notify('metadata', [id])
    def set_metadata(self, id, mi):
        '''
@ -690,24 +712,25 @@ class LibraryDatabase2(LibraryDatabase):
        authors = []
        for a in mi.authors:
            authors += a.split('&')
-        self.set_authors(id, authors)
+        self.set_authors(id, authors, notify=False)
        if mi.author_sort:
            self.set_author_sort(id, mi.author_sort)
        if mi.publisher:
-            self.set_publisher(id, mi.publisher)
+            self.set_publisher(id, mi.publisher, notify=False)
        if mi.rating:
            self.set_rating(id, mi.rating)
        if mi.series:
-            self.set_series(id, mi.series)
+            self.set_series(id, mi.series, notify=False)
        if mi.cover_data[1] is not None:
            self.set_cover(id, mi.cover_data[1])
        if mi.tags:
-            self.set_tags(id, mi.tags)
+            self.set_tags(id, mi.tags, notify=False)
        if mi.comments:
            self.set_comment(id, mi.comments)
        self.set_path(id, True)
        self.notify('metadata', [id])
-    def set_authors(self, id, authors):
+    def set_authors(self, id, authors, notify=True):
        '''
        `authors`: A list of authors.
        '''
@ -729,14 +752,16 @@ class LibraryDatabase2(LibraryDatabase):
            except sqlite.IntegrityError: # Sometimes books specify the same author twice in their metadata
                pass
        self.set_path(id, True)
        self.notify('metadata', [id])
-    def set_title(self, id, title):
+    def set_title(self, id, title, notify=True):
        if not title:
            return
        self.conn.execute('UPDATE books SET title=? WHERE id=?', (title, id))
        self.set_path(id, True)
        self.notify('metadata', [id])
-    def set_publisher(self, id, publisher):
+    def set_publisher(self, id, publisher, notify=True):
        self.conn.execute('DELETE FROM books_publishers_link WHERE book=?',(id,))
        self.conn.execute('DELETE FROM publishers WHERE (SELECT COUNT(id) FROM books_publishers_link WHERE publisher=publishers.id) < 1')
        if publisher:
@ -747,8 +772,9 @@ class LibraryDatabase2(LibraryDatabase):
                aid = self.conn.execute('INSERT INTO publishers(name) VALUES (?)', (publisher,)).lastrowid
            self.conn.execute('INSERT INTO books_publishers_link(book, publisher) VALUES (?,?)', (id, aid))
        self.conn.commit()
        self.notify('metadata', [id])
-    def set_tags(self, id, tags, append=False):
+    def set_tags(self, id, tags, append=False, notify=True):
        '''
        @param tags: list of strings
        @param append: If True existing tags are not removed
@ -771,9 +797,10 @@ class LibraryDatabase2(LibraryDatabase):
                self.conn.execute('INSERT INTO books_tags_link(book, tag) VALUES (?,?)',
                              (id, tid))
        self.conn.commit()
        self.notify('metadata', [id])
-    def set_series(self, id, series):
+    def set_series(self, id, series, notify=True):
        self.conn.execute('DELETE FROM books_series_link WHERE book=?',(id,))
        self.conn.execute('DELETE FROM series WHERE (SELECT COUNT(id) FROM books_series_link WHERE series=series.id) < 1')
        if series:
@ -790,8 +817,9 @@ class LibraryDatabase2(LibraryDatabase):
                self.data.set(row, 9, series)
        except ValueError:
            pass
        self.notify('metadata', [id])
-    def set_series_index(self, id, idx):
+    def set_series_index(self, id, idx, notify=True):
        if idx is None:
            idx = 1
        idx = int(idx)
@ -803,6 +831,7 @@ class LibraryDatabase2(LibraryDatabase):
                self.data.set(row, 10, idx)
        except ValueError:
            pass
        self.notify('metadata', [id])
    def add_books(self, paths, formats, metadata, uris=[], add_duplicates=True):
        '''
@ -811,6 +840,7 @@ class LibraryDatabase2(LibraryDatabase):
        '''
        formats, metadata, uris = iter(formats), iter(metadata), iter(uris)
        duplicates = []
        ids = []
        for path in paths:
            mi = metadata.next()
            format = formats.next()
@ -826,6 +856,7 @@ class LibraryDatabase2(LibraryDatabase):
            obj = self.conn.execute('INSERT INTO books(title, uri, series_index, author_sort) VALUES (?, ?, ?, ?)', 
                              (mi.title, uri, series_index, aus))
            id = obj.lastrowid
            ids.append(id)
            self.set_path(id, True)
            self.conn.commit()
            self.set_metadata(id, mi)
@ -859,7 +890,7 @@ class LibraryDatabase2(LibraryDatabase):
            stream = open(path, 'rb')
            self.add_format(id, ext, stream, index_is_id=True)
        self.conn.commit()
-        
+        self.notify('add', [id])
    def move_library_to(self, newloc):
        if not os.path.exists(newloc):
--- a/src/calibre/web/feeds/main.py
+++ b/src/calibre/web/feeds/main.py
@ -10,9 +10,56 @@ from calibre.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
 from calibre.web.fetch.simple import option_parser as _option_parser
 from calibre.web.feeds.news import Profile2Recipe, BasicNewsRecipe
 from calibre.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile
 from calibre.utils.config import Config, StringConfig
 def config(defaults=None):
    desc = _('Options to control the fetching of periodical content from the web.')
    c = Config('feeds2disk', desc) if defaults is None else StringConfig(defaults, desc)
-def option_parser(usage=_('''\
+    web2disk = c.add_group('web2disk', _('Customize the download engine'))
    web2disk('timeout', ['-t', '--timeout'], default=10.0, 
              help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),)
    web2disk('delay', ['--delay'], default=0, 
              help=_('Minimum interval in seconds between consecutive fetches. Default is %default s'))
    web2disk('encoding', ['--encoding'], default=None, 
              help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.'))
    web2disk('match_regexps', ['--match-regexp'], default=[], action='append',
              help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.'))
    web2disk('filter_regexps', ['--filter-regexp'], default=[], action='append',
              help=_('Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.'))
    web2disk('no_stylesheets', ['--dont-download-stylesheets'], action='store_true', default=False,
              help=_('Do not download CSS stylesheets.'))
    c.add_option('feeds', ['--feeds'], default=None,
                 help=_('''Specify a list of feeds to download. For example: 
 "['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']"
 If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.'''))
    c.add_option('verbose', ['-v', '--verbose'], default=0, action='count',
                 help=_('''Be more verbose while processing.'''))
    c.add_option('title', ['--title'], default=None,
                 help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.'))
    c.add_option('username', ['-u', '--username'], default=None, 
                 help=_('Username for sites that require a login to access content.'))
    c.add_option('password', ['-p', '--password'], default=None, 
                 help=_('Password for sites that require a login to access content.'))
    c.add_option('lrf', ['--lrf'], default=False, action='store_true', 
                 help='Optimize fetching for subsequent conversion to LRF.')
    c.add_option('epub', ['--epub'], default=False, action='store_true', 
                 help='Optimize fetching for subsequent conversion to EPUB.')
    c.add_option('recursions', ['--recursions'], default=0,
                 help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
    c.add_option('output_dir', ['--output-dir'], default='.', 
                 help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.'))
    c.add_option('no_progress_bar', ['--no-progress-bar'], default=False, action='store_true',
                 help=_("Don't show the progress bar"))
    c.add_option('debug', ['--debug'], action='store_true', default=False,
                 help=_('Very verbose output, useful for debugging.'))
    c.add_option('test', ['--test'], action='store_true', default=False, 
                 help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.'))
    return c
 USAGE=_('''\
 %%prog [options] ARG
 %%prog parses an online source of articles, like an RSS or ATOM feed and 
@ -28,7 +75,9 @@ recipe as a string   - %%prog will load the recipe directly from the string arg.
 Available builtin recipes are:
 %s
-''')%(unicode(list(titles))[1:-1])):
+''')%(unicode(list(titles))[1:-1])
 def option_parser(usage=USAGE):
    p = _option_parser(usage=usage)
    p.remove_option('--max-recursions')
    p.remove_option('--base-dir')
@ -51,7 +100,7 @@ If you specify this option, any argument to %prog is ignored and a default recip
                 help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
    p.add_option('--output-dir', default=os.getcwd(), 
                 help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.'))
-    p.add_option('--no-progress-bar', dest='progress_bar', default=True, action='store_false',
+    p.add_option('--no-progress-bar', dest='no_progress_bar', default=False, action='store_true',
                 help=_('Dont show the progress bar'))
    p.add_option('--debug', action='store_true', default=False,
                 help=_('Very verbose output, useful for debugging.'))
@ -67,7 +116,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
    if notification is None:
        from calibre.utils.terminfo import TerminalController, ProgressBar
        term = TerminalController(sys.stdout)
-        pb = ProgressBar(term, _('Fetching feeds...'), no_progress_bar=not opts.progress_bar)
+        pb = ProgressBar(term, _('Fetching feeds...'), no_progress_bar=opts.no_progress_bar)
        notification = pb.update
    recipe, is_profile = None, False
@ -76,14 +125,9 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
    else:
        try:
            if os.access(recipe_arg, os.R_OK):
                try:
                recipe = compile_recipe(open(recipe_arg).read())
                is_profile = DefaultProfile in recipe.__bases__ or \
                             FullContentProfile in recipe.__bases__
                except:
                    import traceback
                    traceback.print_exc()
                    return 1
            else:
                raise Exception('not file')
        except:
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -18,7 +18,7 @@ from calibre.ebooks.metadata.opf import OPFCreator
 from calibre.ebooks.lrf import entity_to_unicode
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation
-from calibre.web.feeds import feed_from_xml, templates, feeds_from_index
+from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
 from calibre.web.fetch.simple import option_parser as web2disk_option_parser
 from calibre.web.fetch.simple import RecursiveFetcher
 from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
@ -138,6 +138,9 @@ class BasicNewsRecipe(object, LoggingInterface):
    #: List of options to pass to html2lrf, to customize generation of LRF ebooks.
    html2lrf_options      = []
    #: Options to pass to html2epub to customize generation of EPUB ebooks.
    html2epub_options     = ''
    #: List of tags to be removed. Specified tags are removed from downloaded HTML.
    #: A tag is specified as a dictionary of the form::
    #:    
@ -594,9 +597,9 @@ class BasicNewsRecipe(object, LoggingInterface):
    def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
        templ = templates.EmbeddedContent()
        raw = templ.generate(article).render('html')
-        with PersistentTemporaryFile('_feeds2disk.html') as f:
+        with PersistentTemporaryFile('_feeds2disk.html') as pt:
-            f.write(raw)
+            pt.write(raw)
-            url = ('file:'+f.name) if iswindows else ('file://'+f.name)
+            url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
        return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
@ -643,7 +646,6 @@ class BasicNewsRecipe(object, LoggingInterface):
                    url = article.url
                if not url:
                    continue
                func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
                            ((self.fetch_obfuscated_article if self.articles_are_obfuscated \
                              else self.fetch_article), url)
@ -819,13 +821,21 @@ class BasicNewsRecipe(object, LoggingInterface):
            else:
                title, url = obj
            self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url))
-            f = self.browser.open(url)
+            try:
                with closing(self.browser.open(url)) as f:
                    parsed_feeds.append(feed_from_xml(f.read(), 
                                          title=title,
                                          oldest_article=self.oldest_article,
                                          max_articles_per_feed=self.max_articles_per_feed,
                                          get_article_url=self.get_article_url))
-            f.close()
+            except Exception, err:
                feed = Feed()
                msg = 'Failed feed: %s'%(title if title else url)
                feed.populate_from_preparsed_feed(msg, [])
                feed.description = unicode(err)
                parsed_feeds.append(feed)
                self.log_exception(msg)
        return parsed_feeds
    @classmethod