IGN:Fixes for various regressions. feeds2disk now handles bad feeds gracefully. Tags view is now more tightly coupled to the database and the search box.

This commit is contained in:
Kovid Goyal 2008-09-17 12:51:59 -07:00
parent 3c404a7a66
commit 3502579474
7 changed files with 175 additions and 60 deletions

View File

@ -13,8 +13,8 @@ from urlparse import urlparse
from urllib import unquote
from lxml import html, etree
from lxml.html import soupparser, HTMLParser
from lxml.etree import XPath, XMLParser
from lxml.html import soupparser
from lxml.etree import XPath
get_text = XPath("//text()")
from calibre import LoggingInterface, unicode_path
@ -298,8 +298,6 @@ class PreProcessor(object):
class Parser(PreProcessor, LoggingInterface):
PARSER = HTMLParser(recover=True)
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
LoggingInterface.__init__(self, logging.getLogger(name))
self.setup_cli_handler(opts.verbose)
@ -350,9 +348,7 @@ class Parser(PreProcessor, LoggingInterface):
for pat in ENCODING_PATS:
src = pat.sub('', src)
try:
self.root = etree.HTML(src, self.PARSER)
if self.root is None:
raise ValueError('%s is empty'%self.htmlfile.path)
self.root = html.fromstring(src)
except:
if self.opts.verbose:
self.log_exception('lxml based parsing failed')

View File

@ -7,14 +7,11 @@ from lxml import etree
from calibre.ebooks.lrf import option_parser as lrf_option_parser
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
from calibre import isosx, setup_cli_handlers, __appname__
from calibre import setup_cli_handlers, __appname__
from calibre.libwand import convert, WandException
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
from calibre.ebooks.lrf.rtf.xsl import xhtml
UNRTF = 'unrtf'
if isosx and hasattr(sys, 'frameworks_dir'):
UNRTF = os.path.join(getattr(sys, 'frameworks_dir'), UNRTF)
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
def option_parser():
parser = lrf_option_parser(
@ -139,7 +136,10 @@ def generate_xml(rtfpath):
def generate_html(rtfpath, logger):
logger.info('Converting RTF to XML...')
xml = generate_xml(rtfpath)
try:
xml = generate_xml(rtfpath)
except RtfInvalidCodeException:
raise Exception(_('This RTF file has a feature calibre does not support. Convert it to HTML and then convert it.'))
tdir = os.path.dirname(xml)
cwd = os.getcwdu()
os.chdir(tdir)

View File

@ -5,7 +5,7 @@ from xml.parsers.expat import ExpatError
from functools import partial
from PyQt4.QtCore import Qt, SIGNAL, QObject, QCoreApplication, QUrl
from PyQt4.QtGui import QPixmap, QColor, QPainter, QMenu, QIcon, QMessageBox, \
QToolButton, QDialog, QDesktopServices
QToolButton, QDialog, QDesktopServices, QFileDialog
from PyQt4.QtSvg import QSvgRenderer
from calibre import __version__, __appname__, islinux, sanitize_file_name, \
@ -216,7 +216,17 @@ class Main(MainWindow, Ui_MainWindow):
self.show()
self.stack.setCurrentIndex(0)
db = LibraryDatabase2(self.library_path)
try:
db = LibraryDatabase2(self.library_path)
except OSError, err:
error_dialog(self, _('Bad database location'), unicode(err)).exec_()
dir = unicode(QFileDialog.getExistingDirectory(self,
_('Choose a location for your ebook library.'), os.path.expanduser('~')))
if not dir:
QCoreApplication.exit(1)
else:
self.library_path = dir
db = LibraryDatabase2(self.library_path)
self.library_view.set_database(db)
if self.olddb is not None:
QMessageBox.information(self, 'Database format changed',
@ -252,6 +262,8 @@ in which you want to store your books files. Any existing books will be automati
self.connect(self.tags_view, SIGNAL('tags_marked(PyQt_PyObject, PyQt_PyObject)'),
self.search.search_from_tokens)
self.connect(self.status_bar.tag_view_button, SIGNAL('toggled(bool)'), self.toggle_tags_view)
self.connect(self.search, SIGNAL('search(PyQt_PyObject, PyQt_PyObject)'),
self.tags_view.model().reinit)
########################### Cover Flow ################################
self.cover_flow = None
if CoverFlow is not None:
@ -1272,7 +1284,6 @@ in which you want to store your books files. Any existing books will be automati
home = os.path.dirname(self.database_path)
if not os.path.exists(home):
home = os.getcwd()
from PyQt4.QtGui import QFileDialog
dir = unicode(QFileDialog.getExistingDirectory(self,
_('Choose a location for your ebook library.'), home))
if not dir:

View File

@ -6,7 +6,6 @@ __docformat__ = 'restructuredtext en'
'''
Browsing book collection by tags.
'''
from PyQt4.Qt import QAbstractItemModel, Qt, QVariant, QTreeView, QModelIndex, \
QFont, SIGNAL, QSize, QColor, QIcon
@ -45,6 +44,8 @@ class TagsModel(QAbstractItemModel):
def __init__(self, db):
QAbstractItemModel.__init__(self)
self.db = db
self.ignore_next_search = False
self._data = {}
self.refresh()
self.bold_font = QFont()
self.bold_font.setBold(True)
@ -53,18 +54,40 @@ class TagsModel(QAbstractItemModel):
self.status_map = list(map(QVariant, self.status_map))
self.cmap = [QIcon(':/images/user_profile.svg'), QIcon(':/images/series.svg'), QIcon(':/images/book.svg'), QIcon(':/images/publisher.png'), QIcon(':/images/tags.svg')]
self.cmap = list(map(QVariant, self.cmap))
self.db.add_listener(self.database_changed)
def database_changed(self, event, ids):
self.refresh()
def refresh(self):
old_data = self._data
self._data = self.db.get_categories()
for key in self._data:
self._data[key] = list(map(Tag, self._data[key]))
for key in old_data.keys():
for tag in old_data[key]:
try:
index = self._data[key].index(tag)
if index > -1:
self._data[key][index].state = tag.state
except:
continue
self.reset()
def reinit(self, *args, **kwargs):
if not self.ignore_next_search:
for category in self._data.values():
for tag in category:
tag.state = 0
self.reset()
self.ignore_next_search = False
def toggle(self, index):
if index.parent().isValid():
category = self.row_map[index.parent().row()]
tag = self._data[category][index.row()]
tag.state = (tag.state + 1)%3
self.ignore_next_search = True
self.emit(SIGNAL('dataChanged(QModelIndex,QModelIndex)'), index, index)
return True
return False

View File

@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
The database used to store ebook metadata
'''
import os, re, sys, shutil, cStringIO, glob, collections, textwrap, \
operator, itertools, functools
operator, itertools, functools, traceback
import sqlite3 as sqlite
from itertools import repeat
@ -356,6 +356,7 @@ class LibraryDatabase2(LibraryDatabase):
def __init__(self, library_path, row_factory=False):
if not os.path.exists(library_path):
os.makedirs(library_path)
self.listeners = set([])
self.library_path = os.path.abspath(library_path)
self.row_factory = row_factory
self.dbpath = os.path.join(library_path, 'metadata.db')
@ -486,7 +487,7 @@ class LibraryDatabase2(LibraryDatabase):
if cdata is not None:
open(os.path.join(tpath, 'cover.jpg'), 'wb').write(cdata)
for format in formats:
# Get data as string (cant use file as source and target files may be the same)
# Get data as string (can't use file as source and target files may be the same)
f = self.format(id, format, index_is_id=True, as_file=False)
if not f:
continue
@ -503,6 +504,22 @@ class LibraryDatabase2(LibraryDatabase):
if len(os.listdir(parent)) == 0:
shutil.rmtree(parent)
def add_listener(self, listener):
'''
Add a listener. Will be called on change events with two arguments.
Event name and list of affected ids.
'''
self.listeners.add(listener)
def notify(self, event, ids=[]):
'Notify all listeners'
for listener in self.listeners:
try:
listener(event, ids)
except:
traceback.print_exc()
continue
def cover(self, index, index_is_id=False, as_file=False, as_image=False):
'''
Return the cover image as a bytestring (in JPEG format) or None.
@ -601,6 +618,7 @@ class LibraryDatabase2(LibraryDatabase):
self.conn.execute('INSERT INTO data (book,format,uncompressed_size,name) VALUES (?,?,?,?)',
(id, format.upper(), size, name))
self.conn.commit()
self.notify('metadata', [id])
def delete_book(self, id):
'''
@ -615,6 +633,8 @@ class LibraryDatabase2(LibraryDatabase):
shutil.rmtree(parent)
self.conn.execute('DELETE FROM books WHERE id=?', (id,))
self.conn.commit()
self.clean()
self.notify('delete', [id])
def remove_format(self, index, format, index_is_id=False):
id = index if index_is_id else self.id(index)
@ -630,6 +650,7 @@ class LibraryDatabase2(LibraryDatabase):
pass
self.conn.execute('DELETE FROM data WHERE book=? AND format=?', (id, format.upper()))
self.conn.commit()
self.notify('metadata', [id])
def clean(self):
'''
@ -668,16 +689,17 @@ class LibraryDatabase2(LibraryDatabase):
self.data.set(row, col, val)
if column == 'authors':
val = val.split('&,')
self.set_authors(id, val)
self.set_authors(id, val, notify=False)
elif column == 'title':
self.set_title(id, val)
self.set_title(id, val, notify=False)
elif column == 'publisher':
self.set_publisher(id, val)
self.set_publisher(id, val, notify=False)
elif column == 'rating':
self.set_rating(id, val)
elif column == 'tags':
self.set_tags(id, val.split(','), append=False)
self.set_tags(id, val.split(','), append=False, notify=False)
self.set_path(id, True)
self.notify('metadata', [id])
def set_metadata(self, id, mi):
'''
@ -690,24 +712,25 @@ class LibraryDatabase2(LibraryDatabase):
authors = []
for a in mi.authors:
authors += a.split('&')
self.set_authors(id, authors)
self.set_authors(id, authors, notify=False)
if mi.author_sort:
self.set_author_sort(id, mi.author_sort)
if mi.publisher:
self.set_publisher(id, mi.publisher)
self.set_publisher(id, mi.publisher, notify=False)
if mi.rating:
self.set_rating(id, mi.rating)
if mi.series:
self.set_series(id, mi.series)
self.set_series(id, mi.series, notify=False)
if mi.cover_data[1] is not None:
self.set_cover(id, mi.cover_data[1])
if mi.tags:
self.set_tags(id, mi.tags)
self.set_tags(id, mi.tags, notify=False)
if mi.comments:
self.set_comment(id, mi.comments)
self.set_path(id, True)
self.notify('metadata', [id])
def set_authors(self, id, authors):
def set_authors(self, id, authors, notify=True):
'''
`authors`: A list of authors.
'''
@ -729,14 +752,16 @@ class LibraryDatabase2(LibraryDatabase):
except sqlite.IntegrityError: # Sometimes books specify the same author twice in their metadata
pass
self.set_path(id, True)
self.notify('metadata', [id])
def set_title(self, id, title):
def set_title(self, id, title, notify=True):
if not title:
return
self.conn.execute('UPDATE books SET title=? WHERE id=?', (title, id))
self.set_path(id, True)
self.notify('metadata', [id])
def set_publisher(self, id, publisher):
def set_publisher(self, id, publisher, notify=True):
self.conn.execute('DELETE FROM books_publishers_link WHERE book=?',(id,))
self.conn.execute('DELETE FROM publishers WHERE (SELECT COUNT(id) FROM books_publishers_link WHERE publisher=publishers.id) < 1')
if publisher:
@ -747,8 +772,9 @@ class LibraryDatabase2(LibraryDatabase):
aid = self.conn.execute('INSERT INTO publishers(name) VALUES (?)', (publisher,)).lastrowid
self.conn.execute('INSERT INTO books_publishers_link(book, publisher) VALUES (?,?)', (id, aid))
self.conn.commit()
self.notify('metadata', [id])
def set_tags(self, id, tags, append=False):
def set_tags(self, id, tags, append=False, notify=True):
'''
@param tags: list of strings
@param append: If True existing tags are not removed
@ -771,9 +797,10 @@ class LibraryDatabase2(LibraryDatabase):
self.conn.execute('INSERT INTO books_tags_link(book, tag) VALUES (?,?)',
(id, tid))
self.conn.commit()
self.notify('metadata', [id])
def set_series(self, id, series):
def set_series(self, id, series, notify=True):
self.conn.execute('DELETE FROM books_series_link WHERE book=?',(id,))
self.conn.execute('DELETE FROM series WHERE (SELECT COUNT(id) FROM books_series_link WHERE series=series.id) < 1')
if series:
@ -790,8 +817,9 @@ class LibraryDatabase2(LibraryDatabase):
self.data.set(row, 9, series)
except ValueError:
pass
self.notify('metadata', [id])
def set_series_index(self, id, idx):
def set_series_index(self, id, idx, notify=True):
if idx is None:
idx = 1
idx = int(idx)
@ -803,6 +831,7 @@ class LibraryDatabase2(LibraryDatabase):
self.data.set(row, 10, idx)
except ValueError:
pass
self.notify('metadata', [id])
def add_books(self, paths, formats, metadata, uris=[], add_duplicates=True):
'''
@ -811,6 +840,7 @@ class LibraryDatabase2(LibraryDatabase):
'''
formats, metadata, uris = iter(formats), iter(metadata), iter(uris)
duplicates = []
ids = []
for path in paths:
mi = metadata.next()
format = formats.next()
@ -826,6 +856,7 @@ class LibraryDatabase2(LibraryDatabase):
obj = self.conn.execute('INSERT INTO books(title, uri, series_index, author_sort) VALUES (?, ?, ?, ?)',
(mi.title, uri, series_index, aus))
id = obj.lastrowid
ids.append(id)
self.set_path(id, True)
self.conn.commit()
self.set_metadata(id, mi)
@ -859,7 +890,7 @@ class LibraryDatabase2(LibraryDatabase):
stream = open(path, 'rb')
self.add_format(id, ext, stream, index_is_id=True)
self.conn.commit()
self.notify('add', [id])
def move_library_to(self, newloc):
if not os.path.exists(newloc):

View File

@ -10,9 +10,56 @@ from calibre.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
from calibre.web.fetch.simple import option_parser as _option_parser
from calibre.web.feeds.news import Profile2Recipe, BasicNewsRecipe
from calibre.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile
from calibre.utils.config import Config, StringConfig
def option_parser(usage=_('''\
def config(defaults=None):
desc = _('Options to control the fetching of periodical content from the web.')
c = Config('feeds2disk', desc) if defaults is None else StringConfig(defaults, desc)
web2disk = c.add_group('web2disk', _('Customize the download engine'))
web2disk('timeout', ['-t', '--timeout'], default=10.0,
help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),)
web2disk('delay', ['--delay'], default=0,
help=_('Minimum interval in seconds between consecutive fetches. Default is %default s'))
web2disk('encoding', ['--encoding'], default=None,
help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.'))
web2disk('match_regexps', ['--match-regexp'], default=[], action='append',
help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.'))
web2disk('filter_regexps', ['--filter-regexp'], default=[], action='append',
help=_('Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.'))
web2disk('no_stylesheets', ['--dont-download-stylesheets'], action='store_true', default=False,
help=_('Do not download CSS stylesheets.'))
c.add_option('feeds', ['--feeds'], default=None,
help=_('''Specify a list of feeds to download. For example:
"['http://feeds.newsweek.com/newsweek/TopNews', 'http://feeds.newsweek.com/headlines/politics']"
If you specify this option, any argument to %prog is ignored and a default recipe is used to download the feeds.'''))
c.add_option('verbose', ['-v', '--verbose'], default=0, action='count',
help=_('''Be more verbose while processing.'''))
c.add_option('title', ['--title'], default=None,
help=_('The title for this recipe. Used as the title for any ebooks created from the downloaded feeds.'))
c.add_option('username', ['-u', '--username'], default=None,
help=_('Username for sites that require a login to access content.'))
c.add_option('password', ['-p', '--password'], default=None,
help=_('Password for sites that require a login to access content.'))
c.add_option('lrf', ['--lrf'], default=False, action='store_true',
help='Optimize fetching for subsequent conversion to LRF.')
c.add_option('epub', ['--epub'], default=False, action='store_true',
help='Optimize fetching for subsequent conversion to EPUB.')
c.add_option('recursions', ['--recursions'], default=0,
help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
c.add_option('output_dir', ['--output-dir'], default='.',
help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.'))
c.add_option('no_progress_bar', ['--no-progress-bar'], default=False, action='store_true',
help=_("Don't show the progress bar"))
c.add_option('debug', ['--debug'], action='store_true', default=False,
help=_('Very verbose output, useful for debugging.'))
c.add_option('test', ['--test'], action='store_true', default=False,
help=_('Useful for recipe development. Forces max_articles_per_feed to 2 and downloads at most 2 feeds.'))
return c
USAGE=_('''\
%%prog [options] ARG
%%prog parses an online source of articles, like an RSS or ATOM feed and
@ -28,7 +75,9 @@ recipe as a string - %%prog will load the recipe directly from the string arg.
Available builtin recipes are:
%s
''')%(unicode(list(titles))[1:-1])):
''')%(unicode(list(titles))[1:-1])
def option_parser(usage=USAGE):
p = _option_parser(usage=usage)
p.remove_option('--max-recursions')
p.remove_option('--base-dir')
@ -51,7 +100,7 @@ If you specify this option, any argument to %prog is ignored and a default recip
help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
p.add_option('--output-dir', default=os.getcwd(),
help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.'))
p.add_option('--no-progress-bar', dest='progress_bar', default=True, action='store_false',
p.add_option('--no-progress-bar', dest='no_progress_bar', default=False, action='store_true',
help=_('Dont show the progress bar'))
p.add_option('--debug', action='store_true', default=False,
help=_('Very verbose output, useful for debugging.'))
@ -67,7 +116,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
if notification is None:
from calibre.utils.terminfo import TerminalController, ProgressBar
term = TerminalController(sys.stdout)
pb = ProgressBar(term, _('Fetching feeds...'), no_progress_bar=not opts.progress_bar)
pb = ProgressBar(term, _('Fetching feeds...'), no_progress_bar=opts.no_progress_bar)
notification = pb.update
recipe, is_profile = None, False
@ -76,14 +125,9 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
else:
try:
if os.access(recipe_arg, os.R_OK):
try:
recipe = compile_recipe(open(recipe_arg).read())
is_profile = DefaultProfile in recipe.__bases__ or \
FullContentProfile in recipe.__bases__
except:
import traceback
traceback.print_exc()
return 1
recipe = compile_recipe(open(recipe_arg).read())
is_profile = DefaultProfile in recipe.__bases__ or \
FullContentProfile in recipe.__bases__
else:
raise Exception('not file')
except:

View File

@ -18,7 +18,7 @@ from calibre.ebooks.metadata.opf import OPFCreator
from calibre.ebooks.lrf import entity_to_unicode
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
from calibre.web.fetch.simple import RecursiveFetcher
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
@ -138,6 +138,9 @@ class BasicNewsRecipe(object, LoggingInterface):
#: List of options to pass to html2lrf, to customize generation of LRF ebooks.
html2lrf_options = []
#: Options to pass to html2epub to customize generation of EPUB ebooks.
html2epub_options = ''
#: List of tags to be removed. Specified tags are removed from downloaded HTML.
#: A tag is specified as a dictionary of the form::
#:
@ -594,9 +597,9 @@ class BasicNewsRecipe(object, LoggingInterface):
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
templ = templates.EmbeddedContent()
raw = templ.generate(article).render('html')
with PersistentTemporaryFile('_feeds2disk.html') as f:
f.write(raw)
url = ('file:'+f.name) if iswindows else ('file://'+f.name)
with PersistentTemporaryFile('_feeds2disk.html') as pt:
pt.write(raw)
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
@ -643,7 +646,6 @@ class BasicNewsRecipe(object, LoggingInterface):
url = article.url
if not url:
continue
func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
((self.fetch_obfuscated_article if self.articles_are_obfuscated \
else self.fetch_article), url)
@ -819,13 +821,21 @@ class BasicNewsRecipe(object, LoggingInterface):
else:
title, url = obj
self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url))
f = self.browser.open(url)
parsed_feeds.append(feed_from_xml(f.read(),
title=title,
oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
get_article_url=self.get_article_url))
f.close()
try:
with closing(self.browser.open(url)) as f:
parsed_feeds.append(feed_from_xml(f.read(),
title=title,
oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
get_article_url=self.get_article_url))
except Exception, err:
feed = Feed()
msg = 'Failed feed: %s'%(title if title else url)
feed.populate_from_preparsed_feed(msg, [])
feed.description = unicode(err)
parsed_feeds.append(feed)
self.log_exception(msg)
return parsed_feeds
@classmethod