KG updates

This commit is contained in:
GRiker 2011-06-24 06:27:06 -06:00
commit 64ae406db9
10 changed files with 562 additions and 379 deletions

View File

@ -14,7 +14,7 @@ class LeTemps(BasicNewsRecipe):
title = u'Le Temps'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'Sujata Raman'
__author__ = 'Kovid Goyal'
description = 'French news. Needs a subscription from http://www.letemps.ch'
no_stylesheets = True
remove_javascript = True
@ -27,6 +27,7 @@ class LeTemps(BasicNewsRecipe):
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
br.open('http://www.letemps.ch/login')
br.select_form(nr=1)
br['username'] = self.username
br['password'] = self.password
raw = br.submit().read()

View File

@ -875,7 +875,7 @@ class ActionCopyToLibrary(InterfaceActionBase):
class ActionTweakEpub(InterfaceActionBase):
name = 'Tweak ePub'
actual_plugin = 'calibre.gui2.actions.tweak_epub:TweakEpubAction'
description = _('Make small twekas to epub files in your calibre library')
description = _('Make small tweaks to epub files in your calibre library')
class ActionNextMatch(InterfaceActionBase):
name = 'Next Match'

View File

@ -1,96 +1,235 @@
#!/usr/bin/env python
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
__copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\
'2008, Anatoly Shipitsin <norguhtar at gmail.com>'
'''Read meta information from fb2 files'''
import os
import datetime
from functools import partial
from base64 import b64decode
from lxml import etree
from calibre.ebooks.metadata import MetaInformation
from calibre.utils.date import parse_date
from calibre import guess_all_extensions, prints, force_unicode
from calibre.ebooks.metadata import MetaInformation, check_isbn
from calibre.ebooks.chardet import xml_to_unicode
from calibre import guess_all_extensions
XLINK_NS = 'http://www.w3.org/1999/xlink'
def XLINK(name):
return '{%s}%s' % (XLINK_NS, name)
NAMESPACES = {
'fb2' : 'http://www.gribuser.ru/xml/fictionbook/2.0',
'xlink' : 'http://www.w3.org/1999/xlink' }
XPath = partial(etree.XPath, namespaces=NAMESPACES)
tostring = partial(etree.tostring, method='text', encoding=unicode)
def get_metadata(stream):
""" Return metadata as a L{MetaInfo} object """
XPath = lambda x : etree.XPath(x,
namespaces={'fb2':'http://www.gribuser.ru/xml/fictionbook/2.0',
'xlink':XLINK_NS})
tostring = lambda x : etree.tostring(x, method='text',
encoding=unicode).strip()
""" Return fb2 metadata as a L{MetaInformation} object """
root = _get_fbroot(stream)
book_title = _parse_book_title(root)
authors = _parse_authors(root)
# fallback for book_title
if book_title:
book_title = unicode(book_title)
else:
book_title = force_unicode(os.path.splitext(
os.path.basename(getattr(stream, 'name',
_('Unknown'))))[0])
mi = MetaInformation(book_title, authors)
try:
_parse_cover(root, mi)
except:
pass
try:
_parse_comments(root, mi)
except:
pass
try:
_parse_tags(root, mi)
except:
pass
try:
_parse_series(root, mi)
except:
pass
try:
_parse_isbn(root, mi)
except:
pass
try:
_parse_publisher(root, mi)
except:
pass
try:
_parse_pubdate(root, mi)
except:
pass
try:
_parse_timestamp(root, mi)
except:
pass
try:
_parse_language(root, mi)
except:
pass
#_parse_uuid(root, mi)
#if DEBUG:
# prints(mi)
return mi
def _parse_authors(root):
authors = []
# pick up authors but only from 1 secrion <title-info>; otherwise it is not consistent!
# Those are fallbacks: <src-title-info>, <document-info>
for author_sec in ['title-info', 'src-title-info', 'document-info']:
for au in XPath('//fb2:%s/fb2:author'%author_sec)(root):
author = _parse_author(au)
if author:
authors.append(author)
if author:
break
# if no author so far
if not authors:
authors.append(_('Unknown'))
return authors
def _parse_author(elm_author):
""" Returns a list of display author and sortable author"""
xp_templ = 'normalize-space(fb2:%s/text())'
author = XPath(xp_templ % 'first-name')(elm_author)
lname = XPath(xp_templ % 'last-name')(elm_author)
mname = XPath(xp_templ % 'middle-name')(elm_author)
if mname:
author = (author + ' ' + mname).strip()
if lname:
author = (author + ' ' + lname).strip()
# fallback to nickname
if not author:
nname = XPath(xp_templ % 'nickname')(elm_author)
if nname:
author = nname
return author
def _parse_book_title(root):
# <title-info> has a priority. (actually <title-info> is mandatory)
# other are backup solution (sequence is important. other then in fb2-doc)
xp_ti = '//fb2:title-info/fb2:book-title/text()'
xp_pi = '//fb2:publish-info/fb2:book-title/text()'
xp_si = '//fb2:src-title-info/fb2:book-title/text()'
book_title = XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root)
return book_title
def _parse_cover(root, mi):
# pickup from <title-info>, if not exists it fallbacks to <src-title-info>
imgid = XPath('substring-after(string(//fb2:coverpage/fb2:image/@xlink:href), "#")')(root)
if imgid:
try:
_parse_cover_data(root, imgid, mi)
except:
pass
def _parse_cover_data(root, imgid, mi):
elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root)
if elm_binary:
mimetype = elm_binary[0].get('content-type', 'image/jpeg')
mime_extensions = guess_all_extensions(mimetype)
if mime_extensions:
pic_data = elm_binary[0].text
if pic_data:
mi.cover_data = (mime_extensions[0][1:], b64decode(pic_data))
else:
prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid) )
def _parse_tags(root, mi):
# pick up genre but only from 1 secrion <title-info>; otherwise it is not consistent!
# Those are fallbacks: <src-title-info>
for genre_sec in ['title-info', 'src-title-info']:
# -- i18n Translations-- ?
tags = XPath('//fb2:%s/fb2:genre/text()' % genre_sec)(root)
if tags:
mi.tags = list(map(unicode, tags))
break
def _parse_series(root, mi):
#calibri supports only 1 series: use the 1-st one
# pick up sequence but only from 1 secrion in prefered order
# except <src-title-info>
xp_ti = '//fb2:title-info/fb2:sequence[1]'
xp_pi = '//fb2:publish-info/fb2:sequence[1]'
elms_sequence = XPath('%s|%s' % (xp_ti, xp_pi))(root)
if elms_sequence:
mi.series = elms_sequence[0].get('name', None)
if mi.series:
mi.series_index = elms_sequence[0].get('number', None)
def _parse_isbn(root, mi):
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
isbn = XPath('normalize-space(//fb2:publish-info/fb2:isbn/text())')(root)
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
if ',' in isbn:
isbn = isbn[:isbn.index(',')]
if check_isbn(isbn):
mi.isbn = isbn
def _parse_comments(root, mi):
# pick up annotation but only from 1 secrion <title-info>; fallback: <src-title-info>
for annotation_sec in ['title-info', 'src-title-info']:
elms_annotation = XPath('//fb2:%s/fb2:annotation' % annotation_sec)(root)
if elms_annotation:
mi.comments = tostring(elms_annotation[0])
# TODO: tags i18n, xslt?
break
def _parse_publisher(root, mi):
publisher = XPath('string(//fb2:publish-info/fb2:publisher/text())')(root)
if publisher:
mi.publisher = publisher
def _parse_pubdate(root, mi):
year = XPath('number(//fb2:publish-info/fb2:year/text())')(root)
if float.is_integer(year):
# only year is available, so use 1-st of Jan
mi.pubdate = datetime.date(int(year), 1, 1)
def _parse_timestamp(root, mi):
#<date value="1996-12-03">03.12.1996</date>
xp ='//fb2:document-info/fb2:date/@value|'\
'//fb2:document-info/fb2:date/text()'
docdate = XPath('string(%s)' % xp)(root)
if docdate:
mi.timestamp = parse_date(docdate)
def _parse_language(root, mi):
language = XPath('string(//fb2:title-info/fb2:lang/text())')(root)
if language:
mi.language = language
mi.languages = [ language ]
def _parse_uuid(root, mi):
uuid = XPath('normalize-space(//document-info/fb2:id/text())')(root)
if uuid:
mi.uuid = uuid
def _get_fbroot(stream):
parser = etree.XMLParser(recover=True, no_network=True)
raw = stream.read()
raw = xml_to_unicode(raw, strip_encoding_pats=True,
assume_utf8=True)[0]
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
root = etree.fromstring(raw, parser=parser)
authors, author_sort = [], None
for au in XPath('//fb2:author')(root):
fname = lname = author = None
fe = XPath('descendant::fb2:first-name')(au)
if fe:
fname = tostring(fe[0])
author = fname
le = XPath('descendant::fb2:last-name')(au)
if le:
lname = tostring(le[0])
if author:
author += ' '+lname
else:
author = lname
if author:
authors.append(author)
if len(authors) == 1 and author is not None:
if lname:
author_sort = lname
if fname:
if author_sort: author_sort += ', '+fname
else: author_sort = fname
title = os.path.splitext(os.path.basename(getattr(stream, 'name',
_('Unknown'))))[0]
for x in XPath('//fb2:book-title')(root):
title = tostring(x)
break
comments = ''
for x in XPath('//fb2:annotation')(root):
comments += tostring(x)
if not comments:
comments = None
tags = list(map(tostring, XPath('//fb2:genre')(root)))
return root
cp = XPath('//fb2:coverpage')(root)
cdata = None
if cp:
cimage = XPath('descendant::fb2:image[@xlink:href]')(cp[0])
if cimage:
id = cimage[0].get(XLINK('href')).replace('#', '')
binary = XPath('//fb2:binary[@id="%s"]'%id)(root)
if binary:
mt = binary[0].get('content-type', 'image/jpeg')
exts = guess_all_extensions(mt)
if not exts:
exts = ['.jpg']
cdata = (exts[0][1:], b64decode(tostring(binary[0])))
series = None
series_index = 1.0
for x in XPath('//fb2:sequence')(root):
series = x.get('name', None)
if series is not None:
series_index = x.get('number', 1.0)
break
mi = MetaInformation(title, authors)
mi.comments = comments
mi.author_sort = author_sort
if tags:
mi.tags = tags
mi.series = series
mi.series_index = series_index
if cdata:
mi.cover_data = cdata
return mi

View File

@ -591,8 +591,10 @@ class BooksView(QTableView): # {{{
fmt = prefs['output_format']
def url_for_id(i):
ans = db.format(i, fmt, index_is_id=True, as_path=True,
preserve_filename=True)
try:
ans = db.format_path(i, fmt, index_is_id=True)
except:
ans = None
if ans is None:
fmts = db.formats(i, index_is_id=True)
if fmts:
@ -600,13 +602,15 @@ class BooksView(QTableView): # {{{
else:
fmts = []
for f in fmts:
ans = db.format(i, f, index_is_id=True, as_path=True,
preserve_filename=True)
try:
ans = db.format_path(i, f, index_is_id=True)
except:
ans = None
if ans is None:
ans = db.abspath(i, index_is_id=True)
return QUrl.fromLocalFile(ans)
md.setUrls([url_for_id(i) for i in selected[:25]])
md.setUrls([url_for_id(i) for i in selected])
drag = QDrag(self)
col = self.selectionModel().currentIndex().column()
md.column_name = self.column_map[col]

View File

@ -149,7 +149,8 @@ class TagsView(QTreeView): # {{{
hidden_categories=self.hidden_categories,
search_restriction=None,
drag_drop_finished=self.drag_drop_finished,
collapse_model=self.collapse_model)
collapse_model=self.collapse_model,
state_map={})
self.pane_is_visible = True # because TagsModel.init did a recount
self.sort_by = sort_by
self.tag_match = tag_match
@ -173,6 +174,7 @@ class TagsView(QTreeView): # {{{
self.made_connections = True
self.refresh_signal_processed = True
db.add_listener(self.database_changed)
self.expanded.connect(self.item_expanded)
def database_changed(self, event, ids):
if self.refresh_signal_processed:
@ -541,6 +543,10 @@ class TagsView(QTreeView): # {{{
return self.isExpanded(idx)
def recount(self, *args):
'''
Rebuild the category tree, expand any categories that were expanded,
reset the search states, and reselect the current node.
'''
if self.disable_recounting or not self.pane_is_visible:
return
self.refresh_signal_processed = True
@ -548,18 +554,23 @@ class TagsView(QTreeView): # {{{
if not ci.isValid():
ci = self.indexAt(QPoint(10, 10))
path = self.model().path_for_index(ci) if self.is_visible(ci) else None
try:
if not self.model().refresh(): # categories changed!
self.set_new_model()
path = None
except: #Database connection could be closed if an integrity check is happening
pass
expanded_categories, state_map = self.model().get_state()
self.set_new_model(state_map=state_map)
for category in expanded_categories:
self.expand(self.model().index_for_category(category))
self._model.show_item_at_path(path)
# If the number of user categories changed, if custom columns have come or
# gone, or if columns have been hidden or restored, we must rebuild the
# model. Reason: it is much easier than reconstructing the browser tree.
def set_new_model(self, filter_categories_by=None):
def item_expanded(self, idx):
'''
Called by the expanded signal
'''
self.setCurrentIndex(idx)
def set_new_model(self, filter_categories_by=None, state_map={}):
'''
There are cases where we need to rebuild the category tree without
attempting to reposition the current node.
'''
try:
old = getattr(self, '_model', None)
if old is not None:
@ -569,7 +580,8 @@ class TagsView(QTreeView): # {{{
search_restriction=self.search_restriction,
drag_drop_finished=self.drag_drop_finished,
filter_categories_by=filter_categories_by,
collapse_model=self.collapse_model)
collapse_model=self.collapse_model,
state_map=state_map)
self.setModel(self._model)
except:
# The DB must be gone. Set the model to None and hope that someone
@ -752,7 +764,8 @@ class TagsModel(QAbstractItemModel): # {{{
def __init__(self, db, parent, hidden_categories=None,
search_restriction=None, drag_drop_finished=None,
filter_categories_by=None, collapse_model='disable'):
filter_categories_by=None, collapse_model='disable',
state_map={}):
QAbstractItemModel.__init__(self, parent)
# must do this here because 'QPixmap: Must construct a QApplication
@ -776,10 +789,10 @@ class TagsModel(QAbstractItemModel): # {{{
self.filter_categories_by = filter_categories_by
self.collapse_model = collapse_model
# get_node_tree cannot return None here, because row_map is empty. Note
# that get_node_tree can indirectly change the user_categories dict.
# Note that _get_category_nodes can indirectly change the
# user_categories dict.
data = self.get_node_tree(config['sort_tags_by'])
data = self._get_category_nodes(config['sort_tags_by'])
gst = db.prefs.get('grouped_search_terms', {})
self.root_item = TagTreeItem(icon_map=self.icon_state_map)
self.category_nodes = []
@ -844,7 +857,7 @@ class TagsModel(QAbstractItemModel): # {{{
category_node_map[key] = node
last_category_node = node
self.category_nodes.append(node)
self.refresh(data=data)
self._create_node_tree(data, state_map)
def break_cycles(self):
self.root_item.break_cycles()
@ -1121,8 +1134,10 @@ class TagsModel(QAbstractItemModel): # {{{
def set_search_restriction(self, s):
self.search_restriction = s
def get_node_tree(self, sort):
old_row_map_len = len(self.row_map)
def _get_category_nodes(self, sort):
'''
Called by __init__. Do not directly call this method.
'''
self.row_map = []
self.categories = {}
@ -1176,19 +1191,27 @@ class TagsModel(QAbstractItemModel): # {{{
if category in data: # The search category can come and go
self.row_map.append(category)
self.categories[category] = tb_categories[category]['name']
if old_row_map_len != 0 and old_row_map_len != len(self.row_map):
# A category has been added or removed. We must force a rebuild of
# the model
return None
return data
def refresh(self, data=None):
'''
Here to trap usages of refresh in the old architecture. Can eventually
be removed.
'''
print 'TagsModel: refresh called!'
traceback.print_stack()
return False
def _create_node_tree(self, data, state_map):
'''
Called by __init__. Do not directly call this method.
'''
sort_by = config['sort_tags_by']
if data is None:
data = self.get_node_tree(sort_by) # get category data
if data is None:
return False
print '_create_node_tree: no data!'
traceback.print_stack()
return
collapse = gprefs['tags_browser_collapse_at']
collapse_model = self.collapse_model
@ -1354,29 +1377,23 @@ class TagsModel(QAbstractItemModel): # {{{
# }}}
for category in self.category_nodes:
if len(category.children) > 0:
child_map = category.children
states = [c.tag.state for c in category.child_tags()]
names = [(c.tag.name, c.tag.category) for c in category.child_tags()]
state_map = dict(izip(names, states))
# temporary sub-categories (the partitioning ones) must follow
# the permanent sub-categories. This will happen naturally if
# the temp ones are added by process_node
ctags = [c for c in child_map if
c.type == TagTreeItem.CATEGORY and not c.temporary]
start = len(ctags)
self.beginRemoveRows(self.createIndex(category.row(), 0, category),
start, len(child_map)-1)
category.children = ctags
for i in range(start, len(child_map)):
child_map[i].break_cycles()
child_map = None
self.endRemoveRows()
else:
state_map = {}
process_one_node(category, state_map.get(category.py_name, {}))
process_one_node(category, state_map)
return True
def get_state(self):
state_map = {}
expanded_categories = []
for row, category in enumerate(self.category_nodes):
if self.tags_view.isExpanded(self.index(row, 0, QModelIndex())):
expanded_categories.append(category.py_name)
states = [c.tag.state for c in category.child_tags()]
names = [(c.tag.name, c.tag.category) for c in category.child_tags()]
state_map[category.py_name] = dict(izip(names, states))
return expanded_categories, state_map
def index_for_category(self, name):
for row, category in enumerate(self.category_nodes):
if category.py_name == name:
return self.index(row, 0, QModelIndex())
def columnCount(self, parent):
return 1
@ -1476,7 +1493,7 @@ class TagsModel(QAbstractItemModel): # {{{
self.tags_view.tag_item_renamed.emit()
item.tag.name = val
self.rename_item_in_all_user_categories(name, key, val)
self.refresh() # Should work, because no categories can have disappeared
self.refresh_required.emit()
self.show_item_at_path(path)
return True
@ -1789,19 +1806,22 @@ class TagsModel(QAbstractItemModel): # {{{
return v
return None
def show_item_at_path(self, path, box=False):
def show_item_at_path(self, path, box=False,
position=QTreeView.PositionAtCenter):
'''
Scroll the browser and open categories to show the item referenced by
path. If possible, the item is placed in the center. If box=True, a
box is drawn around the item.
'''
if path:
self.show_item_at_index(self.index_for_path(path), box)
self.show_item_at_index(self.index_for_path(path), box=box,
position=position)
def show_item_at_index(self, idx, box=False):
def show_item_at_index(self, idx, box=False,
position=QTreeView.PositionAtCenter):
if idx.isValid():
self.tags_view.setCurrentIndex(idx)
self.tags_view.scrollTo(idx, QTreeView.PositionAtCenter)
self.tags_view.scrollTo(idx, position)
if box:
tag_item = idx.internalPointer()
tag_item.boxed = True

View File

@ -1144,6 +1144,20 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
break
return sha.hexdigest()
def format_path(self, index, fmt, index_is_id=False):
'''
This method is intended to be used only in those rare situations, like
Drag'n Drop, when you absolutely need the path to the original file.
Otherwise, use format(..., as_path=True).
Note that a networked backend will always return None.
'''
path = self.format_abspath(index, fmt, index_is_id=index_is_id)
if path is None:
id_ = index if index_is_id else self.id(index)
raise NoSuchFormat('Record %d has no format: %s'%(id_, fmt))
return path
def format_abspath(self, index, format, index_is_id=False):
'''
Return absolute path to the ebook file of format `format`

View File

@ -633,6 +633,7 @@ TXT input supports a number of options to differentiate how paragraphs are detec
:guilabel:`Formatting Style: None`
Applies no special formatting to the text, the document is converted to html with no other changes.
.. _pdfconversion:
Convert PDF documents
~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -35,29 +35,11 @@ What are the best source formats to convert?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In order of decreasing preference: LIT, MOBI, EPUB, FB2, HTML, PRC, RTF, PDB, TXT, PDF
Why does the PDF conversion lose some images/tables?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The PDF conversion tries to extract the text and images from the PDF file and convert them to and HTML based ebook. Some PDF files have images in a format that cannot be extracted (vector images). All tables
are also represented as vector diagrams, thus they cannot be extracted.
I converted a PDF file, but the result has various problems?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
How do I convert a collection of HTML files in a specific order?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In order to convert a collection of HTML files in a specific oder, you have to create a table of contents file. That is, another HTML file that contains links to all the other files in the desired order. Such a file looks like::
PDF is a terrible format to convert from. For a list of the various issues you will encounter when converting PDF, see: :ref:`pdfconversion`.
<html>
<body>
<h1>Table of Contents</h1>
<p style="text-indent:0pt">
<a href="file1.html">First File</a><br/>
<a href="file2.html">Second File</a><br/>
.
.
.
</p>
</body>
</html>
Then just add this HTML file to the GUI and use the convert button to create your ebook.
.. _char-encoding-faq:
@ -85,6 +67,26 @@ If you have a hand edited TOC in the input document, you can use the TOC detecti
Finally, I encourage you to ditch the content TOC and only have a metadata TOC in your ebooks. Metadata TOCs will give the people reading your ebooks a much superior navigation experience (except on the Kindle, where they are essentially the same as a content TOC).
How do I convert a collection of HTML files in a specific order?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In order to convert a collection of HTML files in a specific oder, you have to create a table of contents file. That is, another HTML file that contains links to all the other files in the desired order. Such a file looks like::
<html>
<body>
<h1>Table of Contents</h1>
<p style="text-indent:0pt">
<a href="file1.html">First File</a><br/>
<a href="file2.html">Second File</a><br/>
.
.
.
</p>
</body>
</html>
Then just add this HTML file to the GUI and use the convert button to create your ebook.
How do I use some of the advanced features of the conversion tools?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
You can get help on any individual feature of the converters by mousing over it in the GUI or running ``ebook-convert dummy.html .epub -h`` at a terminal. A good place to start is to look at the following demo files that demonstrate some of the advanced features:

File diff suppressed because it is too large Load Diff

View File

@ -208,6 +208,8 @@ def gc_histogram():
def diff_hists(h1, h2):
"""Prints differences between two results of gc_histogram()."""
for k in h1:
if k not in h2:
h2[k] = 0
if h1[k] != h2[k]:
print "%s: %d -> %d (%s%d)" % (
k, h1[k], h2[k], h2[k] > h1[k] and "+" or "", h2[k] - h1[k])