KG updates

This commit is contained in:
GRiker 2011-06-24 06:27:06 -06:00
commit 64ae406db9
10 changed files with 562 additions and 379 deletions

View File

@ -14,7 +14,7 @@ class LeTemps(BasicNewsRecipe):
title = u'Le Temps' title = u'Le Temps'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
__author__ = 'Sujata Raman' __author__ = 'Kovid Goyal'
description = 'French news. Needs a subscription from http://www.letemps.ch' description = 'French news. Needs a subscription from http://www.letemps.ch'
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
@ -27,6 +27,7 @@ class LeTemps(BasicNewsRecipe):
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open('http://www.letemps.ch/login') br.open('http://www.letemps.ch/login')
br.select_form(nr=1)
br['username'] = self.username br['username'] = self.username
br['password'] = self.password br['password'] = self.password
raw = br.submit().read() raw = br.submit().read()

View File

@ -875,7 +875,7 @@ class ActionCopyToLibrary(InterfaceActionBase):
class ActionTweakEpub(InterfaceActionBase): class ActionTweakEpub(InterfaceActionBase):
name = 'Tweak ePub' name = 'Tweak ePub'
actual_plugin = 'calibre.gui2.actions.tweak_epub:TweakEpubAction' actual_plugin = 'calibre.gui2.actions.tweak_epub:TweakEpubAction'
description = _('Make small twekas to epub files in your calibre library') description = _('Make small tweaks to epub files in your calibre library')
class ActionNextMatch(InterfaceActionBase): class ActionNextMatch(InterfaceActionBase):
name = 'Next Match' name = 'Next Match'

View File

@ -1,96 +1,235 @@
#!/usr/bin/env python #!/usr/bin/env python
from __future__ import with_statement from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>' __copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\
'2008, Anatoly Shipitsin <norguhtar at gmail.com>'
'''Read meta information from fb2 files''' '''Read meta information from fb2 files'''
import os import os
import datetime
from functools import partial
from base64 import b64decode from base64 import b64decode
from lxml import etree from lxml import etree
from calibre.ebooks.metadata import MetaInformation from calibre.utils.date import parse_date
from calibre import guess_all_extensions, prints, force_unicode
from calibre.ebooks.metadata import MetaInformation, check_isbn
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre import guess_all_extensions
XLINK_NS = 'http://www.w3.org/1999/xlink'
def XLINK(name):
return '{%s}%s' % (XLINK_NS, name)
NAMESPACES = {
'fb2' : 'http://www.gribuser.ru/xml/fictionbook/2.0',
'xlink' : 'http://www.w3.org/1999/xlink' }
XPath = partial(etree.XPath, namespaces=NAMESPACES)
tostring = partial(etree.tostring, method='text', encoding=unicode)
def get_metadata(stream): def get_metadata(stream):
""" Return metadata as a L{MetaInfo} object """ """ Return fb2 metadata as a L{MetaInformation} object """
XPath = lambda x : etree.XPath(x,
namespaces={'fb2':'http://www.gribuser.ru/xml/fictionbook/2.0', root = _get_fbroot(stream)
'xlink':XLINK_NS})
tostring = lambda x : etree.tostring(x, method='text', book_title = _parse_book_title(root)
encoding=unicode).strip() authors = _parse_authors(root)
parser = etree.XMLParser(recover=True, no_network=True)
raw = stream.read() # fallback for book_title
raw = xml_to_unicode(raw, strip_encoding_pats=True, if book_title:
assume_utf8=True)[0] book_title = unicode(book_title)
root = etree.fromstring(raw, parser=parser)
authors, author_sort = [], None
for au in XPath('//fb2:author')(root):
fname = lname = author = None
fe = XPath('descendant::fb2:first-name')(au)
if fe:
fname = tostring(fe[0])
author = fname
le = XPath('descendant::fb2:last-name')(au)
if le:
lname = tostring(le[0])
if author:
author += ' '+lname
else: else:
author = lname book_title = force_unicode(os.path.splitext(
os.path.basename(getattr(stream, 'name',
_('Unknown'))))[0])
mi = MetaInformation(book_title, authors)
try:
_parse_cover(root, mi)
except:
pass
try:
_parse_comments(root, mi)
except:
pass
try:
_parse_tags(root, mi)
except:
pass
try:
_parse_series(root, mi)
except:
pass
try:
_parse_isbn(root, mi)
except:
pass
try:
_parse_publisher(root, mi)
except:
pass
try:
_parse_pubdate(root, mi)
except:
pass
try:
_parse_timestamp(root, mi)
except:
pass
try:
_parse_language(root, mi)
except:
pass
#_parse_uuid(root, mi)
#if DEBUG:
# prints(mi)
return mi
def _parse_authors(root):
authors = []
# pick up authors but only from 1 secrion <title-info>; otherwise it is not consistent!
# Those are fallbacks: <src-title-info>, <document-info>
for author_sec in ['title-info', 'src-title-info', 'document-info']:
for au in XPath('//fb2:%s/fb2:author'%author_sec)(root):
author = _parse_author(au)
if author: if author:
authors.append(author) authors.append(author)
if len(authors) == 1 and author is not None: if author:
break
# if no author so far
if not authors:
authors.append(_('Unknown'))
return authors
def _parse_author(elm_author):
""" Returns a list of display author and sortable author"""
xp_templ = 'normalize-space(fb2:%s/text())'
author = XPath(xp_templ % 'first-name')(elm_author)
lname = XPath(xp_templ % 'last-name')(elm_author)
mname = XPath(xp_templ % 'middle-name')(elm_author)
if mname:
author = (author + ' ' + mname).strip()
if lname: if lname:
author_sort = lname author = (author + ' ' + lname).strip()
if fname:
if author_sort: author_sort += ', '+fname
else: author_sort = fname
title = os.path.splitext(os.path.basename(getattr(stream, 'name',
_('Unknown'))))[0]
for x in XPath('//fb2:book-title')(root):
title = tostring(x)
break
comments = ''
for x in XPath('//fb2:annotation')(root):
comments += tostring(x)
if not comments:
comments = None
tags = list(map(tostring, XPath('//fb2:genre')(root)))
cp = XPath('//fb2:coverpage')(root) # fallback to nickname
cdata = None if not author:
if cp: nname = XPath(xp_templ % 'nickname')(elm_author)
cimage = XPath('descendant::fb2:image[@xlink:href]')(cp[0]) if nname:
if cimage: author = nname
id = cimage[0].get(XLINK('href')).replace('#', '')
binary = XPath('//fb2:binary[@id="%s"]'%id)(root)
if binary:
mt = binary[0].get('content-type', 'image/jpeg')
exts = guess_all_extensions(mt)
if not exts:
exts = ['.jpg']
cdata = (exts[0][1:], b64decode(tostring(binary[0])))
series = None return author
series_index = 1.0
for x in XPath('//fb2:sequence')(root):
series = x.get('name', None) def _parse_book_title(root):
if series is not None: # <title-info> has a priority. (actually <title-info> is mandatory)
series_index = x.get('number', 1.0) # other are backup solution (sequence is important. other then in fb2-doc)
break xp_ti = '//fb2:title-info/fb2:book-title/text()'
mi = MetaInformation(title, authors) xp_pi = '//fb2:publish-info/fb2:book-title/text()'
mi.comments = comments xp_si = '//fb2:src-title-info/fb2:book-title/text()'
mi.author_sort = author_sort book_title = XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root)
return book_title
def _parse_cover(root, mi):
# pickup from <title-info>, if not exists it fallbacks to <src-title-info>
imgid = XPath('substring-after(string(//fb2:coverpage/fb2:image/@xlink:href), "#")')(root)
if imgid:
try:
_parse_cover_data(root, imgid, mi)
except:
pass
def _parse_cover_data(root, imgid, mi):
elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root)
if elm_binary:
mimetype = elm_binary[0].get('content-type', 'image/jpeg')
mime_extensions = guess_all_extensions(mimetype)
if mime_extensions:
pic_data = elm_binary[0].text
if pic_data:
mi.cover_data = (mime_extensions[0][1:], b64decode(pic_data))
else:
prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid) )
def _parse_tags(root, mi):
# pick up genre but only from 1 secrion <title-info>; otherwise it is not consistent!
# Those are fallbacks: <src-title-info>
for genre_sec in ['title-info', 'src-title-info']:
# -- i18n Translations-- ?
tags = XPath('//fb2:%s/fb2:genre/text()' % genre_sec)(root)
if tags: if tags:
mi.tags = tags mi.tags = list(map(unicode, tags))
mi.series = series break
mi.series_index = series_index
if cdata: def _parse_series(root, mi):
mi.cover_data = cdata #calibri supports only 1 series: use the 1-st one
return mi # pick up sequence but only from 1 secrion in prefered order
# except <src-title-info>
xp_ti = '//fb2:title-info/fb2:sequence[1]'
xp_pi = '//fb2:publish-info/fb2:sequence[1]'
elms_sequence = XPath('%s|%s' % (xp_ti, xp_pi))(root)
if elms_sequence:
mi.series = elms_sequence[0].get('name', None)
if mi.series:
mi.series_index = elms_sequence[0].get('number', None)
def _parse_isbn(root, mi):
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
isbn = XPath('normalize-space(//fb2:publish-info/fb2:isbn/text())')(root)
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
if ',' in isbn:
isbn = isbn[:isbn.index(',')]
if check_isbn(isbn):
mi.isbn = isbn
def _parse_comments(root, mi):
# pick up annotation but only from 1 secrion <title-info>; fallback: <src-title-info>
for annotation_sec in ['title-info', 'src-title-info']:
elms_annotation = XPath('//fb2:%s/fb2:annotation' % annotation_sec)(root)
if elms_annotation:
mi.comments = tostring(elms_annotation[0])
# TODO: tags i18n, xslt?
break
def _parse_publisher(root, mi):
publisher = XPath('string(//fb2:publish-info/fb2:publisher/text())')(root)
if publisher:
mi.publisher = publisher
def _parse_pubdate(root, mi):
year = XPath('number(//fb2:publish-info/fb2:year/text())')(root)
if float.is_integer(year):
# only year is available, so use 1-st of Jan
mi.pubdate = datetime.date(int(year), 1, 1)
def _parse_timestamp(root, mi):
#<date value="1996-12-03">03.12.1996</date>
xp ='//fb2:document-info/fb2:date/@value|'\
'//fb2:document-info/fb2:date/text()'
docdate = XPath('string(%s)' % xp)(root)
if docdate:
mi.timestamp = parse_date(docdate)
def _parse_language(root, mi):
language = XPath('string(//fb2:title-info/fb2:lang/text())')(root)
if language:
mi.language = language
mi.languages = [ language ]
def _parse_uuid(root, mi):
uuid = XPath('normalize-space(//document-info/fb2:id/text())')(root)
if uuid:
mi.uuid = uuid
def _get_fbroot(stream):
parser = etree.XMLParser(recover=True, no_network=True)
raw = stream.read()
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
root = etree.fromstring(raw, parser=parser)
return root

View File

@ -591,8 +591,10 @@ class BooksView(QTableView): # {{{
fmt = prefs['output_format'] fmt = prefs['output_format']
def url_for_id(i): def url_for_id(i):
ans = db.format(i, fmt, index_is_id=True, as_path=True, try:
preserve_filename=True) ans = db.format_path(i, fmt, index_is_id=True)
except:
ans = None
if ans is None: if ans is None:
fmts = db.formats(i, index_is_id=True) fmts = db.formats(i, index_is_id=True)
if fmts: if fmts:
@ -600,13 +602,15 @@ class BooksView(QTableView): # {{{
else: else:
fmts = [] fmts = []
for f in fmts: for f in fmts:
ans = db.format(i, f, index_is_id=True, as_path=True, try:
preserve_filename=True) ans = db.format_path(i, f, index_is_id=True)
except:
ans = None
if ans is None: if ans is None:
ans = db.abspath(i, index_is_id=True) ans = db.abspath(i, index_is_id=True)
return QUrl.fromLocalFile(ans) return QUrl.fromLocalFile(ans)
md.setUrls([url_for_id(i) for i in selected[:25]]) md.setUrls([url_for_id(i) for i in selected])
drag = QDrag(self) drag = QDrag(self)
col = self.selectionModel().currentIndex().column() col = self.selectionModel().currentIndex().column()
md.column_name = self.column_map[col] md.column_name = self.column_map[col]

View File

@ -149,7 +149,8 @@ class TagsView(QTreeView): # {{{
hidden_categories=self.hidden_categories, hidden_categories=self.hidden_categories,
search_restriction=None, search_restriction=None,
drag_drop_finished=self.drag_drop_finished, drag_drop_finished=self.drag_drop_finished,
collapse_model=self.collapse_model) collapse_model=self.collapse_model,
state_map={})
self.pane_is_visible = True # because TagsModel.init did a recount self.pane_is_visible = True # because TagsModel.init did a recount
self.sort_by = sort_by self.sort_by = sort_by
self.tag_match = tag_match self.tag_match = tag_match
@ -173,6 +174,7 @@ class TagsView(QTreeView): # {{{
self.made_connections = True self.made_connections = True
self.refresh_signal_processed = True self.refresh_signal_processed = True
db.add_listener(self.database_changed) db.add_listener(self.database_changed)
self.expanded.connect(self.item_expanded)
def database_changed(self, event, ids): def database_changed(self, event, ids):
if self.refresh_signal_processed: if self.refresh_signal_processed:
@ -541,6 +543,10 @@ class TagsView(QTreeView): # {{{
return self.isExpanded(idx) return self.isExpanded(idx)
def recount(self, *args): def recount(self, *args):
'''
Rebuild the category tree, expand any categories that were expanded,
reset the search states, and reselect the current node.
'''
if self.disable_recounting or not self.pane_is_visible: if self.disable_recounting or not self.pane_is_visible:
return return
self.refresh_signal_processed = True self.refresh_signal_processed = True
@ -548,18 +554,23 @@ class TagsView(QTreeView): # {{{
if not ci.isValid(): if not ci.isValid():
ci = self.indexAt(QPoint(10, 10)) ci = self.indexAt(QPoint(10, 10))
path = self.model().path_for_index(ci) if self.is_visible(ci) else None path = self.model().path_for_index(ci) if self.is_visible(ci) else None
try: expanded_categories, state_map = self.model().get_state()
if not self.model().refresh(): # categories changed! self.set_new_model(state_map=state_map)
self.set_new_model() for category in expanded_categories:
path = None self.expand(self.model().index_for_category(category))
except: #Database connection could be closed if an integrity check is happening
pass
self._model.show_item_at_path(path) self._model.show_item_at_path(path)
# If the number of user categories changed, if custom columns have come or def item_expanded(self, idx):
# gone, or if columns have been hidden or restored, we must rebuild the '''
# model. Reason: it is much easier than reconstructing the browser tree. Called by the expanded signal
def set_new_model(self, filter_categories_by=None): '''
self.setCurrentIndex(idx)
def set_new_model(self, filter_categories_by=None, state_map={}):
'''
There are cases where we need to rebuild the category tree without
attempting to reposition the current node.
'''
try: try:
old = getattr(self, '_model', None) old = getattr(self, '_model', None)
if old is not None: if old is not None:
@ -569,7 +580,8 @@ class TagsView(QTreeView): # {{{
search_restriction=self.search_restriction, search_restriction=self.search_restriction,
drag_drop_finished=self.drag_drop_finished, drag_drop_finished=self.drag_drop_finished,
filter_categories_by=filter_categories_by, filter_categories_by=filter_categories_by,
collapse_model=self.collapse_model) collapse_model=self.collapse_model,
state_map=state_map)
self.setModel(self._model) self.setModel(self._model)
except: except:
# The DB must be gone. Set the model to None and hope that someone # The DB must be gone. Set the model to None and hope that someone
@ -752,7 +764,8 @@ class TagsModel(QAbstractItemModel): # {{{
def __init__(self, db, parent, hidden_categories=None, def __init__(self, db, parent, hidden_categories=None,
search_restriction=None, drag_drop_finished=None, search_restriction=None, drag_drop_finished=None,
filter_categories_by=None, collapse_model='disable'): filter_categories_by=None, collapse_model='disable',
state_map={}):
QAbstractItemModel.__init__(self, parent) QAbstractItemModel.__init__(self, parent)
# must do this here because 'QPixmap: Must construct a QApplication # must do this here because 'QPixmap: Must construct a QApplication
@ -776,10 +789,10 @@ class TagsModel(QAbstractItemModel): # {{{
self.filter_categories_by = filter_categories_by self.filter_categories_by = filter_categories_by
self.collapse_model = collapse_model self.collapse_model = collapse_model
# get_node_tree cannot return None here, because row_map is empty. Note # Note that _get_category_nodes can indirectly change the
# that get_node_tree can indirectly change the user_categories dict. # user_categories dict.
data = self.get_node_tree(config['sort_tags_by']) data = self._get_category_nodes(config['sort_tags_by'])
gst = db.prefs.get('grouped_search_terms', {}) gst = db.prefs.get('grouped_search_terms', {})
self.root_item = TagTreeItem(icon_map=self.icon_state_map) self.root_item = TagTreeItem(icon_map=self.icon_state_map)
self.category_nodes = [] self.category_nodes = []
@ -844,7 +857,7 @@ class TagsModel(QAbstractItemModel): # {{{
category_node_map[key] = node category_node_map[key] = node
last_category_node = node last_category_node = node
self.category_nodes.append(node) self.category_nodes.append(node)
self.refresh(data=data) self._create_node_tree(data, state_map)
def break_cycles(self): def break_cycles(self):
self.root_item.break_cycles() self.root_item.break_cycles()
@ -1121,8 +1134,10 @@ class TagsModel(QAbstractItemModel): # {{{
def set_search_restriction(self, s): def set_search_restriction(self, s):
self.search_restriction = s self.search_restriction = s
def get_node_tree(self, sort): def _get_category_nodes(self, sort):
old_row_map_len = len(self.row_map) '''
Called by __init__. Do not directly call this method.
'''
self.row_map = [] self.row_map = []
self.categories = {} self.categories = {}
@ -1176,20 +1191,28 @@ class TagsModel(QAbstractItemModel): # {{{
if category in data: # The search category can come and go if category in data: # The search category can come and go
self.row_map.append(category) self.row_map.append(category)
self.categories[category] = tb_categories[category]['name'] self.categories[category] = tb_categories[category]['name']
if old_row_map_len != 0 and old_row_map_len != len(self.row_map):
# A category has been added or removed. We must force a rebuild of
# the model
return None
return data return data
def refresh(self, data=None): def refresh(self, data=None):
sort_by = config['sort_tags_by'] '''
if data is None: Here to trap usages of refresh in the old architecture. Can eventually
data = self.get_node_tree(sort_by) # get category data be removed.
if data is None: '''
print 'TagsModel: refresh called!'
traceback.print_stack()
return False return False
def _create_node_tree(self, data, state_map):
'''
Called by __init__. Do not directly call this method.
'''
sort_by = config['sort_tags_by']
if data is None:
print '_create_node_tree: no data!'
traceback.print_stack()
return
collapse = gprefs['tags_browser_collapse_at'] collapse = gprefs['tags_browser_collapse_at']
collapse_model = self.collapse_model collapse_model = self.collapse_model
if collapse == 0: if collapse == 0:
@ -1354,29 +1377,23 @@ class TagsModel(QAbstractItemModel): # {{{
# }}} # }}}
for category in self.category_nodes: for category in self.category_nodes:
if len(category.children) > 0: process_one_node(category, state_map.get(category.py_name, {}))
child_map = category.children
def get_state(self):
state_map = {}
expanded_categories = []
for row, category in enumerate(self.category_nodes):
if self.tags_view.isExpanded(self.index(row, 0, QModelIndex())):
expanded_categories.append(category.py_name)
states = [c.tag.state for c in category.child_tags()] states = [c.tag.state for c in category.child_tags()]
names = [(c.tag.name, c.tag.category) for c in category.child_tags()] names = [(c.tag.name, c.tag.category) for c in category.child_tags()]
state_map = dict(izip(names, states)) state_map[category.py_name] = dict(izip(names, states))
# temporary sub-categories (the partitioning ones) must follow return expanded_categories, state_map
# the permanent sub-categories. This will happen naturally if
# the temp ones are added by process_node
ctags = [c for c in child_map if
c.type == TagTreeItem.CATEGORY and not c.temporary]
start = len(ctags)
self.beginRemoveRows(self.createIndex(category.row(), 0, category),
start, len(child_map)-1)
category.children = ctags
for i in range(start, len(child_map)):
child_map[i].break_cycles()
child_map = None
self.endRemoveRows()
else:
state_map = {}
process_one_node(category, state_map) def index_for_category(self, name):
return True for row, category in enumerate(self.category_nodes):
if category.py_name == name:
return self.index(row, 0, QModelIndex())
def columnCount(self, parent): def columnCount(self, parent):
return 1 return 1
@ -1476,7 +1493,7 @@ class TagsModel(QAbstractItemModel): # {{{
self.tags_view.tag_item_renamed.emit() self.tags_view.tag_item_renamed.emit()
item.tag.name = val item.tag.name = val
self.rename_item_in_all_user_categories(name, key, val) self.rename_item_in_all_user_categories(name, key, val)
self.refresh() # Should work, because no categories can have disappeared self.refresh_required.emit()
self.show_item_at_path(path) self.show_item_at_path(path)
return True return True
@ -1789,19 +1806,22 @@ class TagsModel(QAbstractItemModel): # {{{
return v return v
return None return None
def show_item_at_path(self, path, box=False): def show_item_at_path(self, path, box=False,
position=QTreeView.PositionAtCenter):
''' '''
Scroll the browser and open categories to show the item referenced by Scroll the browser and open categories to show the item referenced by
path. If possible, the item is placed in the center. If box=True, a path. If possible, the item is placed in the center. If box=True, a
box is drawn around the item. box is drawn around the item.
''' '''
if path: if path:
self.show_item_at_index(self.index_for_path(path), box) self.show_item_at_index(self.index_for_path(path), box=box,
position=position)
def show_item_at_index(self, idx, box=False): def show_item_at_index(self, idx, box=False,
position=QTreeView.PositionAtCenter):
if idx.isValid(): if idx.isValid():
self.tags_view.setCurrentIndex(idx) self.tags_view.setCurrentIndex(idx)
self.tags_view.scrollTo(idx, QTreeView.PositionAtCenter) self.tags_view.scrollTo(idx, position)
if box: if box:
tag_item = idx.internalPointer() tag_item = idx.internalPointer()
tag_item.boxed = True tag_item.boxed = True

View File

@ -1144,6 +1144,20 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
break break
return sha.hexdigest() return sha.hexdigest()
def format_path(self, index, fmt, index_is_id=False):
'''
This method is intended to be used only in those rare situations, like
Drag'n Drop, when you absolutely need the path to the original file.
Otherwise, use format(..., as_path=True).
Note that a networked backend will always return None.
'''
path = self.format_abspath(index, fmt, index_is_id=index_is_id)
if path is None:
id_ = index if index_is_id else self.id(index)
raise NoSuchFormat('Record %d has no format: %s'%(id_, fmt))
return path
def format_abspath(self, index, format, index_is_id=False): def format_abspath(self, index, format, index_is_id=False):
''' '''
Return absolute path to the ebook file of format `format` Return absolute path to the ebook file of format `format`

View File

@ -633,6 +633,7 @@ TXT input supports a number of options to differentiate how paragraphs are detec
:guilabel:`Formatting Style: None` :guilabel:`Formatting Style: None`
Applies no special formatting to the text, the document is converted to html with no other changes. Applies no special formatting to the text, the document is converted to html with no other changes.
.. _pdfconversion:
Convert PDF documents Convert PDF documents
~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -35,29 +35,11 @@ What are the best source formats to convert?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In order of decreasing preference: LIT, MOBI, EPUB, FB2, HTML, PRC, RTF, PDB, TXT, PDF In order of decreasing preference: LIT, MOBI, EPUB, FB2, HTML, PRC, RTF, PDB, TXT, PDF
Why does the PDF conversion lose some images/tables? I converted a PDF file, but the result has various problems?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The PDF conversion tries to extract the text and images from the PDF file and convert them to and HTML based ebook. Some PDF files have images in a format that cannot be extracted (vector images). All tables
are also represented as vector diagrams, thus they cannot be extracted.
How do I convert a collection of HTML files in a specific order? PDF is a terrible format to convert from. For a list of the various issues you will encounter when converting PDF, see: :ref:`pdfconversion`.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In order to convert a collection of HTML files in a specific oder, you have to create a table of contents file. That is, another HTML file that contains links to all the other files in the desired order. Such a file looks like::
<html>
<body>
<h1>Table of Contents</h1>
<p style="text-indent:0pt">
<a href="file1.html">First File</a><br/>
<a href="file2.html">Second File</a><br/>
.
.
.
</p>
</body>
</html>
Then just add this HTML file to the GUI and use the convert button to create your ebook.
.. _char-encoding-faq: .. _char-encoding-faq:
@ -85,6 +67,26 @@ If you have a hand edited TOC in the input document, you can use the TOC detecti
Finally, I encourage you to ditch the content TOC and only have a metadata TOC in your ebooks. Metadata TOCs will give the people reading your ebooks a much superior navigation experience (except on the Kindle, where they are essentially the same as a content TOC). Finally, I encourage you to ditch the content TOC and only have a metadata TOC in your ebooks. Metadata TOCs will give the people reading your ebooks a much superior navigation experience (except on the Kindle, where they are essentially the same as a content TOC).
How do I convert a collection of HTML files in a specific order?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In order to convert a collection of HTML files in a specific oder, you have to create a table of contents file. That is, another HTML file that contains links to all the other files in the desired order. Such a file looks like::
<html>
<body>
<h1>Table of Contents</h1>
<p style="text-indent:0pt">
<a href="file1.html">First File</a><br/>
<a href="file2.html">Second File</a><br/>
.
.
.
</p>
</body>
</html>
Then just add this HTML file to the GUI and use the convert button to create your ebook.
How do I use some of the advanced features of the conversion tools? How do I use some of the advanced features of the conversion tools?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
You can get help on any individual feature of the converters by mousing over it in the GUI or running ``ebook-convert dummy.html .epub -h`` at a terminal. A good place to start is to look at the following demo files that demonstrate some of the advanced features: You can get help on any individual feature of the converters by mousing over it in the GUI or running ``ebook-convert dummy.html .epub -h`` at a terminal. A good place to start is to look at the following demo files that demonstrate some of the advanced features:

File diff suppressed because it is too large Load Diff

View File

@ -208,6 +208,8 @@ def gc_histogram():
def diff_hists(h1, h2): def diff_hists(h1, h2):
"""Prints differences between two results of gc_histogram().""" """Prints differences between two results of gc_histogram()."""
for k in h1: for k in h1:
if k not in h2:
h2[k] = 0
if h1[k] != h2[k]: if h1[k] != h2[k]:
print "%s: %d -> %d (%s%d)" % ( print "%s: %d -> %d (%s%d)" % (
k, h1[k], h2[k], h2[k] > h1[k] and "+" or "", h2[k] - h1[k]) k, h1[k], h2[k], h2[k] > h1[k] and "+" or "", h2[k] - h1[k])