diff --git a/resources/recipes/freenature.recipe b/resources/recipes/freenature.recipe index cf06e7163d..0b287842ec 100644 --- a/resources/recipes/freenature.recipe +++ b/resources/recipes/freenature.recipe @@ -1,4 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag import re class NatureNews(BasicNewsRecipe): @@ -10,17 +11,76 @@ class NatureNews(BasicNewsRecipe): max_articles_per_feed = 50 no_stylesheets = True - remove_tags_before = dict(name='h1', attrs={'class':'heading entry-title'}) - remove_tags_after = dict(name='h2', attrs={'id':'comments'}) + keep_only_tags = [dict(name='div', attrs={'id':'content'})] +# remove_tags_before = dict(name='h1', attrs={'class':'heading entry-title'}) +# remove_tags_after = dict(name='h2', attrs={'id':'comments'}) remove_tags = [ dict(name='h2', attrs={'id':'comments'}), dict(attrs={'alt':'Advertisement'}), dict(name='div', attrs={'class':'ad'}), - ] + dict(attrs={'class':'Z3988'}), + dict(attrs={'class':['formatpublished','type-of-article','cleardiv','disclaimer','buttons','comments xoxo']}), + dict(name='a', attrs={'href':'#comments'}), + dict(name='h2',attrs={'class':'subheading plusicon icon-add-comment'}) + ] preprocess_regexps = [ (re.compile(r'
ADVERTISEMENT
', re.DOTALL|re.IGNORECASE), lambda match: ''), ] + extra_css = ''' + .author { text-align: right; font-size: small; line-height:1em; margin-top:0px; margin-left:0; margin-right:0; margin-bottom: 0; } + .imagedescription { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .imagecredit { font-size: x-small; font-style: normal; font-weight: bold} + ''' + feeds = [('Nature News', 'http://feeds.nature.com/news/rss/most_recent')] + def preprocess_html(self,soup): + # The author name is slightly buried - dig it up + author = soup.find('p', {'class':'byline'}) + if author: + # Find out the author's name + authornamediv = author.find('span',{'class':'author fn'}) + authornamelink = authornamediv.find('a') + if authornamelink: + authorname = authornamelink.contents[0] + else: + authorname = authornamediv.contents[0] + # Stick the author's name in the byline tag + tag = Tag(soup,'div') + tag['class'] = 'author' + tag.insert(0,authorname.strip()) + author.replaceWith(tag) + + # Change the intro from a p to a div + intro = soup.find('p',{'class':'intro'}) + if intro: + tag = Tag(soup,'div') + tag['class'] = 'intro' + tag.insert(0,intro.contents[0]) + intro.replaceWith(tag) + + # Change span class=imagedescription to div + descr = soup.find('span',{'class':'imagedescription'}) + if descr: + tag = Tag(soup,'div') + tag['class'] = 'imagedescription' + tag.insert(0,descr.renderContents()) + descr.replaceWith(tag) + + # The references are in a list, let's make them simpler + reflistcont = soup.find('ul',{'id':'article-refrences'}) + if reflistcont: + reflist = reflistcont.li.renderContents() + tag = Tag(soup,'div') + tag['class'] = 'article-references' + tag.insert(0,reflist) + reflistcont.replaceWith(tag) + + # Within the id=content div, we need to remove all the stuff after the end of the class=entry-content + entrycontent = soup.find('div',{'class':'entry-content'}) + for nextSibling in entrycontent.findNextSiblings(): + nextSibling.extract() + + return soup diff --git a/resources/recipes/nikkei_sub_economy.recipe b/resources/recipes/nikkei_sub_economy.recipe index 2dd8f1add8..8e7a68dfe7 100644 --- a/resources/recipes/nikkei_sub_economy.recipe +++ b/resources/recipes/nikkei_sub_economy.recipe @@ -27,6 +27,9 @@ class NikkeiNet_sub_economy(BasicNewsRecipe): {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, {'class':"cmn-article_keyword cmn-clearfix"}, {'class':"cmn-print_headline cmn-clearfix"}, + {'class':"cmn-article_list"}, + dict(id="ABOUT-NIKKEI"), + {'class':"cmn-sub_market"}, ] remove_tags_after = {'class':"cmn-pr_list"} diff --git a/src/calibre/ebooks/fb2/input.py b/src/calibre/ebooks/fb2/input.py index b019873d39..3d3ec69833 100644 --- a/src/calibre/ebooks/fb2/input.py +++ b/src/calibre/ebooks/fb2/input.py @@ -104,13 +104,17 @@ class FB2Input(InputFormatPlugin): entries = [(f, guess_type(f)[0]) for f in os.listdir('.')] opf.create_manifest(entries) opf.create_spine(['index.xhtml']) - - for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES): - href = img.get('{%s}href'%XLINK_NS, img.get('href', None)) - if href is not None: - if href.startswith('#'): - href = href[1:] - opf.guide.set_cover(os.path.abspath(href)) + if mi.cover_data and mi.cover_data[1]: + with open('fb2_cover_calibre_mi.jpg', 'wb') as f: + f.write(mi.cover_data[1]) + opf.guide.set_cover(os.path.abspath('fb2_cover_calibre_mi.jpg')) + else: + for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES): + href = img.get('{%s}href'%XLINK_NS, img.get('href', None)) + if href is not None: + if href.startswith('#'): + href = href[1:] + opf.guide.set_cover(os.path.abspath(href)) opf.render(open('metadata.opf', 'wb')) return os.path.join(os.getcwd(), 'metadata.opf') diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index e07418f41c..2f397006a1 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -542,7 +542,17 @@ class MobiReader(object): elif tag.tag == 'img': tag.set('height', height) else: - styles.append('margin-top: %s' % self.ensure_unit(height)) + if tag.tag == 'div' and not tag.text and \ + (not tag.tail or not tag.tail.strip()) and \ + not len(list(tag.iterdescendants())): + # Paragraph spacer + # Insert nbsp so that the element is never + # discarded by a renderer + tag.text = u'\u00a0' # nbsp + styles.append('height: %s' % + self.ensure_unit(height)) + else: + styles.append('margin-top: %s' % self.ensure_unit(height)) if attrib.has_key('width'): width = attrib.pop('width').strip() if width and re.search(r'\d+', width): diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index 6820709b3e..08b4369078 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -227,7 +227,7 @@ class EbookIterator(object): self.log.warn('Missing spine item:', repr(spath)) cover = self.opf.cover - if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf') and cover: + if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf', 'fb2') and cover: cfile = os.path.join(self.base, 'calibre_iterator_cover.html') chtml = (TITLEPAGE%os.path.relpath(cover, self.base).replace(os.sep, '/')).encode('utf-8') diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 10e5871d31..a0814ee0dd 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -34,18 +34,15 @@ class PML_HTMLizer(object): 'ra', 'c', 'r', - 't', 's', 'l', 'k', - 'T', 'FN', 'SB', ] STATES_VALUE_REQ = [ 'a', - 'T', 'FN', 'SB', ] @@ -96,8 +93,6 @@ class PML_HTMLizer(object): 'Sb': 'sb', 'c': 'c', 'r': 'r', - 't': 't', - 'T': 'T', 'i': 'i', 'I': 'i', 'u': 'u', @@ -133,8 +128,6 @@ class PML_HTMLizer(object): DIV_STATES = [ 'c', 'r', - 't', - 'T', 'FN', 'SB', ] @@ -255,8 +248,6 @@ class PML_HTMLizer(object): for key, val in self.state.items(): if val[0]: - if key == 'T': - self.state['T'][0] = False if key in self.DIV_STATES: div.append(key) elif key in self.SPAN_STATES: @@ -506,6 +497,9 @@ class PML_HTMLizer(object): self.toc = TOC() self.file_name = file_name + indent_state = {'t': False, 'T': False} + adv_indent_val = '' + for s in self.STATES: self.state[s] = [False, '']; @@ -515,6 +509,8 @@ class PML_HTMLizer(object): parsed = [] empty = True + basic_indent = indent_state['t'] + adv_indent = indent_state['T'] # Must use StringIO, cStringIO does not support unicode line = StringIO.StringIO(line) @@ -527,7 +523,7 @@ class PML_HTMLizer(object): if c == '\\': c = line.read(1) - if c in 'qcrtTiIuobBlk': + if c in 'qcriIuobBlk': text = self.process_code(c, line) elif c in 'FS': l = line.read(1) @@ -574,6 +570,15 @@ class PML_HTMLizer(object): elif c == 'w': empty = False text = ''+
_('%s has been updated to version %s. '
'See the new features. Visit the download pa'
- 'ge?')%(__appname__, version))
+ '">new features.')%(__appname__, version))
self.label.setOpenExternalLinks(True)
self.label.setWordWrap(True)
self.setWindowTitle(_('Update available!'))
@@ -94,13 +93,13 @@ class UpdateMixin(object):
type=Qt.QueuedConnection)
self.update_checker.start()
- def update_found(self, version):
+ def update_found(self, version, force=False):
os = 'windows' if iswindows else 'osx' if isosx else 'linux'
url = 'http://calibre-ebook.com/download_%s'%os
self.status_bar.new_version_available(version, url)
- if config.get('new_version_notification') and \
- dynamic.get('update to version %s'%version, True):
+ if force or (config.get('new_version_notification') and \
+ dynamic.get('update to version %s'%version, True)):
self._update_notification__ = UpdateNotification(version,
parent=self)
self._update_notification__.show()
diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py
index f2ff783a76..0bb5ee7634 100644
--- a/src/calibre/gui2/widgets.py
+++ b/src/calibre/gui2/widgets.py
@@ -426,46 +426,47 @@ class EnLineEdit(LineEditECM, QLineEdit):
pass
-class TagsCompleter(QCompleter):
+class ItemsCompleter(QCompleter):
'''
A completer object that completes a list of tags. It is used in conjunction
with a CompleterLineEdit.
'''
- def __init__(self, parent, all_tags):
- QCompleter.__init__(self, all_tags, parent)
- self.all_tags = set(all_tags)
+ def __init__(self, parent, all_items):
+ QCompleter.__init__(self, all_items, parent)
+ self.all_items = set(all_items)
- def update(self, text_tags, completion_prefix):
- tags = list(self.all_tags.difference(text_tags))
- model = QStringListModel(tags, self)
+ def update(self, text_items, completion_prefix):
+ items = list(self.all_items.difference(text_items))
+ model = QStringListModel(items, self)
self.setModel(model)
self.setCompletionPrefix(completion_prefix)
if completion_prefix.strip() != '':
self.complete()
- def update_tags_cache(self, tags):
- self.all_tags = set(tags)
- model = QStringListModel(tags, self)
+ def update_items_cache(self, items):
+ self.all_items = set(items)
+ model = QStringListModel(items, self)
self.setModel(model)
-class TagsLineEdit(EnLineEdit):
+class CompleteLineEdit(EnLineEdit):
'''
A QLineEdit that can complete parts of text separated by separator.
'''
- def __init__(self, parent=0, tags=[]):
+ def __init__(self, parent=0, complete_items=[], sep=',', space_before_sep=False):
EnLineEdit.__init__(self, parent)
- self.separator = ','
+ self.separator = sep
+ self.space_before_sep = space_before_sep
self.connect(self, SIGNAL('textChanged(QString)'), self.text_changed)
- self.completer = TagsCompleter(self, tags)
+ self.completer = ItemsCompleter(self, complete_items)
self.completer.setCaseSensitivity(Qt.CaseInsensitive)
self.connect(self,
@@ -476,32 +477,43 @@ class TagsLineEdit(EnLineEdit):
self.completer.setWidget(self)
- def update_tags_cache(self, tags):
- self.completer.update_tags_cache(tags)
+ def update_items_cache(self, complete_items):
+ self.completer.update_items_cache(complete_items)
+
+ def set_separator(self, sep):
+ self.separator = sep
+
+ def set_space_before_sep(self, space_before):
+ self.space_before_sep = space_before
def text_changed(self, text):
all_text = unicode(text)
text = all_text[:self.cursorPosition()]
- prefix = text.split(',')[-1].strip()
+ prefix = text.split(self.separator)[-1].strip()
- text_tags = []
+ text_items = []
for t in all_text.split(self.separator):
t1 = unicode(t).strip()
if t1 != '':
- text_tags.append(t)
- text_tags = list(set(text_tags))
+ text_items.append(t)
+ text_items = list(set(text_items))
self.emit(SIGNAL('text_changed(PyQt_PyObject, PyQt_PyObject)'),
- text_tags, prefix)
+ text_items, prefix)
def complete_text(self, text):
cursor_pos = self.cursorPosition()
before_text = unicode(self.text())[:cursor_pos]
after_text = unicode(self.text())[cursor_pos:]
- prefix_len = len(before_text.split(',')[-1].strip())
- self.setText('%s%s%s %s' % (before_text[:cursor_pos - prefix_len],
- text, self.separator, after_text))
- self.setCursorPosition(cursor_pos - prefix_len + len(text) + 2)
+ prefix_len = len(before_text.split(self.separator)[-1].strip())
+ if self.space_before_sep:
+ complete_text_pat = '%s%s %s %s'
+ len_extra = 3
+ else:
+ complete_text_pat = '%s%s%s %s'
+ len_extra = 2
+ self.setText(complete_text_pat % (before_text[:cursor_pos - prefix_len], text, self.separator, after_text))
+ self.setCursorPosition(cursor_pos - prefix_len + len(text) + len_extra)
class EnComboBox(QComboBox):
@@ -528,6 +540,22 @@ class EnComboBox(QComboBox):
idx = 0
self.setCurrentIndex(idx)
+class CompleteComboBox(EnComboBox):
+
+ def __init__(self, *args):
+ EnComboBox.__init__(self, *args)
+ self.setLineEdit(CompleteLineEdit(self))
+
+ def update_items_cache(self, complete_items):
+ self.lineEdit().update_items_cache(complete_items)
+
+ def set_separator(self, sep):
+ self.lineEdit().set_separator(sep)
+
+ def set_space_before_sep(self, space_before):
+ self.lineEdit().set_space_before_sep(space_before)
+
+
class HistoryLineEdit(QComboBox):
lost_focus = pyqtSignal()
diff --git a/src/calibre/library/database.py b/src/calibre/library/database.py
index 6016dbd03e..2138b2f1eb 100644
--- a/src/calibre/library/database.py
+++ b/src/calibre/library/database.py
@@ -1060,6 +1060,10 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
return [ (i[0], i[1]) for i in \
self.conn.get('SELECT id, name FROM authors')]
+ def all_author_names(self):
+ return filter(None, [i[0].strip().replace('|', ',') for i in self.conn.get(
+ 'SELECT name FROM authors')])
+
def all_publishers(self):
return [ (i[0], i[1]) for i in \
self.conn.get('SELECT id, name FROM publishers')]
diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index 4b2b169d72..71639ca749 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -547,6 +547,7 @@ Some limitations of PDF input are:
* Extraction of vector images and tables from within the document is also not supported.
* Some PDFs use special glyphs to represent ll or ff or fi, etc. Conversion of these may or may not work depending on just how they are represented internally in the PDF.
* Some PDFs store their images upside down with a rotation instruction, |app| currently doesn't support that instruction, so the images will be rotated in the output as well.
+ * Links and Tables of Contents are not supported
To re-iterate **PDF is a really, really bad** format to use as input. If you absolutely must use PDF, then be prepared for an
output ranging anywhere from decent to unusable, depending on the input PDF.
diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst
index 0e8c101620..ee72bf6fdb 100644
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@@ -450,6 +450,11 @@ How do I use purchased EPUB books with |app|?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Most purchased EPUB books have `DRM