From ac8ccceef86423fbacb500fb6f5da842cf785573 Mon Sep 17 00:00:00 2001 From: James Ralston <> Date: Sun, 21 Feb 2010 10:06:40 -0800 Subject: [PATCH 01/11] remove br from top of page in chm conversion --- src/calibre/ebooks/chm/input.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py index a2976c944a..784848929d 100644 --- a/src/calibre/ebooks/chm/input.py +++ b/src/calibre/ebooks/chm/input.py @@ -11,7 +11,7 @@ from mimetypes import guess_type as guess_mimetype from htmlentitydefs import name2codepoint from pprint import PrettyPrinter -from BeautifulSoup import BeautifulSoup +from BeautifulSoup import BeautifulSoup, NavigableString from lxml import html, etree from pychm.chm import CHMFile from pychm.chmlib import ( @@ -35,6 +35,17 @@ def match_string(s1, s2_already_lowered): return True return False +def check_all_prev_empty(tag): + if tag is None: + return True + if tag.__class__ == NavigableString and not check_empty(tag): + return False + return check_all_prev_empty(tag.previousSibling) + +def check_empty(s, rex = re.compile(r'\S')): + return rex.search(s) is None + + def option_parser(): parser = OptionParser(usage=_('%prog [options] mybook.chm')) parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output') @@ -160,6 +171,12 @@ class CHMReader(CHMFile): t[-1].extract() # for some very odd reason each page's content appears to be in a table # too. and this table has sub-tables for random asides... grr. + + # remove br at top of page if present after nav bars removed + br = html('br') + if br: + if check_all_prev_empty(br[0].previousSibling): + br[0].extract() # some images seem to be broken in some chm's :/ for img in html('img'): From 91a2881a0c3ede8982c451d2e9a198c371bef79e Mon Sep 17 00:00:00 2001 From: James Ralston <> Date: Sun, 21 Feb 2010 11:01:12 -0800 Subject: [PATCH 02/11] strip br from top of page in chm conversion --- src/calibre/ebooks/chm/input.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py index ecb54dffdb..3b08854532 100644 --- a/src/calibre/ebooks/chm/input.py +++ b/src/calibre/ebooks/chm/input.py @@ -4,11 +4,11 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ,' \ ' and Alex Bramley .' -import os, shutil, uuid +import os, shutil, uuid, re from tempfile import mkdtemp from mimetypes import guess_type as guess_mimetype -from BeautifulSoup import BeautifulSoup +from BeautifulSoup import BeautifulSoup, NavigableString from lxml import html from pychm.chm import CHMFile from pychm.chmlib import ( @@ -29,6 +29,17 @@ def match_string(s1, s2_already_lowered): return True return False +def check_all_prev_empty(tag): + if tag is None: + return True + if tag.__class__ == NavigableString and not check_empty(tag): + return False + return check_all_prev_empty(tag.previousSibling) + +def check_empty(s, rex = re.compile(r'\S')): + return rex.search(s) is None + + def option_parser(): parser = OptionParser(usage=_('%prog [options] mybook.chm')) parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output') @@ -155,6 +166,12 @@ class CHMReader(CHMFile): # for some very odd reason each page's content appears to be in a table # too. and this table has sub-tables for random asides... grr. + # remove br at top of page if present after nav bars removed + br = soup('br') + if br: + if check_all_prev_empty(br[0].previousSibling): + br[0].extract() + # some images seem to be broken in some chm's :/ for img in soup('img'): try: From 9f01f0b1264a9313699ff18606dd3550f40f304a Mon Sep 17 00:00:00 2001 From: James Ralston <> Date: Sat, 6 Mar 2010 11:55:20 -0800 Subject: [PATCH 03/11] catch UnicodeDecodeError exception --- src/calibre/ebooks/chm/metadata.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py index 7386d54658..0ce1f0b07f 100644 --- a/src/calibre/ebooks/chm/metadata.py +++ b/src/calibre/ebooks/chm/metadata.py @@ -15,7 +15,10 @@ from calibre.utils.logging import default_log from calibre.ptempfile import TemporaryFile def _clean(s): - return s.replace(u'\u00a0', u' ') + try: + return s.replace(u'\u00a0', u' ') + except UnicodeDecodeError: + return u"" def _detag(tag): str = u"" From 043223eac6a682ba6559a7385c5d5a1ac8061e8d Mon Sep 17 00:00:00 2001 From: James Ralston <> Date: Sun, 7 Mar 2010 22:03:08 -0800 Subject: [PATCH 04/11] renderContents as unicode --- src/calibre/ebooks/chm/metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py index 0ce1f0b07f..2f0c246d10 100644 --- a/src/calibre/ebooks/chm/metadata.py +++ b/src/calibre/ebooks/chm/metadata.py @@ -37,7 +37,7 @@ def _metadata_from_table(soup, searchfor): td = td.parent # there appears to be multiple ways of structuring the metadata # on the home page. cue some nasty special-case hacks... - if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I): + if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(None), flags=re.I): meta = _detag(td.findNextSibling('td')) return re.sub('^:', '', meta).strip() else: @@ -49,7 +49,7 @@ def _metadata_from_span(soup, searchfor): if span is None: return None # this metadata might need some cleaning up still :/ - return _detag(span.renderContents().strip()) + return _detag(span.renderContents(None).strip()) def _get_authors(soup): aut = (_metadata_from_span(soup, r'author') From 43d6a53d7b0de7acc70d7ce67e1eb7b62add8596 Mon Sep 17 00:00:00 2001 From: James Ralston <> Date: Sun, 7 Mar 2010 22:21:55 -0800 Subject: [PATCH 05/11] renderContents as unicode --- src/calibre/ebooks/chm/metadata.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py index 2f0c246d10..d6a1d24024 100644 --- a/src/calibre/ebooks/chm/metadata.py +++ b/src/calibre/ebooks/chm/metadata.py @@ -15,10 +15,7 @@ from calibre.utils.logging import default_log from calibre.ptempfile import TemporaryFile def _clean(s): - try: - return s.replace(u'\u00a0', u' ') - except UnicodeDecodeError: - return u"" + return s.replace(u'\u00a0', u' ') def _detag(tag): str = u"" From 450e9ef176cb91b553a1aae11e73f04a948501a1 Mon Sep 17 00:00:00 2001 From: James Ralston <> Date: Sun, 7 Mar 2010 23:18:26 -0800 Subject: [PATCH 06/11] detag handle None --- src/calibre/ebooks/chm/metadata.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py index d6a1d24024..28e307df95 100644 --- a/src/calibre/ebooks/chm/metadata.py +++ b/src/calibre/ebooks/chm/metadata.py @@ -19,6 +19,8 @@ def _clean(s): def _detag(tag): str = u"" + if tag is None: + return str for elem in tag: if hasattr(elem, "contents"): str += _detag(elem) From a38243345c80bcad7c3605f88ad263a0a238c2e5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 8 Mar 2010 09:41:00 -0700 Subject: [PATCH 07/11] Iriver story driver: Put uploads into the Books directory --- src/calibre/devices/iriver/driver.py | 1 + src/calibre/translations/ru.po | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/devices/iriver/driver.py b/src/calibre/devices/iriver/driver.py index 7373996213..ca570eed7a 100644 --- a/src/calibre/devices/iriver/driver.py +++ b/src/calibre/devices/iriver/driver.py @@ -32,6 +32,7 @@ class IRIVER_STORY(USBMS): MAIN_MEMORY_VOLUME_LABEL = 'Story Main Memory' STORAGE_CARD_VOLUME_LABEL = 'Story Storage Card' + EBOOK_DIR_MAIN = 'Book' SUPPORTS_SUB_DIRS = True diff --git a/src/calibre/translations/ru.po b/src/calibre/translations/ru.po index 22808b177f..723ab7fd41 100644 --- a/src/calibre/translations/ru.po +++ b/src/calibre/translations/ru.po @@ -6492,7 +6492,7 @@ msgstr "" "
    \n" "
  1. Отключите устройство. Дождитесь завершения создания " "базы данных (т.е. дождитесь ее готовности к использованию). Подключите " -"устройство. Теперь должно все заработать в приложении %(apps)s. Если этого " +"устройство. Теперь должно все заработать в приложении %(app)s. Если этого " "не произошло, то переходите к следующему пункту.
  2. \n" "
  3. Закройте %(app)s приложение. Найдите файл media.xml в " "основной памяти устройства. Удалите его. Отключите устройство. Подождите " From d4e611508a9d4fb2360e952c10e419819454319c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 8 Mar 2010 10:03:20 -0700 Subject: [PATCH 08/11] Elsevier.nl by Darko Miletic --- resources/images/news/elsevier.png | Bin 0 -> 568 bytes resources/recipes/elsevier.recipe | 65 +++++++++++++++++++++++++++++ src/calibre/manual/faq.rst | 6 +++ 3 files changed, 71 insertions(+) create mode 100644 resources/images/news/elsevier.png create mode 100644 resources/recipes/elsevier.recipe diff --git a/resources/images/news/elsevier.png b/resources/images/news/elsevier.png new file mode 100644 index 0000000000000000000000000000000000000000..373b39cd7c2b3d75f5250fe83f786086c047995a GIT binary patch literal 568 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`Y)RhkE)4%caKYZ?lYt_f1s;*b zK-vS0-A-oPfdtD69Mgd`SU*F|v9*U87#I^hT^vI!PA{EwR@)^}j;_mT~d9q+j_ zarwQome2owEt#h*l9^oT8g0XG^5fnHrI^k{1535|(-V^%c0a#3Gm0s`@z5HlP*wHI zkB>fH+Tf$d{xUp(@zQ(-eeZMphZq+qFe%+p6S}iQPwj)??Ps@2PcbqyI`y7T+|zik zv+$w#%uADM9JJu{7|X zmA&E2oN86UZhPHKv5$Gy(X8IZ+cen~eoAl9T^A<4qvl)yRK;r>Op^B)Q#xEi!HBC=nH1Y5)(jg8?b zS16mZJ3^$U;a30oB$3`*4;zwCKdMpdTy!*AG21Kr8{4&a1)(fOlYaq2UbVzEq9i4; zB-JXpC>2OC7#SEE=o%R78d!uF8dw<^SQ(q?8khqa6Ce30p=ij>PsvQHMAKkoX=P{u V(GYe=Vk=MsgQu&X%Q~loCIBh--C+O# literal 0 HcmV?d00001 diff --git a/resources/recipes/elsevier.recipe b/resources/recipes/elsevier.recipe new file mode 100644 index 0000000000..389ce3f74d --- /dev/null +++ b/resources/recipes/elsevier.recipe @@ -0,0 +1,65 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +elsevier.nl +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Pagina12(BasicNewsRecipe): + title = 'Elsevier.nl' + __author__ = 'Darko Miletic' + description = 'News from Denmark' + publisher = 'elsevier.nl' + category = 'news, politics, Denmark' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = False + language = 'nl' + country = 'NL' + remove_empty_feeds = True + masthead_url = 'http://www.elsevier.nl/static/elsevier/stdimg/logo.gif' + extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} ' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + keep_only_tags = dict(attrs={'id':'artikel_container'}) + remove_tags_before = dict(attrs={'id':'breadcrumb_container'}) + remove_tags_after = dict(attrs={'class':'author_link'}) + remove_tags = [ + dict(attrs={'id':'breadcrumb_container'}) + ,dict(name='div',attrs={'class':'pullout_vak'}) + ] + remove_attributes = ['width','height'] + + feeds = [ + (u'Laatste nieuws' , u'http://www.elsevier.nl/web/RSS/Homepage-RSS.htm?output=xml' ) + ,(u'Nederland' , u'http://www.elsevier.nl/web/RSS/Nederland-RSS.htm?output=xml' ) + ,(u'Politiek' , u'http://www.elsevier.nl/web/RSS/Politiek-RSS.htm?output=xml' ) + ,(u'Europese Unie' , u'http://www.elsevier.nl/web/RSS/Europese-Unie-RSS.htm?output=xml' ) + ,(u'Buitenland' , u'http://www.elsevier.nl/web/RSS/Buitenland-RSS.htm?output=xml' ) + ,(u'Economie' , u'http://www.elsevier.nl/web/RSS/Economie-RSS.htm?output=xml' ) + ,(u'Wetenschap' , u'http://www.elsevier.nl/web/RSS/Wetenschap-RSS.htm?output=xml' ) + ,(u'Cultuur & Televisie' , u'http://www.elsevier.nl/web/RSS/Cultuur-Televisie-RSS.htm?output=xml') + ,(u'Society' , u'http://www.elsevier.nl/web/RSS/Society-RSS.htm?output=xml' ) + ,(u'Internet&/Gadgets' , u'http://www.elsevier.nl/web/RSS/Internet-Gadgets-RSS.htm?output=xml' ) + ,(u'Comentaren' , u'http://www.elsevier.nl/web/RSS/Commentaren-RSS.htm?output=xml' ) + ] + + def print_version(self, url): + return url + '?print=true' + + def get_article_url(self, article): + return article.get('guid', None).rpartition('?')[0] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index ba186a0c10..c38f4f1d23 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -313,3 +313,9 @@ I want some feature added to |app|. What can I do? You have two choices: 1. Create a patch by hacking on |app| and send it to me for review and inclusion. See `Development `_. 2. `Open a ticket `_ (you have to register and login first) and hopefully I will find the time to implement your feature. + +Can I include |app| on a CD to be distributed with my product/magazine? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +|app| is licensed under the GNU General Public License v3 (an open source license). This means that you are free to redistribute |app| as long as you make the source code available. So if you want to put |app| on a CD with your product, you must also put the |app| source code on the CD. The source code is available for download `here `_. + + From 81056a21cbf5cef9d0b82f41b1566486aae37c1c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 8 Mar 2010 17:32:10 -0700 Subject: [PATCH 09/11] Taz DigiABO by Lars Jacob --- resources/recipes/taz.recipe | 62 ++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 resources/recipes/taz.recipe diff --git a/resources/recipes/taz.recipe b/resources/recipes/taz.recipe new file mode 100644 index 0000000000..530fa7d6b7 --- /dev/null +++ b/resources/recipes/taz.recipe @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = '2010, Lars Jacob jacob.lars at gmail.com' +__docformat__ = 'restructuredtext de' + +''' +www.taz.de/digiabo +''' +import os, urllib2, zipfile, tempfile +from calibre.web.feeds.news import BasicNewsRecipe + +class TazDigiabo(BasicNewsRecipe): + + title = u'Taz Digiabo' + description = u'Das EPUB DigiAbo der Taz' + language = 'de' + lang = 'de-DE' + + __author__ = 'Lars Jacob' + needs_subscription = True + + conversion_options = { + 'no_default_epub_cover' : True + } + + def build_index(self): + if self.username is not None and self.password is not None: + domain = "http://www.taz.de" + + url = domain + "/epub/" + + auth_handler = urllib2.HTTPBasicAuthHandler() + auth_handler.add_password(realm='TAZ-ABO', + uri=url, + user=self.username, + passwd=self.password) + opener = urllib2.build_opener(auth_handler) + urllib2.install_opener(opener) + + try: + f = urllib2.urlopen(url) + except urllib2.HTTPError: + self.report_progress(0,_('Can\'t login to download issue')) + return + + tmp = tempfile.TemporaryFile() + self.report_progress(0,_('downloading epub')) + tmp.write(f.read()) + + zfile = zipfile.ZipFile(tmp, 'r') + self.report_progress(0,_('extracting epub')) + + zfile.extractall(self.output_dir) + + tmp.close() + index = os.path.join(self.output_dir, 'content.opf') + + self.report_progress(1,_('epub downloaded and extracted')) + + return index From 6085c0886de6140689d2a48402f60f4c389ba68d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 9 Mar 2010 12:47:43 -0700 Subject: [PATCH 10/11] Fix #5092 (Errors in Catalog list) --- src/calibre/library/catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index 5350d4ba04..e5c48f3911 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -102,7 +102,7 @@ class CSV_XML(CatalogPlugin): if field == 'formats': fmt_list = [] for format in item: - fmt_list.append(format.partition('.')[2]) + fmt_list.append(format.rpartition('.')[2].lower()) item = ', '.join(fmt_list) elif field in ['authors','tags']: item = ', '.join(item) From 4e943c02245d6cfffe0f07431b15f1497e99b9eb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 9 Mar 2010 12:48:27 -0700 Subject: [PATCH 11/11] ... --- src/calibre/devices/hanlin/driver.py | 2 +- src/calibre/ebooks/metadata/rar.py | 2 +- src/calibre/ebooks/metadata/zip.py | 2 +- src/calibre/utils/smtp.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/devices/hanlin/driver.py b/src/calibre/devices/hanlin/driver.py index d88a4d4baa..a69907937c 100644 --- a/src/calibre/devices/hanlin/driver.py +++ b/src/calibre/devices/hanlin/driver.py @@ -115,7 +115,7 @@ class BOOX(HANLINV3): supported_platforms = ['windows', 'osx', 'linux'] # Ordered list of supported formats - FORMATS = ['epub', 'fb2', 'pdf', 'html', 'txt', 'rtf', 'mobi', 'prc', 'chm'] + FORMATS = ['epub', 'fb2', 'djvu', 'pdf', 'html', 'txt', 'rtf', 'mobi', 'prc', 'chm'] VENDOR_ID = [0x0525] PRODUCT_ID = [0xa4a5] diff --git a/src/calibre/ebooks/metadata/rar.py b/src/calibre/ebooks/metadata/rar.py index d23577eab1..a9b5d45546 100644 --- a/src/calibre/ebooks/metadata/rar.py +++ b/src/calibre/ebooks/metadata/rar.py @@ -32,7 +32,7 @@ def get_metadata(stream): if stream_type: stream_type = stream_type[1:] if stream_type in ('lit', 'opf', 'prc', 'mobi', 'fb2', 'epub', - 'rb', 'imp', 'pdf', 'lrf'): + 'rb', 'imp', 'pdf', 'lrf', 'azw'): with TemporaryDirectory() as tdir: with CurrentDir(tdir): stream = extract_member(path, match=None, name=f, diff --git a/src/calibre/ebooks/metadata/zip.py b/src/calibre/ebooks/metadata/zip.py index 08ac132d53..b8c260bd1f 100644 --- a/src/calibre/ebooks/metadata/zip.py +++ b/src/calibre/ebooks/metadata/zip.py @@ -23,7 +23,7 @@ def get_metadata(stream): if stream_type: stream_type = stream_type[1:] if stream_type in ('lit', 'opf', 'prc', 'mobi', 'fb2', 'epub', - 'rb', 'imp', 'pdf', 'lrf'): + 'rb', 'imp', 'pdf', 'lrf', 'azw'): with TemporaryDirectory() as tdir: with CurrentDir(tdir): path = zf.extract(f) diff --git a/src/calibre/utils/smtp.py b/src/calibre/utils/smtp.py index 87019ed146..3246810010 100644 --- a/src/calibre/utils/smtp.py +++ b/src/calibre/utils/smtp.py @@ -135,7 +135,7 @@ def option_parser(): 'to SMTP server.')) r=parser.add_option_group('SMTP RELAY', 'Options to use an SMTP relay server to send mail. ' - '%prog will try to send the email directly unless --relay is ' + 'calibre will try to send the email directly unless --relay is ' 'specified.').add_option r('-r', '--relay', help=('An SMTP relay server to use to send mail.')) r('-p', '--port', default=-1,