GwR add metadata editing for DRM files, fix catalog CLI options

2025-08-11 09:13:57 -04:00 · 2010-02-18 04:38:13 -07:00 · 2010-02-18 04:38:13 -07:00 · 7d69048a4b
commit 7d69048a4b
parent d6de6a564c d45da57047
11 changed files with 615 additions and 92 deletions
--- a/resources/images/news/wired_uk.png
+++ b/resources/images/news/wired_uk.png
--- a/resources/kathemerini.recipe
+++ b/resources/kathemerini.recipe
@ -0,0 +1,37 @@
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class Kathimerini(BasicNewsRecipe):
+    title                  = 'Kathimerini'
+    __author__             = 'Pan'
+    description            = 'News from Greece'
+    max_articles_per_feed  = 100
+    oldest_article = 100
+    publisher              = 'Kathimerini'
+    category               = 'news, GR'
+    language               = 'el'
+    no_stylesheets         = True
+    remove_tags_before = dict(name='td',attrs={'class':'news'})
+    remove_tags_after = dict(name='td',attrs={'class':'news'})
+    remove_attributes = ['width', 'src','header','footer']
+
+    feeds = [(u'\u03a0\u03bf\u03bb\u03b9\u03c4\u03b9\u03ba\u03ae',
+  'http://wk.kathimerini.gr/xml_files/politics.xml'),
+ (u'\u0395\u03bb\u03bb\u03ac\u03b4\u03b1',
+  ' http://wk.kathimerini.gr/xml_files/ell.xml'),
+ (u'\u039a\u03cc\u03c3\u03bc\u03bf\u03c2',
+  ' http://wk.kathimerini.gr/xml_files/world.xml'),
+ (u'\u039f\u03b9\u03ba\u03bf\u03bd\u03bf\u03bc\u03af\u03b1',
+  'http://wk.kathimerini.gr/xml_files/economy_1.xml'),
+ (u'\u0395\u03c0\u03b9\u03c7\u03b5\u03b9\u03c1\u03ae\u03c3\u03b5\u03b9\u03c2',
+  'http://wk.kathimerini.gr/xml_files/economy_2.xml'),
+ (u'\u0394\u03b9\u03b5\u03b8\u03bd\u03ae\u03c2 \u039f\u03b9\u03ba\u03bf\u03bd\u03bf\u03bc\u03af\u03b1',
+  'http://wk.kathimerini.gr/xml_files/economy_3.xml'),
+ (u'\u03a0\u03bf\u03bb\u03b9\u03c4\u03b9\u03c3\u03bc\u03cc\u03c2',
+  'http://wk.kathimerini.gr/xml_files/civ.xml'),
+ (u'\u039c\u03cc\u03bd\u03b9\u03bc\u03b5\u03c2 \u03a3\u03c4\u03ae\u03bb\u03b5\u03c2',
+  'http://wk.kathimerini.gr/xml_files/st.xml')]
+
+    def print_version(self, url):
+        return url.replace('http://news.kathimerini.gr/4dcgi/', 'http://news.kathimerini.gr/4dcgi/4dcgi/')
+
+
--- a/resources/recipes/wired_uk.recipe
+++ b/resources/recipes/wired_uk.recipe
@ -0,0 +1,74 @@
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+'''
+www.wired.co.uk
+'''
+
+from calibre import strftime
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Wired_UK(BasicNewsRecipe):
+    title                 = 'Wired Magazine - UK edition'
+    __author__            = 'Darko Miletic'
+    description           = 'Gaming news'
+    publisher             = 'Conde Nast Digital'
+    category              = 'news, games, IT, gadgets'
+    oldest_article        = 32
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    encoding              = 'utf-8'
+    use_embedded_content  = False
+    masthead_url          = 'http://www.wired.co.uk/_/media/wired-logo_UK.gif'
+    language              = 'en_GB'
+    extra_css             = ' body{font-family: Palatino,"Palatino Linotype","Times New Roman",Times,serif} img{margin-bottom: 0.8em } .img-descr{font-family: Tahoma,Arial,Helvetica,sans-serif; font-size: 0.6875em; display: block} '
+    index                 = 'http://www.wired.co.uk/wired-magazine.aspx'
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+
+    keep_only_tags = [dict(name='div', attrs={'class':'article-box'})]
+    remove_tags = [
+                     dict(name=['object','embed','iframe','link'])
+                    ,dict(attrs={'class':['opts','comment','stories']})
+                  ]
+    remove_tags_after = dict(name='div',attrs={'class':'stories'})
+    remove_attributes = ['height','width']
+
+
+    def parse_index(self):
+        totalfeeds = []
+        soup   = self.index_to_soup(self.index)
+        maincontent = soup.find('div',attrs={'class':'main-content'})
+        mfeed = []
+        if maincontent:
+           st = maincontent.find(attrs={'class':'most-wired-box'})
+           if st:
+              for itt in st.findAll('a',href=True):
+               url   = 'http://www.wired.co.uk' + itt['href']
+               title = self.tag_to_string(itt)
+               description = ''
+               date  = strftime(self.timefmt)
+               mfeed.append({
+                                  'title'      :title
+                                 ,'date'       :date
+                                 ,'url'        :url
+                                 ,'description':description
+                                })
+        totalfeeds.append(('Articles', mfeed))
+        return totalfeeds
+
+    def get_cover_url(self):
+        cover_url = None
+        soup = self.index_to_soup(self.index)
+        cover_item = soup.find('span', attrs={'class':'cover'})
+        if cover_item:
+           cover_url = cover_item.img['src']
+        return cover_url
+
+    def print_version(self, url):
+        return url + '?page=all'
--- a/resources/tanea.recipe
+++ b/resources/tanea.recipe
@ -0,0 +1,30 @@
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class TaNea(BasicNewsRecipe):
+    title          = u'Ta Nea'
+    __author__             = 'Pan'
+    oldest_article = 1
+    max_articles_per_feed = 100
+    no_stylesheets         = True
+
+    remove_tags_before = dict(name='div',attrs={'id':'print-body'})
+    remove_tags_after = dict(name='div',attrs={'id':'text'})
+
+    feeds = [
+        (u'\xce\x95\xce\xbb\xce\xbb\xce\xac\xce\xb4\xce\xb1',
+        u'http://www.tanea.gr/default.asp?pid=66&la=1'),
+        (u'\xce\x9a\xcf\x8c\xcf\x83\xce\xbc\xce\xbf\xcf\x82',
+        u'http://www.tanea.gr/default.asp?pid=67&la=1'),
+        (u'\xce\x9f\xce\xb9\xce\xba\xce\xbf\xce\xbd\xce\xbf\xce\xbc\xce\xaf\xce\xb1',
+        u'http://www.tanea.gr/default.asp?pid=68&la=1'),
+        (u'\xce\xa0\xce\xbf\xce\xbb\xce\xb9\xcf\x84\xce\xb9\xcf\x83\xce\xbc\xcf\x8c\xcf\x82',
+        u'http://www.tanea.gr/default.asp?pid=69&la=1'),
+        (u'\xce\x93\xce\xbd\xcf\x8e\xce\xbc\xce\xb5\xcf\x82',
+        u'http://www.tanea.gr/default.asp?pid=79&la=1'),
+        (u'\xce\xa1\xce\xb9\xcf\x80\xce\xad\xcf\x82',
+        u'http://www.tanea.gr/default.asp?pid=80&la=1'),
+        (u'\xce\x91\xce\xb9\xcf\x87\xce\xbc\xce\xad\xcf\x82',
+        u'http://www.tanea.gr/default.asp?pid=81&la=1')]
+
+    def print_version(self, url):
+        return url.replace('http://www.tanea.gr/default.asp?pid=2', 'http://www.tanea.gr/default.asp?pid=96')
--- a/src/calibre/customize/init.py
+++ b/src/calibre/customize/init.py
@ -249,6 +249,7 @@ class CatalogPlugin(Plugin):
    #:                       dest = 'catalog_title',
    #:                       help = (_('Title of generated catalog. \nDefault:') + " '" +
    #:                       '%default' + "'"))]
+    #: cli_options parsed in library.cli:catalog_option_parser()

    cli_options = []

@ -275,9 +276,10 @@ class CatalogPlugin(Plugin):
    def get_output_fields(self, opts):
        # Return a list of requested fields, with opts.sort_by first
        all_fields = set(
-                          ['author_sort','authors','comments','cover','formats',                           'id','isbn','pubdate','publisher','rating',
-                          'series_index','series','size','tags','timestamp',
-                          'title','uuid'])
+                          ['author_sort','authors','comments','cover','formats',
+                           'id','isbn','pubdate','publisher','rating',
+                           'series_index','series','size','tags','timestamp',
+                           'title','uuid'])

        fields = all_fields
        if opts.fields != 'all':
--- a/src/calibre/ebooks/chm/init.py
+++ b/src/calibre/ebooks/chm/init.py
@ -0,0 +1,8 @@
+#!/usr/bin/env  python
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+Used for chm input
+'''
--- a/src/calibre/ebooks/chm/input.py
+++ b/src/calibre/ebooks/chm/input.py
@ -0,0 +1,384 @@
+from __future__ import with_statement
+''' CHM File decoding support '''
+__license__ = 'GPL v3'
+__copyright__  = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
+                 ' and Alex Bramley <a.bramley at gmail.com>.'
+
+import os, shutil, uuid
+from tempfile import mkdtemp
+from mimetypes import guess_type as guess_mimetype
+
+from BeautifulSoup import BeautifulSoup
+from lxml import html
+from pychm.chm import CHMFile
+from pychm.chmlib import (
+  CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL,
+  chm_enumerate,
+)
+
+from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
+from calibre.utils.config import OptionParser
+from calibre.ebooks.metadata.toc import TOC
+from calibre.utils.localization import get_lang
+from calibre.utils.filenames import ascii_filename
+
+
+def match_string(s1, s2_already_lowered):
+    if s1 is not None and s2_already_lowered is not None:
+        if s1.lower()==s2_already_lowered:
+            return True
+    return False
+
+def option_parser():
+    parser = OptionParser(usage=_('%prog [options] mybook.chm'))
+    parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output')
+    parser.add_option('--verbose', default=False, action='store_true', dest='verbose')
+    parser.add_option("-t", "--title", action="store", type="string", \
+                    dest="title", help=_("Set the book title"))
+    parser.add_option('--title-sort', action='store', type='string', default=None,
+                      dest='title_sort', help=_('Set sort key for the title'))
+    parser.add_option("-a", "--author", action="store", type="string", \
+                    dest="author", help=_("Set the author"))
+    parser.add_option('--author-sort', action='store', type='string', default=None,
+                      dest='author_sort', help=_('Set sort key for the author'))
+    parser.add_option("-c", "--category", action="store", type="string", \
+                    dest="category", help=_("The category this book belongs"
+                    " to. E.g.: History"))
+    parser.add_option("--thumbnail", action="store", type="string", \
+                    dest="thumbnail", help=_("Path to a graphic that will be"
+                    " set as this files' thumbnail"))
+    parser.add_option("--comment", action="store", type="string", \
+                    dest="freetext", help=_("Path to a txt file containing a comment."))
+    parser.add_option("--get-thumbnail", action="store_true", \
+                    dest="get_thumbnail", default=False, \
+                    help=_("Extract thumbnail from LRF file"))
+    parser.add_option('--publisher', default=None, help=_('Set the publisher'))
+    parser.add_option('--classification', default=None, help=_('Set the book classification'))
+    parser.add_option('--creator', default=None, help=_('Set the book creator'))
+    parser.add_option('--producer', default=None, help=_('Set the book producer'))
+    parser.add_option('--get-cover', action='store_true', default=False,
+                      help=_('Extract cover from LRF file. Note that the LRF format has no defined cover, so we use some heuristics to guess the cover.'))
+    parser.add_option('--bookid', action='store', type='string', default=None,
+                      dest='book_id', help=_('Set book ID'))
+    parser.add_option('--font-delta', action='store', type='int', default=0,
+                      dest='font_delta', help=_('Set font delta'))
+    return parser
+
+class CHMError(Exception):
+    pass
+
+class CHMReader(CHMFile):
+    def __init__(self, input, log):
+        CHMFile.__init__(self)
+        if not self.LoadCHM(input):
+            raise CHMError("Unable to open CHM file '%s'"%(input,))
+        self.log = log
+        self._sourcechm = input
+        self._contents = None
+        self._playorder = 0
+        self._metadata = False
+        self._extracted = False
+
+        # location of '.hhc' file, which is the CHM TOC.
+        self.root, ext = os.path.splitext(self.topics.lstrip('/'))
+        self.hhc_path = self.root + ".hhc"
+
+
+    def _parse_toc(self, ul, basedir=os.getcwdu()):
+        toc = TOC(play_order=self._playorder, base_path=basedir, text='')
+        self._playorder += 1
+        for li in ul('li', recursive=False):
+            href = li.object('param', {'name': 'Local'})[0]['value']
+            if href.count('#'):
+                href, frag = href.split('#')
+            else:
+                frag = None
+            name = self._deentity(li.object('param', {'name': 'Name'})[0]['value'])
+            #print "========>", name
+            toc.add_item(href, frag, name, play_order=self._playorder)
+            self._playorder += 1
+            if li.ul:
+               child = self._parse_toc(li.ul)
+               child.parent = toc
+               toc.append(child)
+        #print toc
+        return toc
+
+
+    def GetFile(self, path):
+        # have to have abs paths for ResolveObject, but Contents() deliberately
+        # makes them relative. So we don't have to worry, re-add the leading /.
+        # note this path refers to the internal CHM structure
+        if path[0] != '/':
+            path = '/' + path
+        res, ui = self.ResolveObject(path)
+        if res != CHM_RESOLVE_SUCCESS:
+            raise CHMError("Unable to locate '%s' within CHM file '%s'"%(path, self.filename))
+        size, data = self.RetrieveObject(ui)
+        if size == 0:
+            raise CHMError("'%s' is zero bytes in length!"%(path,))
+        return data
+
+    def ExtractFiles(self, output_dir=os.getcwdu()):
+        for path in self.Contents():
+            lpath = os.path.join(output_dir, path)
+            self._ensure_dir(lpath)
+            data = self.GetFile(path)
+            with open(lpath, 'wb') as f:
+                if guess_mimetype(path)[0] == ('text/html'):
+                    data = self._reformat(data)
+                f.write(data)
+        #subprocess.call(['extract_chmLib.exe', self._sourcechm, output_dir])
+        self._extracted = True
+
+    def _reformat(self, data):
+        try:
+            soup = BeautifulSoup(data)
+        except UnicodeEncodeError:
+            # hit some strange encoding problems...
+            print "Unable to parse html for cleaning, leaving it :("
+            return data
+        # nuke javascript...
+        [s.extract() for s in soup('script')]
+        # remove forward and back nav bars from the top/bottom of each page
+        # cos they really fuck with the flow of things and generally waste space
+        # since we can't use [a,b] syntax to select arbitrary items from a list
+        # we'll have to do this manually...
+        t = soup('table')
+        if t:
+            if (t[0].previousSibling is None
+              or t[0].previousSibling.previousSibling is None):
+                t[0].extract()
+            if (t[-1].nextSibling is None
+              or t[-1].nextSibling.nextSibling is None):
+                t[-1].extract()
+        # for some very odd reason each page's content appears to be in a table
+        # too. and this table has sub-tables for random asides... grr.
+
+        # some images seem to be broken in some chm's :/
+        for img in soup('img'):
+            try:
+                # some are supposedly "relative"... lies.
+                while img['src'].startswith('../'): img['src'] = img['src'][3:]
+                # some have ";<junk>" at the end.
+                img['src'] = img['src'].split(';')[0]
+            except KeyError:
+                # and some don't even have a src= ?!
+                pass
+        # now give back some pretty html.
+        return soup.prettify()
+
+    def Contents(self):
+        if self._contents is not None:
+            return self._contents
+        paths = []
+        def get_paths(chm, ui, ctx):
+            # skip directories
+            # note this path refers to the internal CHM structure
+            if ui.path[-1] != '/':
+                # and make paths relative
+                paths.append(ui.path.lstrip('/'))
+        chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None)
+        self._contents = paths
+        return self._contents
+
+    def _ensure_dir(self, path):
+        dir = os.path.dirname(path)
+        if not os.path.isdir(dir):
+            os.makedirs(dir)
+
+    def extract_content(self, output_dir=os.getcwdu()):
+        self.ExtractFiles(output_dir=output_dir)
+
+
+class CHMInput(InputFormatPlugin):
+
+    name        = 'CHM Input'
+    author      = 'Kovid Goyal and Alex Bramley'
+    description = 'Convert CHM files to OEB'
+    file_types  = set(['chm'])
+
+    options = set([
+        OptionRecommendation(name='dummy_option', recommended_value=False,
+            help=_('dummy option until real options are determined.')),
+    ])
+
+    def _chmtohtml(self, output_dir, chm_path, no_images, log):
+        log.debug('Opening CHM file')
+        rdr = CHMReader(chm_path, log)
+        log.debug('Extracting CHM to %s' % output_dir)
+        rdr.extract_content(output_dir)
+        return rdr.hhc_path
+
+
+    def convert(self, stream, options, file_ext, log, accelerators):
+        from calibre.ebooks.metadata.chm import get_metadata_
+
+        log.debug('Processing CHM...')
+        tdir = mkdtemp(prefix='chm2oeb_')
+        from calibre.customize.ui import plugin_for_input_format
+        html_input = plugin_for_input_format('html')
+        for opt in html_input.options:
+            setattr(options, opt.option.name, opt.recommended_value)
+        options.input_encoding = 'utf-8'
+        no_images = False #options.no_images
+        chm_name = stream.name
+        #chm_data = stream.read()
+
+        #closing stream so CHM can be opened by external library
+        stream.close()
+        log.debug('tdir=%s' % tdir)
+        log.debug('stream.name=%s' % stream.name)
+        mainname = self._chmtohtml(tdir, chm_name, no_images, log)
+        mainpath = os.path.join(tdir, mainname)
+
+        metadata = get_metadata_(tdir)
+
+        odi = options.debug_pipeline
+        options.debug_pipeline = None
+        # try a custom conversion:
+        #oeb = self._create_oebbook(mainpath, tdir, options, log, metadata)
+        # try using html converter:
+        htmlpath = self._create_html_root(mainpath, log)
+        oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
+        options.debug_pipeline = odi
+        #log.debug('DEBUG: Not removing tempdir %s' % tdir)
+        shutil.rmtree(tdir)
+        return oeb
+
+    def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
+        # use HTMLInput plugin to generate book
+        from calibre.ebooks.html.input import HTMLInput
+        opts.breadth_first = True
+        htmlinput = HTMLInput(None)
+        oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
+        return oeb
+
+
+    def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
+        from calibre.ebooks.conversion.plumber import create_oebbook
+        from calibre.ebooks.oeb.base import DirContainer
+        oeb = create_oebbook(log, None, opts, self,
+                encoding=opts.input_encoding, populate=False)
+        self.oeb = oeb
+
+        metadata = oeb.metadata
+        if mi.title:
+            metadata.add('title', mi.title)
+        if mi.authors:
+            for a in mi.authors:
+                metadata.add('creator', a, attrib={'role':'aut'})
+        if mi.publisher:
+            metadata.add('publisher', mi.publisher)
+        if mi.isbn:
+            metadata.add('identifier', mi.isbn, attrib={'scheme':'ISBN'})
+        if not metadata.language:
+            oeb.logger.warn(u'Language not specified')
+            metadata.add('language', get_lang())
+        if not metadata.creator:
+            oeb.logger.warn('Creator not specified')
+            metadata.add('creator', _('Unknown'))
+        if not metadata.title:
+            oeb.logger.warn('Title not specified')
+            metadata.add('title', _('Unknown'))
+
+        bookid = str(uuid.uuid4())
+        metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
+        for ident in metadata.identifier:
+            if 'id' in ident.attrib:
+                self.oeb.uid = metadata.identifier[0]
+                break
+
+        hhcdata = self._read_file(hhcpath)
+        hhcroot = html.fromstring(hhcdata)
+        chapters = self._process_nodes(hhcroot)
+        #print "============================="
+        #print "Printing hhcroot"
+        #print etree.tostring(hhcroot, pretty_print=True)
+        #print "============================="
+        log.debug('Found %d section nodes' % len(chapters))
+
+        if len(chapters) > 0:
+            path0 = chapters[0][1]
+            subpath = os.path.dirname(path0)
+            htmlpath = os.path.join(basedir, subpath)
+
+            oeb.container = DirContainer(htmlpath, log)
+            for chapter in chapters:
+                title = chapter[0]
+                basename = os.path.basename(chapter[1])
+                self._add_item(oeb, title, basename)
+
+            oeb.container = DirContainer(htmlpath, oeb.log)
+        return oeb
+
+    def _create_html_root(self, hhcpath, log):
+        hhcdata = self._read_file(hhcpath)
+        hhcroot = html.fromstring(hhcdata)
+        chapters = self._process_nodes(hhcroot)
+        #print "============================="
+        #print "Printing hhcroot"
+        #print etree.tostring(hhcroot, pretty_print=True)
+        #print "============================="
+        log.debug('Found %d section nodes' % len(chapters))
+        htmlpath = os.path.splitext(hhcpath)[0] + ".html"
+        f = open(htmlpath, 'wb')
+        f.write("<HTML><HEAD></HEAD><BODY>\r\n")
+
+        if chapters:
+            path0 = chapters[0][1]
+            subpath = os.path.dirname(path0)
+
+            for chapter in chapters:
+                title = chapter[0]
+                rsrcname = os.path.basename(chapter[1])
+                rsrcpath = os.path.join(subpath, rsrcname)
+                # title should already be url encoded
+                url = "<br /><a href=" + rsrcpath + ">" + title + " </a>\r\n"
+                f.write(url)
+
+        f.write("</BODY></HTML>")
+        f.close()
+        return htmlpath
+
+
+    def _read_file(self, name):
+        f = open(name, 'rb')
+        data = f.read()
+        f.close()
+        return data
+
+    def _visit_node(self, node, chapters, depth):
+        # check that node is a normal node (not a comment, DOCTYPE, etc.)
+        # (normal nodes have string tags)
+        if isinstance(node.tag, basestring):
+            if match_string(node.tag, 'object') and match_string(node.attrib['type'], 'text/sitemap'):
+                for child in node:
+                    if match_string(child.tag,'param') and match_string(child.attrib['name'], 'name'):
+                        chapter_title = child.attrib['value']
+                    if match_string(child.tag,'param') and match_string(child.attrib['name'],'local'):
+                        chapter_path = child.attrib['value']
+                if chapter_title is not None and chapter_path is not None:
+                    chapter = [chapter_title, chapter_path, depth]
+                    chapters.append(chapter)
+            if node.tag=="UL":
+                depth = depth + 1
+            if node.tag=="/UL":
+                depth = depth - 1
+
+    def _process_nodes(self, root):
+        chapters = []
+        depth = 0
+        for node in root.iter():
+            self._visit_node(node, chapters, depth)
+        return chapters
+
+    def _add_item(self, oeb, title, path):
+        bname = os.path.basename(path)
+        id, href = oeb.manifest.generate(id='html',
+                href=ascii_filename(bname))
+        item = oeb.manifest.add(id, href, 'text/html')
+        item.html_input_href = bname
+        oeb.spine.add(item, True)
+        oeb.toc.add(title, item.href)
+
--- a/src/calibre/ebooks/metadata/mobi.py
+++ b/src/calibre/ebooks/metadata/mobi.py
@ -107,7 +107,13 @@ class MetadataUpdater(object):
        self.cover_record = self.thumbnail_record = None
        self.timestamp = None
        self.pdbrecords = self.get_pdbrecords()
-        self.drm_block = self.fetchDRMdata()
+
+        self.drm_block = None
+        if self.encryption_type != 0:
+            if self.have_exth:
+                self.drm_block = self.fetchDRMdata()
+            else:
+                raise MobiError('Unable to set metadata on DRM file without EXTH header')

        self.original_exth_records = {}
        if not have_exth:
@ -117,38 +123,14 @@ class MetadataUpdater(object):
        self.fetchEXTHFields()

    def fetchDRMdata(self):
-        ''' Grab everything between end of EXTH and title '''
-        '''
-        if False and self.have_exth:
-            print "incoming file has EXTH header"
-            # 20:24 = mobiHeaderLength, 16=PDBHeader size, 4 = len('EXTH')
-            exth_off = int(unpack('>I', self.record0[20:24])[0] + 16)
-            print "exth_off = 0x%x" % exth_off
-            exth_len_offset = exth_off + 4
-            print "exth_len_offset = 0x%x" % exth_len_offset
-            exth_len = int(unpack('>I', self.record0[exth_len_offset:exth_len_offset+4])[0])
-            print "len(EXTH) = 0x%x" % exth_len
-            title_offset = int(unpack('>I', self.record0[0x54:0x58])[0])
-            print "offset of full title = 0x%x" % title_offset
-            drm_off = exth_off + exth_len
-            print "DRM data begins at 0x%x" % drm_off
-            print "DRM len is 0x%x bytes" % (title_offset - drm_off)
-            return self.record0[drm_off:drm_off + (title_offset - drm_off)]
-        else:
-        '''
-        if True:
-            drm_offset = int(unpack('>I', self.record0[0xa8:0xac])[0])
-            self.drm_key_count = int(unpack('>I', self.record0[0xac:0xb0])[0])
-            drm_string = ''
-            for x in range(self.drm_key_count):
-                base_addr = drm_offset + (x * self.DRM_KEY_SIZE)
-                drm_string += self.record0[base_addr:base_addr + self.DRM_KEY_SIZE]
-            return drm_string
-        else:
-            drm_offset = int(unpack('>I', self.record0[0xa8:0xac])[0])
-            title_offset = int(unpack('>I', self.record0[0x54:0x58])[0])
-            drm_blocklen = title_offset - drm_offset
-            return self.record0[drm_offset:drm_offset + drm_blocklen]
+        ''' Fetch the DRM keys '''
+        drm_offset = int(unpack('>I', self.record0[0xa8:0xac])[0])
+        self.drm_key_count = int(unpack('>I', self.record0[0xac:0xb0])[0])
+        drm_keys = ''
+        for x in range(self.drm_key_count):
+            base_addr = drm_offset + (x * self.DRM_KEY_SIZE)
+            drm_keys += self.record0[base_addr:base_addr + self.DRM_KEY_SIZE]
+        return drm_keys

    def fetchEXTHFields(self):
        stream = self.stream
@ -224,7 +206,8 @@ class MetadataUpdater(object):

    def create_exth(self, new_title=None, exth=None):
        # Add an EXTH block to record 0, rewrite the stream
-        # self.hexdump(self.record0)
+        if isinstance(new_title, unicode):
+            new_title = new_title.encode(self.codec, 'replace')

        # Fetch the existing title
        title_offset, = unpack('>L', self.record0[0x54:0x58])
@ -248,12 +231,13 @@ class MetadataUpdater(object):
            exth = ['EXTH', pack('>II', 12, 0), pad]
            exth = ''.join(exth)

-        # Update drm_offset
-        self.record0[0xa8:0xac] = pack('>L', 0x10 + mobi_header_length + len(exth))
-        if True:
+        # Update drm_offset(0xa8), title_offset(0x54)
+        if self.encryption_type != 0:
+            self.record0[0xa8:0xac] = pack('>L', 0x10 + mobi_header_length + len(exth))
            self.record0[0xb0:0xb4] = pack('>L', len(self.drm_block))
-        # Update title_offset
-        self.record0[0x54:0x58] = pack('>L', 0x10 + mobi_header_length + len(exth) + len(self.drm_block))
+            self.record0[0x54:0x58] = pack('>L', 0x10 + mobi_header_length + len(exth) + len(self.drm_block))
+        else:
+            self.record0[0x54:0x58] = pack('>L', 0x10 + mobi_header_length + len(exth))

        if new_title:
            self.record0[0x58:0x5c] = pack('>L', len(new_title))
@ -262,21 +246,15 @@ class MetadataUpdater(object):
        new_record0 = StringIO()
        new_record0.write(self.record0[:0x10 + mobi_header_length])
        new_record0.write(exth)
-        new_record0.write(self.drm_block)
-        if new_title:
-            #new_record0.write(new_title.encode(self.codec, 'replace'))
-            new_title = (new_title or _('Unknown')).encode(self.codec, 'replace')
-            new_record0.write(new_title)
-        else:
-            new_record0.write(title_in_file)
+        if self.encryption_type != 0:
+            new_record0.write(self.drm_block)
+        new_record0.write(new_title if new_title else title_in_file)

        # Pad to a 4-byte boundary
        trail = len(new_record0.getvalue()) % 4
        pad = '\0' * (4 - trail) # Always pad w/ at least 1 byte
        new_record0.write(pad)

-        #self.hexdump(new_record0.getvalue())
-
        # Rebuild the stream, update the pdbrecords pointers
        self.patchSection(0,new_record0.getvalue())

@ -386,11 +364,7 @@ class MetadataUpdater(object):
            recs.append((202, pack('>I', self.thumbnail_rindex)))
            pop_exth_record(202)

-        if getattr(self, 'encryption_type', -1) != 0:
-            prints(u"Setting metadata for '%s' (DRM)" % mi.title)
-            # raise MobiError('Setting metadata in DRMed MOBI files is not supported.')
-
-        # Restore any original EXTH fields that weren't modified/updated
+        # Restore any original EXTH fields that weren't updated
        for id in sorted(self.original_exth_records):
            recs.append((id, self.original_exth_records[id]))
        recs = sorted(recs, key=lambda x:(x[0],x[0]))
--- a/src/calibre/ebooks/mobi/writer.py
+++ b/src/calibre/ebooks/mobi/writer.py
@ -1376,7 +1376,7 @@ class MobiWriter(object):
            self._text_length,
            self._text_nrecords-1, RECORD_SIZE, 0, 0)) # 0 - 15 (0x0 - 0xf)
        uid = random.randint(0, 0xffffffff)
-        title = str(metadata.title[0])
+        title = unicode(metadata.title[0]).encode('utf-8')
        # The MOBI Header

        # 0x0 - 0x3
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@ -1,4 +1,4 @@
-import datetime, htmlentitydefs, os, re, shutil, time
+import datetime, htmlentitydefs, os, re, shutil

 from collections import namedtuple
 from copy import deepcopy
@ -21,7 +21,7 @@ FIELDS = ['all', 'author_sort', 'authors', 'comments',
 class CSV_XML(CatalogPlugin):
    'CSV/XML catalog generator'

-    Option = namedtuple('Option', 'option, default, dest, help')
+    Option = namedtuple('Option', 'option, default, dest, action, help')

    name = 'Catalog_CSV_XML'
    description = 'CSV/XML catalog generator'
@ -34,6 +34,7 @@ class CSV_XML(CatalogPlugin):
            Option('--fields',
                default = 'all',
                dest = 'fields',
+                action = None,
                help = _('The fields to output when cataloging books in the '
                    'database.  Should be a comma-separated list of fields.\n'
                    'Available fields: %s.\n'
@ -43,6 +44,7 @@ class CSV_XML(CatalogPlugin):
            Option('--sort-by',
                default = 'id',
                dest = 'sort_by',
+                action = None,
                help = _('Output field to sort on.\n'
                'Available fields: author_sort, id, rating, size, timestamp, title.\n'
                "Default: '%default'\n"
@ -241,7 +243,7 @@ class CSV_XML(CatalogPlugin):
 class EPUB_MOBI(CatalogPlugin):
    'ePub catalog generator'

-    Option = namedtuple('Option', 'option, default, dest, help')
+    Option = namedtuple('Option', 'option, default, dest, action, help')

    name = 'Catalog_EPUB_MOBI'
    description = 'EPUB/MOBI catalog generator'
@ -254,12 +256,14 @@ class EPUB_MOBI(CatalogPlugin):
    cli_options = [Option('--catalog-title',
                          default = 'My Books',
                          dest = 'catalog_title',
+                          action = None,
                          help = _('Title of generated catalog used as title in metadata.\n'
                          "Default: '%default'\n"
                          "Applies to: ePub, MOBI output formats")),
                   Option('--debug-pipeline',
                           default=None,
                           dest='debug_pipeline',
+                           action = None,
                           help=_("Save the output from different stages of the conversion "
                           "pipeline to the specified "
                           "directory. Useful if you are unsure at which stage "
@ -269,48 +273,56 @@ class EPUB_MOBI(CatalogPlugin):
                   Option('--exclude-genre',
                          default='\[[\w ]*\]',
                          dest='exclude_genre',
+                          action = None,
                          help=_("Regex describing tags to exclude as genres.\n" "Default: '%default' excludes bracketed tags, e.g. '[<tag>]'\n"
                          "Applies to: ePub, MOBI output formats")),
                   Option('--exclude-tags',
                          default=('~,'+_('Catalog')),
                          dest='exclude_tags',
+                          action = None,
                          help=_("Comma-separated list of tag words indicating book should be excluded from output.  Case-insensitive.\n"
                          "--exclude-tags=skip will match 'skip this book' and 'Skip will like this'.\n"
                          "Default: '%default'\n"
                          "Applies to: ePub, MOBI output formats")),
                   Option('--generate-titles',
-                          default=True,
+                          default=False,
                          dest='generate_titles',
+                          action = 'store_true',
                          help=_("Include 'Titles' section in catalog.\n"
                          "Default: '%default'\n"
                          "Applies to: ePub, MOBI output formats")),
                   Option('--generate-recently-added',
-                          default=True,
+                          default=False,
                          dest='generate_recently_added',
+                          action = 'store_true',
                          help=_("Include 'Recently Added' section in catalog.\n"
                          "Default: '%default'\n"
                          "Applies to: ePub, MOBI output formats")),
                   Option('--note-tag',
                          default='*',
                          dest='note_tag',
+                          action = None,
                          help=_("Tag prefix for user notes, e.g. '*Jeff might enjoy reading this'.\n"
                          "Default: '%default'\n"
                          "Applies to: ePub, MOBI output formats")),
                   Option('--numbers-as-text',
                          default=False,
                          dest='numbers_as_text',
+                          action = None,
                          help=_("Sort titles with leading numbers as text, e.g.,\n'2001: A Space Odyssey' sorts as \n'Two Thousand One: A Space Odyssey'.\n"
                          "Default: '%default'\n"
                          "Applies to: ePub, MOBI output formats")),
                   Option('--output-profile',
                          default=None,
                          dest='output_profile',
+                          action = None,
                          help=_("Specifies the output profile.  In some cases, an output profile is required to optimize the catalog for the device.  For example, 'kindle' or 'kindle_dx' creates a structured Table of Contents with Sections and Articles.\n"
                          "Default: '%default'\n"
                          "Applies to: ePub, MOBI output formats")),
                   Option('--read-tag',
                          default='+',
                          dest='read_tag',
+                          action = None,
                          help=_("Tag indicating book has been read.\n" "Default: '%default'\n"
                          "Applies to: ePub, MOBI output formats")),
                          ]
@ -3418,13 +3430,12 @@ class EPUB_MOBI(CatalogPlugin):
    def run(self, path_to_output, opts, db, notification=DummyReporter()):
        opts.log = log = Log()
        opts.fmt = self.fmt = path_to_output.rpartition('.')[2]
-        self.opts = opts

        # Add local options
        opts.creator = "calibre"

        # Finalize output_profile
-        op = self.opts.output_profile
+        op = opts.output_profile
        if op is None:
            op = 'default'
        if opts.connected_device['name'] and 'kindle' in opts.connected_device['name'].lower():
@ -3434,13 +3445,30 @@ class EPUB_MOBI(CatalogPlugin):
                op = "kindle"
        opts.descriptionClip = 380 if op.endswith('dx') or 'kindle' not in op else 100
        opts.authorClip = 100 if op.endswith('dx') or 'kindle' not in op else 60
-        self.opts.output_profile = op
+        opts.output_profile = op

        opts.basename = "Catalog"
        opts.cli_environment = not hasattr(opts,'sync')
        # GwR *** hardwired to sort by author, could be an option if passed in opts
        opts.sort_descriptions_by_author = True

+        # If exclude_genre is blank, assume user wants all genre tags included
+        if opts.exclude_genre.strip() == '':
+            opts.exclude_genre = '\[^.\]'
+            log(" converting empty exclude_genre to '\[^.\]'")
+
+        if opts.connected_device['name']:
+            if opts.connected_device['serial']:
+                log(" connected_device: '%s' #%s%s " % \
+                    (opts.connected_device['name'],
+                     opts.connected_device['serial'][0:4],
+                     'x' * (len(opts.connected_device['serial']) - 4)))
+            else:
+                log(" connected_device: '%s'" % opts.connected_device['name'])
+                for storage in opts.connected_device['storage']:
+                    if storage:
+                        log("  mount point: %s" % storage)
+
        if opts.verbose:
            opts_dict = vars(opts)
            log(u"%s(): Generating %s %sin %s environment" %
@ -3458,26 +3486,6 @@ class EPUB_MOBI(CatalogPlugin):
                sections_list.append('Genres')
            log(u"Creating Sections for %s" % ', '.join(sections_list))

-            # If exclude_genre is blank, assume user wants all genre tags included
-            if opts.exclude_genre.strip() == '':
-                opts.exclude_genre = '\[^.\]'
-                log(" converting empty exclude_genre to '\[^.\]'")
-
-            if opts.connected_device['name']:
-                if opts.connected_device['serial']:
-                    log(" connected_device: '%s' #%s%s " % \
-                        (opts.connected_device['name'],
-                         opts.connected_device['serial'][0:4],
-                         'x' * (len(opts.connected_device['serial']) - 4)))
-                else:
-                    log(" connected_device: '%s'" % opts.connected_device['name'])
-
-                for storage in opts.connected_device['storage']:
-                    if storage:
-                        log("  mount point: %s" % storage)
-#                 for book in opts.connected_device['books']:
-#                     log("%s: %s" % (book.title, book.path))
-
            # Display opts
            keys = opts_dict.keys()
            keys.sort()
@ -3488,6 +3496,8 @@ class EPUB_MOBI(CatalogPlugin):
                           'search_text','sort_by','sort_descriptions_by_author','sync']:
                    log("  %s: %s" % (key, opts_dict[key]))

+        self.opts = opts
+
        # Launch the Catalog builder
        catalog = self.CatalogBuilder(db, opts, self, report_progress=notification)
        if opts.verbose:
--- a/src/calibre/library/cli.py
+++ b/src/calibre/library/cli.py
@ -587,9 +587,6 @@ def command_export(args, dbpath):
    do_export(get_db(dbpath, opts), ids, dir, opts)
    return 0

-
-#   GR additions
-
 def catalog_option_parser(args):
    from calibre.customize.ui import available_catalog_formats, plugin_for_catalog_format
    from calibre.utils.logging import Log
@ -599,10 +596,17 @@ def catalog_option_parser(args):
        # Fetch the extension-specific CLI options from the plugin
        plugin = plugin_for_catalog_format(fmt)
        for option in plugin.cli_options:
-            parser.add_option(option.option,
-                              default=option.default,
-                              dest=option.dest,
-                              help=option.help)
+            if option.action:
+                parser.add_option(option.option,
+                                  default=option.default,
+                                  dest=option.dest,
+                                  action=option.action,
+                                  help=option.help)
+            else:
+                parser.add_option(option.option,
+                                  default=option.default,
+                                  dest=option.dest,
+                                  help=option.help)

        return plugin