GwR incremental patch to support pseudo-list strings from composite columns

2025-07-09 03:04:10 -04:00 · 2011-01-12 16:06:27 -07:00 · 2011-01-12 16:06:27 -07:00 · 8212ab2a85
commit 8212ab2a85
parent 3407f7cf0d 2f08bc5086
25 changed files with 1159 additions and 873 deletions
--- a/resources/images/news/zerohedge.png
+++ b/resources/images/news/zerohedge.png
--- a/resources/recipes/expansion_spanish.recipe
+++ b/resources/recipes/expansion_spanish.recipe
@ -1,59 +1,79 @@
 #!/usr/bin/env  python
-# -*- coding: utf-8 -*-
-
 __license__   = 'GPL v3'
-__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
+__author__    = 'Gerardo Diez'
+__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
+description   = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
+__docformat__ = 'restructuredtext en'
+
 '''
-www.expansion.com
+expansion.es
 '''
+from calibre.web.feeds.recipes import BasicNewsRecipe
+class Publico(BasicNewsRecipe):
+    title               =u'Expansion.com'
+    __author__      ='Gerardo Diez'
+    publisher       =u'Unidad Editorial Información Económica, S.L.'
+    category                ='finances, catalunya'
+    oldest_article      =1
+    max_articles_per_feed   =100
+    simultaneous_downloads  =10
+    cover_url       =u'http://estaticos01.expansion.com/iconos/v2.x/v2.0/cabeceras/logo_expansion.png'
+    timefmt         ='[%A, %d %B, %Y]'
+    encoding        ='latin'
+    language        ='es'
+    remove_javascript   =True
+    no_stylesheets      =True
+    keep_only_tags      =dict(name='div', attrs={'class':['noticia primer_elemento']})
+    remove_tags         =[
+                dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto']}),
+                dict(name='ul', attrs={'class':['bolos_desarrollo_noticia']}),
+                dict(name='span', attrs={'class':['comentarios']}),
+                dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}),
+                dict(name='div', attrs={'id':['comentarios_lectores_listado']})
+                            ]
+    feeds               =[
+                (u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'),
+                (u'Portada: Bolsas', u'http://estaticos.expansion.com/rss/mercados.xml'),
+                (u'Divisas', u'http://estaticos.expansion.com/rss/mercadosdivisas.xml'),
+                (u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'),
+                (u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'),
+                (u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'),

-from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag
+                (u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'),
+                (u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'),
+                (u'Créditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
+                (u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'),
+                (u'Fondos de Inversión', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
+                (u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'),

-class Expansion(BasicNewsRecipe):
-    title                 = 'Diario Expansion'
-    __author__            = 'Darko Miletic'
-    description           = 'Lider de informacion de mercados, economica y politica'
-    publisher             = 'expansion.com'
-    category              = 'news, politics, Spain'
-    oldest_article        = 2
-    max_articles_per_feed = 100
-    no_stylesheets        = True
-    use_embedded_content  = False
-    delay                 = 1
-    encoding              = 'iso-8859-15'
-    language = 'es'
+                (u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'),
+                (u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'),
+                (u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'),
+                (u'Energía', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
+                (u'Inmobiliario y Construcción', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
+                (u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'),
+                (u'Automoción e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
+                (u'Distribución', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
+                (u'Deporte y Negocio', u' http://estaticos.expansion.com/rss/empresasdeporte.xml'),
+                (u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'),
+                (u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'),
+                (u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'),

-    direction             = 'ltr'
+                (u'Portada: Economía y Política', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
+                (u'Política', u'http://estaticos.expansion.com/rss/economia.xml'),
+                (u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'),

-    html2lrf_options = [
-                          '--comment'  , description
-                        , '--category' , category
-                        , '--publisher', publisher
-                        ]
+                (u'Portada: Opinión', u'http://estaticos.expansion.com/rss/opinion.xml'),
+                (u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'),
+                (u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'),

-    html2epub_options  = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
+                (u'Portada: Jurídico', u'http://estaticos.expansion.com/rss/juridico.xml'),
+                (u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'),
+                (u'Opinión', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
+                (u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'),

-    feeds              = [
-                            (u'Ultimas noticias', u'http://rss.expansion.com/rss/descarga.htm?data2=178')
-                           ,(u'Temas del dia'   , u'http://rss.expansion.com/rss/descarga.htm?data2=178')
-                         ]
-
-
-    keep_only_tags = [dict(name='div', attrs={'id':'principal'})]
-
-    remove_tags        = [
-                             dict(name=['object','link','script'])
-                            ,dict(name='div', attrs={'class':['utilidades','tit_relacionadas']})
-                         ]
-
-    remove_tags_after = [dict(name='div', attrs={'class':'tit_relacionadas'})]
-
-    def preprocess_html(self, soup):
-        soup.html['dir' ] = self.direction
-        mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
-        soup.head.insert(0,mcharset)
-        for item in soup.findAll(style=True):
-            del item['style']
-        return soup
+                (u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'),
+                (u'Catalu&ntilde;a', u'http://estaticos.expansion.com/rss/catalunya.xml'),
+                (u'Función pública', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
+                ]

--- a/resources/recipes/msnbc.recipe
+++ b/resources/recipes/msnbc.recipe
@ -1,10 +1,9 @@
 __license__   = 'GPL v3'
-__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 msnbc.msn.com
 '''

-import re
 from calibre.web.feeds.recipes import BasicNewsRecipe

 class MsNBC(BasicNewsRecipe):
@ -19,7 +18,16 @@ class MsNBC(BasicNewsRecipe):
    publisher              = 'msnbc.com'
    category               = 'news, USA, world'
    language               = 'en'
-    extra_css              = ' body{ font-family: sans-serif } .head{font-family: serif; font-size: xx-large; font-weight: bold; color: #CC0000} .abstract{font-weight: bold} .source{font-size: small} .updateTime{font-size: small} '
+    extra_css              = """
+                                body{ font-family: Georgia,Times,serif }
+                                .hide{display: none}
+                                .caption{font-family: Arial,sans-serif; font-size: x-small}
+                                .entry-summary{font-family: Arial,sans-serif}
+                                .copyright{font-size: 0.95em; font-style: italic}
+                                .source-org{font-size: small; font-family: Arial,sans-serif}
+                                img{display: block; margin-bottom: 0.5em}
+                                span.byline{display: none}
+                            """

    conversion_options = {
                             'comments' : description
@ -28,14 +36,20 @@ class MsNBC(BasicNewsRecipe):
                            ,'publisher': publisher
                         }

-    preprocess_regexps = [
-        (re.compile(r'</style></head>', re.DOTALL|re.IGNORECASE),lambda match: '</style>')
-       ,(re.compile(r'<div class="head">', re.DOTALL|re.IGNORECASE),lambda match: '</head><body><div class="head">'),
-    ]
+    remove_tags_before = dict(name='h1', attrs={'id':'headline'})
+    remove_tags_after = dict(name='span', attrs={'class':['copyright','Linear copyright']})
+    keep_only_tags=[
+                      dict(attrs={'id':['headline','deck','byline','source','intelliTXT']})
+                     ,dict(attrs={'class':['gl_headline','articleText','drawer-content Linear','v-center3','byline','textBodyBlack']})
+                   ]
+    remove_attributes=['property','lang','rel','xmlns:fb','xmlns:v','xmlns:dc','xmlns:dcmitype','xmlns:og','xmlns:media','xmlns:vcard','typeof','itemscope','itemtype','itemprop','about','type','size','width','height','onreadystatechange','data','border','hspace','vspace']
+
+    remove_tags      = [
+                          dict(name=['iframe','object','link','embed','meta','table'])
+                         ,dict(name='span', attrs={'class':['copyright','Linear copyright']})
+                         ,dict(name='div', attrs={'class':'social'})
+                       ]

-    remove_tags_before = dict(name='div', attrs={'class':'head'})
-    remove_tags_after = dict(name='div', attrs={'class':'copyright'})
-    remove_tags      = [dict(name=['iframe','object','link','script','form'])]

    feeds = [
               (u'US News'       , u'http://rss.msnbc.msn.com/id/3032524/device/rss/rss.xml'      )
@ -48,11 +62,26 @@ class MsNBC(BasicNewsRecipe):
              ,(u'Tech & Science', u'http://rss.msnbc.msn.com/id/3032117/device/rss/rss.xml'      )
            ]

-    def print_version(self, url):
-        return url + 'print/1/displaymode/1098/'
-
    def preprocess_html(self, soup):
-        for item in soup.head.findAll('div'):
-            item.extract()
+        for item in soup.body.findAll('html'):
+            item.name='div'
+        for item in soup.body.findAll('div'):
+            if item.has_key('id') and item['id'].startswith('vine-'):
+               item.extract()
+            if item.has_key('class') and ( item['class'].startswith('ad') or item['class'].startswith('vine')):
+               item.extract()
+        for item in soup.body.findAll('img'):
+            if not item.has_key('alt'):
+               item['alt'] = 'image'
+        for item in soup.body.findAll('ol'):
+            if item.has_key('class') and item['class'].startswith('grid'):
+               item.extract()
+        for item in soup.body.findAll('span'):
+            if ( item.has_key('id') and item['id'].startswith('byLine') and item.string is None) or ( item.has_key('class') and item['class'].startswith('inline') ):
+               item.extract()
+        for alink in soup.findAll('a'):
+            if alink.string is not None:
+               tstr = alink.string
+               alink.replaceWith(tstr)
        return soup

--- a/resources/recipes/technology_review.recipe
+++ b/resources/recipes/technology_review.recipe
@ -35,7 +35,6 @@ class TechnologyReview(BasicNewsRecipe):
    def get_article_url(self, article):
        return article.get('guid', article.get('id', None))

-
    def print_version(self, url):
        baseurl='http://www.technologyreview.com/printer_friendly_article.aspx?id='
        split1 = string.split(url,"/")
@ -43,3 +42,25 @@ class TechnologyReview(BasicNewsRecipe):
        split2= string.split(xxx,"/")
        s =  baseurl + split2[0]
        return s
+
+
+    def postprocess_html(self,soup, True):
+        #remove picture
+        headerhtml = soup.find(True, {'class':'header'})
+        headerhtml.replaceWith("")
+
+        #remove close button
+        closehtml = soup.find(True, {'class':'close'})
+        closehtml.replaceWith("")
+
+        #remove banner advertisement
+        bannerhtml = soup.find(True, {'class':'bannerad'})
+        bannerhtml.replaceWith("")
+
+        #thanks kiklop74!  This code removes all links from the text
+        for alink in soup.findAll('a'):
+            if alink.string is not None:
+               tstr = alink.string
+               alink.replaceWith(tstr)
+
+        return soup
--- a/resources/recipes/wired_daily.recipe
+++ b/resources/recipes/wired_daily.recipe
@ -2,8 +2,10 @@
 __license__   = 'GPL v3'
 __docformat__ = 'restructuredtext en'

+import re

 from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.chardet import xml_to_unicode

 class Wired_Daily(BasicNewsRecipe):

@ -15,30 +17,43 @@ class Wired_Daily(BasicNewsRecipe):

    no_stylesheets = True

+    preprocess_regexps = [(re.compile(r'<head.*</head>', re.DOTALL), lambda m:
+        '<head></head>')]
+
    remove_tags_before = dict(name='div', id='content')
-    remove_tags = [dict(id=['social_tools', 'outerWrapper', 'sidebar',
-        'footer', 'advertisement', 'blog_subscription_unit',
-        'brightcove_component']),
-        {'class':'entryActions'},
-        dict(name=['noscript', 'script'])]
+    remove_tags = [dict(id=['header', 'commenting_module', 'post_nav',
+        'social_tools', 'sidebar', 'footer', 'social_wishlist', 'pgwidget',
+        'outerWrapper', 'inf_widget']),
+        {'class':['entryActions', 'advertisement', 'entryTags']},
+        dict(name=['noscript', 'script']),
+        dict(name='h4', attrs={'class':re.compile(r'rat\d+')}),
+        {'class':lambda x: x and x.startswith('contentjump')},
+        dict(name='li', attrs={'class':['entryCategories', 'entryEdit']})]
+

    feeds = [
        ('Top News', 'http://feeds.wired.com/wired/index'),
-        ('Culture', 'http://feeds.wired.com/wired/culture'),
-        ('Software', 'http://feeds.wired.com/wired/software'),
-        ('Mac', 'http://feeds.feedburner.com/cultofmac/bFow'),
-        ('Gadgets', 'http://feeds.wired.com/wired/gadgets'),
-        ('Cars', 'http://feeds.wired.com/wired/cars'),
-        ('Entertainment', 'http://feeds.wired.com/wired/entertainment'),
-        ('Gaming', 'http://feeds.wired.com/wired/gaming'),
-        ('Science', 'http://feeds.wired.com/wired/science'),
-        ('Med Tech', 'http://feeds.wired.com/wired/medtech'),
-        ('Politics', 'http://feeds.wired.com/wired/politics'),
-        ('Tech Biz', 'http://feeds.wired.com/wired/techbiz'),
-        ('Commentary', 'http://feeds.wired.com/wired/commentary'),
+        ('Product Reviews',
+            'http://www.wired.com/reviews/feeds/latestProductsRss'),
+        ('Autopia', 'http://www.wired.com/autopia/feed/'),
+        ('Danger Room', 'http://www.wired.com/dangerroom/feed/'),
+        ('Epicenter', 'http://www.wired.com/epicenter/feed/'),
+        ('Gadget Lab', 'http://www.wired.com/gadgetlab/feed/'),
+        ('Geek Dad', 'http://www.wired.com/geekdad/feed/'),
+        ('Playbook', 'http://www.wired.com/playbook/feed/'),
+        ('Rawfile', 'http://www.wired.com/rawfile/feed/'),
+        ('This Day in Tech', 'http://www.wired.com/thisdayintech/feed/'),
+        ('Threat Level', 'http://www.wired.com/threatlevel/feed/'),
+        ('Underwire', 'http://www.wired.com/underwire/feed/'),
+        ('Web Monkey', 'http://www.webmonkey.com/feed/'),
+        ('Science', 'http://www.wired.com/wiredscience/feed/'),
        ]

+    def populate_article_metadata(self, article, soup, first):
+        if article.text_summary:
+            article.text_summary = xml_to_unicode(article.text_summary,
+                    resolve_entities=True)[0]
+
    def print_version(self, url):
-        return url.replace('http://www.wired.com/', 'http://www.wired.com/print/')
-
+        return url + '/all/1'

--- a/resources/recipes/zerohedge.recipe
+++ b/resources/recipes/zerohedge.recipe
@ -0,0 +1,33 @@
+__license__   = 'GPL v3'
+__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
+'''
+www.zerohedge.com
+'''
+
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class ZeroHedge(BasicNewsRecipe):
+    title                  = 'Zero Hedge'
+    __author__             = 'Darko Miletic'
+    description            = 'On a long enough timeline the survival rate for everyone drops to zero'
+    oldest_article         = 10
+    max_articles_per_feed  = 100
+    no_stylesheets         = True
+    use_embedded_content   = True
+    encoding               = 'utf8'
+    publisher              = 'zero hedge'
+    category               = 'news, USA, world, economy, politics'
+    language               = 'en'
+    masthead_url           = 'http://www.zerohedge.com/themes/newsflash/logo.png'
+    publication_type       = 'blog'
+    extra_css              = 'body{ font-family: sans-serif }'
+
+    conversion_options = {
+                             'comments' : description
+                            ,'tags'     : category
+                            ,'language' : language
+                            ,'publisher': publisher
+                         }
+
+
+    feeds = [(u'Articles', u'http://feeds.feedburner.com/zerohedge/feed')]
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -459,6 +459,18 @@ def force_unicode(obj, enc=preferred_encoding):
                        obj = obj.decode('utf-8')
    return obj

+def as_unicode(obj, enc=preferred_encoding):
+    if not isbytestring(obj):
+        try:
+            obj = unicode(obj)
+        except:
+            try:
+                obj = str(obj)
+            except:
+                obj = repr(obj)
+    return force_unicode(obj, enc=enc)
+
+

 def human_readable(size):
    """ Convert a size in bytes into a human readable form """
--- a/src/calibre/ebooks/metadata/rtf.py
+++ b/src/calibre/ebooks/metadata/rtf.py
@ -10,7 +10,8 @@ from calibre.ebooks.metadata import MetaInformation, string_to_authors
 title_pat    = re.compile(r'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL)
 author_pat   = re.compile(r'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL)
 comment_pat  = re.compile(r'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL)
-category_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
+tags_pat = re.compile(r'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
+publisher_pat = re.compile(r'\{\\info.*?\{\\manager(.*?)(?<!\\)\}', re.DOTALL)

 def get_document_info(stream):
    """
@ -82,61 +83,73 @@ def decode(raw, codec):

 def get_metadata(stream):
    """ Return metadata as a L{MetaInfo} object """
-    title, author, comment, category = None, None, None, None
    stream.seek(0)
    if stream.read(5) != r'{\rtf':
-        return MetaInformation(None, None)
+        return MetaInformation(_('Unknown'))
    block = get_document_info(stream)[0]
    if not block:
-        return MetaInformation(None, None)
+        return MetaInformation(_('Unknown'))

    stream.seek(0)
    cpg = detect_codepage(stream)
    stream.seek(0)

    title_match = title_pat.search(block)
-    if title_match:
+    if title_match is not None:
        title = decode(title_match.group(1).strip(), cpg)
+    else:
+        title = _('Unknown')
    author_match = author_pat.search(block)
-    if author_match:
+    if author_match is not None:
        author = decode(author_match.group(1).strip(), cpg)
-    comment_match = comment_pat.search(block)
-    if comment_match:
-        comment = decode(comment_match.group(1).strip(), cpg)
-    category_match = category_pat.search(block)
-    if category_match:
-        category = decode(category_match.group(1).strip(), cpg)
-    mi = MetaInformation(title, author)
+    else:
+        author = None
+    mi = MetaInformation(title)
    if author:
        mi.authors = string_to_authors(author)
-    mi.comments = comment
-    mi.category = category
+
+    comment_match = comment_pat.search(block)
+    if comment_match is not None:
+        comment = decode(comment_match.group(1).strip(), cpg)
+        mi.comments = comment
+    tags_match = tags_pat.search(block)
+    if tags_match is not None:
+        tags = decode(tags_match.group(1).strip(), cpg)
+        mi.tags = tags
+    publisher_match = publisher_pat.search(block)
+    if publisher_match is not None:
+        publisher = decode(publisher_match.group(1).strip(), cpg)
+        mi.publisher = publisher
+
    return mi

-
 def create_metadata(stream, options):
-    md = r'{\info'
+    md = [r'{\info']
    if options.title:
        title = options.title.encode('ascii', 'ignore')
-        md += r'{\title %s}'%(title,)
+        md.append(r'{\title %s}'%(title,))
    if options.authors:
        au = options.authors
        if not isinstance(au, basestring):
            au = u', '.join(au)
        author = au.encode('ascii', 'ignore')
-        md += r'{\author %s}'%(author,)
-    if options.get('category', None):
-        category = options.category.encode('ascii', 'ignore')
-        md += r'{\category %s}'%(category,)
+        md.append(r'{\author %s}'%(author,))
    comp = options.comment if hasattr(options, 'comment') else options.comments
    if comp:
        comment = comp.encode('ascii', 'ignore')
-        md += r'{\subject %s}'%(comment,)
-    if len(md) > 6:
-        md += '}'
+        md.append(r'{\subject %s}'%(comment,))
+    if options.publisher:
+        publisher = options.publisher.encode('ascii', 'ignore')
+        md.append(r'{\manager %s}'%(publisher,))
+    if options.tags:
+        tags = u', '.join(options.tags)
+        tags = tags.encode('ascii', 'ignore')
+        md.append(r'{\category %s}'%(tags,))
+    if len(md) > 1:
+        md.append('}')
        stream.seek(0)
        src   = stream.read()
-        ans = src[:6] + md + src[6:]
+        ans = src[:6] + u''.join(md) + src[6:]
        stream.seek(0)
        stream.write(ans)

@ -156,7 +169,7 @@ def set_metadata(stream, options):

        base_pat = r'\{\\name(.*?)(?<!\\)\}'
        title = options.title
-        if title != None:
+        if title is not None:
            title = title.encode('ascii', 'replace')
            pat = re.compile(base_pat.replace('name', 'title'), re.DOTALL)
            if pat.search(src):
@ -164,7 +177,7 @@ def set_metadata(stream, options):
            else:
                src = add_metadata_item(src, 'title', title)
        comment = options.comments
-        if comment != None:
+        if comment is not None:
            comment = comment.encode('ascii', 'replace')
            pat = re.compile(base_pat.replace('name', 'subject'), re.DOTALL)
            if pat.search(src):
@ -172,7 +185,7 @@ def set_metadata(stream, options):
            else:
                src = add_metadata_item(src, 'subject', comment)
        author = options.authors
-        if author != None:
+        if author is not None:
            author =  ', '.join(author)
            author = author.encode('ascii', 'ignore')
            pat = re.compile(base_pat.replace('name', 'author'), re.DOTALL)
@ -180,14 +193,23 @@ def set_metadata(stream, options):
                src = pat.sub(r'{\\author ' + author + r'}', src)
            else:
                src = add_metadata_item(src, 'author', author)
-        category = options.get('category', None)
-        if category != None:
-            category = category.encode('ascii', 'replace')
+        tags = options.tags
+        if tags is not None:
+            tags =  ', '.join(tags)
+            tags = tags.encode('ascii', 'replace')
            pat = re.compile(base_pat.replace('name', 'category'), re.DOTALL)
            if pat.search(src):
-                src = pat.sub(r'{\\category ' + category + r'}', src)
+                src = pat.sub(r'{\\category ' + tags + r'}', src)
            else:
-                src = add_metadata_item(src, 'category', category)
+                src = add_metadata_item(src, 'category', tags)
+        publisher = options.publisher
+        if publisher is not None:
+            publisher = publisher.encode('ascii', 'replace')
+            pat = re.compile(base_pat.replace('name', 'manager'), re.DOTALL)
+            if pat.search(src):
+                src = pat.sub(r'{\\manager ' + publisher + r'}', src)
+            else:
+                src = add_metadata_item(src, 'manager', publisher)
        stream.seek(pos + olen)
        after = stream.read()
        stream.seek(pos)
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -1,4 +1,8 @@
 # -*- coding: utf-8 -*-
+__license__   = 'GPL v3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+

 '''
 Read content from txt file.
@ -10,10 +14,7 @@ from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
 from calibre.ebooks.conversion.preprocess import DocAnalysis
-
-__license__   = 'GPL v3'
-__copyright__ = '2009, John Schember <john@nachtimwald.com>'
-__docformat__ = 'restructuredtext en'
+from calibre.utils.cleantext import clean_ascii_chars

 HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'

@ -33,9 +34,7 @@ def clean_txt(txt):
    # Remove excessive line breaks.
    txt = re.sub('\n{3,}', '\n\n', txt)
    #remove ASCII invalid chars : 0 to 8 and 11-14 to 24
-    chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19))
-    illegal_chars = re.compile(u'|'.join(map(unichr, chars)))
-    txt = illegal_chars.sub('', txt)
+    txt = clean_ascii_chars(txt)

    return txt

--- a/src/calibre/gui2/catalog/catalog_bibtex.py
+++ b/src/calibre/gui2/catalog/catalog_bibtex.py
@ -27,14 +27,17 @@ class PluginWidget(QWidget, Ui_Form):
    def __init__(self, parent=None):
        QWidget.__init__(self, parent)
        self.setupUi(self)
-        from calibre.library.catalog import FIELDS
-        self.all_fields = []
-        for x in FIELDS :
-            if x != 'all':
-                self.all_fields.append(x)
-                QListWidgetItem(x, self.db_fields)

    def initialize(self, name, db): #not working properly to update
+        from calibre.library.catalog import FIELDS
+
+        self.all_fields = [x for x in FIELDS if x != 'all']
+        #add custom columns
+        self.all_fields.extend([x for x in sorted(db.custom_field_keys())])
+        #populate
+        for x in self.all_fields:
+            QListWidgetItem(x, self.db_fields)
+
        self.name = name
        fields = gprefs.get(name+'_db_fields', self.all_fields)
        # Restore the activated db_fields from last use
--- a/src/calibre/gui2/dialogs/metadata_bulk.py
+++ b/src/calibre/gui2/dialogs/metadata_bulk.py
@ -15,7 +15,7 @@ from calibre.ebooks.metadata import string_to_authors, authors_to_string
 from calibre.ebooks.metadata.book.base import composite_formatter
 from calibre.ebooks.metadata.meta import get_metadata
 from calibre.gui2.custom_column_widgets import populate_metadata_page
-from calibre.gui2 import error_dialog
+from calibre.gui2 import error_dialog, ResizableDialog
 from calibre.gui2.progress_indicator import ProgressIndicator
 from calibre.utils.config import dynamic
 from calibre.utils.titlecase import titlecase
@ -49,7 +49,7 @@ def get_cover_data(path):



-class MyBlockingBusy(QDialog):
+class MyBlockingBusy(QDialog): # {{{

    do_one_signal = pyqtSignal()

@ -241,8 +241,9 @@ class MyBlockingBusy(QDialog):
        self.current_index += 1
        self.do_one_signal.emit()

+    # }}}

-class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
+class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog):

    s_r_functions = {       ''              : lambda x: x,
                            _('Lower Case') : lambda x: icu_lower(x),
@ -261,9 +262,8 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
                        ]

    def __init__(self, window, rows, model, tab):
-        QDialog.__init__(self, window)
+        ResizableDialog.__init__(self, window)
        Ui_MetadataBulkDialog.__init__(self)
-        self.setupUi(self)
        self.model = model
        self.db = model.db
        self.ids = [self.db.id(r) for r in rows]
--- a/src/calibre/gui2/dialogs/metadata_bulk.ui
+++ b/src/calibre/gui2/dialogs/metadata_bulk.ui
--- a/src/calibre/gui2/dialogs/metadata_single.py
+++ b/src/calibre/gui2/dialogs/metadata_single.py
@ -823,7 +823,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
                                if book.series_index is not None:
                                    self.series_index.setValue(book.series_index)
                        if book.has_cover:
-                            if d.opt_auto_download_cover.isChecked() and book.has_cover:
+                            if d.opt_auto_download_cover.isChecked():
                                self.fetch_cover()
                            else:
                                self.fetch_cover_button.setFocus(Qt.OtherFocusReason)
--- a/src/calibre/gui2/layout.py
+++ b/src/calibre/gui2/layout.py
@ -8,9 +8,9 @@ __docformat__ = 'restructuredtext en'
 from functools import partial

 from PyQt4.Qt import QIcon, Qt, QWidget, QToolBar, QSize, \
-    pyqtSignal, QToolButton, QPushButton, \
-    QObject, QVBoxLayout, QSizePolicy, QLabel, QHBoxLayout, QActionGroup, \
-    QMenu
+    pyqtSignal, QToolButton, QMenu, QCheckBox, \
+    QObject, QVBoxLayout, QSizePolicy, QLabel, QHBoxLayout, QActionGroup
+

 from calibre.constants import __appname__
 from calibre.gui2.search_box import SearchBox2, SavedSearchBox
@ -178,7 +178,9 @@ class SearchBar(QWidget): # {{{
        x.setToolTip(_("<p>Search the list of books by title, author, publisher, tags, comments, etc.<br><br>Words separated by spaces are ANDed"))
        l.addWidget(x)

-        self.search_button = QPushButton(_('&Go!'))
+        self.search_button = QToolButton()
+        self.search_button.setToolButtonStyle(Qt.ToolButtonTextOnly)
+        self.search_button.setText(_('&Go!'))
        l.addWidget(self.search_button)
        self.search_button.setSizePolicy(QSizePolicy.Minimum,
                QSizePolicy.Minimum)
@ -192,6 +194,12 @@ class SearchBar(QWidget): # {{{
        l.addWidget(x)
        x.setToolTip(_("Reset Quick Search"))

+        x = parent.search_highlight_only = QCheckBox()
+        x.setText(_('&Highlight'))
+        x.setToolTip(_('Highlight matched books in the book list, instead '
+            'of restricting the book list to the matches.'))
+        l.addWidget(x)
+
        x = parent.saved_search = SavedSearchBox(self)
        x.setMaximumSize(QSize(150, 16777215))
        x.setMinimumContentsLength(15)
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@ -10,7 +10,7 @@ from contextlib import closing
 from operator import attrgetter

 from PyQt4.Qt import QAbstractTableModel, Qt, pyqtSignal, QIcon, QImage, \
-        QModelIndex, QVariant, QDate
+        QModelIndex, QVariant, QDate, QColor

 from calibre.gui2 import NONE, config, UNDEFINED_QDATE
 from calibre.utils.pyparsing import ParseException
@ -93,6 +93,9 @@ class BooksModel(QAbstractTableModel): # {{{
        self.bool_no_icon = QIcon(I('list_remove.png'))
        self.bool_blank_icon = QIcon(I('blank.png'))
        self.device_connected = False
+        self.rows_matching = set()
+        self.lowest_row_matching = None
+        self.highlight_only = False
        self.read_config()

    def change_alignment(self, colname, alignment):
@ -229,9 +232,27 @@ class BooksModel(QAbstractTableModel): # {{{
            self.endInsertRows()
            self.count_changed()

+    def set_highlight_only(self, toWhat):
+        self.highlight_only = toWhat
+        if self.last_search:
+            self.research()
+
    def search(self, text, reset=True):
        try:
-            self.db.search(text)
+            if self.highlight_only:
+                self.db.search('')
+                if not text:
+                    self.rows_matching = set()
+                    self.lowest_row_matching = None
+                else:
+                    self.rows_matching = self.db.search(text, return_matches=True)
+                    if self.rows_matching:
+                        self.lowest_row_matching = self.db.row(self.rows_matching[0])
+                    self.rows_matching = set(self.rows_matching)
+            else:
+                self.rows_matching = set()
+                self.lowest_row_matching = None
+                self.db.search(text)
        except ParseException as e:
            self.searched.emit(e.msg)
            return
@ -337,8 +358,9 @@ class BooksModel(QAbstractTableModel): # {{{
            name, val = mi.format_field(key)
            if mi.metadata_for_field(key)['datatype'] == 'comments':
                name += ':html'
-            if val:
+            if val and name not in data:
                data[name] = val
+
        return data


@ -651,6 +673,9 @@ class BooksModel(QAbstractTableModel): # {{{
            return NONE
        if role in (Qt.DisplayRole, Qt.EditRole):
            return self.column_to_dc_map[col](index.row())
+        elif role == Qt.BackgroundColorRole:
+            if self.id(index) in self.rows_matching:
+                return QColor('lightgreen')
        elif role == Qt.DecorationRole:
            if self.column_to_dc_decorator_map[col] is not None:
                return self.column_to_dc_decorator_map[index.column()](index.row())
--- a/src/calibre/gui2/library/views.py
+++ b/src/calibre/gui2/library/views.py
@ -680,8 +680,14 @@ class BooksView(QTableView): # {{{
    def set_editable(self, editable, supports_backloading):
        self._model.set_editable(editable)

+    def search_proxy(self, txt):
+        self._model.search(txt)
+        if self._model.lowest_row_matching is not None:
+            self.select_rows([self._model.lowest_row_matching], using_ids=False)
+        self.setFocus(Qt.OtherFocusReason)
+
    def connect_to_search_box(self, sb, search_done):
-        sb.search.connect(self._model.search)
+        sb.search.connect(self.search_proxy)
        self._search_done = search_done
        self._model.searched.connect(self.search_done)

--- a/src/calibre/gui2/preferences/toolbar.py
+++ b/src/calibre/gui2/preferences/toolbar.py
@ -37,7 +37,10 @@ class BaseModel(QAbstractListModel):
                    dont_remove_from=set(['toolbar-device']))
        if name is None:
            return FakeAction('--- '+_('Separator')+' ---', None)
-        return gui.iactions[name]
+        try:
+            return gui.iactions[name]
+        except:
+            return None

    def rowCount(self, parent):
        return len(self._data)
@ -124,7 +127,8 @@ class CurrentModel(BaseModel):
        BaseModel.__init__(self)
        self.gprefs_name = 'action-layout-'+key
        current = gprefs[self.gprefs_name]
-        self._data =  [self.name_to_action(x, gui) for x in current]
+        self._data = [self.name_to_action(x, gui) for x in current]
+        self._data = [x for x in self._data if x is not None]
        self.key = key
        self.gui = gui

--- a/src/calibre/gui2/search_box.py
+++ b/src/calibre/gui2/search_box.py
@ -16,6 +16,7 @@ from calibre.gui2 import config
 from calibre.gui2.dialogs.confirm_delete import confirm
 from calibre.gui2.dialogs.saved_search_editor import SavedSearchEditor
 from calibre.gui2.dialogs.search import SearchDialog
+from calibre.utils.config import dynamic
 from calibre.utils.search_query_parser import saved_searches
 from calibre.utils.icu import sort_key

@ -375,6 +376,9 @@ class SearchBoxMixin(object): # {{{
            unicode(self.search.toolTip())))
        self.advanced_search_button.setStatusTip(self.advanced_search_button.toolTip())
        self.clear_button.setStatusTip(self.clear_button.toolTip())
+        self.search_highlight_only.stateChanged.connect(self.highlight_only_changed)
+        self.search_highlight_only.setChecked(
+                            dynamic.get('search_highlight_only', False))

    def focus_search_box(self, *args):
        self.search.setFocus(Qt.OtherFocusReason)
@ -401,6 +405,11 @@ class SearchBoxMixin(object): # {{{
    def focus_to_library(self):
        self.current_view().setFocus(Qt.OtherFocusReason)

+    def highlight_only_changed(self, toWhat):
+        dynamic.set('search_highlight_only', toWhat)
+        self.current_view().model().set_highlight_only(toWhat)
+        self.focus_to_library()
+
    # }}}

 class SavedSearchBoxMixin(object): # {{{
--- a/src/calibre/gui2/viewer/main.py
+++ b/src/calibre/gui2/viewer/main.py
@ -26,6 +26,7 @@ from calibre.gui2.search_box import SearchBox2
 from calibre.ebooks.metadata import MetaInformation
 from calibre.customize.ui import available_input_formats
 from calibre.gui2.viewer.dictionary import Lookup
+from calibre import as_unicode

 class TOCItem(QStandardItem):

@ -632,7 +633,7 @@ class EbookViewer(MainWindow, Ui_EbookViewer):
            else:
                r = getattr(worker.exception, 'reason', worker.exception)
                error_dialog(self, _('Could not open ebook'),
-                        unicode(r), det_msg=worker.traceback, show=True)
+                        as_unicode(r), det_msg=worker.traceback, show=True)
            self.close_progress_indicator()
        else:
            self.metadata.show_opf(self.iterator.opf, os.path.splitext(pathtoebook)[1][1:])
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@ -411,7 +411,8 @@ class ResultCache(SearchQueryParser): # {{{
            if isinstance(location, list):
                if allow_recursion:
                    for loc in location:
-                        matches |= self.get_matches(loc, query, allow_recursion=False)
+                        matches |= self.get_matches(loc, query, candidates,
+                                                    allow_recursion=False)
                    return matches
                raise ParseException(query, len(query), 'Recursive query group detected', self)

@ -419,11 +420,11 @@ class ResultCache(SearchQueryParser): # {{{
                fm = self.field_metadata[location]
                # take care of dates special case
                if fm['datatype'] == 'datetime':
-                    return self.get_dates_matches(location, query.lower())
+                    return self.get_dates_matches(location, query.lower(), candidates)

                # take care of numbers special case
                if fm['datatype'] in ('rating', 'int', 'float'):
-                    return self.get_numeric_matches(location, query.lower())
+                    return self.get_numeric_matches(location, query.lower(), candidates)

                # take care of the 'count' operator for is_multiples
                if fm['is_multiple'] and \
@ -431,7 +432,8 @@ class ResultCache(SearchQueryParser): # {{{
                        query[1:1] in '=<>!':
                    vf = lambda item, loc=fm['rec_index'], ms=fm['is_multiple']:\
                            len(item[loc].split(ms)) if item[loc] is not None else 0
-                    return self.get_numeric_matches(location, query[1:], val_func=vf)
+                    return self.get_numeric_matches(location, query[1:],
+                                                    candidates, val_func=vf)

            # everything else, or 'all' matches
            matchkind = CONTAINS_MATCH
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@ -1531,10 +1531,23 @@ class EPUB_MOBI(CatalogPlugin):
                                        self.opts.header_note_source_field,
                                        index_is_id=True)
                    if notes:
-                        if field_md['datatype'] == 'text' and isinstance(notes,list):
-                            notes = ' &middot; '.join(notes)
+                        if field_md['datatype'] == 'text':
+                            print "\n inner notes: %s\n" % repr(notes)
+                            if isinstance(notes,list):
+                                notes = ' &middot; '.join(notes)
                        elif field_md['datatype'] == 'datetime':
                            notes = format_date(notes,'dd MMM yyyy')
+                        elif field_md['datatype'] == 'composite':
+                            if re.match(r'\[(.*)\]$', notes):
+                                # Sniff for special pseudo-list string "[<item, item>]"
+                                bracketed_content = re.match(r'\[(.*)\]$', notes).group(1)
+                                if re.search(',',bracketed_content):
+                                    # Recast the comma-separated items as a list
+                                    items = bracketed_content.split(',')
+                                    items = [i.strip() for i in items]
+                                    notes = ' &middot; '.join(items)
+                                else:
+                                    notes = bracketed_content
                        this_title['notes'] = {'source':field_md['name'],
                                                   'content':notes}

--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -341,10 +341,6 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        self.has_id  = self.data.has_id
        self.count   = self.data.count

-        # Count times get_metadata is called, and how many times in the cache
-        self.gm_count  = 0
-        self.gm_missed = 0
-
        for prop in ('author_sort', 'authors', 'comment', 'comments', 'isbn',
                     'publisher', 'rating', 'series', 'series_index', 'tags',
                     'title', 'timestamp', 'uuid', 'pubdate', 'ondevice'):
@ -710,6 +706,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        formats = row[fm['formats']]
        if not formats:
            formats = None
+        else:
+            formats = formats.split(',')
        mi.formats = formats
        tags = row[fm['tags']]
        if tags:
--- a/src/calibre/trac/bzr_commit_plugin.py
+++ b/src/calibre/trac/bzr_commit_plugin.py
@ -110,6 +110,7 @@ class cmd_commit(_cmd_commit):
            suffix = 'The fix will be in the next release.'
        action = action+'ed'
        msg = '%s in branch %s. %s'%(action, nick, suffix)
+        msg = msg.replace('Fixesed', 'Fixed')
        server = xmlrpclib.ServerProxy(url)
        server.ticket.update(int(bug), msg,
                             {'status':'closed', 'resolution':'fixed'},
--- a/src/calibre/utils/cleantext.py
+++ b/src/calibre/utils/cleantext.py
@ -3,7 +3,7 @@ __license__ = 'GPL 3'
 __copyright__ = '2010, sengian <sengian1@gmail.com>'
 __docformat__ = 'restructuredtext en'

-import re
+import re, htmlentitydefs

 _ascii_pat = None

@ -21,3 +21,32 @@ def clean_ascii_chars(txt, charlist=None):
        pat = re.compile(u'|'.join(map(unichr, charlist)))
    return pat.sub('', txt)

+##
+# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
+# Removes HTML or XML character references and entities from a text string.
+#
+# @param text The HTML (or XML) source text.
+# @return The plain text, as a Unicode string, if necessary.
+
+def unescape(text, rm=False, rchar=u''):
+    def fixup(m, rm=rm, rchar=rchar):
+        text = m.group(0)
+        if text[:2] == "&#":
+            # character reference
+            try:
+                if text[:3] == "&#x":
+                    return unichr(int(text[3:-1], 16))
+                else:
+                    return unichr(int(text[2:-1]))
+            except ValueError:
+                pass
+        else:
+            # named entity
+            try:
+                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+            except KeyError:
+                pass
+        if rm:
+            return rchar #replace by char
+        return text # leave as is
+    return re.sub("&#?\w+;", fixup, text)
--- a/src/calibre/utils/formatter.py
+++ b/src/calibre/utils/formatter.py
@ -18,6 +18,24 @@ class _Parser(object):
    LEX_NUM = 4
    LEX_EOF = 5

+    def _python(self, func):
+        locals = {}
+        exec func in locals
+        if 'evaluate' not in locals:
+            self.error('no evaluate function in python')
+        try:
+            result = locals['evaluate'](self.parent.kwargs)
+            if isinstance(result, (float, int)):
+                result = unicode(result)
+            elif isinstance(result, list):
+                result = ','.join(result)
+            elif isinstance(result, str):
+                result = unicode(result)
+            return result
+        except Exception as e:
+            self.error('python function threw exception: ' + e.msg)
+
+
    def _strcmp(self, x, y, lt, eq, gt):
        v = strcmp(x, y)
        if v < 0:
@ -79,6 +97,7 @@ class _Parser(object):
            'field'    : (1, lambda s, x: s.parent.get_value(x, [], s.parent.kwargs)),
            'multiply' : (2, partial(_math, op='*')),
            'print'    : (-1, _print),
+            'python'   : (1, _python),
            'strcat'   : (-1, _concat),
            'strcmp'   : (5, _strcmp),
            'substr'   : (3, lambda s, x, y, z: x[int(y): len(x) if int(z) == 0 else int(z)]),
@ -362,7 +381,7 @@ class TemplateFormatter(string.Formatter):
                (r'\'.*?((?<!\\)\')',   lambda x,t: (3, t[1:-1])),
                (r'\n#.*?(?=\n)',       None),
                (r'\s',                 None)
-        ])
+        ], flags=re.DOTALL)

    def _eval_program(self, val, prog):
        # keep a cache of the lex'ed program under the theory that re-lexing