merge from trunk, minor preprocess tweaks

2025-12-14 09:05:16 -05:00 · 2011-01-13 17:02:44 +08:00 · 2011-01-13 17:02:44 +08:00 · 8d02ad93a7
commit 8d02ad93a7
parent 9832b7118b 187331af81
18 changed files with 370 additions and 68 deletions
--- a/resources/recipes/cicero.recipe
+++ b/resources/recipes/cicero.recipe
@ -0,0 +1,35 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class Cicero(BasicNewsRecipe):
    timefmt               = ' [%Y-%m-%d]'
    title                 = u'Cicero'
    __author__            = 'mad@sharktooth.de'
    description           = u'Magazin f\xfcr politische Kultur'
    oldest_article        = 7
    language              = 'de'
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    publisher             = 'Ringier Publishing'
    category              = 'news, politics, Germany'
    encoding              = 'iso-8859-1'
    publication_type      = 'magazine'
    masthead_url          = 'http://www.cicero.de/img2/cicero_logo_rss.gif'
    feeds                 = [
 (u'Das gesamte Portfolio', u'http://www.cicero.de/rss/rss.php?ress_id='),
 #(u'Alle Heft-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=heft'),
 #(u'Alle Online-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=online'),
 #(u'Berliner Republik', u'http://www.cicero.de/rss/rss.php?ress_id=4'),
 #(u'Weltb\xfchne', u'http://www.cicero.de/rss/rss.php?ress_id=1'),
 #(u'Salon', u'http://www.cicero.de/rss/rss.php?ress_id=7'),
 #(u'Kapital', u'http://www.cicero.de/rss/rss.php?ress_id=6'),
 #(u'Netzst\xfccke', u'http://www.cicero.de/rss/rss.php?ress_id=9'),
 #(u'Leinwand', u'http://www.cicero.de/rss/rss.php?ress_id=12'),
 #(u'Bibliothek', u'http://www.cicero.de/rss/rss.php?ress_id=15'),
 (u'Kolumne - Alle Kolulmnen', u'http://www.cicero.de/rss/rss2.php?ress_id='),
 #(u'Kolumne - Schreiber, Berlin', u'http://www.cicero.de/rss/rss2.php?ress_id=35'),
 #(u'Kolumne - TV Kritik', u'http://www.cicero.de/rss/rss2.php?ress_id=34')
 ]
    def print_version(self, url):
        return 'http://www.cicero.de/page_print.php?' + url.rpartition('?')[2]
--- a/resources/recipes/cnetjapan.recipe
+++ b/resources/recipes/cnetjapan.recipe
@ -11,7 +11,7 @@ class CNetJapan(BasicNewsRecipe):
                      (u'CNet Blog', u'http://feed.japan.cnet.com/rss/blog/index.rdf')
                        ]
    language       = 'ja'
-    encoding       = 'Shift_JIS'
+    encoding       = 'utf-8'
    remove_javascript = True
    preprocess_regexps = [
--- a/resources/recipes/el_correo.recipe
+++ b/resources/recipes/el_correo.recipe
@ -0,0 +1,122 @@
 #!/usr/bin/env  python
 __license__     = 'GPL v3'
 __copyright__   = '08 Januery 2011, desUBIKado'
 __author__      = 'desUBIKado'
 __description__ = 'Daily newspaper from Biscay'
 __version__     = 'v0.08'
 __date__        = '08, Januery 2011'
 '''
 [url]http://www.elcorreo.com/[/url]
 '''
 import time
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class heraldo(BasicNewsRecipe):
    __author__            = 'desUBIKado'
    description           = 'Daily newspaper from Biscay'
    title                 = u'El Correo'
    publisher             = 'Vocento'
    category              = 'News, politics, culture, economy, general interest'
    oldest_article        = 2
    delay                 = 1
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    language              = 'es'
    timefmt               = '[%a, %d %b, %Y]'
    encoding              = 'iso-8859-1'
    remove_empty_feeds    = True
    remove_javascript     = False
    feeds              = [
                           (u'Portada',       u'http://www.elcorreo.com/vizcaya/portada.xml'),
                           (u'Local',         u'http://www.elcorreo.com/vizcaya/rss/feeds/vizcaya.xml'),
               (u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'),
               (u'Econom\xeda',   u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'),
                           (u'Pol\xedtica',   u'http://www.elcorreo.com/vizcaya/rss/feeds/politica.xml'),
               (u'Opini\xf3n',    u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'),
               (u'Deportes',      u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'),
                           (u'Sociedad',      u'http://www.elcorreo.com/vizcaya/rss/feeds/sociedad.xml'),
               (u'Cultura',       u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'),
               (u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'),
               (u'Gente',         u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml')
                         ]
    keep_only_tags     = [
                          dict(name='div', attrs={'class':['grouphead','date','art_head','story-texto','text','colC_articulo','contenido_comentarios']}),
                          dict(name='div' , attrs={'id':['articulo','story-texto','story-entradilla']})
                         ]
    remove_tags        = [
                          dict(name='div', attrs={'class':['art_barra','detalles-opinion','formdenunciar','modulo calculadoras','nubetags','pie']}),
                          dict(name='div', attrs={'class':['mod_lomas','bloque_lomas','blm_header','link-app3','link-app4','botones_listado']}),
                          dict(name='div', attrs={'class':['navegacion_galeria','modulocanalpromocion','separa','separacion','compartir','tags_relacionados']}),
                          dict(name='div', attrs={'class':['moduloBuscadorDeportes','modulo-gente','moddestacadopeq','OpcArt','articulopiniones']}),
                          dict(name='div', attrs={'class':['modulo-especial','publiEspecial']}),
                          dict(name='div', attrs={'id':['articulopina']}),
                          dict(name='br', attrs={'class':'clear'}),
                          dict(name='form', attrs={'name':'frm_conversor2'})
                         ]
    remove_tags_before = dict(name='div' , attrs={'class':'articulo  '})
    remove_tags_after  = dict(name='div' , attrs={'class':'comentarios'})
    def get_cover_url(self):
        cover = None
        st = time.localtime()
        year = str(st.tm_year)
        month = "%.2d" % st.tm_mon
        day = "%.2d" % st.tm_mday
        #[url]http://img.kiosko.net/2011/01/02/es/elcorreo.750.jpg[/url]
                #[url]http://info.elcorreo.com/pdf/06012011-viz.pdf[/url]
        cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf'
        br = BasicNewsRecipe.get_browser()
        try:
            br.open(cover)
        except:
            self.log("\nPortada no disponible")
            cover ='http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
        return cover
    extra_css = '''
                    h1, .headline {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
                    h2, .subhead {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:18px;}
                    h3, .overhead {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
                    h4 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
                    h5 {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
                    h6 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}
                    .date,.byline, .photo {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
                    img{margin-bottom: 0.4em}
                '''
    preprocess_regexps = [
 # To present the image of the embedded video
                           (re.compile(r'var RUTA_IMAGEN', re.DOTALL|re.IGNORECASE), lambda match: '</script><img src'),
                           (re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
                           (re.compile(r'var SITIO = "elcorreo";', re.DOTALL|re.IGNORECASE), lambda match: '<SCRIPT TYPE="text/JavaScript"'),
 # To separate paragraphs with a blank line
                           (re.compile(r'<div class="p"', re.DOTALL|re.IGNORECASE), lambda match: '<p></p><div class="p"'),
 # To put a blank line between the subtitle and the date and time of the news
                           (re.compile(r'<div class="date">', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="date">'),
 # To put a blank line between the intro of the embedded videos and the previous text
                           (re.compile(r'<div class="video"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="video"'),
 # To view photos from the first when these are presented as a gallery
                           (re.compile(r'src="/img/shim.gif"', re.DOTALL|re.IGNORECASE), lambda match: ''),
                           (re.compile(r'rel=', re.DOTALL|re.IGNORECASE), lambda match: 'src='),
 # To remove the link of the title
                           (re.compile(r'<h1 class="headline">\n<a href="', re.DOTALL|re.IGNORECASE), lambda match: '<h1 class="'),
                           (re.compile(r'</a>\n</h1>', re.DOTALL|re.IGNORECASE), lambda match: '</h1>'),
                         ]
--- a/resources/recipes/heraldo.recipe
+++ b/resources/recipes/heraldo.recipe
@ -3,13 +3,14 @@ __license__     = 'GPL v3'
 __copyright__   = '04 December 2010, desUBIKado'
 __author__      = 'desUBIKado'
 __description__ = 'Daily newspaper from Aragon'
-__version__     = 'v0.03'
+__version__     = 'v0.04'
-__date__        = '11, December 2010'
+__date__        = '6, Januery 2011'
 '''
 [url]http://www.heraldo.es/[/url]
 '''
 import time
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class heraldo(BasicNewsRecipe):
@ -20,12 +21,13 @@ class heraldo(BasicNewsRecipe):
    category       = 'News, politics, culture, economy, general interest'
    language       = 'es'
    timefmt        = '[%a, %d %b, %Y]'
-    oldest_article = 1
+    oldest_article = 2
    delay          = 1
    max_articles_per_feed = 100
    use_embedded_content  = False
    remove_javascript = True
    no_stylesheets = True
-    recursion      = 10
+
    feeds          = [
                        (u'Portadas', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss')
@ -37,7 +39,8 @@ class heraldo(BasicNewsRecipe):
    remove_tags        = [dict(name='a', attrs={'class':['com flo-r','enl-if','enl-df']}),
                          dict(name='div', attrs={'class':['brb-b-s con marg-btt','cnt-rel con']}),
-                          dict(name='form', attrs={'class':'form'})]
+                          dict(name='form', attrs={'class':'form'}),
                          dict(name='ul', attrs={'id':['cont-tags','pag-1']})]
    remove_tags_before = dict(name='div' , attrs={'id':'dts'})
    remove_tags_after  = dict(name='div' , attrs={'id':'com'})
@ -59,7 +62,16 @@ class heraldo(BasicNewsRecipe):
       return cover
    extra_css = '''
-                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:xx-large;}
+                    .con strong{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}
                    .con h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
                    .con span{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:12px;}
                    .ent {font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;}
                    img{margin-bottom: 0.4em}
                '''
    preprocess_regexps = [
 # To separate the comments with a blank line
                           (re.compile(r'<div id="com"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div id="com"')
                         ]
--- a/resources/recipes/tyzden.recipe
+++ b/resources/recipes/tyzden.recipe
@ -0,0 +1,80 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2011, Miroslav Vasko zemiak@gmail.com'
 '''
 .tyzden, a weekly news magazine (a week old issue)
 '''
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
 from datetime import date
 import re
 class TyzdenRecipe(BasicNewsRecipe):
    __license__  = 'GPL v3'
    __author__ = 'zemiak'
    language = 'sk'
    version = 1
    publisher = u'www.tyzden.sk'
    category = u'Magazine'
    description = u'A conservative weekly magazine. The latest free issue'
    today = date.today()
    iso = today.isocalendar()
    year = iso[0]
    weeknum = iso[1]
    if (weeknum > 1):
        weeknum -= 1
    title = u'.tyzden ' + str(weeknum) + '/' + str(year)
    base_url_path = 'http://www.tyzden.sk/casopis/' + str(year) + '/' + str(weeknum)
    base_url = base_url_path + '.html'
    oldest_article = 20
    max_articles_per_feed = 100
    remove_javascript = True
    use_embedded_content    = False
    no_stylesheets = True
    keep_only_tags = []
    keep_only_tags.append(dict(name = 'h1'))
    keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text_area top_nofoto'}))
    keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text_block'}))
    remove_tags_after = [dict(name = 'div', attrs = {'class': 'text_block'})]
    def find_sections(self):
        soup = self.index_to_soup(self.base_url)
        # find cover pic
 	imgdiv = soup.find('div', attrs = {'class': 'foto'})
 	if imgdiv is not None:
            img = imgdiv.find('img')
            if img is not None:
                self.cover_url = 'http://www.tyzden.sk/' + img['src']
        # end find cover pic
        for s in soup.findAll('a', attrs={'href': re.compile(r'rubrika/.*')}):
            yield (self.tag_to_string(s), s)
    def find_articles(self, soup):
        for art in soup.findAllNext('a'):
            if (not art['href'].startswith('casopis/')):
                break;
            url = art['href']
            title = self.tag_to_string(art)
            yield {
                    'title': title, 'url':self.base_url_path + '/' + url, 'description':title,
                    'date' : strftime('%a, %d %b'),
                    }
    def parse_index(self):
        feeds = []
        for title, soup in self.find_sections():
            feeds.append((title, list(self.find_articles(soup))))
        return feeds
--- a/setup/build_environment.py
+++ b/setup/build_environment.py
@ -117,7 +117,6 @@ if iswindows:
    poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
            r'%s\poppler;%s'%(sw_inc_dir, sw_inc_dir))
    popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[1]+r'\qt4']
    poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir)
    popplerqt4_lib_dirs = poppler_lib_dirs
    poppler_libs = ['poppler']
@ -131,7 +130,6 @@ elif isosx:
    fc_lib = '/sw/lib'
    poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
            '/sw/build/poppler-0.14.5/poppler:/sw/build/poppler-0.14.5')
    popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
    poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
            '/sw/lib')
    poppler_libs = ['poppler']
@ -150,9 +148,6 @@ else:
    # Include directories
    poppler_inc_dirs = pkgconfig_include_dirs('poppler',
        'POPPLER_INC_DIR', '/usr/include/poppler')
    popplerqt4_inc_dirs = pkgconfig_include_dirs('poppler-qt4', '', '')
    if not popplerqt4_inc_dirs:
        popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
    png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
        '/usr/include')
    magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick')
@ -187,20 +182,17 @@ if not poppler_inc_dirs or not os.path.exists(
    poppler_error = \
    ('Poppler not found on your system. Various PDF related',
    ' functionality will not work. Use the POPPLER_INC_DIR and',
-    ' POPPLER_LIB_DIR environment variables.')
+    ' POPPLER_LIB_DIR environment variables. calibre requires '
-
+    ' the poppler XPDF headers. If your distro does not '
-popplerqt4_error = None
+    ' include them you will have to re-compile poppler '
-if not popplerqt4_inc_dirs or not os.path.exists(
+    ' by hand with --enable-xpdf-headers')
        os.path.join(popplerqt4_inc_dirs[-1], 'poppler-qt4.h')):
    popplerqt4_error = \
            ('Poppler Qt4 bindings not found on your system.')
 magick_error = None
 if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
    'wand')):
    magick_error = ('ImageMagick not found on your system. '
            'Try setting the environment variables MAGICK_INC '
-            'and MAGICK_LIB to help calibre locate the inclue and libbrary '
+            'and MAGICK_LIB to help calibre locate the include and library '
            'files.')
 podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)
--- a/src/calibre/devices/android/driver.py
+++ b/src/calibre/devices/android/driver.py
@ -53,6 +53,9 @@ class ANDROID(USBMS):
            # LG
            0x1004 : { 0x61cc : [0x100] },
            # Archos
            0x0e79 : { 0x1420 : [0x0216]},
            }
    EBOOK_DIR_MAIN = ['eBooks/import', 'wordplayer/calibretransfer', 'Books']
    EXTRA_CUSTOMIZATION_MESSAGE = _('Comma separated list of directories to '
@ -61,18 +64,19 @@ class ANDROID(USBMS):
    EXTRA_CUSTOMIZATION_DEFAULT = ', '.join(EBOOK_DIR_MAIN)
    VENDOR_NAME      = ['HTC', 'MOTOROLA', 'GOOGLE_', 'ANDROID', 'ACER',
-            'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE']
+            'GT-I5700', 'SAMSUNG', 'DELL', 'LINUX', 'GOOGLE', 'ARCHOS']
    WINDOWS_MAIN_MEM = ['ANDROID_PHONE', 'A855', 'A853', 'INC.NEXUS_ONE',
            '__UMS_COMPOSITE', '_MB200', 'MASS_STORAGE', '_-_CARD', 'SGH-I897',
            'GT-I9000', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID',
            'SCH-I500_CARD', 'SPH-D700_CARD', 'MB810', 'GT-P1000', 'DESIRE',
-            'SGH-T849', '_MB300']
+            'SGH-T849', '_MB300', 'A70S']
    WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897',
-            'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD']
+            'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD',
            'A70S']
-    OSX_MAIN_MEM = 'HTC Android Phone Media'
+    OSX_MAIN_MEM = 'Android Device Main Memory'
-    MAIN_MEMORY_VOLUME_LABEL  = 'Android Phone Internal Memory'
+    MAIN_MEMORY_VOLUME_LABEL  = 'Android Device Main Memory'
    SUPPORTS_SUB_DIRS = True
--- a/src/calibre/devices/prs505/driver.py
+++ b/src/calibre/devices/prs505/driver.py
@ -76,12 +76,23 @@ class PRS505(USBMS):
                'sending DRMed books in which you cannot change the cover.'
                ' WARNING: This option should only be used with newer '
                'SONY readers: 350, 650, 950 and newer.'),
            _('Refresh separate covers when using automatic management (newer readers)') +
                ':::' +
                _('Set this option to have separate book covers uploaded '
                  'every time you connect your device. Unset this option if '
                  'you have so many books on the reader that performance is '
                  'unacceptable.')
    ]
    EXTRA_CUSTOMIZATION_DEFAULT = [
                ', '.join(['series', 'tags']),
                False,
                False
    ]
    OPT_COLLECTIONS    = 0
    OPT_UPLOAD_COVERS  = 1
    OPT_REFRESH_COVERS = 2
    plugboard = None
    plugboard_func = None
@ -171,7 +182,7 @@ class PRS505(USBMS):
        opts = self.settings()
        if opts.extra_customization:
            collections = [x.strip() for x in
-                    opts.extra_customization[0].split(',')]
+                    opts.extra_customization[self.OPT_COLLECTIONS].split(',')]
        else:
            collections = []
        debug_print('PRS505: collection fields:', collections)
@ -183,6 +194,20 @@ class PRS505(USBMS):
        c.update(blists, collections, pb)
        c.write()
        if opts.extra_customization[self.OPT_REFRESH_COVERS]:
            debug_print('PRS505: uploading covers in sync_booklists')
            for idx,bl in blists.items():
                prefix = self._card_a_prefix if idx == 1 else \
                                self._card_b_prefix if idx == 2 \
                                    else self._main_prefix
                for book in bl:
                    p = os.path.join(prefix, book.lpath)
                    self._upload_cover(os.path.dirname(p),
                                      os.path.splitext(os.path.basename(p))[0],
                                      book, p)
        else:
            debug_print('PRS505: NOT uploading covers in sync_booklists')
        USBMS.sync_booklists(self, booklists, end_session=end_session)
        debug_print('PRS505: finished sync_booklists')
@ -199,11 +224,14 @@ class PRS505(USBMS):
    def upload_cover(self, path, filename, metadata, filepath):
        opts = self.settings()
-        if not opts.extra_customization[1]:
+        if not opts.extra_customization[self.OPT_UPLOAD_COVERS]:
            # Building thumbnails disabled
-            debug_print('PRS505: not uploading covers')
+            debug_print('PRS505: not uploading cover')
            return
-        debug_print('PRS505: uploading covers')
+        debug_print('PRS505: uploading cover')
        self._upload_cover(path, filename, metadata, filepath)
    def _upload_cover(self, path, filename, metadata, filepath):
        if metadata.thumbnail and metadata.thumbnail[-1]:
            path = path.replace('/', os.sep)
            is_main = path.startswith(self._main_prefix)
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -480,7 +480,7 @@ class HTMLPreProcessor(object):
                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                end_rules.append(
                    # Un wrap using punctuation
-                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                )
        for rule in self.PREPROCESS + start_rules:
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -186,7 +186,7 @@ class PreProcessor(object):
    def punctuation_unwrap(self, length, content, format):
        # define the pieces of the regex
-        lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
+        lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
        line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
        blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
        line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -296,7 +296,7 @@ class RTFInput(InputFormatPlugin):
                        u'<p>\u00a0</p>\n'.encode('utf-8'), res)
            if self.opts.preprocess_html:
                preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
-                res = preprocessor(res)
+                res = preprocessor(res.decode('utf-8')).encode('utf-8')
            f.write(res)
        self.write_inline_css(inline_class, border_styles)
        stream.seek(0)
--- a/src/calibre/ebooks/txt/txtml.py
+++ b/src/calibre/ebooks/txt/txtml.py
@ -219,6 +219,10 @@ class TXTMLizer(object):
        if tag in SPACE_TAGS:
            text.append(u' ')
        # Scene breaks.
        if tag == 'hr':
            text.append('\n\n* * *\n\n')
        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(elem.text)
--- a/src/calibre/gui2/preferences/plugboard.py
+++ b/src/calibre/gui2/preferences/plugboard.py
@ -5,11 +5,11 @@ __license__   = 'GPL v3'
 __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
-from PyQt4 import QtGui
+from PyQt4.Qt import Qt, QLineEdit, QComboBox, SIGNAL, QListWidgetItem
 from PyQt4.Qt import Qt
 from calibre.gui2 import error_dialog
 from calibre.gui2.device import device_name_for_plugboards
 from calibre.gui2.dialogs.template_dialog import TemplateDialog
 from calibre.gui2.preferences import ConfigWidgetBase, test_widget
 from calibre.gui2.preferences.plugboard_ui import Ui_Form
 from calibre.customize.ui import metadata_writers, device_plugins
@ -17,6 +17,27 @@ from calibre.library.save_to_disk import plugboard_any_format_value, \
                        plugboard_any_device_value, plugboard_save_to_disk_value
 from calibre.utils.formatter import validation_formatter
 class LineEditWithTextBox(QLineEdit):
    '''
    Extend the context menu of a QLineEdit to include more actions.
    '''
    def contextMenuEvent(self, event):
        menu = self.createStandardContextMenu()
        menu.addSeparator()
        action_open_editor = menu.addAction(_('Open Editor'))
        self.connect(action_open_editor, SIGNAL('triggered()'), self.open_editor)
        menu.exec_(event.globalPos())
    def open_editor(self):
        t = TemplateDialog(self, self.text())
        if t.exec_():
            self.setText(t.textbox.toPlainText())
 class ConfigWidget(ConfigWidgetBase, Ui_Form):
    def genesis(self, gui):
@ -72,10 +93,10 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
        self.source_widgets = []
        self.dest_widgets = []
        for i in range(0, len(self.dest_fields)-1):
-            w = QtGui.QLineEdit(self)
+            w = LineEditWithTextBox(self)
            self.source_widgets.append(w)
            self.fields_layout.addWidget(w, 5+i, 0, 1, 1)
-            w = QtGui.QComboBox(self)
+            w = QComboBox(self)
            self.dest_widgets.append(w)
            self.fields_layout.addWidget(w, 5+i, 1, 1, 1)
@ -297,7 +318,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
                for op in self.current_plugboards[f][d]:
                    ops.append('([' + op[0] + '] -> ' + op[1] + ')')
                txt = '%s:%s = %s\n'%(f, d, ', '.join(ops))
-                item = QtGui.QListWidgetItem(txt)
+                item = QListWidgetItem(txt)
                item.setData(Qt.UserRole, (f, d))
                self.existing_plugboards.addItem(item)
        self.refilling = False
--- a/src/calibre/library/custom_columns.py
+++ b/src/calibre/library/custom_columns.py
@ -151,6 +151,8 @@ class CustomColumns(object):
            return v
        def adapt_number(x, d):
            if x is None:
                return None
            if isinstance(x, (str, unicode, bytes)):
                if x.lower() == 'none':
                    return None
@ -195,8 +197,8 @@ class CustomColumns(object):
            data = self.custom_column_num_map[num]
        row = self.data._data[idx] if index_is_id else self.data[idx]
        ans = row[self.FIELD_MAP[data['num']]]
-        if ans and data['is_multiple'] and data['datatype'] == 'text':
+        if data['is_multiple'] and data['datatype'] == 'text':
-            ans = ans.split('|')
+            ans = ans.split('|') if ans else []
            if data['display'].get('sort_alpha', False):
                ans.sort(cmp=lambda x,y:cmp(x.lower(), y.lower()))
        return ans
--- a/src/calibre/library/server/browse.py
+++ b/src/calibre/library/server/browse.py
@ -756,7 +756,7 @@ class BrowseServer(object):
        sort = self.browse_sort_book_list(items, list_sort)
        ids = [x[0] for x in items]
        html = render_book_list(ids, self.opts.url_prefix,
-                suffix=_('in search')+': '+query)
+                suffix=_('in search')+': '+xml(query))
        return self.browse_template(sort, category=False, initial_search=query).format(
                title=_('Matching books'),
                script='booklist();', main=html)
--- a/src/calibre/library/sqlite.py
+++ b/src/calibre/library/sqlite.py
@ -98,9 +98,10 @@ class AumSortedConcatenate(object):
    def finalize(self):
        keys = self.ans.keys()
-        if len(keys) == 0:
+        l = len(keys)
-            return None
+        if l == 0:
-        if len(keys) == 1:
+            return 'Unknown:::Unknown'
        if l == 1:
            return self.ans[keys[0]]
        return ':#:'.join([self.ans[v] for v in sorted(keys)])
--- a/src/calibre/utils/formatter.py
+++ b/src/calibre/utils/formatter.py
@ -98,9 +98,10 @@ class _Parser(object):
        m = 'Formatter: ' + message + _(' near ')
        if self.lex_pos > 0:
            m = '{0} {1}'.format(m, self.prog[self.lex_pos-1][1])
-        m = '{0} {1}'.format(m, self.prog[self.lex_pos][1])
+        elif self.lex_pos < len(self.prog):
        if self.lex_pos < len(self.prog):
            m = '{0} {1}'.format(m, self.prog[self.lex_pos+1][1])
        else:
            m = '{0} {1}'.format(m, _('end of program'))
        raise ValueError(m)
    def token(self):