Merge from trunk

2025-07-09 03:04:10 -04:00 · 2010-11-21 19:11:31 +01:00 · 2010-11-21 19:11:31 +01:00 · 26a8583887
commit 26a8583887
parent 04aec11f94 fc524ee7d4
11 changed files with 261 additions and 69 deletions
--- a/resources/recipes/brand_eins.recipe
+++ b/resources/recipes/brand_eins.recipe
@ -1,18 +1,22 @@
 #!/usr/bin/env  python
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 mode: python -*-
 # Find the newest version of this recipe here:
 # https://github.com/consti/BrandEins-Recipe/raw/master/brandeins.recipe
 __license__   = 'GPL v3'
-__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>'
+__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>'
-__version__   = '0.95'
+__version__   = '0.96'
 ''' http://brandeins.de - Wirtschaftsmagazin '''
 import re
 import string
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class BrandEins(BasicNewsRecipe):
-  title = u'Brand Eins'
+  title = u'brand eins'
  __author__ = 'Constantin Hofstetter'
  description = u'Wirtschaftsmagazin'
  publisher ='brandeins.de'
@ -22,11 +26,14 @@ class BrandEins(BasicNewsRecipe):
  no_stylesheets = True
  encoding = 'utf-8'
  language = 'de'
  publication_type = 'magazine'
  needs_subscription = True
  # 2 is the last full magazine (default)
  # 1 is the newest (but not full)
  # 3 is one before 2 etc.
-  which_ausgabe = 2
+  # This value can be set via the username field.
  default_issue = 2
  keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]
@ -61,17 +68,31 @@ class BrandEins(BasicNewsRecipe):
    return soup
  def get_cover(self, soup):
    cover_url = None
    cover_item = soup.find('div', attrs = {'class': 'cover_image'})
    if cover_item:
      cover_url = 'http://www.brandeins.de/' + cover_item.img['src']
    return cover_url
  def parse_index(self):
    feeds = []
    archive = "http://www.brandeins.de/archiv.html"
    issue = self.default_issue
    if self.username:
      try:
        issue = int(self.username)
      except:
        pass
    soup = self.index_to_soup(archive)
    latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0]
-    pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-self.which_ausgabe]
+    pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-issue]
    url = pre_latest_issue.get('href', False)
    # Get the title for the magazin - build it out of the title of the cover - take the issue and year;
-    self.title = "Brand Eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d+)", pre_latest_issue.find('img').get('title', False)).group('date')
+    self.title = "brand eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date')
    url = 'http://brandeins.de/'+url
    # url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
@ -83,6 +104,7 @@ class BrandEins(BasicNewsRecipe):
  def brand_eins_parse_latest_issue(self, url):
    soup = self.index_to_soup(url)
    self.cover_url = self.get_cover(soup)
    article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]
    titles_and_articles = []
@ -123,3 +145,4 @@ class BrandEins(BasicNewsRecipe):
          current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
    titles_and_articles.append([chapter_title, current_articles])
    return titles_and_articles
--- a/resources/recipes/nikkei_sub.recipe
+++ b/resources/recipes/nikkei_sub.recipe
@ -0,0 +1,125 @@
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 import mechanize
 from calibre.ptempfile import PersistentTemporaryFile
 class NikkeiNet_subscription(BasicNewsRecipe):
    title           = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248'
    __author__      = 'Hiroshi Miura'
    description     = 'News and current market affairs from Japan'
    needs_subscription = True
    oldest_article  = 2
    max_articles_per_feed = 20
    language        = 'ja'
    remove_javascript = False
    temp_files = []
    remove_tags_before = {'class':"cmn-section cmn-indent"}
    remove_tags = [
                       {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
                       {'class':"cmn-article_keyword cmn-clearfix"},
                       {'class':"cmn-print_headline cmn-clearfix"},
                         ]
    remove_tags_after = {'class':"cmn-pr_list"}
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        cj = mechanize.LWPCookieJar()
        br.set_cookiejar(cj)
        #br.set_debug_http(True)
        #br.set_debug_redirects(True)
        #br.set_debug_responses(True)
        if self.username is not None and self.password is not None:
            #print "----------------------------get login form--------------------------------------------"
            # open login form
            br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
            response = br.response()
            #print "----------------------------get login form---------------------------------------------"
            #print "----------------------------set login form---------------------------------------------"
            # remove disabled input which brings error on mechanize
            response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
            response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
            br.set_response(response)
            br.select_form(name='LA0010Form01')
            br['LA0010Form01:LA0010Email']   = self.username
            br['LA0010Form01:LA0010Password'] = self.password
            br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
            br.submit()
            br.response()
            #print "----------------------------send login form---------------------------------------------"
            #print "----------------------------open news main page-----------------------------------------"
            # open news site
            br.open('http://www.nikkei.com/')
            br.response()
            #print "----------------------------www.nikkei.com BODY   --------------------------------------"
            #print response2.get_data()
            #print "-------------------------^^-got auto redirect form----^^--------------------------------"
            # forced redirect in default
            br.select_form(nr=0)
            br.submit()
            response3 = br.response()
            # return some cookie which should be set by Javascript
            #print response3.geturl()
            raw = response3.get_data()
            #print "---------------------------response to form --------------------------------------------"
            # grab cookie from JS and set it
            redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
            br.select_form(nr=0)
            self.temp_files.append(PersistentTemporaryFile('_fa.html'))
            self.temp_files[-1].write("#LWP-Cookies-2.0\n")
            self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
            self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
            self.temp_files[-1].close()
            cj.load(self.temp_files[-1].name)
            br.submit()
            #br.set_debug_http(False)
            #br.set_debug_redirects(False)
            #br.set_debug_responses(False)
        return br
    feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', 	u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'),
 		 (u'\u65e5\u7d4c\u88fd\u54c1',	u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'),
 		 (u'internet',		u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'),
 		 (u'\u653f\u6cbb', 		u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'),
 		 (u'\u8ca1\u52d9', 		u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'),
 		 (u'\u7d4c\u6e08', 		u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'),
 		 (u'\u56fd\u969b', 		u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'),
 		 (u'\u79d1\u5b66', 		u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'),
 		 (u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'),
 		 (u'\u304f\u3089\u3057', 	u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'),
 		 (u'\u30b9\u30dd\u30fc\u30c4', 	u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'),
 		 (u'\u793e\u4f1a', 		u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'),
 		 (u'\u30a8\u30b3', 		u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'),
 		 (u'\u5065\u5eb7', 		u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'),
 		 (u'\u96c7\u7528', 		u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'),
 		 (u'\u6559\u80b2', 		u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'),
 		 (u'\u304a\u304f\u3084\u307f', 	u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'),
 		 (u'\u4eba\u4e8b', 		u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'),
 		 (u'\u7279\u96c6', 		u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'),
 		 (u'\u5730\u57df\u30cb\u30e5\u30fc\u30b9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=local'),
 		 (u'\u7d71\u8a08\u30fb\u767d\u66f8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=report'),
 		 (u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking'),
 		 (u'\u4f1a\u898b', 		u'http://www.zou3.net/php/rss/nikkei2rss.php?head=interview'),
 		 (u'\u793e\u8aac\u30fb\u6625\u79cb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shasetsu'),
 		 (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'),
 		 (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'),
 		 (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'),
 		 (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', 	u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'),
 		 (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', 	u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'),
 		 (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', 	u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba'),
 		 (u'\u8abf\u67fb\u30fb\u30a2\u30f3\u30b1\u30fc\u30c8', 	u'http://www.zou3.net/php/rss/nikkei2rss.php?head=research')
 		]
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -483,8 +483,8 @@ from calibre.devices.kobo.driver import KOBO
 from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
    LibraryThing, Fictionwise
 from calibre.ebooks.metadata.douban import DoubanBooks
 from calibre.ebooks.metadata.nicebooks import NiceBooks
 from calibre.ebooks.metadata.fictionwise import Fictionwise
 from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers
 from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
        LibraryThingCovers, DoubanCovers
 from calibre.ebooks.metadata.nicebooks import NiceBooksCovers
@ -493,8 +493,9 @@ from calibre.ebooks.epub.fix.unmanifested import Unmanifested
 from calibre.ebooks.epub.fix.epubcheck import Epubcheck
 plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon,
-        LibraryThing, Fictionwise, DoubanBooks, NiceBooks,CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
+        LibraryThing, DoubanBooks, NiceBooks, Fictionwise, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
-        Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers, NiceBooksCovers]
+        Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers,
        NiceBooksCovers]
 plugins += [
    ComicInput,
    EPUBInput,
--- a/src/calibre/customize/ui.py
+++ b/src/calibre/customize/ui.py
@ -120,9 +120,7 @@ def enable_plugin(plugin_or_name):
    config['enabled_plugins'] = ep
 default_disabled_plugins = set([
-    'Douban Books', 'Douban.com covers',
+    'Douban Books', 'Douban.com covers', 'Fictionwise', 'Nicebooks', 'Nicebooks covers'
    'NiceBooks', 'NiceBooksCovers',
    'Fictionwise'
 ])
 def is_disabled(plugin):
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -220,13 +220,13 @@ class Dehyphenator(object):
        self.html = html
        self.format = format
        if format == 'html':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
        elif format == 'pdf':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
        elif format == 'individual_words':
-            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
        elif format == 'html_cleanup':
-            intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
+            intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
        html = intextmatch.sub(self.dehyphenate, html)
        return html
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -22,12 +22,12 @@ class PreProcessor(object):
        title = match.group('title')
        if not title:
            self.html_preprocess_sections = self.html_preprocess_sections + 1
-            self.log("found " + unicode(self.html_preprocess_sections) +
+            self.log("marked " + unicode(self.html_preprocess_sections) +
                    " chapters. - " + unicode(chap))
            return '<h2>'+chap+'</h2>\n'
        else:
            self.html_preprocess_sections = self.html_preprocess_sections + 1
-            self.log("found " + unicode(self.html_preprocess_sections) +
+            self.log("marked " + unicode(self.html_preprocess_sections) +
                    " chapters & titles. - " + unicode(chap) + ", " + unicode(title))
            return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
@ -83,12 +83,30 @@ class PreProcessor(object):
        if min_lns > tot_htm_ends:
            return True
    def dump(self, raw, where):
        import os
        dp = getattr(self.extra_opts, 'debug_pipeline', None)
        if dp and os.path.exists(dp):
            odir = os.path.join(dp, 'preprocess')
            if not os.path.exists(odir):
                    os.makedirs(odir)
            if os.path.exists(odir):
                odir = os.path.join(odir, where)
                if not os.path.exists(odir):
                    os.makedirs(odir)
                name, i = None, 0
                while not name or os.path.exists(os.path.join(odir, name)):
                    i += 1
                    name = '%04d.html'%i
                with open(os.path.join(odir, name), 'wb') as f:
                    f.write(raw.encode('utf-8'))
    def __call__(self, html):
        self.log("*********  Preprocessing HTML  *********")
        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = re.sub(r"\s*</p>", "</p>\n", html)
-        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
+        html = re.sub(r"\s*<p(?P<style>[^>]*)>\s*", "\n<p"+"\g<style>"+">", html)
        ###### Check Markup ######
        #
@ -150,52 +168,61 @@ class PreProcessor(object):
               #print "blanks between paragraphs is marked True"
            else:
                blanks_between_paragraphs = False
-        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
+        #self.dump(html, 'before_chapter_markup')
        # detect chapters/sections to match xpath or splitting logic
        #
        # Build the Regular Expressions in pieces
-        lookahead = "(?=<(p|div))"
+        init_lookahead = "(?=<(p|div))"
        chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
        chapter_header_open = r"(?P<chap>"
        title_header_open = r"(?P<title>"
        chapter_header_close = ")\s*"
-        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*"
+        title_header_close = ")"
        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
        title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
        if blanks_between_paragraphs:
            blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
        else:
            blank_lines = ""
        opt_title_open = "("
        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
        title_header_open = "(?P<title>"
        title_header_close = ")\s*"
        title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
        opt_title_close = ")?"
        n_lookahead_open = "\s+(?!"
        n_lookahead_close = ")"
-        default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
+        default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
        typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
        numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
        uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
-        chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+        min_chapters = 10
        #print chapter_marker
        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
        self.html_preprocess_sections = len(heading.findall(html))
        self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
        #
        # Start with most typical chapter headings, get more aggressive until one works
        if self.html_preprocess_sections < 10:
            chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
            html = chapdetect.sub(self.chapter_head, html)
        if self.html_preprocess_sections < 10:
            self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters")
            chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
            chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
            html = chapdetect2.sub(self.chapter_head, html)
-        if self.html_preprocess_sections < 10:
+        chapter_types = [
-            self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words")
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
-            chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+            [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
-            chapdetect2 = re.compile(r'%s' % chapter_marker,  re.UNICODE)
+            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
-            html = chapdetect2.sub(self.chapter_head, html)
+            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
            ]
        # Start with most typical chapter headings, get more aggressive until one works
        for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
            if self.html_preprocess_sections >= min_chapters:
                break
            full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
            n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
            self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
            if lookahead_ignorecase:
                chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
                chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
            else:
                chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
                chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
            html = chapdetect.sub(self.chapter_head, html)
        ###### Unwrap lines ######
        #
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
@ -232,6 +259,7 @@ class PreProcessor(object):
            html = dehyphenator(html,'html', length)
            self.log("Done dehyphenating")
            # Unwrap lines using punctation and line length
            #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
            unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
            html = unwrap.sub(' ', html)
            #check any remaining hyphens, but only unwrap if there is a match
@ -248,10 +276,10 @@ class PreProcessor(object):
        html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
        # If still no sections after unwrapping mark split points on lines with no punctuation
-        if self.html_preprocess_sections < 10:
+        if self.html_preprocess_sections < 5:
            self.log("Looking for more split points based on punctuation,"
                    " currently have " + unicode(self.html_preprocess_sections))
-            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
+            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
            html = chapdetect3.sub(self.chapter_break, html)
        # search for places where a first or second level heading is immediately followed by another
        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
@ -262,4 +290,7 @@ class PreProcessor(object):
        # put back non-breaking spaces in empty paragraphs to preserve original formatting
        html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
        # Center separator lines
        html = re.sub(u'<p>\s*(?P<break>([*#•]+\s*)+)\s*</p>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
        return html
--- a/src/calibre/ebooks/metadata/nicebooks.py
+++ b/src/calibre/ebooks/metadata/nicebooks.py
@ -8,7 +8,6 @@ from urllib import urlencode
 from math import ceil
 from copy import deepcopy
 from lxml import html
 from lxml.html import soupparser
 from calibre.utils.date import parse_date, utcnow
@ -107,7 +106,7 @@ class Query(object):
        assert (max_results < 21)
        self.max_results = int(max_results)
-        
+
        if isbn is not None:
            q = isbn
        else:
@ -121,7 +120,7 @@ class Query(object):
    def __call__(self, browser, verbose, timeout = 5.):
        if verbose:
            print 'Query:', self.BASE_URL+self.urldata
-        
+
        try:
            raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read()
        except Exception, e:
@ -138,14 +137,14 @@ class Query(object):
            feed = soupparser.fromstring(raw)
        except:
            return
-        
+
        #nb of page to call
        try:
            nbresults = int(feed.xpath("//div[@id='topbar']/b")[0].text)
        except:
            #direct hit
            return [feed]
-        
+
        nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/10))
        pages =[feed]
        if nbpagetoquery > 1:
@ -164,7 +163,7 @@ class Query(object):
                except:
                    continue
                pages.append(feed)
-        
+
        results = []
        for x in pages:
            results.extend([i.find_class('title')[0].get('href') \
@ -172,9 +171,9 @@ class Query(object):
        return results[:self.max_results]
 class ResultList(list):
-    
+
    BASE_URL = 'http://fr.nicebooks.com'
- 
+
    def __init__(self):
        self.repub = re.compile(u'\s*.diteur\s*', re.I)
        self.reauteur = re.compile(u'\s*auteur.*', re.I)
@ -208,8 +207,8 @@ class ResultList(list):
        except:
            report(verbose)
            return None
-    
+
-    def get_book_info(self, entry, mi):
+    def get_book_info(self, entry, mi, verbose):
        entry = entry.find("dl[@title='Informations sur le livre']")
        for x in entry.getiterator('dt'):
            if x.text == 'ISBN':
@ -240,7 +239,7 @@ class ResultList(list):
        # mi.pubdate = self.get_date(entry, verbose)
        # mi.isbn = self.get_ISBN(entry)
        # mi.language = self.get_language(entry)
-        return self.get_book_info(entry, mi)
+        return self.get_book_info(entry, mi, verbose)
    def get_individual_metadata(self, browser, linkdata, verbose):
        try:
@ -343,7 +342,7 @@ def search(title=None, author=None, publisher=None, isbn=None,
    br = browser()
    entries = Query(title=title, author=author, isbn=isbn, publisher=publisher,
        keywords=keywords, max_results=max_results)(br, verbose)
-    
+
    if entries is None or len(entries) == 0:
        return
@ -390,6 +389,7 @@ def option_parser():
    return parser
 def main(args=sys.argv):
    import os
    parser = option_parser()
    opts, args = parser.parse_args(args)
    try:
@ -421,4 +421,4 @@ def main(args=sys.argv):
        print
 if __name__ == '__main__':
-    sys.exit(main())
+    sys.exit(main())
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@ -9,6 +9,7 @@ import os
 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
 from calibre.ebooks.conversion.utils import PreProcessor
 class PDBInput(InputFormatPlugin):
@ -44,3 +45,8 @@ class PDBInput(InputFormatPlugin):
        opf = reader.extract_content(os.getcwd())
        return opf
    def preprocess_html(self, options, html):
        self.options = options
        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
        return preprocessor(html)
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -114,7 +114,7 @@ class RTFInput(InputFormatPlugin):
            group_borders = 1,
            # Write or do not write paragraphs. Default is 0.
-            empty_paragraphs = 0,
+            empty_paragraphs = 1,
        )
        parser.parse_rtf()
        ans = open('out.xml').read()
@ -289,6 +289,10 @@ class RTFInput(InputFormatPlugin):
        with open(html, 'wb') as f:
            res = transform.tostring(result)
            res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
            # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
            if not getattr(self.options, 'remove_paragraph_spacing', False):
                res = re.sub('\s*<body>', '<body>', res)
                res = re.sub('(?<=\n)\n{2}', u'<p>\u00a0</p>\n', res)
            if self.options.preprocess_html:
                preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
                res = preprocessor(res)
--- a/src/calibre/gui2/wizard/init.py
+++ b/src/calibre/gui2/wizard/init.py
@ -615,10 +615,14 @@ class LibraryPage(QWizardPage, LibraryUI):
        self.emit(SIGNAL('retranslate()'))
        self.init_languages()
        try:
-            if prefs['language'].lower().startswith('zh'):
+            lang = prefs['language'].lower()[:2]
-                from calibre.customize.ui import enable_plugin
+            metadata_plugins = {
-                for name in ('Douban Books', 'Douban.com covers'):
+                    'zh' : ('Douban Books', 'Douban.com covers'),
-                    enable_plugin(name)
+                    'fr' : ('Nicebooks', 'Nicebooks covers'),
            }.get(lang, [])
            from calibre.customize.ui import enable_plugin
            for name in metadata_plugins:
                enable_plugin(name)
        except:
            pass
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@ -771,7 +771,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
        except:
            # Can happen if path has not yet been set
            return False
-        return os.access(path, os.R_OK)
+        return os.path.exists(path)
    def remove_cover(self, id, notify=True):
        path = os.path.join(self.library_path, self.path(id, index_is_id=True), 'cover.jpg')