diff --git a/resources/recipes/brand_eins.recipe b/resources/recipes/brand_eins.recipe
index be5b98ffe6..c69dd693b2 100644
--- a/resources/recipes/brand_eins.recipe
+++ b/resources/recipes/brand_eins.recipe
@@ -1,18 +1,22 @@
#!/usr/bin/env python
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 mode: python -*-
+
+# Find the newest version of this recipe here:
+# https://github.com/consti/BrandEins-Recipe/raw/master/brandeins.recipe
__license__ = 'GPL v3'
-__copyright__ = '2010, Constantin Hofstetter '
-__version__ = '0.95'
+__copyright__ = '2010, Constantin Hofstetter , Steffen Siebert '
+__version__ = '0.96'
''' http://brandeins.de - Wirtschaftsmagazin '''
import re
import string
from calibre.web.feeds.recipes import BasicNewsRecipe
+
class BrandEins(BasicNewsRecipe):
- title = u'Brand Eins'
+ title = u'brand eins'
__author__ = 'Constantin Hofstetter'
description = u'Wirtschaftsmagazin'
publisher ='brandeins.de'
@@ -22,11 +26,14 @@ class BrandEins(BasicNewsRecipe):
no_stylesheets = True
encoding = 'utf-8'
language = 'de'
+ publication_type = 'magazine'
+ needs_subscription = True
# 2 is the last full magazine (default)
# 1 is the newest (but not full)
# 3 is one before 2 etc.
- which_ausgabe = 2
+ # This value can be set via the username field.
+ default_issue = 2
keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]
@@ -61,17 +68,31 @@ class BrandEins(BasicNewsRecipe):
return soup
+ def get_cover(self, soup):
+ cover_url = None
+ cover_item = soup.find('div', attrs = {'class': 'cover_image'})
+ if cover_item:
+ cover_url = 'http://www.brandeins.de/' + cover_item.img['src']
+ return cover_url
+
def parse_index(self):
feeds = []
archive = "http://www.brandeins.de/archiv.html"
+ issue = self.default_issue
+ if self.username:
+ try:
+ issue = int(self.username)
+ except:
+ pass
+
soup = self.index_to_soup(archive)
latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0]
- pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-self.which_ausgabe]
+ pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-issue]
url = pre_latest_issue.get('href', False)
# Get the title for the magazin - build it out of the title of the cover - take the issue and year;
- self.title = "Brand Eins "+ re.search(r"(?P\d\d\/\d\d\d\d+)", pre_latest_issue.find('img').get('title', False)).group('date')
+ self.title = "brand eins "+ re.search(r"(?P\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date')
url = 'http://brandeins.de/'+url
# url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
@@ -83,6 +104,7 @@ class BrandEins(BasicNewsRecipe):
def brand_eins_parse_latest_issue(self, url):
soup = self.index_to_soup(url)
+ self.cover_url = self.get_cover(soup)
article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]
titles_and_articles = []
@@ -123,3 +145,4 @@ class BrandEins(BasicNewsRecipe):
current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
titles_and_articles.append([chapter_title, current_articles])
return titles_and_articles
+
diff --git a/resources/recipes/nikkei_sub.recipe b/resources/recipes/nikkei_sub.recipe
new file mode 100644
index 0000000000..95b0017339
--- /dev/null
+++ b/resources/recipes/nikkei_sub.recipe
@@ -0,0 +1,125 @@
+import re
+from calibre.web.feeds.recipes import BasicNewsRecipe
+import mechanize
+from calibre.ptempfile import PersistentTemporaryFile
+
+
+class NikkeiNet_subscription(BasicNewsRecipe):
+ title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248'
+ __author__ = 'Hiroshi Miura'
+ description = 'News and current market affairs from Japan'
+ needs_subscription = True
+ oldest_article = 2
+ max_articles_per_feed = 20
+ language = 'ja'
+ remove_javascript = False
+ temp_files = []
+
+ remove_tags_before = {'class':"cmn-section cmn-indent"}
+ remove_tags = [
+ {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
+ {'class':"cmn-article_keyword cmn-clearfix"},
+ {'class':"cmn-print_headline cmn-clearfix"},
+ ]
+ remove_tags_after = {'class':"cmn-pr_list"}
+
+
+ def get_browser(self):
+ br = BasicNewsRecipe.get_browser()
+
+ cj = mechanize.LWPCookieJar()
+ br.set_cookiejar(cj)
+
+ #br.set_debug_http(True)
+ #br.set_debug_redirects(True)
+ #br.set_debug_responses(True)
+
+ if self.username is not None and self.password is not None:
+ #print "----------------------------get login form--------------------------------------------"
+ # open login form
+ br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
+ response = br.response()
+ #print "----------------------------get login form---------------------------------------------"
+ #print "----------------------------set login form---------------------------------------------"
+ # remove disabled input which brings error on mechanize
+ response.set_data(response.get_data().replace(" ", " -->"))
+ br.set_response(response)
+ br.select_form(name='LA0010Form01')
+ br['LA0010Form01:LA0010Email'] = self.username
+ br['LA0010Form01:LA0010Password'] = self.password
+ br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
+ br.submit()
+ br.response()
+ #print "----------------------------send login form---------------------------------------------"
+ #print "----------------------------open news main page-----------------------------------------"
+ # open news site
+ br.open('http://www.nikkei.com/')
+ br.response()
+ #print "----------------------------www.nikkei.com BODY --------------------------------------"
+ #print response2.get_data()
+ #print "-------------------------^^-got auto redirect form----^^--------------------------------"
+ # forced redirect in default
+ br.select_form(nr=0)
+ br.submit()
+ response3 = br.response()
+ # return some cookie which should be set by Javascript
+ #print response3.geturl()
+ raw = response3.get_data()
+ #print "---------------------------response to form --------------------------------------------"
+ # grab cookie from JS and set it
+ redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
+ br.select_form(nr=0)
+
+ self.temp_files.append(PersistentTemporaryFile('_fa.html'))
+ self.temp_files[-1].write("#LWP-Cookies-2.0\n")
+
+ self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
+ self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
+ self.temp_files[-1].close()
+ cj.load(self.temp_files[-1].name)
+
+ br.submit()
+
+ #br.set_debug_http(False)
+ #br.set_debug_redirects(False)
+ #br.set_debug_responses(False)
+ return br
+
+
+
+ feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'),
+ (u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'),
+ (u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'),
+ (u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'),
+ (u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'),
+ (u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'),
+ (u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'),
+ (u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'),
+ (u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'),
+ (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'),
+ (u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'),
+ (u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'),
+ (u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'),
+ (u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'),
+ (u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'),
+ (u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'),
+ (u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'),
+ (u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'),
+ (u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'),
+ (u'\u5730\u57df\u30cb\u30e5\u30fc\u30b9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=local'),
+ (u'\u7d71\u8a08\u30fb\u767d\u66f8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=report'),
+ (u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking'),
+ (u'\u4f1a\u898b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=interview'),
+ (u'\u793e\u8aac\u30fb\u6625\u79cb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shasetsu'),
+ (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'),
+ (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'),
+ (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'),
+ (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'),
+ (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'),
+ (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba'),
+ (u'\u8abf\u67fb\u30fb\u30a2\u30f3\u30b1\u30fc\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=research')
+ ]
+
+
+
diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index 4815375563..1947870d95 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -483,8 +483,8 @@ from calibre.devices.kobo.driver import KOBO
from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
LibraryThing, Fictionwise
from calibre.ebooks.metadata.douban import DoubanBooks
-from calibre.ebooks.metadata.nicebooks import NiceBooks
from calibre.ebooks.metadata.fictionwise import Fictionwise
+from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers
from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
LibraryThingCovers, DoubanCovers
from calibre.ebooks.metadata.nicebooks import NiceBooksCovers
@@ -493,8 +493,9 @@ from calibre.ebooks.epub.fix.unmanifested import Unmanifested
from calibre.ebooks.epub.fix.epubcheck import Epubcheck
plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon,
- LibraryThing, Fictionwise, DoubanBooks, NiceBooks,CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
- Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers, NiceBooksCovers]
+ LibraryThing, DoubanBooks, NiceBooks, Fictionwise, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
+ Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers,
+ NiceBooksCovers]
plugins += [
ComicInput,
EPUBInput,
diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py
index e963a17df9..c8d0a7bf6b 100644
--- a/src/calibre/customize/ui.py
+++ b/src/calibre/customize/ui.py
@@ -120,9 +120,7 @@ def enable_plugin(plugin_or_name):
config['enabled_plugins'] = ep
default_disabled_plugins = set([
- 'Douban Books', 'Douban.com covers',
- 'NiceBooks', 'NiceBooksCovers',
- 'Fictionwise'
+ 'Douban Books', 'Douban.com covers', 'Fictionwise', 'Nicebooks', 'Nicebooks covers'
])
def is_disabled(plugin):
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 64dcd93f38..3ff816b3bf 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -220,13 +220,13 @@ class Dehyphenator(object):
self.html = html
self.format = format
if format == 'html':
- intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)' % length)
+ intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P( )?\s*([iubp]>\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*
\s*)?(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length)
elif format == 'pdf':
- intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?P|[iub]>\s*
\s*<[iub]>)\s*(?P[\w\d]+)'% length)
+ intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P|[iub]>\s*
\s*<[iub]>)\s*(?P[\w\d]+)'% length)
elif format == 'individual_words':
- intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-(?P[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P
\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)')
+ intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P \s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)')
html = intextmatch.sub(self.dehyphenate, html)
return html
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 41f276294a..11979b933c 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -22,12 +22,12 @@ class PreProcessor(object):
title = match.group('title')
if not title:
self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("found " + unicode(self.html_preprocess_sections) +
+ self.log("marked " + unicode(self.html_preprocess_sections) +
" chapters. - " + unicode(chap))
return ''+chap+' \n'
else:
self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("found " + unicode(self.html_preprocess_sections) +
+ self.log("marked " + unicode(self.html_preprocess_sections) +
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
return ''+chap+' \n'+title+' \n'
@@ -83,12 +83,30 @@ class PreProcessor(object):
if min_lns > tot_htm_ends:
return True
+ def dump(self, raw, where):
+ import os
+ dp = getattr(self.extra_opts, 'debug_pipeline', None)
+ if dp and os.path.exists(dp):
+ odir = os.path.join(dp, 'preprocess')
+ if not os.path.exists(odir):
+ os.makedirs(odir)
+ if os.path.exists(odir):
+ odir = os.path.join(odir, where)
+ if not os.path.exists(odir):
+ os.makedirs(odir)
+ name, i = None, 0
+ while not name or os.path.exists(os.path.join(odir, name)):
+ i += 1
+ name = '%04d.html'%i
+ with open(os.path.join(odir, name), 'wb') as f:
+ f.write(raw.encode('utf-8'))
+
def __call__(self, html):
self.log("********* Preprocessing HTML *********")
# Arrange line feeds and
tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*", "\n", html)
- html = re.sub(r"\s*\s*", "\n
", html)
+ html = re.sub(r"\s*
[^>]*)>\s*", "\n
"+">", html)
###### Check Markup ######
#
@@ -150,52 +168,61 @@ class PreProcessor(object):
#print "blanks between paragraphs is marked True"
else:
blanks_between_paragraphs = False
- #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
+ #self.dump(html, 'before_chapter_markup')
# detect chapters/sections to match xpath or splitting logic
#
# Build the Regular Expressions in pieces
- lookahead = "(?=<(p|div))"
+ init_lookahead = "(?=<(p|div))"
chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
+ title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
chapter_header_open = r"(?P"
+ title_header_open = r"(?P"
chapter_header_close = ")\s*"
- chapter_line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)\s[^>]*>)?\s*(?P=outer)>\s*"
+ title_header_close = ")"
+ chapter_line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>"
+ title_line_close = "((?P=inner6)>)?\s*((?P=inner5)>)?\s*((?P=inner4)>)?\s*(?P=outer2)>"
+
if blanks_between_paragraphs:
blank_lines = "(\s*]*>\s*
){0,2}\s*"
else:
blank_lines = ""
opt_title_open = "("
- title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
- title_header_open = "(?P"
- title_header_close = ")\s*"
- title_line_close = "((?P=inner6)>)?\s*((?P=inner5)>)?\s*((?P=inner4)\s[^>]*>)?\s*(?P=outer2)>"
opt_title_close = ")?"
+ n_lookahead_open = "\s+(?!"
+ n_lookahead_close = ")"
- default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
- typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
- numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
- uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
+ default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
- chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- #print chapter_marker
+ min_chapters = 10
heading = re.compile(']*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
- #
- # Start with most typical chapter headings, get more aggressive until one works
- if self.html_preprocess_sections < 10:
- chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
- html = chapdetect.sub(self.chapter_head, html)
- if self.html_preprocess_sections < 10:
- self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters")
- chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
- html = chapdetect2.sub(self.chapter_head, html)
- if self.html_preprocess_sections < 10:
- self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words")
- chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE)
- html = chapdetect2.sub(self.chapter_head, html)
+ chapter_types = [
+ [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
+ [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
+ [r"]*>\s*(]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)( )?\s* ", True, "Searching for emphasized lines"], # Emphasized lines
+ [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
+ [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
+ ]
+
+ # Start with most typical chapter headings, get more aggressive until one works
+ for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
+ if self.html_preprocess_sections >= min_chapters:
+ break
+ full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
+ n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
+ self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
+ if lookahead_ignorecase:
+ chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+ chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+ else:
+ chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
+ chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
+
+ html = chapdetect.sub(self.chapter_head, html)
+
+
###### Unwrap lines ######
#
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
@@ -232,6 +259,7 @@ class PreProcessor(object):
html = dehyphenator(html,'html', length)
self.log("Done dehyphenating")
# Unwrap lines using punctation and line length
+ #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
html = unwrap.sub(' ', html)
#check any remaining hyphens, but only unwrap if there is a match
@@ -248,10 +276,10 @@ class PreProcessor(object):
html = re.sub(u'\xad\s*(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
# If still no sections after unwrapping mark split points on lines with no punctuation
- if self.html_preprocess_sections < 10:
+ if self.html_preprocess_sections < 5:
self.log("Looking for more split points based on punctuation,"
" currently have " + unicode(self.html_preprocess_sections))
- chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*( )?([ibu]>){0,2}\s*( )?\s*([ibu]>){0,2}\s*( )?\s*(p|div)>)', re.IGNORECASE)
+ chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*( )?([ibu]>){0,2}\s*( )?\s*([ibu]>){0,2}\s*( )?\s*(p|div)>)', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_break, html)
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
@@ -262,4 +290,7 @@ class PreProcessor(object):
# put back non-breaking spaces in empty paragraphs to preserve original formatting
html = blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html)
+ # Center separator lines
+ html = re.sub(u'\s*(?P([*#•]+\s*)+)\s*
', '' + '\g' + '
', html)
+
return html
diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py
index 51858e4b77..4d19e9611b 100644
--- a/src/calibre/ebooks/metadata/nicebooks.py
+++ b/src/calibre/ebooks/metadata/nicebooks.py
@@ -8,7 +8,6 @@ from urllib import urlencode
from math import ceil
from copy import deepcopy
-from lxml import html
from lxml.html import soupparser
from calibre.utils.date import parse_date, utcnow
@@ -107,7 +106,7 @@ class Query(object):
assert (max_results < 21)
self.max_results = int(max_results)
-
+
if isbn is not None:
q = isbn
else:
@@ -121,7 +120,7 @@ class Query(object):
def __call__(self, browser, verbose, timeout = 5.):
if verbose:
print 'Query:', self.BASE_URL+self.urldata
-
+
try:
raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read()
except Exception, e:
@@ -138,14 +137,14 @@ class Query(object):
feed = soupparser.fromstring(raw)
except:
return
-
+
#nb of page to call
try:
nbresults = int(feed.xpath("//div[@id='topbar']/b")[0].text)
except:
#direct hit
return [feed]
-
+
nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/10))
pages =[feed]
if nbpagetoquery > 1:
@@ -164,7 +163,7 @@ class Query(object):
except:
continue
pages.append(feed)
-
+
results = []
for x in pages:
results.extend([i.find_class('title')[0].get('href') \
@@ -172,9 +171,9 @@ class Query(object):
return results[:self.max_results]
class ResultList(list):
-
+
BASE_URL = 'http://fr.nicebooks.com'
-
+
def __init__(self):
self.repub = re.compile(u'\s*.diteur\s*', re.I)
self.reauteur = re.compile(u'\s*auteur.*', re.I)
@@ -208,8 +207,8 @@ class ResultList(list):
except:
report(verbose)
return None
-
- def get_book_info(self, entry, mi):
+
+ def get_book_info(self, entry, mi, verbose):
entry = entry.find("dl[@title='Informations sur le livre']")
for x in entry.getiterator('dt'):
if x.text == 'ISBN':
@@ -240,7 +239,7 @@ class ResultList(list):
# mi.pubdate = self.get_date(entry, verbose)
# mi.isbn = self.get_ISBN(entry)
# mi.language = self.get_language(entry)
- return self.get_book_info(entry, mi)
+ return self.get_book_info(entry, mi, verbose)
def get_individual_metadata(self, browser, linkdata, verbose):
try:
@@ -343,7 +342,7 @@ def search(title=None, author=None, publisher=None, isbn=None,
br = browser()
entries = Query(title=title, author=author, isbn=isbn, publisher=publisher,
keywords=keywords, max_results=max_results)(br, verbose)
-
+
if entries is None or len(entries) == 0:
return
@@ -390,6 +389,7 @@ def option_parser():
return parser
def main(args=sys.argv):
+ import os
parser = option_parser()
opts, args = parser.parse_args(args)
try:
@@ -421,4 +421,4 @@ def main(args=sys.argv):
print
if __name__ == '__main__':
- sys.exit(main())
\ No newline at end of file
+ sys.exit(main())
diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index dfe5b653dd..6850c48b16 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -9,6 +9,7 @@ import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
+from calibre.ebooks.conversion.utils import PreProcessor
class PDBInput(InputFormatPlugin):
@@ -44,3 +45,8 @@ class PDBInput(InputFormatPlugin):
opf = reader.extract_content(os.getcwd())
return opf
+
+ def preprocess_html(self, options, html):
+ self.options = options
+ preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
+ return preprocessor(html)
\ No newline at end of file
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 32de91c011..75c839eb83 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -114,7 +114,7 @@ class RTFInput(InputFormatPlugin):
group_borders = 1,
# Write or do not write paragraphs. Default is 0.
- empty_paragraphs = 0,
+ empty_paragraphs = 1,
)
parser.parse_rtf()
ans = open('out.xml').read()
@@ -289,6 +289,10 @@ class RTFInput(InputFormatPlugin):
with open(html, 'wb') as f:
res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
+ # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
+ if not getattr(self.options, 'remove_paragraph_spacing', False):
+ res = re.sub('\s*', '', res)
+ res = re.sub('(?<=\n)\n{2}', u'\u00a0
\n', res)
if self.options.preprocess_html:
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
res = preprocessor(res)
diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py
index e2f463b80b..4f418d34d5 100644
--- a/src/calibre/gui2/wizard/__init__.py
+++ b/src/calibre/gui2/wizard/__init__.py
@@ -615,10 +615,14 @@ class LibraryPage(QWizardPage, LibraryUI):
self.emit(SIGNAL('retranslate()'))
self.init_languages()
try:
- if prefs['language'].lower().startswith('zh'):
- from calibre.customize.ui import enable_plugin
- for name in ('Douban Books', 'Douban.com covers'):
- enable_plugin(name)
+ lang = prefs['language'].lower()[:2]
+ metadata_plugins = {
+ 'zh' : ('Douban Books', 'Douban.com covers'),
+ 'fr' : ('Nicebooks', 'Nicebooks covers'),
+ }.get(lang, [])
+ from calibre.customize.ui import enable_plugin
+ for name in metadata_plugins:
+ enable_plugin(name)
except:
pass
diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py
index 6d18a2d663..8e7002097a 100644
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@@ -771,7 +771,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
except:
# Can happen if path has not yet been set
return False
- return os.access(path, os.R_OK)
+ return os.path.exists(path)
def remove_cover(self, id, notify=True):
path = os.path.join(self.library_path, self.path(id, index_is_id=True), 'cover.jpg')