Merge from trunk

This commit is contained in:
Sengian 2010-11-21 19:11:31 +01:00
commit 26a8583887
11 changed files with 261 additions and 69 deletions

View File

@ -1,18 +1,22 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 mode: python -*-
# Find the newest version of this recipe here:
# https://github.com/consti/BrandEins-Recipe/raw/master/brandeins.recipe
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>' __copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>'
__version__ = '0.95' __version__ = '0.96'
''' http://brandeins.de - Wirtschaftsmagazin ''' ''' http://brandeins.de - Wirtschaftsmagazin '''
import re import re
import string import string
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class BrandEins(BasicNewsRecipe): class BrandEins(BasicNewsRecipe):
title = u'Brand Eins' title = u'brand eins'
__author__ = 'Constantin Hofstetter' __author__ = 'Constantin Hofstetter'
description = u'Wirtschaftsmagazin' description = u'Wirtschaftsmagazin'
publisher ='brandeins.de' publisher ='brandeins.de'
@ -22,11 +26,14 @@ class BrandEins(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
encoding = 'utf-8' encoding = 'utf-8'
language = 'de' language = 'de'
publication_type = 'magazine'
needs_subscription = True
# 2 is the last full magazine (default) # 2 is the last full magazine (default)
# 1 is the newest (but not full) # 1 is the newest (but not full)
# 3 is one before 2 etc. # 3 is one before 2 etc.
which_ausgabe = 2 # This value can be set via the username field.
default_issue = 2
keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})] keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]
@ -61,17 +68,31 @@ class BrandEins(BasicNewsRecipe):
return soup return soup
def get_cover(self, soup):
cover_url = None
cover_item = soup.find('div', attrs = {'class': 'cover_image'})
if cover_item:
cover_url = 'http://www.brandeins.de/' + cover_item.img['src']
return cover_url
def parse_index(self): def parse_index(self):
feeds = [] feeds = []
archive = "http://www.brandeins.de/archiv.html" archive = "http://www.brandeins.de/archiv.html"
issue = self.default_issue
if self.username:
try:
issue = int(self.username)
except:
pass
soup = self.index_to_soup(archive) soup = self.index_to_soup(archive)
latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0] latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0]
pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-self.which_ausgabe] pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-issue]
url = pre_latest_issue.get('href', False) url = pre_latest_issue.get('href', False)
# Get the title for the magazin - build it out of the title of the cover - take the issue and year; # Get the title for the magazin - build it out of the title of the cover - take the issue and year;
self.title = "Brand Eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d+)", pre_latest_issue.find('img').get('title', False)).group('date') self.title = "brand eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date')
url = 'http://brandeins.de/'+url url = 'http://brandeins.de/'+url
# url = "http://www.brandeins.de/archiv/magazin/tierisch.html" # url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
@ -83,6 +104,7 @@ class BrandEins(BasicNewsRecipe):
def brand_eins_parse_latest_issue(self, url): def brand_eins_parse_latest_issue(self, url):
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
self.cover_url = self.get_cover(soup)
article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})] article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]
titles_and_articles = [] titles_and_articles = []
@ -123,3 +145,4 @@ class BrandEins(BasicNewsRecipe):
current_articles.append({'title': title, 'url': url, 'description': description, 'date':''}) current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
titles_and_articles.append([chapter_title, current_articles]) titles_and_articles.append([chapter_title, current_articles])
return titles_and_articles return titles_and_articles

View File

@ -0,0 +1,125 @@
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
import mechanize
from calibre.ptempfile import PersistentTemporaryFile
class NikkeiNet_subscription(BasicNewsRecipe):
title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248'
__author__ = 'Hiroshi Miura'
description = 'News and current market affairs from Japan'
needs_subscription = True
oldest_article = 2
max_articles_per_feed = 20
language = 'ja'
remove_javascript = False
temp_files = []
remove_tags_before = {'class':"cmn-section cmn-indent"}
remove_tags = [
{'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"},
{'class':"cmn-article_keyword cmn-clearfix"},
{'class':"cmn-print_headline cmn-clearfix"},
]
remove_tags_after = {'class':"cmn-pr_list"}
def get_browser(self):
br = BasicNewsRecipe.get_browser()
cj = mechanize.LWPCookieJar()
br.set_cookiejar(cj)
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
if self.username is not None and self.password is not None:
#print "----------------------------get login form--------------------------------------------"
# open login form
br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam')
response = br.response()
#print "----------------------------get login form---------------------------------------------"
#print "----------------------------set login form---------------------------------------------"
# remove disabled input which brings error on mechanize
response.set_data(response.get_data().replace("<input id=\"j_id48\"", "<!-- "))
response.set_data(response.get_data().replace("gm_home_on.gif\" />", " -->"))
br.set_response(response)
br.select_form(name='LA0010Form01')
br['LA0010Form01:LA0010Email'] = self.username
br['LA0010Form01:LA0010Password'] = self.password
br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True
br.submit()
br.response()
#print "----------------------------send login form---------------------------------------------"
#print "----------------------------open news main page-----------------------------------------"
# open news site
br.open('http://www.nikkei.com/')
br.response()
#print "----------------------------www.nikkei.com BODY --------------------------------------"
#print response2.get_data()
#print "-------------------------^^-got auto redirect form----^^--------------------------------"
# forced redirect in default
br.select_form(nr=0)
br.submit()
response3 = br.response()
# return some cookie which should be set by Javascript
#print response3.geturl()
raw = response3.get_data()
#print "---------------------------response to form --------------------------------------------"
# grab cookie from JS and set it
redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1)
br.select_form(nr=0)
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write("#LWP-Cookies-2.0\n")
self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n")
self.temp_files[-1].close()
cj.load(self.temp_files[-1].name)
br.submit()
#br.set_debug_http(False)
#br.set_debug_redirects(False)
#br.set_debug_responses(False)
return br
feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'),
(u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'),
(u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'),
(u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'),
(u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'),
(u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'),
(u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'),
(u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'),
(u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'),
(u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'),
(u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'),
(u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'),
(u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'),
(u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'),
(u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'),
(u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'),
(u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'),
(u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'),
(u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'),
(u'\u5730\u57df\u30cb\u30e5\u30fc\u30b9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=local'),
(u'\u7d71\u8a08\u30fb\u767d\u66f8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=report'),
(u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking'),
(u'\u4f1a\u898b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=interview'),
(u'\u793e\u8aac\u30fb\u6625\u79cb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shasetsu'),
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'),
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'),
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'),
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'),
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'),
(u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba'),
(u'\u8abf\u67fb\u30fb\u30a2\u30f3\u30b1\u30fc\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=research')
]

View File

@ -483,8 +483,8 @@ from calibre.devices.kobo.driver import KOBO
from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
LibraryThing, Fictionwise LibraryThing, Fictionwise
from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.douban import DoubanBooks
from calibre.ebooks.metadata.nicebooks import NiceBooks
from calibre.ebooks.metadata.fictionwise import Fictionwise from calibre.ebooks.metadata.fictionwise import Fictionwise
from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers
from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
LibraryThingCovers, DoubanCovers LibraryThingCovers, DoubanCovers
from calibre.ebooks.metadata.nicebooks import NiceBooksCovers from calibre.ebooks.metadata.nicebooks import NiceBooksCovers
@ -493,8 +493,9 @@ from calibre.ebooks.epub.fix.unmanifested import Unmanifested
from calibre.ebooks.epub.fix.epubcheck import Epubcheck from calibre.ebooks.epub.fix.epubcheck import Epubcheck
plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon,
LibraryThing, Fictionwise, DoubanBooks, NiceBooks,CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, LibraryThing, DoubanBooks, NiceBooks, Fictionwise, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers, NiceBooksCovers] Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers,
NiceBooksCovers]
plugins += [ plugins += [
ComicInput, ComicInput,
EPUBInput, EPUBInput,

View File

@ -120,9 +120,7 @@ def enable_plugin(plugin_or_name):
config['enabled_plugins'] = ep config['enabled_plugins'] = ep
default_disabled_plugins = set([ default_disabled_plugins = set([
'Douban Books', 'Douban.com covers', 'Douban Books', 'Douban.com covers', 'Fictionwise', 'Nicebooks', 'Nicebooks covers'
'NiceBooks', 'NiceBooksCovers',
'Fictionwise'
]) ])
def is_disabled(plugin): def is_disabled(plugin):

View File

@ -220,13 +220,13 @@ class Dehyphenator(object):
self.html = html self.html = html
self.format = format self.format = format
if format == 'html': if format == 'html':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length) intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
elif format == 'pdf': elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length) intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
elif format == 'individual_words': elif format == 'individual_words':
intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|)(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
elif format == 'html_cleanup': elif format == 'html_cleanup':
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)') intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
html = intextmatch.sub(self.dehyphenate, html) html = intextmatch.sub(self.dehyphenate, html)
return html return html

View File

@ -22,12 +22,12 @@ class PreProcessor(object):
title = match.group('title') title = match.group('title')
if not title: if not title:
self.html_preprocess_sections = self.html_preprocess_sections + 1 self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("found " + unicode(self.html_preprocess_sections) + self.log("marked " + unicode(self.html_preprocess_sections) +
" chapters. - " + unicode(chap)) " chapters. - " + unicode(chap))
return '<h2>'+chap+'</h2>\n' return '<h2>'+chap+'</h2>\n'
else: else:
self.html_preprocess_sections = self.html_preprocess_sections + 1 self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("found " + unicode(self.html_preprocess_sections) + self.log("marked " + unicode(self.html_preprocess_sections) +
" chapters & titles. - " + unicode(chap) + ", " + unicode(title)) " chapters & titles. - " + unicode(chap) + ", " + unicode(title))
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n' return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
@ -83,12 +83,30 @@ class PreProcessor(object):
if min_lns > tot_htm_ends: if min_lns > tot_htm_ends:
return True return True
def dump(self, raw, where):
import os
dp = getattr(self.extra_opts, 'debug_pipeline', None)
if dp and os.path.exists(dp):
odir = os.path.join(dp, 'preprocess')
if not os.path.exists(odir):
os.makedirs(odir)
if os.path.exists(odir):
odir = os.path.join(odir, where)
if not os.path.exists(odir):
os.makedirs(odir)
name, i = None, 0
while not name or os.path.exists(os.path.join(odir, name)):
i += 1
name = '%04d.html'%i
with open(os.path.join(odir, name), 'wb') as f:
f.write(raw.encode('utf-8'))
def __call__(self, html): def __call__(self, html):
self.log("********* Preprocessing HTML *********") self.log("********* Preprocessing HTML *********")
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*</p>", "</p>\n", html) html = re.sub(r"\s*</p>", "</p>\n", html)
html = re.sub(r"\s*<p>\s*", "\n<p>", html) html = re.sub(r"\s*<p(?P<style>[^>]*)>\s*", "\n<p"+"\g<style>"+">", html)
###### Check Markup ###### ###### Check Markup ######
# #
@ -150,52 +168,61 @@ class PreProcessor(object):
#print "blanks between paragraphs is marked True" #print "blanks between paragraphs is marked True"
else: else:
blanks_between_paragraphs = False blanks_between_paragraphs = False
#self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") #self.dump(html, 'before_chapter_markup')
# detect chapters/sections to match xpath or splitting logic # detect chapters/sections to match xpath or splitting logic
# #
# Build the Regular Expressions in pieces # Build the Regular Expressions in pieces
lookahead = "(?=<(p|div))" init_lookahead = "(?=<(p|div))"
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*" chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
chapter_header_open = r"(?P<chap>" chapter_header_open = r"(?P<chap>"
title_header_open = r"(?P<title>"
chapter_header_close = ")\s*" chapter_header_close = ")\s*"
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*" title_header_close = ")"
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
if blanks_between_paragraphs: if blanks_between_paragraphs:
blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*" blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
else: else:
blank_lines = "" blank_lines = ""
opt_title_open = "(" opt_title_open = "("
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
title_header_open = "(?P<title>"
title_header_close = ")\s*"
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
opt_title_close = ")?" opt_title_close = ")?"
n_lookahead_open = "\s+(?!"
n_lookahead_close = ")"
default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)" default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close min_chapters = 10
#print chapter_marker
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html)) self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
#
# Start with most typical chapter headings, get more aggressive until one works
if self.html_preprocess_sections < 10:
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
html = chapdetect.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters")
chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
html = chapdetect2.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10: chapter_types = [
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words") [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE) [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
html = chapdetect2.sub(self.chapter_head, html) [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
]
# Start with most typical chapter headings, get more aggressive until one works
for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
if self.html_preprocess_sections >= min_chapters:
break
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
if lookahead_ignorecase:
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
else:
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
html = chapdetect.sub(self.chapter_head, html)
###### Unwrap lines ###### ###### Unwrap lines ######
# #
# Some OCR sourced files have line breaks in the html using a combination of span & p tags # Some OCR sourced files have line breaks in the html using a combination of span & p tags
@ -232,6 +259,7 @@ class PreProcessor(object):
html = dehyphenator(html,'html', length) html = dehyphenator(html,'html', length)
self.log("Done dehyphenating") self.log("Done dehyphenating")
# Unwrap lines using punctation and line length # Unwrap lines using punctation and line length
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
html = unwrap.sub(' ', html) html = unwrap.sub(' ', html)
#check any remaining hyphens, but only unwrap if there is a match #check any remaining hyphens, but only unwrap if there is a match
@ -248,10 +276,10 @@ class PreProcessor(object):
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
# If still no sections after unwrapping mark split points on lines with no punctuation # If still no sections after unwrapping mark split points on lines with no punctuation
if self.html_preprocess_sections < 10: if self.html_preprocess_sections < 5:
self.log("Looking for more split points based on punctuation," self.log("Looking for more split points based on punctuation,"
" currently have " + unicode(self.html_preprocess_sections)) " currently have " + unicode(self.html_preprocess_sections))
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_break, html) html = chapdetect3.sub(self.chapter_break, html)
# search for places where a first or second level heading is immediately followed by another # search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter # top level heading. demote the second heading to h3 to prevent splitting between chapter
@ -262,4 +290,7 @@ class PreProcessor(object):
# put back non-breaking spaces in empty paragraphs to preserve original formatting # put back non-breaking spaces in empty paragraphs to preserve original formatting
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html) html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
# Center separator lines
html = re.sub(u'<p>\s*(?P<break>([*#•]+\s*)+)\s*</p>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
return html return html

View File

@ -8,7 +8,6 @@ from urllib import urlencode
from math import ceil from math import ceil
from copy import deepcopy from copy import deepcopy
from lxml import html
from lxml.html import soupparser from lxml.html import soupparser
from calibre.utils.date import parse_date, utcnow from calibre.utils.date import parse_date, utcnow
@ -107,7 +106,7 @@ class Query(object):
assert (max_results < 21) assert (max_results < 21)
self.max_results = int(max_results) self.max_results = int(max_results)
if isbn is not None: if isbn is not None:
q = isbn q = isbn
else: else:
@ -121,7 +120,7 @@ class Query(object):
def __call__(self, browser, verbose, timeout = 5.): def __call__(self, browser, verbose, timeout = 5.):
if verbose: if verbose:
print 'Query:', self.BASE_URL+self.urldata print 'Query:', self.BASE_URL+self.urldata
try: try:
raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read() raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read()
except Exception, e: except Exception, e:
@ -138,14 +137,14 @@ class Query(object):
feed = soupparser.fromstring(raw) feed = soupparser.fromstring(raw)
except: except:
return return
#nb of page to call #nb of page to call
try: try:
nbresults = int(feed.xpath("//div[@id='topbar']/b")[0].text) nbresults = int(feed.xpath("//div[@id='topbar']/b")[0].text)
except: except:
#direct hit #direct hit
return [feed] return [feed]
nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/10)) nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/10))
pages =[feed] pages =[feed]
if nbpagetoquery > 1: if nbpagetoquery > 1:
@ -164,7 +163,7 @@ class Query(object):
except: except:
continue continue
pages.append(feed) pages.append(feed)
results = [] results = []
for x in pages: for x in pages:
results.extend([i.find_class('title')[0].get('href') \ results.extend([i.find_class('title')[0].get('href') \
@ -172,9 +171,9 @@ class Query(object):
return results[:self.max_results] return results[:self.max_results]
class ResultList(list): class ResultList(list):
BASE_URL = 'http://fr.nicebooks.com' BASE_URL = 'http://fr.nicebooks.com'
def __init__(self): def __init__(self):
self.repub = re.compile(u'\s*.diteur\s*', re.I) self.repub = re.compile(u'\s*.diteur\s*', re.I)
self.reauteur = re.compile(u'\s*auteur.*', re.I) self.reauteur = re.compile(u'\s*auteur.*', re.I)
@ -208,8 +207,8 @@ class ResultList(list):
except: except:
report(verbose) report(verbose)
return None return None
def get_book_info(self, entry, mi): def get_book_info(self, entry, mi, verbose):
entry = entry.find("dl[@title='Informations sur le livre']") entry = entry.find("dl[@title='Informations sur le livre']")
for x in entry.getiterator('dt'): for x in entry.getiterator('dt'):
if x.text == 'ISBN': if x.text == 'ISBN':
@ -240,7 +239,7 @@ class ResultList(list):
# mi.pubdate = self.get_date(entry, verbose) # mi.pubdate = self.get_date(entry, verbose)
# mi.isbn = self.get_ISBN(entry) # mi.isbn = self.get_ISBN(entry)
# mi.language = self.get_language(entry) # mi.language = self.get_language(entry)
return self.get_book_info(entry, mi) return self.get_book_info(entry, mi, verbose)
def get_individual_metadata(self, browser, linkdata, verbose): def get_individual_metadata(self, browser, linkdata, verbose):
try: try:
@ -343,7 +342,7 @@ def search(title=None, author=None, publisher=None, isbn=None,
br = browser() br = browser()
entries = Query(title=title, author=author, isbn=isbn, publisher=publisher, entries = Query(title=title, author=author, isbn=isbn, publisher=publisher,
keywords=keywords, max_results=max_results)(br, verbose) keywords=keywords, max_results=max_results)(br, verbose)
if entries is None or len(entries) == 0: if entries is None or len(entries) == 0:
return return
@ -390,6 +389,7 @@ def option_parser():
return parser return parser
def main(args=sys.argv): def main(args=sys.argv):
import os
parser = option_parser() parser = option_parser()
opts, args = parser.parse_args(args) opts, args = parser.parse_args(args)
try: try:
@ -421,4 +421,4 @@ def main(args=sys.argv):
print print
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())

View File

@ -9,6 +9,7 @@ import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
from calibre.ebooks.conversion.utils import PreProcessor
class PDBInput(InputFormatPlugin): class PDBInput(InputFormatPlugin):
@ -44,3 +45,8 @@ class PDBInput(InputFormatPlugin):
opf = reader.extract_content(os.getcwd()) opf = reader.extract_content(os.getcwd())
return opf return opf
def preprocess_html(self, options, html):
self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)

View File

@ -114,7 +114,7 @@ class RTFInput(InputFormatPlugin):
group_borders = 1, group_borders = 1,
# Write or do not write paragraphs. Default is 0. # Write or do not write paragraphs. Default is 0.
empty_paragraphs = 0, empty_paragraphs = 1,
) )
parser.parse_rtf() parser.parse_rtf()
ans = open('out.xml').read() ans = open('out.xml').read()
@ -289,6 +289,10 @@ class RTFInput(InputFormatPlugin):
with open(html, 'wb') as f: with open(html, 'wb') as f:
res = transform.tostring(result) res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
if not getattr(self.options, 'remove_paragraph_spacing', False):
res = re.sub('\s*<body>', '<body>', res)
res = re.sub('(?<=\n)\n{2}', u'<p>\u00a0</p>\n', res)
if self.options.preprocess_html: if self.options.preprocess_html:
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
res = preprocessor(res) res = preprocessor(res)

View File

@ -615,10 +615,14 @@ class LibraryPage(QWizardPage, LibraryUI):
self.emit(SIGNAL('retranslate()')) self.emit(SIGNAL('retranslate()'))
self.init_languages() self.init_languages()
try: try:
if prefs['language'].lower().startswith('zh'): lang = prefs['language'].lower()[:2]
from calibre.customize.ui import enable_plugin metadata_plugins = {
for name in ('Douban Books', 'Douban.com covers'): 'zh' : ('Douban Books', 'Douban.com covers'),
enable_plugin(name) 'fr' : ('Nicebooks', 'Nicebooks covers'),
}.get(lang, [])
from calibre.customize.ui import enable_plugin
for name in metadata_plugins:
enable_plugin(name)
except: except:
pass pass

View File

@ -771,7 +771,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
except: except:
# Can happen if path has not yet been set # Can happen if path has not yet been set
return False return False
return os.access(path, os.R_OK) return os.path.exists(path)
def remove_cover(self, id, notify=True): def remove_cover(self, id, notify=True):
path = os.path.join(self.library_path, self.path(id, index_is_id=True), 'cover.jpg') path = os.path.join(self.library_path, self.path(id, index_is_id=True), 'cover.jpg')