diff --git a/resources/recipes/brand_eins.recipe b/resources/recipes/brand_eins.recipe index be5b98ffe6..c69dd693b2 100644 --- a/resources/recipes/brand_eins.recipe +++ b/resources/recipes/brand_eins.recipe @@ -1,18 +1,22 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +# -*- coding: utf-8 mode: python -*- + +# Find the newest version of this recipe here: +# https://github.com/consti/BrandEins-Recipe/raw/master/brandeins.recipe __license__ = 'GPL v3' -__copyright__ = '2010, Constantin Hofstetter ' -__version__ = '0.95' +__copyright__ = '2010, Constantin Hofstetter , Steffen Siebert ' +__version__ = '0.96' ''' http://brandeins.de - Wirtschaftsmagazin ''' import re import string from calibre.web.feeds.recipes import BasicNewsRecipe + class BrandEins(BasicNewsRecipe): - title = u'Brand Eins' + title = u'brand eins' __author__ = 'Constantin Hofstetter' description = u'Wirtschaftsmagazin' publisher ='brandeins.de' @@ -22,11 +26,14 @@ class BrandEins(BasicNewsRecipe): no_stylesheets = True encoding = 'utf-8' language = 'de' + publication_type = 'magazine' + needs_subscription = True # 2 is the last full magazine (default) # 1 is the newest (but not full) # 3 is one before 2 etc. - which_ausgabe = 2 + # This value can be set via the username field. + default_issue = 2 keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})] @@ -61,17 +68,31 @@ class BrandEins(BasicNewsRecipe): return soup + def get_cover(self, soup): + cover_url = None + cover_item = soup.find('div', attrs = {'class': 'cover_image'}) + if cover_item: + cover_url = 'http://www.brandeins.de/' + cover_item.img['src'] + return cover_url + def parse_index(self): feeds = [] archive = "http://www.brandeins.de/archiv.html" + issue = self.default_issue + if self.username: + try: + issue = int(self.username) + except: + pass + soup = self.index_to_soup(archive) latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0] - pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-self.which_ausgabe] + pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-issue] url = pre_latest_issue.get('href', False) # Get the title for the magazin - build it out of the title of the cover - take the issue and year; - self.title = "Brand Eins "+ re.search(r"(?P\d\d\/\d\d\d\d+)", pre_latest_issue.find('img').get('title', False)).group('date') + self.title = "brand eins "+ re.search(r"(?P\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date') url = 'http://brandeins.de/'+url # url = "http://www.brandeins.de/archiv/magazin/tierisch.html" @@ -83,6 +104,7 @@ class BrandEins(BasicNewsRecipe): def brand_eins_parse_latest_issue(self, url): soup = self.index_to_soup(url) + self.cover_url = self.get_cover(soup) article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})] titles_and_articles = [] @@ -123,3 +145,4 @@ class BrandEins(BasicNewsRecipe): current_articles.append({'title': title, 'url': url, 'description': description, 'date':''}) titles_and_articles.append([chapter_title, current_articles]) return titles_and_articles + diff --git a/resources/recipes/nikkei_sub.recipe b/resources/recipes/nikkei_sub.recipe new file mode 100644 index 0000000000..95b0017339 --- /dev/null +++ b/resources/recipes/nikkei_sub.recipe @@ -0,0 +1,125 @@ +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +import mechanize +from calibre.ptempfile import PersistentTemporaryFile + + +class NikkeiNet_subscription(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + needs_subscription = True + oldest_article = 2 + max_articles_per_feed = 20 + language = 'ja' + remove_javascript = False + temp_files = [] + + remove_tags_before = {'class':"cmn-section cmn-indent"} + remove_tags = [ + {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, + {'class':"cmn-article_keyword cmn-clearfix"}, + {'class':"cmn-print_headline cmn-clearfix"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + + cj = mechanize.LWPCookieJar() + br.set_cookiejar(cj) + + #br.set_debug_http(True) + #br.set_debug_redirects(True) + #br.set_debug_responses(True) + + if self.username is not None and self.password is not None: + #print "----------------------------get login form--------------------------------------------" + # open login form + br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') + response = br.response() + #print "----------------------------get login form---------------------------------------------" + #print "----------------------------set login form---------------------------------------------" + # remove disabled input which brings error on mechanize + response.set_data(response.get_data().replace("", " -->")) + br.set_response(response) + br.select_form(name='LA0010Form01') + br['LA0010Form01:LA0010Email'] = self.username + br['LA0010Form01:LA0010Password'] = self.password + br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True + br.submit() + br.response() + #print "----------------------------send login form---------------------------------------------" + #print "----------------------------open news main page-----------------------------------------" + # open news site + br.open('http://www.nikkei.com/') + br.response() + #print "----------------------------www.nikkei.com BODY --------------------------------------" + #print response2.get_data() + #print "-------------------------^^-got auto redirect form----^^--------------------------------" + # forced redirect in default + br.select_form(nr=0) + br.submit() + response3 = br.response() + # return some cookie which should be set by Javascript + #print response3.geturl() + raw = response3.get_data() + #print "---------------------------response to form --------------------------------------------" + # grab cookie from JS and set it + redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) + br.select_form(nr=0) + + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write("#LWP-Cookies-2.0\n") + + self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].close() + cj.load(self.temp_files[-1].name) + + br.submit() + + #br.set_debug_http(False) + #br.set_debug_redirects(False) + #br.set_debug_responses(False) + return br + + + + feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'), + (u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'), + (u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'), + (u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'), + (u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'), + (u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'), + (u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'), + (u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'), + (u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'), + (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'), + (u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'), + (u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'), + (u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'), + (u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'), + (u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'), + (u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'), + (u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'), + (u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'), + (u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'), + (u'\u5730\u57df\u30cb\u30e5\u30fc\u30b9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=local'), + (u'\u7d71\u8a08\u30fb\u767d\u66f8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=report'), + (u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking'), + (u'\u4f1a\u898b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=interview'), + (u'\u793e\u8aac\u30fb\u6625\u79cb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shasetsu'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba'), + (u'\u8abf\u67fb\u30fb\u30a2\u30f3\u30b1\u30fc\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=research') + ] + + + diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 4815375563..1947870d95 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -483,8 +483,8 @@ from calibre.devices.kobo.driver import KOBO from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ LibraryThing, Fictionwise from calibre.ebooks.metadata.douban import DoubanBooks -from calibre.ebooks.metadata.nicebooks import NiceBooks from calibre.ebooks.metadata.fictionwise import Fictionwise +from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ LibraryThingCovers, DoubanCovers from calibre.ebooks.metadata.nicebooks import NiceBooksCovers @@ -493,8 +493,9 @@ from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, - LibraryThing, Fictionwise, DoubanBooks, NiceBooks,CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, - Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers, NiceBooksCovers] + LibraryThing, DoubanBooks, NiceBooks, Fictionwise, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, + Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers, + NiceBooksCovers] plugins += [ ComicInput, EPUBInput, diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index e963a17df9..c8d0a7bf6b 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -120,9 +120,7 @@ def enable_plugin(plugin_or_name): config['enabled_plugins'] = ep default_disabled_plugins = set([ - 'Douban Books', 'Douban.com covers', - 'NiceBooks', 'NiceBooksCovers', - 'Fictionwise' + 'Douban Books', 'Douban.com covers', 'Fictionwise', 'Nicebooks', 'Nicebooks covers' ]) def is_disabled(plugin): diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 64dcd93f38..3ff816b3bf 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -220,13 +220,13 @@ class Dehyphenator(object): self.html = html self.format = format if format == 'html': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)' % length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P()?\s*(\s*){1,2}(?P<(p|div)[^>]*>\s*(]*>\s*

\s*)?\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P[\w\d]+)' % length) elif format == 'pdf': - intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) elif format == 'individual_words': - intextmatch = re.compile(u'>[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-(?P[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') + intextmatch = re.compile(u'(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') html = intextmatch.sub(self.dehyphenate, html) return html diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 41f276294a..11979b933c 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -22,12 +22,12 @@ class PreProcessor(object): title = match.group('title') if not title: self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("found " + unicode(self.html_preprocess_sections) + + self.log("marked " + unicode(self.html_preprocess_sections) + " chapters. - " + unicode(chap)) return '

'+chap+'

\n' else: self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("found " + unicode(self.html_preprocess_sections) + + self.log("marked " + unicode(self.html_preprocess_sections) + " chapters & titles. - " + unicode(chap) + ", " + unicode(title)) return '

'+chap+'

\n

'+title+'

\n' @@ -83,12 +83,30 @@ class PreProcessor(object): if min_lns > tot_htm_ends: return True + def dump(self, raw, where): + import os + dp = getattr(self.extra_opts, 'debug_pipeline', None) + if dp and os.path.exists(dp): + odir = os.path.join(dp, 'preprocess') + if not os.path.exists(odir): + os.makedirs(odir) + if os.path.exists(odir): + odir = os.path.join(odir, where) + if not os.path.exists(odir): + os.makedirs(odir) + name, i = None, 0 + while not name or os.path.exists(os.path.join(odir, name)): + i += 1 + name = '%04d.html'%i + with open(os.path.join(odir, name), 'wb') as f: + f.write(raw.encode('utf-8')) + def __call__(self, html): self.log("********* Preprocessing HTML *********") # Arrange line feeds and

tags so the line_length and no_markup functions work correctly html = re.sub(r"\s*

", "

\n", html) - html = re.sub(r"\s*

\s*", "\n

", html) + html = re.sub(r"\s*[^>]*)>\s*", "\n"+">", html) ###### Check Markup ###### # @@ -150,52 +168,61 @@ class PreProcessor(object): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False - #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") + #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic # # Build the Regular Expressions in pieces - lookahead = "(?=<(p|div))" + init_lookahead = "(?=<(p|div))" chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" + title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" chapter_header_open = r"(?P" + title_header_open = r"(?P" chapter_header_close = ")\s*" - chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*" + title_header_close = ")" + chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>" + title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>" + if blanks_between_paragraphs: blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*" else: blank_lines = "" opt_title_open = "(" - title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*" - title_header_open = "(?P<title>" - title_header_close = ")\s*" - title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>" opt_title_close = ")?" + n_lookahead_open = "\s+(?!" + n_lookahead_close = ")" - default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)" - typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}" - numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*" - uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*" + default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)" - chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - #print chapter_marker + min_chapters = 10 heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") - # - # Start with most typical chapter headings, get more aggressive until one works - if self.html_preprocess_sections < 10: - chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) - html = chapdetect.sub(self.chapter_head, html) - if self.html_preprocess_sections < 10: - self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters") - chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) - html = chapdetect2.sub(self.chapter_head, html) - if self.html_preprocess_sections < 10: - self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words") - chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE) - html = chapdetect2.sub(self.chapter_head, html) + chapter_types = [ + [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], + [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters + [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines + [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles + [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters + ] + + # Start with most typical chapter headings, get more aggressive until one works + for [chapter_type, lookahead_ignorecase, log_message] in chapter_types: + if self.html_preprocess_sections >= min_chapters: + break + full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close + n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) + self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message) + if lookahead_ignorecase: + chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) + else: + chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close + chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) + + html = chapdetect.sub(self.chapter_head, html) + + ###### Unwrap lines ###### # # Some OCR sourced files have line breaks in the html using a combination of span & p tags @@ -232,6 +259,7 @@ class PreProcessor(object): html = dehyphenator(html,'html', length) self.log("Done dehyphenating") # Unwrap lines using punctation and line length + #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE) unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) html = unwrap.sub(' ', html) #check any remaining hyphens, but only unwrap if there is a match @@ -248,10 +276,10 @@ class PreProcessor(object): html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) # If still no sections after unwrapping mark split points on lines with no punctuation - if self.html_preprocess_sections < 10: + if self.html_preprocess_sections < 5: self.log("Looking for more split points based on punctuation," " currently have " + unicode(self.html_preprocess_sections)) - chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) + chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) html = chapdetect3.sub(self.chapter_break, html) # search for places where a first or second level heading is immediately followed by another # top level heading. demote the second heading to h3 to prevent splitting between chapter @@ -262,4 +290,7 @@ class PreProcessor(object): # put back non-breaking spaces in empty paragraphs to preserve original formatting html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html) + # Center separator lines + html = re.sub(u'<p>\s*(?P<break>([*#•]+\s*)+)\s*</p>', '<p style="text-align:center">' + '\g<break>' + '</p>', html) + return html diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 51858e4b77..4d19e9611b 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -8,7 +8,6 @@ from urllib import urlencode from math import ceil from copy import deepcopy -from lxml import html from lxml.html import soupparser from calibre.utils.date import parse_date, utcnow @@ -107,7 +106,7 @@ class Query(object): assert (max_results < 21) self.max_results = int(max_results) - + if isbn is not None: q = isbn else: @@ -121,7 +120,7 @@ class Query(object): def __call__(self, browser, verbose, timeout = 5.): if verbose: print 'Query:', self.BASE_URL+self.urldata - + try: raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read() except Exception, e: @@ -138,14 +137,14 @@ class Query(object): feed = soupparser.fromstring(raw) except: return - + #nb of page to call try: nbresults = int(feed.xpath("//div[@id='topbar']/b")[0].text) except: #direct hit return [feed] - + nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/10)) pages =[feed] if nbpagetoquery > 1: @@ -164,7 +163,7 @@ class Query(object): except: continue pages.append(feed) - + results = [] for x in pages: results.extend([i.find_class('title')[0].get('href') \ @@ -172,9 +171,9 @@ class Query(object): return results[:self.max_results] class ResultList(list): - + BASE_URL = 'http://fr.nicebooks.com' - + def __init__(self): self.repub = re.compile(u'\s*.diteur\s*', re.I) self.reauteur = re.compile(u'\s*auteur.*', re.I) @@ -208,8 +207,8 @@ class ResultList(list): except: report(verbose) return None - - def get_book_info(self, entry, mi): + + def get_book_info(self, entry, mi, verbose): entry = entry.find("dl[@title='Informations sur le livre']") for x in entry.getiterator('dt'): if x.text == 'ISBN': @@ -240,7 +239,7 @@ class ResultList(list): # mi.pubdate = self.get_date(entry, verbose) # mi.isbn = self.get_ISBN(entry) # mi.language = self.get_language(entry) - return self.get_book_info(entry, mi) + return self.get_book_info(entry, mi, verbose) def get_individual_metadata(self, browser, linkdata, verbose): try: @@ -343,7 +342,7 @@ def search(title=None, author=None, publisher=None, isbn=None, br = browser() entries = Query(title=title, author=author, isbn=isbn, publisher=publisher, keywords=keywords, max_results=max_results)(br, verbose) - + if entries is None or len(entries) == 0: return @@ -390,6 +389,7 @@ def option_parser(): return parser def main(args=sys.argv): + import os parser = option_parser() opts, args = parser.parse_args(args) try: @@ -421,4 +421,4 @@ def main(args=sys.argv): print if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file + sys.exit(main()) diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index dfe5b653dd..6850c48b16 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -9,6 +9,7 @@ import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader +from calibre.ebooks.conversion.utils import PreProcessor class PDBInput(InputFormatPlugin): @@ -44,3 +45,8 @@ class PDBInput(InputFormatPlugin): opf = reader.extract_content(os.getcwd()) return opf + + def preprocess_html(self, options, html): + self.options = options + preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) + return preprocessor(html) \ No newline at end of file diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 32de91c011..75c839eb83 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -114,7 +114,7 @@ class RTFInput(InputFormatPlugin): group_borders = 1, # Write or do not write paragraphs. Default is 0. - empty_paragraphs = 0, + empty_paragraphs = 1, ) parser.parse_rtf() ans = open('out.xml').read() @@ -289,6 +289,10 @@ class RTFInput(InputFormatPlugin): with open(html, 'wb') as f: res = transform.tostring(result) res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] + # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines + if not getattr(self.options, 'remove_paragraph_spacing', False): + res = re.sub('\s*<body>', '<body>', res) + res = re.sub('(?<=\n)\n{2}', u'<p>\u00a0</p>\n', res) if self.options.preprocess_html: preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) res = preprocessor(res) diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py index e2f463b80b..4f418d34d5 100644 --- a/src/calibre/gui2/wizard/__init__.py +++ b/src/calibre/gui2/wizard/__init__.py @@ -615,10 +615,14 @@ class LibraryPage(QWizardPage, LibraryUI): self.emit(SIGNAL('retranslate()')) self.init_languages() try: - if prefs['language'].lower().startswith('zh'): - from calibre.customize.ui import enable_plugin - for name in ('Douban Books', 'Douban.com covers'): - enable_plugin(name) + lang = prefs['language'].lower()[:2] + metadata_plugins = { + 'zh' : ('Douban Books', 'Douban.com covers'), + 'fr' : ('Nicebooks', 'Nicebooks covers'), + }.get(lang, []) + from calibre.customize.ui import enable_plugin + for name in metadata_plugins: + enable_plugin(name) except: pass diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 6d18a2d663..8e7002097a 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -771,7 +771,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): except: # Can happen if path has not yet been set return False - return os.access(path, os.R_OK) + return os.path.exists(path) def remove_cover(self, id, notify=True): path = os.path.join(self.library_path, self.path(id, index_is_id=True), 'cover.jpg')