From 441b4e20cc9661579e09b6bfa6aa00b19c54eb3d Mon Sep 17 00:00:00 2001 From: Lee Date: Sun, 24 Apr 2011 22:43:25 +0800 Subject: [PATCH 1/5] re-factored the query logic for overdrive to handle titles including punctuation --- src/calibre/ebooks/metadata/sources/overdrive.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 759da45610..62a3ca2091 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -198,12 +198,16 @@ class OverDrive(Source): title_tokens = list(self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True)) - if len(title_tokens) >= len(author_tokens): + xref_q = '' + if len(author_tokens) <= 1: initial_q = ' '.join(title_tokens) xref_q = '+'.join(author_tokens) else: initial_q = ' '.join(author_tokens) - xref_q = '+'.join(title_tokens) + for token in title_tokens: + if len(xref_q) < len(token): + xref_q = token + #log.error('Initial query is %s'%initial_q) #log.error('Cross reference query is %s'%xref_q) @@ -234,10 +238,12 @@ class OverDrive(Source): if xref_q.find('+') != -1: xref_tokens = xref_q.split('+') xref_q = xref_tokens[0] - #log.error('xref_q is '+xref_q) + for token in xref_tokens: + if len(xref_q) < len(token): + xref_q = token + #log.error('rewrote xref_q, new query is '+xref_q) else: xref_q = '' - xref_q = '' q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q elif int(m.group('totalrecords')) == 0: return '' @@ -264,6 +270,7 @@ class OverDrive(Source): else: if creators: creators = creators.split(', ') + # if an exact match in a preferred format occurs if ((author and creators[0] == author[0]) or (not author and not creators)) and od_title.lower() == title.lower() and int(formatid) in [1, 50, 410, 900] and thumbimage: return self.format_results(reserveid, od_title, subtitle, series, publisher, From 88a54e805405cbc72f7eb72ba469a711a3285777 Mon Sep 17 00:00:00 2001 From: Lee Date: Mon, 25 Apr 2011 10:51:20 +0800 Subject: [PATCH 2/5] ... --- src/calibre/ebooks/metadata/sources/overdrive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 62a3ca2091..67eac7e337 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -337,9 +337,9 @@ class OverDrive(Source): def find_ovrdrv_data(self, br, log, title, author, isbn, ovrdrv_id=None): q = base_url if ovrdrv_id is None: - return self.overdrive_search(br, log, q, title, author) + return self.overdrive_search(br, log, q, title, author) else: - return self.overdrive_get_record(br, log, q, ovrdrv_id) + return self.overdrive_get_record(br, log, q, ovrdrv_id) From a1bbba3198f0945ade6d4d70f51a87f22c1d284a Mon Sep 17 00:00:00 2001 From: Lee Date: Sun, 7 Aug 2011 22:18:47 +0800 Subject: [PATCH 3/5] fix a number of issues with scene break formatting in heuristics --- src/calibre/ebooks/conversion/utils.py | 38 +++++++++++++++++--------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 7488df4609..9962335da3 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -451,27 +451,33 @@ class HeuristicProcessor(object): return html def detect_whitespace(self, html): - blanks_around_headings = re.compile(r'(?P(<(p|div)[^>]*>\s*\s*){1,}\s*)?(?P\d+)[^>]*>.*?)(?P\s*(<(p|div)[^>]*>\s*\s*){1,})?', re.IGNORECASE|re.DOTALL) + blanks_around_headings = re.compile(r'(?P(<(p|div)[^>]*>\s*\s*){1,}\s*)?(?P\d+)[^>]*>.*?)(?P\s*(<(p|div)[^>]*>\s*\s*){1,})?', re.IGNORECASE|re.DOTALL) + blanks_around_scene_breaks = re.compile(r'(?P(<(p|div)[^>]*>\s*\s*){1,}\s*)?(?P

]*>.*?

)(?P\s*(<(p|div)[^>]*>\s*\s*){1,})?', re.IGNORECASE|re.DOTALL) blanks_n_nopunct = re.compile(r'(?P(]*>\s*

\s*){1,}\s*)?]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](\s*)*

(?P\s*(]*>\s*

\s*){1,})?', re.IGNORECASE|re.DOTALL) def merge_header_whitespace(match): initblanks = match.group('initparas') - endblanks = match.group('initparas') - heading = match.group('heading') + endblanks = match.group('endparas') + content = match.group('content') top_margin = '' bottom_margin = '' if initblanks is not None: + print "initial blanks are:\n"+initblanks top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;' if endblanks is not None: - bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(initblanks)))+'em;' + print "endblanks blanks are:\n"+endblanks + bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(endblanks)))+'em;' if initblanks == None and endblanks == None: - return heading + return content + elif content.find('scenebreak') != -1: + return content else: - heading = re.sub('(?i)\d+)[^>]*>', '\n\n'+' style="'+top_margin+bottom_margin+'">', heading) - return heading + content = re.sub('(?i)\d+)[^>]*>', '\n\n'+' style="'+top_margin+bottom_margin+'">', content) + return content html = blanks_around_headings.sub(merge_header_whitespace, html) + html = blanks_around_scene_breaks.sub(merge_header_whitespace, html) def markup_whitespaces(match): blanks = match.group(0) @@ -506,6 +512,12 @@ class HeuristicProcessor(object): html = self.blankreg.sub('\n

', html) return html + def detect_scene_breaks(self, html): + scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close + scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE) + html = scene_breaks.sub(self.scene_break_open+'\g'+'

', html) + return html + def markup_user_break(self, replacement_break): ''' Takes string a user supplies and wraps it in markup that will be centered with @@ -765,25 +777,25 @@ class HeuristicProcessor(object): # If non-blank scene breaks exist they are center aligned and styled with appropriate margins. if getattr(self.extra_opts, 'format_scene_breaks', False): html = re.sub('(?i)]*>\s*\s*', '

', html) + html = self.detect_scene_breaks(html) html = self.detect_whitespace(html) html = self.detect_soft_breaks(html) blanks_count = len(self.any_multi_blank.findall(html)) if blanks_count >= 1: html = self.merge_blanks(html, blanks_count) - scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close - scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE) + detected_scene_break = re.compile(r'

]*>.*?

') + scene_break_count = len(detected_scene_break.findall(html)) # If the user has enabled scene break replacement, then either softbreaks # or 'hard' scene breaks are replaced, depending on which is in use # Otherwise separator lines are centered, use a bit larger margin in this case replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None) if replacement_break: replacement_break = self.markup_user_break(replacement_break) - if len(scene_break.findall(html)) >= 1: - html = scene_break.sub(replacement_break, html) + if scene_break_count >= 1: + html = detected_scene_break.sub(replacement_break, html) + html = re.sub(']*>\s*

', replacement_break, html) else: html = re.sub(']*>\s*

', replacement_break, html) - else: - html = scene_break.sub(self.scene_break_open+'\g'+'

', html) if self.deleted_nbsps: # put back non-breaking spaces in empty paragraphs so they render correctly From d07b4556e97c15b373fbf6b40c4fcc29b3872c10 Mon Sep 17 00:00:00 2001 From: Lee Date: Mon, 8 Aug 2011 03:51:56 +0800 Subject: [PATCH 4/5] fix issue where overdrive can get in an ifinite loop --- .../ebooks/metadata/sources/overdrive.py | 43 ++++++++++--------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 1faacaa3ef..0af41538b0 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -208,8 +208,8 @@ class OverDrive(Source): if len(xref_q) < len(token): xref_q = token - #log.error('Initial query is %s'%initial_q) - #log.error('Cross reference query is %s'%xref_q) + log.error('Initial query is %s'%initial_q) + log.error('Cross reference query is %s'%xref_q) q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q query = '{"szKeyword":"'+initial_q+'"}' @@ -224,29 +224,30 @@ class OverDrive(Source): # get the search results object results = False + iterations = 0 while results == False: + iterations += 1 xreq = mechanize.Request(q_xref) xreq.add_header('X-Requested-With', 'XMLHttpRequest') xreq.add_header('Referer', q_init_search) xreq.add_header('Accept', 'application/json, text/javascript, */*') raw = br.open_novisit(xreq).read() for m in re.finditer(ur'"iTotalDisplayRecords":(?P\d+).*?"iTotalRecords":(?P\d+)', raw): - if int(m.group('displayrecords')) >= 1: - results = True - elif int(m.group('totalrecords')) >= 1: - if int(m.group('totalrecords')) >= 100: - if xref_q.find('+') != -1: - xref_tokens = xref_q.split('+') - xref_q = xref_tokens[0] - for token in xref_tokens: - if len(xref_q) < len(token): - xref_q = token - #log.error('rewrote xref_q, new query is '+xref_q) - else: - xref_q = '' - q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q - elif int(m.group('totalrecords')) == 0: + if int(m.group('totalrecords')) == 0: return '' + elif int(m.group('displayrecords')) >= 1: + results = True + elif int(m.group('totalrecords')) >= 1 and iterations < 3: + if xref_q.find('+') != -1: + xref_tokens = xref_q.split('+') + xref_q = xref_tokens[0] + for token in xref_tokens: + if len(xref_q) < len(token): + xref_q = token + #log.error('rewrote xref_q, new query is '+xref_q) + else: + xref_q = '' + q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q return self.sort_ovrdrv_results(raw, log, title, title_tokens, author, author_tokens) @@ -461,10 +462,10 @@ if __name__ == '__main__': [ ( - {'title':'Foundation and Earth', - 'authors':['Asimov']}, - [title_test('Foundation and Earth', exact=True), - authors_test(['Isaac Asimov'])] + {'title':'The Sea Kings Daughter', + 'authors':['Elizabeth Peters']}, + [title_test('The Sea Kings Daughter', exact=False), + authors_test(['Elizabeth Peters'])] ), ( From a6efef3d3159348665323cda2f9e8c9bffd2d990 Mon Sep 17 00:00:00 2001 From: Lee Date: Fri, 20 Apr 2012 21:52:57 +0800 Subject: [PATCH 5/5] removed dash unwrap regression from bug #822744 --- src/calibre/ebooks/conversion/utils.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 26b800f55b..e2a02702df 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -322,7 +322,6 @@ class HeuristicProcessor(object): lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?\s*()?" blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*" line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*" @@ -331,23 +330,19 @@ class HeuristicProcessor(object): unwrap_regex = lookahead+line_ending+blanklines+line_opening em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening - dash_unwrap_regex = dash+line_ending+blanklines+line_opening if format == 'txt': unwrap_regex = lookahead+txt_line_wrap em_en_unwrap_regex = em_en_lookahead+txt_line_wrap shy_unwrap_regex = soft_hyphen+txt_line_wrap - dash_unwrap_regex = dash+txt_line_wrap unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE) em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE) shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE) - dash_unwrap = re.compile(u"%s" % dash_unwrap_regex, re.UNICODE) content = unwrap.sub(' ', content) content = em_en_unwrap.sub('', content) content = shy_unwrap.sub('', content) - content = dash_unwrap.sub('', content) return content def txt_process(self, match):