diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index f154764515..e2a02702df 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -322,7 +322,6 @@ class HeuristicProcessor(object):
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?\s*((span|[iubp]|div)>)?"
blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*"
line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
@@ -331,23 +330,19 @@ class HeuristicProcessor(object):
unwrap_regex = lookahead+line_ending+blanklines+line_opening
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
- dash_unwrap_regex = dash+line_ending+blanklines+line_opening
if format == 'txt':
unwrap_regex = lookahead+txt_line_wrap
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
shy_unwrap_regex = soft_hyphen+txt_line_wrap
- dash_unwrap_regex = dash+txt_line_wrap
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
- dash_unwrap = re.compile(u"%s" % dash_unwrap_regex, re.UNICODE)
content = unwrap.sub(' ', content)
content = em_en_unwrap.sub('', content)
content = shy_unwrap.sub('', content)
- content = dash_unwrap.sub('', content)
return content
def txt_process(self, match):
@@ -460,27 +455,31 @@ class HeuristicProcessor(object):
return html
def detect_whitespace(self, html):
- blanks_around_headings = re.compile(r'(?P(<(p|div)[^>]*>\s*(p|div)>\s*){1,}\s*)?(?P\d+)[^>]*>.*?)(?P\s*(<(p|div)[^>]*>\s*(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
+ blanks_around_headings = re.compile(r'(?P(<(p|div)[^>]*>\s*(p|div)>\s*){1,}\s*)?(?P\d+)[^>]*>.*?)(?P\s*(<(p|div)[^>]*>\s*(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
+ blanks_around_scene_breaks = re.compile(r'(?P(<(p|div)[^>]*>\s*(p|div)>\s*){1,}\s*)?(?P]*>.*?
)(?P\s*(<(p|div)[^>]*>\s*(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
blanks_n_nopunct = re.compile(r'(?P(]*>\s*
\s*){1,}\s*)?]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W]((span|[ibu]|em|strong|font)>\s*)*
(?P\s*(]*>\s*
\s*){1,})?', re.IGNORECASE|re.DOTALL)
def merge_header_whitespace(match):
initblanks = match.group('initparas')
- endblanks = match.group('initparas')
- heading = match.group('heading')
+ endblanks = match.group('endparas')
+ content = match.group('content')
top_margin = ''
bottom_margin = ''
if initblanks is not None:
top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
if endblanks is not None:
- bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(initblanks)))+'em;'
+ bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(endblanks)))+'em;'
if initblanks == None and endblanks == None:
- return heading
+ return content
+ elif content.find('scenebreak') != -1:
+ return content
else:
- heading = re.sub('(?i)\d+)[^>]*>', '\n\n'+' style="'+top_margin+bottom_margin+'">', heading)
- return heading
+ content = re.sub('(?i)\d+)[^>]*>', '\n\n'+' style="'+top_margin+bottom_margin+'">', content)
+ return content
html = blanks_around_headings.sub(merge_header_whitespace, html)
+ html = blanks_around_scene_breaks.sub(merge_header_whitespace, html)
def markup_whitespaces(match):
blanks = match.group(0)
@@ -515,6 +514,12 @@ class HeuristicProcessor(object):
html = self.blankreg.sub('\n
', html)
return html
+ def detect_scene_breaks(self, html):
+ scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
+ scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
+ html = scene_breaks.sub(self.scene_break_open+'\g'+'
', html)
+ return html
+
def markup_user_break(self, replacement_break):
'''
Takes string a user supplies and wraps it in markup that will be centered with
@@ -781,25 +786,25 @@ class HeuristicProcessor(object):
if getattr(self.extra_opts, 'format_scene_breaks', False):
self.log.debug('Formatting scene breaks')
html = re.sub('(?i)]*>\s*
\s*
', '', html)
+ html = self.detect_scene_breaks(html)
html = self.detect_whitespace(html)
html = self.detect_soft_breaks(html)
blanks_count = len(self.any_multi_blank.findall(html))
if blanks_count >= 1:
html = self.merge_blanks(html, blanks_count)
- scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
- scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
+ detected_scene_break = re.compile(r']*>.*?
')
+ scene_break_count = len(detected_scene_break.findall(html))
# If the user has enabled scene break replacement, then either softbreaks
# or 'hard' scene breaks are replaced, depending on which is in use
# Otherwise separator lines are centered, use a bit larger margin in this case
replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
if replacement_break:
replacement_break = self.markup_user_break(replacement_break)
- if len(scene_break.findall(html)) >= 1:
- html = scene_break.sub(replacement_break, html)
+ if scene_break_count >= 1:
+ html = detected_scene_break.sub(replacement_break, html)
+ html = re.sub(']*>\s*
', replacement_break, html)
else:
html = re.sub(']*>\s*
', replacement_break, html)
- else:
- html = scene_break.sub(self.scene_break_open+'\g'+'', html)
if self.deleted_nbsps:
# put back non-breaking spaces in empty paragraphs so they render correctly
diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py
index bb1bbb9d42..6d6ebd3990 100755
--- a/src/calibre/ebooks/metadata/sources/overdrive.py
+++ b/src/calibre/ebooks/metadata/sources/overdrive.py
@@ -197,14 +197,18 @@ class OverDrive(Source):
title_tokens = list(self.get_title_tokens(title,
strip_joiners=False, strip_subtitle=True))
- if len(title_tokens) >= len(author_tokens):
+ xref_q = ''
+ if len(author_tokens) <= 1:
initial_q = ' '.join(title_tokens)
xref_q = '+'.join(author_tokens)
else:
initial_q = ' '.join(author_tokens)
- xref_q = '+'.join(title_tokens)
- #log.error('Initial query is %s'%initial_q)
- #log.error('Cross reference query is %s'%xref_q)
+ for token in title_tokens:
+ if len(xref_q) < len(token):
+ xref_q = token
+
+ log.error('Initial query is %s'%initial_q)
+ log.error('Cross reference query is %s'%xref_q)
q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
query = '{"szKeyword":"'+initial_q+'"}'
@@ -219,27 +223,30 @@ class OverDrive(Source):
# get the search results object
results = False
+ iterations = 0
while results == False:
+ iterations += 1
xreq = mechanize.Request(q_xref)
xreq.add_header('X-Requested-With', 'XMLHttpRequest')
xreq.add_header('Referer', q_init_search)
xreq.add_header('Accept', 'application/json, text/javascript, */*')
raw = br.open_novisit(xreq).read()
for m in re.finditer(ur'"iTotalDisplayRecords":(?P\d+).*?"iTotalRecords":(?P\d+)', raw):
- if int(m.group('displayrecords')) >= 1:
- results = True
- elif int(m.group('totalrecords')) >= 1:
- if int(m.group('totalrecords')) >= 100:
- if xref_q.find('+') != -1:
- xref_tokens = xref_q.split('+')
- xref_q = xref_tokens[0]
- #log.error('xref_q is '+xref_q)
- else:
- xref_q = ''
- xref_q = ''
- q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
- elif int(m.group('totalrecords')) == 0:
+ if int(m.group('totalrecords')) == 0:
return ''
+ elif int(m.group('displayrecords')) >= 1:
+ results = True
+ elif int(m.group('totalrecords')) >= 1 and iterations < 3:
+ if xref_q.find('+') != -1:
+ xref_tokens = xref_q.split('+')
+ xref_q = xref_tokens[0]
+ for token in xref_tokens:
+ if len(xref_q) < len(token):
+ xref_q = token
+ #log.error('rewrote xref_q, new query is '+xref_q)
+ else:
+ xref_q = ''
+ q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
return self.sort_ovrdrv_results(raw, log, title, title_tokens, author, author_tokens)
@@ -263,6 +270,7 @@ class OverDrive(Source):
else:
if creators:
creators = creators.split(', ')
+
# if an exact match in a preferred format occurs
if ((author and creators and creators[0] == author[0]) or (not author and not creators)) and od_title.lower() == title.lower() and int(formatid) in [1, 50, 410, 900] and thumbimage:
return self.format_results(reserveid, od_title, subtitle, series, publisher,
@@ -330,9 +338,9 @@ class OverDrive(Source):
def find_ovrdrv_data(self, br, log, title, author, isbn, ovrdrv_id=None):
q = base_url
if ovrdrv_id is None:
- return self.overdrive_search(br, log, q, title, author)
+ return self.overdrive_search(br, log, q, title, author)
else:
- return self.overdrive_get_record(br, log, q, ovrdrv_id)
+ return self.overdrive_get_record(br, log, q, ovrdrv_id)
@@ -461,10 +469,10 @@ if __name__ == '__main__':
[
(
- {'title':'Foundation and Earth',
- 'authors':['Asimov']},
- [title_test('Foundation and Earth', exact=True),
- authors_test(['Isaac Asimov'])]
+ {'title':'The Sea Kings Daughter',
+ 'authors':['Elizabeth Peters']},
+ [title_test('The Sea Kings Daughter', exact=False),
+ authors_test(['Elizabeth Peters'])]
),
(