diff --git a/resources/recipes/new_london_day.recipe b/resources/recipes/new_london_day.recipe
new file mode 100644
index 0000000000..bc8c44e40e
--- /dev/null
+++ b/resources/recipes/new_london_day.recipe
@@ -0,0 +1,74 @@
+__license__ = 'GPL 3'
+__copyright__ = '2009, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1294342201(BasicNewsRecipe):
+ title = u'New London Day'
+ __author__ = 'Being'
+ description = 'State, local and business news from New London, CT'
+ language = 'en_GB'
+ oldest_article = 1
+ max_articles_per_feed = 200
+
+ use_embedded_content = False
+ no_stylesheets = True
+ remove_javascript = True
+ remove_tags_before = dict(id='article')
+ remove_tags_after = dict(id='article')
+ remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
+ dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
+ dict(name=['script', 'noscript', 'style'])]
+ remove_tags_after = [ {'class':['photo_article',]} ]
+ remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer"]},
+ {'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent"]},
+ dict(name='font',attrs={'id':["cr-other-headlines"]})]
+ extra_css = '''
+ h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
+ h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
+ .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
+ .date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
+ p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ .copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
+ .story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ .entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ .pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ .maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ .story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+ body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
+ '''
+
+ feeds = [
+ (u'All News', u'http://www.theday.com/section/rss'),
+ (u'Breaking News', u'http://www.theday.com/section/rss01'),
+ (u'Police and Courts', u'http://www.theday.com/section/rss02'),
+ (u'State News', u'http://www.theday.com/section/rss03'),
+ (u'Local Business', u'http://www.theday.com/section/rss04'),
+ (u'Entertainment', u'http://www.theday.com/section/rss05'),
+ (u'Opinion', u'http://www.theday.com/section/rss06'),
+ (u'Casinos', u'http://www.theday.com/section/rss12'),
+ (u'Defense and Military', u'http://www.theday.com/section/rss14'),
+ (u'Ann Baldelli Ruminations', u'http://www.theday.com/section/rss20'),
+ (u'Paul Choiniere Ruminations', u'http://www.theday.com/section/rss21'),
+ (u'Michael Costanza Omnivore', u'http://www.theday.com/section/rss23'),
+ (u'Rebecca Dangelo Reel Life', u'http://www.theday.com/section/rss25'),]
+
+ def print_version(self, url):
+ return url.replace('/index.html', '/print.html')
+
+ def get_article_url(self, article):
+ return article.get('feedburner_origlink', article.get('guid', article.get('link')))
+
+
+ def postprocess_html(self, soup, first_fetch):
+ for t in soup.findAll(['table', 'tr', 'td']):
+ t.name = 'div'
+
+ for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})):
+ tag.extract()
+ for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})):
+ tag.extract()
+
+ return soup
+
diff --git a/resources/recipes/njp.recipe b/resources/recipes/njp.recipe
index f2a427072b..996aef2fdf 100644
--- a/resources/recipes/njp.recipe
+++ b/resources/recipes/njp.recipe
@@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
-__copyright__ = 'Chema Cort閟 - 2011-01-05'
+__copyright__ = u'Chema Cort\xe9s - 2011-01-05'
__version__ = 'v0.01'
__date__ = '2011-01-05'
'''
@@ -13,7 +13,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class NewJournalOfPhysics(BasicNewsRecipe):
title = u'New Journal of Physics'
- __author__ = u'Chema Cort閟'
+ __author__ = u'Chema Cort\xe9s'
description = u'The open-access journal for physics'
publisher = u'IOP (Institute of Physics)'
category = 'physics, journal, science'
diff --git a/resources/recipes/walla.recipe b/resources/recipes/walla.recipe
new file mode 100644
index 0000000000..5fbfed7a03
--- /dev/null
+++ b/resources/recipes/walla.recipe
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1283848012(BasicNewsRecipe):
+ description = 'The WallaNews.'
+ cover_url = 'http://ftp5.bizportal.co.il/web/giflib/news/rsPhoto/sz_5/rsz_220_220_logo_walla.gif'
+ title = u'Walla'
+ language = 'he'
+ __author__ = 'marbs'
+ extra_css='img {max-width:100%;} body{direction: rtl;},title{direction: rtl; } ,article_description{direction: rtl; }, a.article{direction: rtl; } ,calibre_feed_description{direction: rtl; }'
+ simultaneous_downloads = 5
+# remove_javascript = True
+ timefmt = '[%a, %d %b, %Y]'
+ oldest_article = 1
+ max_articles_per_feed = 100
+ # remove_attributes = ['width']
+ keep_only_tags =dict(name='div', attrs={'class':'wp-0-b w3'})
+ remove_tags = [dict(name='div', attrs={'class':'tagsContainer'})]
+ max_articles_per_feed = 100
+# preprocess_regexps = [
+# (re.compile(r'
', re.DOTALL|re.IGNORECASE), lambda match: '')
+# ]
+
+
+ feeds = [(u'讞讚砖讜转', u'http://rss.walla.co.il/?w=/1/0/1/@rss'),
+ (u'注住拽讬诐', u'http://rss.walla.co.il/?w=/2/3/1/@rss'),
+ (u'转专讘讜转', u'http://rss.walla.co.il/?w=/4/249/1/@rss'),
+ (u'讘专讬讗讜转', u'http://rss.walla.co.il/?w=/5/18/1/@rss'),
+ (u'TECH', u'http://rss.walla.co.il/?w=/6/4/1/@rss'),
+ (u'讗住讟专讜诇讜讙讬讛', u'http://rss.walla.co.il/?w=/8/3307/1/@rss'),
+ (u'讘注诇讬 讞讬讬诐', u'http://rss.walla.co.il/?w=/59/5703/1/@rss'),
+ (u'专讻讘', u'http://rss.walla.co.il/?w=/31/4700/1/@rss'),
+ (u'住诇讘住', u'http://rss.walla.co.il/?w=/22/3600/1/@rss'),
+ (u'讗讜讻诇', u'http://rss.walla.co.il/?w=/9/903/1/@rss'),
+ (u'讗讜驻谞讛', u'http://rss.walla.co.il/?w=/24/2120/1/@rss'),
+ (u'讘专谞讝讛', u'http://rss.walla.co.il/?w=/27/3900/1/@rss'),
+ (u'ZONE', u'http://rss.walla.co.il/?w=/18/500/1/@rss'),
+ (u'住驻讜专讟', u'http://rss.walla.co.il/?w=/3/7/1/@rss')]
+
+ def print_version(self, url):
+ print_url = url + '/@@/item/printer'
+ return print_url
+
diff --git a/resources/viewer/bookmarks.js b/resources/viewer/bookmarks.js
index d36e7c579a..253524326f 100644
--- a/resources/viewer/bookmarks.js
+++ b/resources/viewer/bookmarks.js
@@ -41,6 +41,7 @@ function scroll_to_bookmark(bookmark) {
$.scrollTo($(bm[0]), 1000,
{
over:ratio,
+ axis: 'y', // Do not scroll in the x direction
onAfter:function(){window.py_bridge.animated_scroll_done()}
}
);
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 3ff816b3bf..29006ffd9b 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -51,16 +51,16 @@ def chap_head(match):
chap = match.group('chap')
title = match.group('title')
if not title:
- return ''+chap+'
\n'
+ return ''+chap+'
\n'
else:
- return ''+chap+'
\n'+title+'
\n'
+ return ''+chap+'
\n'+title+'
\n'
def wrap_lines(match):
ital = match.group('ital')
if not ital:
- return ' '
+ return ' '
else:
- return ital+' '
+ return ital+' '
class DocAnalysis(object):
'''
@@ -191,7 +191,7 @@ class Dehyphenator(object):
dehyphenated = unicode(firsthalf) + unicode(secondhalf)
lookupword = self.removesuffixes.sub('', dehyphenated)
if self.prefixes.match(firsthalf) is None:
- lookupword = self.removeprefix.sub('', lookupword)
+ lookupword = self.removeprefix.sub('', lookupword)
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
try:
searchresult = self.html.find(lookupword.lower())
@@ -353,7 +353,7 @@ class HTMLPreProcessor(object):
(re.compile(r'((?<=)\s*file:////?[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
# Center separator lines
- (re.compile(u'
\s*(?P([*#鈥+\s*)+)\s*
'), lambda match: '\n
' + match.group(1) + '
'),
+ (re.compile(u'
\s*(?P([*#鈥⑩湨]+\s*)+)\s*
'), lambda match: '\n
' + match.group(1) + '
'),
# Remove page links
(re.compile(r'', re.IGNORECASE), lambda match: ''),
@@ -363,13 +363,11 @@ class HTMLPreProcessor(object):
# Remove gray background
(re.compile(r']+>'), lambda match : ''),
- # Detect Chapters to match default XPATH in GUI
- (re.compile(r'
\s*(?P(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*([ibu]>){0,2})\s*(
\s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*
)?', re.IGNORECASE), chap_head),
- # Cover the case where every letter in a chapter title is separated by a space
- (re.compile(r'
\s*(?P([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(
\s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*(
))?'), chap_head),
+ # Convert line breaks to paragraphs
+ (re.compile(r'
]*>\s*'), lambda match : '
\n'),
+ (re.compile(r'
]*>\s*'), lambda match : '\n'),
+ (re.compile(r'\s*'), lambda match : '
\n'),
- # Have paragraphs show better
- (re.compile(r''), lambda match : ''),
# Clean up spaces
(re.compile(u'(?<=[\.,;\?!鈥"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics
@@ -455,9 +453,9 @@ class HTMLPreProcessor(object):
# delete soft hyphens - moved here so it's executed after header/footer removal
if is_pdftohtml:
# unwrap/delete soft hyphens
- end_rules.append((re.compile(u'[颅](\s*
)+\s*(?=[[a-z\d])'), lambda match: ''))
+ end_rules.append((re.compile(u'[颅](
\s*\s*)+\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting
- end_rules.append((re.compile(u'[颅]\s*((i|u|b)>)+(\s*
)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
+ end_rules.append((re.compile(u'[颅]\s*((i|u|b)>)+(
\s*\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
# Make the more aggressive chapter marking regex optional with the preprocess option to
# reduce false positives and move after header/footer removal
@@ -475,7 +473,7 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(u'(?<=.{%i}[鈥撯擼)\s*
\s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append(
# Un wrap using punctuation
- (re.compile(u'(?<=.{%i}([a-z盲毛茂枚眉脿猫矛貌霉谩膰茅铆贸艅艣煤芒锚卯么没莽膮臋偶谋,:)\IA\u00DF]|(?(i|b|u)>)?\s*(
\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+ (re.compile(u'(?<=.{%i}([a-z盲毛茂枚眉脿猫矛貌霉谩膰茅铆贸艅艣煤芒锚卯么没莽膮臋偶谋茫玫帽忙酶镁冒脽,:)\IA\u00DF]|(?(i|b|u)>)?\s*(\s*\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:
@@ -508,7 +506,15 @@ class HTMLPreProcessor(object):
if is_pdftohtml and length > -1:
# Dehyphenate
dehyphenator = Dehyphenator()
- html = dehyphenator(html,'pdf', length)
+ html = dehyphenator(html,'html', length)
+
+ if is_pdftohtml:
+ from calibre.ebooks.conversion.utils import PreProcessor
+ pdf_markup = PreProcessor(self.extra_opts, None)
+ totalwords = 0
+ totalwords = pdf_markup.get_word_count(html)
+ if totalwords > 7000:
+ html = pdf_markup.markup_chapters(html, totalwords, True)
#dump(html, 'post-preprocess')
@@ -554,5 +560,9 @@ class HTMLPreProcessor(object):
html = smartyPants(html)
html = html.replace(start, '')
+ # convert ellipsis to entities to prevent wrapping
+ html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
+ # convert double dashes to em-dash
+ html = re.sub('\s--\s', u'\u2014', html)
return substitute_entites(html)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 11979b933c..1bb232c911 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -6,8 +6,10 @@ __copyright__ = '2010, Kovid Goyal '
__docformat__ = 'restructuredtext en'
import re
+from math import ceil
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log
+from calibre.utils.wordcount import get_wordcount_obj
class PreProcessor(object):
@@ -17,6 +19,9 @@ class PreProcessor(object):
self.found_indents = 0
self.extra_opts = extra_opts
+ def is_pdftohtml(self, src):
+ return '' in src[:1000]
+
def chapter_head(self, match):
chap = match.group('chap')
title = match.group('title')
@@ -64,7 +69,7 @@ class PreProcessor(object):
inspect. Percent is the minimum percent of line endings which should
be marked up to return true.
'''
- htm_end_ere = re.compile('
', re.DOTALL)
+ htm_end_ere = re.compile('(p|div)>', re.DOTALL)
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
htm_end = htm_end_ere.findall(raw)
line_end = line_end_ere.findall(raw)
@@ -101,36 +106,125 @@ class PreProcessor(object):
with open(os.path.join(odir, name), 'wb') as f:
f.write(raw.encode('utf-8'))
+ def get_word_count(self, html):
+ word_count_text = re.sub(r'(?s)]*>.*?', '', html)
+ word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
+ wordcount = get_wordcount_obj(word_count_text)
+ return wordcount.words
+
+ def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
+ # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
+ # minimum of chapters to search for
+ self.min_chapters = 1
+ if wordcount > 7000:
+ self.min_chapters = int(ceil(wordcount / 7000.))
+ #print "minimum chapters required are: "+str(self.min_chapters)
+ heading = re.compile(']*>', re.IGNORECASE)
+ self.html_preprocess_sections = len(heading.findall(html))
+ self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
+
+ # Build the Regular Expressions in pieces
+ init_lookahead = "(?=<(p|div))"
+ chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
+ title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
+ chapter_header_open = r"(?P"
+ title_header_open = r"(?P"
+ chapter_header_close = ")\s*"
+ title_header_close = ")"
+ chapter_line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>"
+ title_line_close = "((?P=inner6)>)?\s*((?P=inner5)>)?\s*((?P=inner4)>)?\s*(?P=outer2)>"
+
+ is_pdftohtml = self.is_pdftohtml(html)
+ if is_pdftohtml:
+ chapter_line_open = "<(?Pp)[^>]*>(\s*<[ibu][^>]*>)?\s*"
+ chapter_line_close = "\s*([ibu][^>]*>\s*)?(?P=outer)>"
+ title_line_open = "<(?Pp)[^>]*>\s*"
+ title_line_close = "\s*(?P=outer2)>"
+
+
+ if blanks_between_paragraphs:
+ blank_lines = "(\s*]*>\s*
){0,2}\s*"
+ else:
+ blank_lines = ""
+ opt_title_open = "("
+ opt_title_close = ")?"
+ n_lookahead_open = "\s+(?!"
+ n_lookahead_close = ")"
+
+ default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?([ibu][^>]*>)?(?=<)"
+
+ chapter_types = [
+ [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
+ [r"]*>\s*(]*>)?\s*(?!([*#鈥+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*", True, "Searching for emphasized lines"], # Emphasized lines
+ [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
+ [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
+ [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
+ [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon
+ [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
+ ]
+
+ # Start with most typical chapter headings, get more aggressive until one works
+ for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
+ if self.html_preprocess_sections >= self.min_chapters:
+ break
+ full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
+ n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
+ self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
+ if lookahead_ignorecase:
+ chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+ chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+ else:
+ chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
+ chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
+ html = chapdetect.sub(self.chapter_head, html)
+
+ words_per_chptr = wordcount
+ if words_per_chptr > 0 and self.html_preprocess_sections > 0:
+ words_per_chptr = wordcount / self.html_preprocess_sections
+ self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
+ return html
+
+
+
def __call__(self, html):
self.log("********* Preprocessing HTML *********")
+ # Count the words in the document to estimate how many chapters to look for and whether
+ # other types of processing are attempted
+ totalwords = 0
+ totalwords = self.get_word_count(html)
+
+ if totalwords < 20:
+ self.log("not enough text, not preprocessing")
+ return html
+
# Arrange line feeds and tags so the line_length and no_markup functions work correctly
- html = re.sub(r"\s*", "\n", html)
- html = re.sub(r"\s*[^>]*)>\s*", "\n
"+">", html)
+ html = re.sub(r"\s*(?Pp|div)>", ""+"\g"+">\n", html)
+ html = re.sub(r"\s*<(?Pp|div)(?P