mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge from trunk
This commit is contained in:
commit
15133ee8f1
74
resources/recipes/new_london_day.recipe
Normal file
74
resources/recipes/new_london_day.recipe
Normal file
@ -0,0 +1,74 @@
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1294342201(BasicNewsRecipe):
|
||||
title = u'New London Day'
|
||||
__author__ = 'Being'
|
||||
description = 'State, local and business news from New London, CT'
|
||||
language = 'en_GB'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 200
|
||||
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
remove_tags_before = dict(id='article')
|
||||
remove_tags_after = dict(id='article')
|
||||
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
|
||||
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
|
||||
dict(name=['script', 'noscript', 'style'])]
|
||||
remove_tags_after = [ {'class':['photo_article',]} ]
|
||||
remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer"]},
|
||||
{'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent"]},
|
||||
dict(name='font',attrs={'id':["cr-other-headlines"]})]
|
||||
extra_css = '''
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||
.byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
|
||||
.date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
|
||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
.copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
|
||||
.story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
.entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
.pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
.maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
.story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||
'''
|
||||
|
||||
feeds = [
|
||||
(u'All News', u'http://www.theday.com/section/rss'),
|
||||
(u'Breaking News', u'http://www.theday.com/section/rss01'),
|
||||
(u'Police and Courts', u'http://www.theday.com/section/rss02'),
|
||||
(u'State News', u'http://www.theday.com/section/rss03'),
|
||||
(u'Local Business', u'http://www.theday.com/section/rss04'),
|
||||
(u'Entertainment', u'http://www.theday.com/section/rss05'),
|
||||
(u'Opinion', u'http://www.theday.com/section/rss06'),
|
||||
(u'Casinos', u'http://www.theday.com/section/rss12'),
|
||||
(u'Defense and Military', u'http://www.theday.com/section/rss14'),
|
||||
(u'Ann Baldelli Ruminations', u'http://www.theday.com/section/rss20'),
|
||||
(u'Paul Choiniere Ruminations', u'http://www.theday.com/section/rss21'),
|
||||
(u'Michael Costanza Omnivore', u'http://www.theday.com/section/rss23'),
|
||||
(u'Rebecca Dangelo Reel Life', u'http://www.theday.com/section/rss25'),]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('/index.html', '/print.html')
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('feedburner_origlink', article.get('guid', article.get('link')))
|
||||
|
||||
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
for t in soup.findAll(['table', 'tr', 'td']):
|
||||
t.name = 'div'
|
||||
|
||||
for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})):
|
||||
tag.extract()
|
||||
for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})):
|
||||
tag.extract()
|
||||
|
||||
return soup
|
||||
|
@ -2,7 +2,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = 'Chema Cortés - 2011-01-05'
|
||||
__copyright__ = u'Chema Cort\xe9s - 2011-01-05'
|
||||
__version__ = 'v0.01'
|
||||
__date__ = '2011-01-05'
|
||||
'''
|
||||
@ -13,7 +13,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class NewJournalOfPhysics(BasicNewsRecipe):
|
||||
title = u'New Journal of Physics'
|
||||
__author__ = u'Chema Cortés'
|
||||
__author__ = u'Chema Cort\xe9s'
|
||||
description = u'The open-access journal for physics'
|
||||
publisher = u'IOP (Institute of Physics)'
|
||||
category = 'physics, journal, science'
|
||||
|
44
resources/recipes/walla.recipe
Normal file
44
resources/recipes/walla.recipe
Normal file
@ -0,0 +1,44 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1283848012(BasicNewsRecipe):
|
||||
description = 'The WallaNews.'
|
||||
cover_url = 'http://ftp5.bizportal.co.il/web/giflib/news/rsPhoto/sz_5/rsz_220_220_logo_walla.gif'
|
||||
title = u'Walla'
|
||||
language = 'he'
|
||||
__author__ = 'marbs'
|
||||
extra_css='img {max-width:100%;} body{direction: rtl;},title{direction: rtl; } ,article_description{direction: rtl; }, a.article{direction: rtl; } ,calibre_feed_description{direction: rtl; }'
|
||||
simultaneous_downloads = 5
|
||||
# remove_javascript = True
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
# remove_attributes = ['width']
|
||||
keep_only_tags =dict(name='div', attrs={'class':'wp-0-b w3'})
|
||||
remove_tags = [dict(name='div', attrs={'class':'tagsContainer'})]
|
||||
max_articles_per_feed = 100
|
||||
# preprocess_regexps = [
|
||||
# (re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: '')
|
||||
# ]
|
||||
|
||||
|
||||
feeds = [(u'חדשות', u'http://rss.walla.co.il/?w=/1/0/1/@rss'),
|
||||
(u'עסקים', u'http://rss.walla.co.il/?w=/2/3/1/@rss'),
|
||||
(u'תרבות', u'http://rss.walla.co.il/?w=/4/249/1/@rss'),
|
||||
(u'בריאות', u'http://rss.walla.co.il/?w=/5/18/1/@rss'),
|
||||
(u'TECH', u'http://rss.walla.co.il/?w=/6/4/1/@rss'),
|
||||
(u'אסטרולוגיה', u'http://rss.walla.co.il/?w=/8/3307/1/@rss'),
|
||||
(u'בעלי חיים', u'http://rss.walla.co.il/?w=/59/5703/1/@rss'),
|
||||
(u'רכב', u'http://rss.walla.co.il/?w=/31/4700/1/@rss'),
|
||||
(u'סלבס', u'http://rss.walla.co.il/?w=/22/3600/1/@rss'),
|
||||
(u'אוכל', u'http://rss.walla.co.il/?w=/9/903/1/@rss'),
|
||||
(u'אופנה', u'http://rss.walla.co.il/?w=/24/2120/1/@rss'),
|
||||
(u'ברנזה', u'http://rss.walla.co.il/?w=/27/3900/1/@rss'),
|
||||
(u'ZONE', u'http://rss.walla.co.il/?w=/18/500/1/@rss'),
|
||||
(u'ספורט', u'http://rss.walla.co.il/?w=/3/7/1/@rss')]
|
||||
|
||||
def print_version(self, url):
|
||||
print_url = url + '/@@/item/printer'
|
||||
return print_url
|
||||
|
@ -41,6 +41,7 @@ function scroll_to_bookmark(bookmark) {
|
||||
$.scrollTo($(bm[0]), 1000,
|
||||
{
|
||||
over:ratio,
|
||||
axis: 'y', // Do not scroll in the x direction
|
||||
onAfter:function(){window.py_bridge.animated_scroll_done()}
|
||||
}
|
||||
);
|
||||
|
@ -353,7 +353,7 @@ class HTMLPreProcessor(object):
|
||||
(re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||
|
||||
# Center separator lines
|
||||
(re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
|
||||
(re.compile(u'<br>\s*(?P<break>([*#•✦]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
|
||||
|
||||
# Remove page links
|
||||
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
|
||||
@ -363,13 +363,11 @@ class HTMLPreProcessor(object):
|
||||
# Remove gray background
|
||||
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
|
||||
|
||||
# Detect Chapters to match default XPATH in GUI
|
||||
(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
|
||||
# Cover the case where every letter in a chapter title is separated by a space
|
||||
(re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
|
||||
# Convert line breaks to paragraphs
|
||||
(re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'),
|
||||
(re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'),
|
||||
(re.compile(r'\s*</body>'), lambda match : '</p>\n</body>'),
|
||||
|
||||
# Have paragraphs show better
|
||||
(re.compile(r'<br.*?>'), lambda match : '<p>'),
|
||||
# Clean up spaces
|
||||
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
|
||||
# Add space before and after italics
|
||||
@ -455,9 +453,9 @@ class HTMLPreProcessor(object):
|
||||
# delete soft hyphens - moved here so it's executed after header/footer removal
|
||||
if is_pdftohtml:
|
||||
# unwrap/delete soft hyphens
|
||||
end_rules.append((re.compile(u'[](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||
end_rules.append((re.compile(u'[](</p>\s*<p>\s*)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||
# unwrap/delete soft hyphens with formatting
|
||||
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||
|
||||
# Make the more aggressive chapter marking regex optional with the preprocess option to
|
||||
# reduce false positives and move after header/footer removal
|
||||
@ -475,7 +473,7 @@ class HTMLPreProcessor(object):
|
||||
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||
)
|
||||
|
||||
for rule in self.PREPROCESS + start_rules:
|
||||
@ -508,7 +506,15 @@ class HTMLPreProcessor(object):
|
||||
if is_pdftohtml and length > -1:
|
||||
# Dehyphenate
|
||||
dehyphenator = Dehyphenator()
|
||||
html = dehyphenator(html,'pdf', length)
|
||||
html = dehyphenator(html,'html', length)
|
||||
|
||||
if is_pdftohtml:
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
pdf_markup = PreProcessor(self.extra_opts, None)
|
||||
totalwords = 0
|
||||
totalwords = pdf_markup.get_word_count(html)
|
||||
if totalwords > 7000:
|
||||
html = pdf_markup.markup_chapters(html, totalwords, True)
|
||||
|
||||
#dump(html, 'post-preprocess')
|
||||
|
||||
@ -554,5 +560,9 @@ class HTMLPreProcessor(object):
|
||||
html = smartyPants(html)
|
||||
html = html.replace(start, '<!--')
|
||||
html = html.replace(stop, '-->')
|
||||
# convert ellipsis to entities to prevent wrapping
|
||||
html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '…', html)
|
||||
# convert double dashes to em-dash
|
||||
html = re.sub('\s--\s', u'\u2014', html)
|
||||
return substitute_entites(html)
|
||||
|
||||
|
@ -6,8 +6,10 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
from math import ceil
|
||||
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre.utils.wordcount import get_wordcount_obj
|
||||
|
||||
class PreProcessor(object):
|
||||
|
||||
@ -17,6 +19,9 @@ class PreProcessor(object):
|
||||
self.found_indents = 0
|
||||
self.extra_opts = extra_opts
|
||||
|
||||
def is_pdftohtml(self, src):
|
||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||
|
||||
def chapter_head(self, match):
|
||||
chap = match.group('chap')
|
||||
title = match.group('title')
|
||||
@ -64,7 +69,7 @@ class PreProcessor(object):
|
||||
inspect. Percent is the minimum percent of line endings which should
|
||||
be marked up to return true.
|
||||
'''
|
||||
htm_end_ere = re.compile('</p>', re.DOTALL)
|
||||
htm_end_ere = re.compile('</(p|div)>', re.DOTALL)
|
||||
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
|
||||
htm_end = htm_end_ere.findall(raw)
|
||||
line_end = line_end_ere.findall(raw)
|
||||
@ -101,12 +106,101 @@ class PreProcessor(object):
|
||||
with open(os.path.join(odir, name), 'wb') as f:
|
||||
f.write(raw.encode('utf-8'))
|
||||
|
||||
def get_word_count(self, html):
|
||||
word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
|
||||
word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
|
||||
wordcount = get_wordcount_obj(word_count_text)
|
||||
return wordcount.words
|
||||
|
||||
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
|
||||
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
|
||||
# minimum of chapters to search for
|
||||
self.min_chapters = 1
|
||||
if wordcount > 7000:
|
||||
self.min_chapters = int(ceil(wordcount / 7000.))
|
||||
#print "minimum chapters required are: "+str(self.min_chapters)
|
||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||
self.html_preprocess_sections = len(heading.findall(html))
|
||||
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
||||
|
||||
# Build the Regular Expressions in pieces
|
||||
init_lookahead = "(?=<(p|div))"
|
||||
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
||||
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
|
||||
chapter_header_open = r"(?P<chap>"
|
||||
title_header_open = r"(?P<title>"
|
||||
chapter_header_close = ")\s*"
|
||||
title_header_close = ")"
|
||||
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
|
||||
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
|
||||
|
||||
is_pdftohtml = self.is_pdftohtml(html)
|
||||
if is_pdftohtml:
|
||||
chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
|
||||
chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
|
||||
title_line_open = "<(?P<outer2>p)[^>]*>\s*"
|
||||
title_line_close = "\s*</(?P=outer2)>"
|
||||
|
||||
|
||||
if blanks_between_paragraphs:
|
||||
blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
|
||||
else:
|
||||
blank_lines = ""
|
||||
opt_title_open = "("
|
||||
opt_title_close = ")?"
|
||||
n_lookahead_open = "\s+(?!"
|
||||
n_lookahead_close = ")"
|
||||
|
||||
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
||||
|
||||
chapter_types = [
|
||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
|
||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
|
||||
[r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
||||
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
|
||||
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
|
||||
[r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon
|
||||
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
|
||||
]
|
||||
|
||||
# Start with most typical chapter headings, get more aggressive until one works
|
||||
for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
|
||||
if self.html_preprocess_sections >= self.min_chapters:
|
||||
break
|
||||
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
||||
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
||||
self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
||||
if lookahead_ignorecase:
|
||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
||||
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
||||
else:
|
||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
|
||||
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
|
||||
html = chapdetect.sub(self.chapter_head, html)
|
||||
|
||||
words_per_chptr = wordcount
|
||||
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
|
||||
words_per_chptr = wordcount / self.html_preprocess_sections
|
||||
self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
|
||||
return html
|
||||
|
||||
|
||||
|
||||
def __call__(self, html):
|
||||
self.log("********* Preprocessing HTML *********")
|
||||
|
||||
# Count the words in the document to estimate how many chapters to look for and whether
|
||||
# other types of processing are attempted
|
||||
totalwords = 0
|
||||
totalwords = self.get_word_count(html)
|
||||
|
||||
if totalwords < 20:
|
||||
self.log("not enough text, not preprocessing")
|
||||
return html
|
||||
|
||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||
html = re.sub(r"\s*</p>", "</p>\n", html)
|
||||
html = re.sub(r"\s*<p(?P<style>[^>]*)>\s*", "\n<p"+"\g<style>"+">", html)
|
||||
html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
|
||||
html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
|
||||
|
||||
###### Check Markup ######
|
||||
#
|
||||
@ -141,12 +235,17 @@ class PreProcessor(object):
|
||||
self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
|
||||
# remove remaining non-breaking spaces
|
||||
html = re.sub(ur'\u00a0', ' ', html)
|
||||
# Get rid of various common microsoft specific tags which can cause issues later
|
||||
# Get rid of empty <o:p> tags to simplify other processing
|
||||
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
|
||||
# Delete microsoft 'smart' tags
|
||||
html = re.sub('(?i)</?st1:\w+>', '', html)
|
||||
# Get rid of empty span, bold, & italics tags
|
||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||
html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
|
||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||
# ADE doesn't render <br />, change to empty paragraphs
|
||||
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
||||
|
||||
# If more than 40% of the lines are empty paragraphs and the user has enabled remove
|
||||
# paragraph spacing then delete blank lines to clean up spacing
|
||||
@ -168,59 +267,12 @@ class PreProcessor(object):
|
||||
#print "blanks between paragraphs is marked True"
|
||||
else:
|
||||
blanks_between_paragraphs = False
|
||||
|
||||
#self.dump(html, 'before_chapter_markup')
|
||||
# detect chapters/sections to match xpath or splitting logic
|
||||
#
|
||||
# Build the Regular Expressions in pieces
|
||||
init_lookahead = "(?=<(p|div))"
|
||||
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
|
||||
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
|
||||
chapter_header_open = r"(?P<chap>"
|
||||
title_header_open = r"(?P<title>"
|
||||
chapter_header_close = ")\s*"
|
||||
title_header_close = ")"
|
||||
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
|
||||
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
|
||||
|
||||
if blanks_between_paragraphs:
|
||||
blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
|
||||
else:
|
||||
blank_lines = ""
|
||||
opt_title_open = "("
|
||||
opt_title_close = ")?"
|
||||
n_lookahead_open = "\s+(?!"
|
||||
n_lookahead_close = ")"
|
||||
|
||||
default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
|
||||
|
||||
min_chapters = 10
|
||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||
self.html_preprocess_sections = len(heading.findall(html))
|
||||
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
||||
|
||||
chapter_types = [
|
||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
|
||||
[r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
|
||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
|
||||
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
|
||||
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
|
||||
]
|
||||
|
||||
# Start with most typical chapter headings, get more aggressive until one works
|
||||
for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
|
||||
if self.html_preprocess_sections >= min_chapters:
|
||||
break
|
||||
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
||||
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
||||
self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
||||
if lookahead_ignorecase:
|
||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
||||
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
||||
else:
|
||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
|
||||
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
|
||||
|
||||
html = chapdetect.sub(self.chapter_head, html)
|
||||
html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
|
||||
|
||||
|
||||
###### Unwrap lines ######
|
||||
@ -247,7 +299,7 @@ class PreProcessor(object):
|
||||
# Calculate Length
|
||||
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
||||
length = docanalysis.line_length(unwrap_factor)
|
||||
self.log("*** Median line length is " + unicode(length) + ", calculated with " + format + " format ***")
|
||||
self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
|
||||
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
||||
if hardbreaks or unwrap_factor < 0.4:
|
||||
self.log("Unwrapping required, unwrapping Lines")
|
||||
@ -260,7 +312,7 @@ class PreProcessor(object):
|
||||
self.log("Done dehyphenating")
|
||||
# Unwrap lines using punctation and line length
|
||||
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
|
||||
unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||
unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||
html = unwrap.sub(' ', html)
|
||||
#check any remaining hyphens, but only unwrap if there is a match
|
||||
dehyphenator = Dehyphenator()
|
||||
@ -276,7 +328,7 @@ class PreProcessor(object):
|
||||
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
||||
|
||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||
if self.html_preprocess_sections < 5:
|
||||
if self.html_preprocess_sections < self.min_chapters:
|
||||
self.log("Looking for more split points based on punctuation,"
|
||||
" currently have " + unicode(self.html_preprocess_sections))
|
||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||
|
@ -16,6 +16,7 @@ import uuid
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre import guess_type
|
||||
from calibre import prepare_string_for_xml
|
||||
from calibre.constants import __appname__, __version__
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
|
||||
@ -161,6 +162,23 @@ class FB2MLizer(object):
|
||||
text.append('<section>')
|
||||
self.section_level += 1
|
||||
|
||||
# Insert the title page / cover into the spine if it is not already referenced.
|
||||
title_name = u''
|
||||
if 'titlepage' in self.oeb_book.guide:
|
||||
title_name = 'titlepage'
|
||||
elif 'cover' in self.oeb_book.guide:
|
||||
title_name = 'cover'
|
||||
if title_name:
|
||||
title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href]
|
||||
if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml':
|
||||
self.oeb_book.spine.insert(0, title_item, True)
|
||||
# Create xhtml page to reference cover image so it can be used.
|
||||
if not title_name and self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
|
||||
id = unicode(self.oeb_book.metadata.cover[0])
|
||||
cover_item = self.oeb_book.manifest.ids[id]
|
||||
if cover_item.media_type in OEB_RASTER_IMAGES:
|
||||
self.insert_image_cover(cover_item.href)
|
||||
|
||||
for item in self.oeb_book.spine:
|
||||
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
|
||||
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
||||
@ -185,6 +203,17 @@ class FB2MLizer(object):
|
||||
|
||||
return ''.join(text) + '</body>'
|
||||
|
||||
def insert_image_cover(self, image_href):
|
||||
from calibre.ebooks.oeb.base import RECOVER_PARSER
|
||||
try:
|
||||
root = etree.fromstring(u'<html xmlns="%s"><body><img src="%s" /></body></html>' % (XHTML_NS, image_href), parser=RECOVER_PARSER)
|
||||
except:
|
||||
root = etree.fromstring(u'', parser=RECOVER_PARSER)
|
||||
|
||||
id, href = self.oeb_book.manifest.generate('fb2_cover', 'fb2_cover.xhtml')
|
||||
item = self.oeb_book.manifest.add(id, href, guess_type(href)[0], data=root)
|
||||
self.oeb_book.spine.insert(0, item, True)
|
||||
|
||||
def fb2mlize_images(self):
|
||||
'''
|
||||
This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function.
|
||||
|
@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
||||
from calibre.ebooks.conversion.utils import PreProcessor
|
||||
@ -18,19 +18,6 @@ class PDBInput(InputFormatPlugin):
|
||||
description = 'Convert PDB to HTML'
|
||||
file_types = set(['pdb'])
|
||||
|
||||
options = set([
|
||||
OptionRecommendation(name='single_line_paras', recommended_value=False,
|
||||
help=_('Normally calibre treats blank lines as paragraph markers. '
|
||||
'With this option it will assume that every line represents '
|
||||
'a paragraph instead.')),
|
||||
OptionRecommendation(name='print_formatted_paras', recommended_value=False,
|
||||
help=_('Normally calibre treats blank lines as paragraph markers. '
|
||||
'With this option it will assume that every line starting with '
|
||||
'an indent (either a tab or 2+ spaces) represents a paragraph. '
|
||||
'Paragraphs end when the next line that starts with an indent '
|
||||
'is reached.')),
|
||||
])
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
header = PdbHeaderReader(stream)
|
||||
|
@ -22,7 +22,7 @@ class PDBOutput(OutputFormatPlugin):
|
||||
short_switch='f', choices=FORMAT_WRITERS.keys(),
|
||||
help=(_('Format to use inside the pdb container. Choices are:')+\
|
||||
' %s' % FORMAT_WRITERS.keys())),
|
||||
OptionRecommendation(name='output_encoding', recommended_value='cp1252',
|
||||
OptionRecommendation(name='pdb_output_encoding', recommended_value='cp1252',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. ' \
|
||||
'The default is cp1252. Note: This option is not honored by all ' \
|
||||
|
@ -8,12 +8,11 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
import struct
|
||||
|
||||
from cStringIO import StringIO
|
||||
|
||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
|
||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted
|
||||
|
||||
class HeaderRecord(object):
|
||||
'''
|
||||
@ -33,9 +32,7 @@ class Reader(FormatReader):
|
||||
def __init__(self, header, stream, log, options):
|
||||
self.stream = stream
|
||||
self.log = log
|
||||
self.encoding = options.input_encoding
|
||||
self.single_line_paras = options.single_line_paras
|
||||
self.print_formatted_paras = options.print_formatted_paras
|
||||
self.options = options
|
||||
|
||||
self.sections = []
|
||||
for i in range(header.num_sections):
|
||||
@ -48,34 +45,29 @@ class Reader(FormatReader):
|
||||
|
||||
def decompress_text(self, number):
|
||||
if self.header_record.compression == 1:
|
||||
return self.section_data(number).decode('cp1252' if self.encoding is None else self.encoding)
|
||||
return self.section_data(number)
|
||||
if self.header_record.compression == 2 or self.header_record.compression == 258:
|
||||
from calibre.ebooks.compression.palmdoc import decompress_doc
|
||||
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace')
|
||||
return decompress_doc(self.section_data(number))
|
||||
return ''
|
||||
|
||||
def extract_content(self, output_dir):
|
||||
txt = ''
|
||||
raw_txt = ''
|
||||
|
||||
self.log.info('Decompressing text...')
|
||||
for i in range(1, self.header_record.num_records + 1):
|
||||
self.log.debug('\tDecompressing text section %i' % i)
|
||||
txt += self.decompress_text(i)
|
||||
raw_txt += self.decompress_text(i)
|
||||
|
||||
self.log.info('Converting text to OEB...')
|
||||
if self.single_line_paras:
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
if self.print_formatted_paras:
|
||||
txt = separate_paragraphs_print_formatted(txt)
|
||||
html = convert_basic(txt)
|
||||
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
|
||||
index.write(html.encode('utf-8'))
|
||||
stream = StringIO(raw_txt)
|
||||
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
mi = get_metadata(self.stream, 'pdb')
|
||||
manifest = [('index.html', None)]
|
||||
spine = ['index.html']
|
||||
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
return os.path.join(output_dir, 'metadata.opf')
|
||||
txt_plugin = plugin_for_input_format('txt')
|
||||
for option in txt_plugin.options:
|
||||
if not hasattr(self.options, option.option.name):
|
||||
setattr(self.options, option.name, option.recommended_value)
|
||||
|
||||
stream.seek(0)
|
||||
return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
|
||||
|
@ -50,7 +50,8 @@ class Writer(FormatWriter):
|
||||
txt = writer.extract_content(oeb_book, self.opts)
|
||||
|
||||
self.log.debug('\tReplacing newlines with selected type...')
|
||||
txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace')
|
||||
txt = specified_newlines(TxtNewlines('windows').newline,
|
||||
txt).encode(self.opts.pdb_output_encoding, 'replace')
|
||||
|
||||
txt_length = len(txt)
|
||||
|
||||
|
@ -19,9 +19,6 @@ class Reader(FormatReader):
|
||||
self.stream = stream
|
||||
self.log = log
|
||||
self.options = options
|
||||
setattr(self.options, 'new_pdf_engine', False)
|
||||
setattr(self.options, 'no_images', False)
|
||||
setattr(self.options, 'unwrap_factor', 0.45)
|
||||
|
||||
def extract_content(self, output_dir):
|
||||
self.log.info('Extracting PDF...')
|
||||
@ -32,6 +29,11 @@ class Reader(FormatReader):
|
||||
pdf.write(self.header.section_data(x))
|
||||
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
pdf_plugin = plugin_for_input_format('pdf')
|
||||
for option in pdf_plugin.options:
|
||||
if not hasattr(self.options, option.option.name):
|
||||
setattr(self.options, option.name, option.recommended_value)
|
||||
|
||||
pdf.seek(0)
|
||||
return plugin_for_input_format('pdf').convert(pdf, self.options,
|
||||
'pdf', self.log, [])
|
||||
return pdf_plugin.convert(pdf, self.options, 'pdf', self.log, {})
|
||||
|
@ -8,12 +8,13 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, struct, zlib
|
||||
import struct
|
||||
import zlib
|
||||
|
||||
from cStringIO import StringIO
|
||||
|
||||
from calibre.ebooks.pdb.formatreader import FormatReader
|
||||
from calibre.ebooks.pdb.ztxt import zTXTError
|
||||
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
|
||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted
|
||||
|
||||
SUPPORTED_VERSION = (1, 40)
|
||||
|
||||
@ -38,9 +39,7 @@ class Reader(FormatReader):
|
||||
def __init__(self, header, stream, log, options):
|
||||
self.stream = stream
|
||||
self.log = log
|
||||
self.encoding = options.input_encoding
|
||||
self.single_line_paras = options.single_line_paras
|
||||
self.print_formatted_paras = options.print_formatted_paras
|
||||
self.options = options
|
||||
|
||||
self.sections = []
|
||||
for i in range(header.num_sections):
|
||||
@ -68,30 +67,25 @@ class Reader(FormatReader):
|
||||
def decompress_text(self, number):
|
||||
if number == 1:
|
||||
self.uncompressor = zlib.decompressobj()
|
||||
return self.uncompressor.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace')
|
||||
return self.uncompressor.decompress(self.section_data(number))
|
||||
|
||||
def extract_content(self, output_dir):
|
||||
txt = ''
|
||||
raw_txt = ''
|
||||
|
||||
self.log.info('Decompressing text...')
|
||||
for i in range(1, self.header_record.num_records + 1):
|
||||
self.log.debug('\tDecompressing text section %i' % i)
|
||||
txt += self.decompress_text(i)
|
||||
raw_txt += self.decompress_text(i)
|
||||
|
||||
self.log.info('Converting text to OEB...')
|
||||
if self.single_line_paras:
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
if self.print_formatted_paras:
|
||||
txt = separate_paragraphs_print_formatted(txt)
|
||||
html = convert_basic(txt)
|
||||
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
|
||||
index.write(html.encode('utf-8'))
|
||||
stream = StringIO(raw_txt)
|
||||
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
mi = get_metadata(self.stream, 'pdb')
|
||||
manifest = [('index.html', None)]
|
||||
spine = ['index.html']
|
||||
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
return os.path.join(output_dir, 'metadata.opf')
|
||||
txt_plugin = plugin_for_input_format('txt')
|
||||
for option in txt_plugin.options:
|
||||
if not hasattr(self.options, option.option.name):
|
||||
setattr(self.options, option.name, option.recommended_value)
|
||||
|
||||
stream.seek(0)
|
||||
return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
|
||||
|
@ -54,7 +54,8 @@ class Writer(FormatWriter):
|
||||
txt = writer.extract_content(oeb_book, self.opts)
|
||||
|
||||
self.log.debug('\tReplacing newlines with selected type...')
|
||||
txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace')
|
||||
txt = specified_newlines(TxtNewlines('windows').newline,
|
||||
txt).encode(self.opts.pdb_output_encoding, 'replace')
|
||||
|
||||
txt_length = len(txt)
|
||||
|
||||
|
@ -28,7 +28,7 @@ class PMLOutput(OutputFormatPlugin):
|
||||
file_type = 'pmlz'
|
||||
|
||||
options = set([
|
||||
OptionRecommendation(name='output_encoding', recommended_value='cp1252',
|
||||
OptionRecommendation(name='pml_output_encoding', recommended_value='cp1252',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. ' \
|
||||
'The default is cp1252.')),
|
||||
@ -48,7 +48,7 @@ class PMLOutput(OutputFormatPlugin):
|
||||
pmlmlizer = PMLMLizer(log)
|
||||
pml = unicode(pmlmlizer.extract_content(oeb_book, opts))
|
||||
with open(os.path.join(tdir, 'index.pml'), 'wb') as out:
|
||||
out.write(pml.encode(opts.output_encoding, 'replace'))
|
||||
out.write(pml.encode(opts.pml_output_encoding, 'replace'))
|
||||
|
||||
self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, tdir, opts)
|
||||
|
||||
|
@ -4,11 +4,9 @@ __license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
from cStringIO import StringIO
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
|
||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ebooks.compression.tcr import decompress
|
||||
|
||||
class TCRInput(InputFormatPlugin):
|
||||
@ -18,37 +16,20 @@ class TCRInput(InputFormatPlugin):
|
||||
description = 'Convert TCR files to HTML'
|
||||
file_types = set(['tcr'])
|
||||
|
||||
options = set([
|
||||
OptionRecommendation(name='single_line_paras', recommended_value=False,
|
||||
help=_('Normally calibre treats blank lines as paragraph markers. '
|
||||
'With this option it will assume that every line represents '
|
||||
'a paragraph instead.')),
|
||||
OptionRecommendation(name='print_formatted_paras', recommended_value=False,
|
||||
help=_('Normally calibre treats blank lines as paragraph markers. '
|
||||
'With this option it will assume that every line starting with '
|
||||
'an indent (either a tab or 2+ spaces) represents a paragraph. '
|
||||
'Paragraphs end when the next line that starts with an indent '
|
||||
'is reached.')),
|
||||
])
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
log.info('Decompressing text...')
|
||||
ienc = options.input_encoding if options.input_encoding else 'utf-8'
|
||||
txt = decompress(stream).decode(ienc, 'replace')
|
||||
raw_txt = decompress(stream)
|
||||
|
||||
log.info('Converting text to OEB...')
|
||||
if options.single_line_paras:
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
if options.print_formatted_paras:
|
||||
txt = separate_paragraphs_print_formatted(txt)
|
||||
html = convert_basic(txt)
|
||||
with open(os.path.join(os.getcwd(), 'index.html'), 'wb') as index:
|
||||
index.write(html.encode('utf-8'))
|
||||
stream = StringIO(raw_txt)
|
||||
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
mi = get_metadata(stream, 'tcr')
|
||||
manifest = [('index.html', None)]
|
||||
spine = ['index.html']
|
||||
opf_writer(os.getcwd(), 'metadata.opf', manifest, spine, mi)
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
return os.path.join(os.getcwd(), 'metadata.opf')
|
||||
txt_plugin = plugin_for_input_format('txt')
|
||||
for option in txt_plugin.options:
|
||||
if not hasattr(options, option.option.name):
|
||||
setattr(options, option.name, option.recommended_value)
|
||||
|
||||
stream.seek(0)
|
||||
return txt_plugin.convert(stream, options,
|
||||
'txt', log, accelerators)
|
||||
|
@ -18,7 +18,7 @@ class TCROutput(OutputFormatPlugin):
|
||||
file_type = 'tcr'
|
||||
|
||||
options = set([
|
||||
OptionRecommendation(name='output_encoding', recommended_value='utf-8',
|
||||
OptionRecommendation(name='tcr_output_encoding', recommended_value='utf-8',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. ' \
|
||||
'The default is utf-8.')),
|
||||
@ -40,7 +40,7 @@ class TCROutput(OutputFormatPlugin):
|
||||
setattr(opts, 'indent_paras', False)
|
||||
|
||||
writer = TXTMLizer(log)
|
||||
txt = writer.extract_content(oeb_book, opts).encode(opts.output_encoding, 'replace')
|
||||
txt = writer.extract_content(oeb_book, opts).encode(opts.tcr_output_encoding, 'replace')
|
||||
|
||||
log.info('Compressing text...')
|
||||
txt = compress(txt)
|
||||
|
@ -7,9 +7,10 @@ __docformat__ = 'restructuredtext en'
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre.ebooks.chardet import detect
|
||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||
preserve_spaces
|
||||
preserve_spaces, detect_paragraph_type, detect_formatting_type
|
||||
from calibre import _ent_pat, xml_entity_to_unicode
|
||||
|
||||
class TXTInput(InputFormatPlugin):
|
||||
@ -20,45 +21,57 @@ class TXTInput(InputFormatPlugin):
|
||||
file_types = set(['txt'])
|
||||
|
||||
options = set([
|
||||
OptionRecommendation(name='single_line_paras', recommended_value=False,
|
||||
help=_('Normally calibre treats blank lines as paragraph markers. '
|
||||
'With this option it will assume that every line represents '
|
||||
'a paragraph instead.')),
|
||||
OptionRecommendation(name='print_formatted_paras', recommended_value=False,
|
||||
help=_('Normally calibre treats blank lines as paragraph markers. '
|
||||
'With this option it will assume that every line starting with '
|
||||
'an indent (either a tab or 2+ spaces) represents a paragraph. '
|
||||
'Paragraphs end when the next line that starts with an indent '
|
||||
'is reached.')),
|
||||
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
||||
choices=['auto', 'block', 'single', 'print'],
|
||||
help=_('Paragraph structure.\n'
|
||||
'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
|
||||
'* auto: Try to auto detect paragraph type.\n'
|
||||
'* block: Treat a blank line as a paragraph break.\n'
|
||||
'* single: Assume every line is a paragraph.\n'
|
||||
'* print: Assume every line starting with 2+ spaces or a tab '
|
||||
'starts a paragraph.')),
|
||||
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
||||
choices=['auto', 'none', 'markdown'],
|
||||
help=_('Formatting used within the document.'
|
||||
'* auto: Try to auto detect the document formatting.\n'
|
||||
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
|
||||
'* markdown: Run the input though the markdown pre-processor. '
|
||||
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||
help=_('Normally extra spaces are condensed into a single space. '
|
||||
'With this option all spaces will be displayed.')),
|
||||
OptionRecommendation(name='markdown', recommended_value=False,
|
||||
help=_('Run the text input through the markdown pre-processor. To '
|
||||
'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
|
||||
OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
|
||||
help=_('Do not insert a Table of Contents into the output text.')),
|
||||
])
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
ienc = stream.encoding if stream.encoding else 'utf-8'
|
||||
log.debug('Reading text from file...')
|
||||
|
||||
txt = stream.read()
|
||||
# Get the encoding of the document.
|
||||
if options.input_encoding:
|
||||
ienc = options.input_encoding
|
||||
log.debug('Reading text from file...')
|
||||
txt = stream.read().decode(ienc, 'replace')
|
||||
log.debug('Using user specified input encoding of %s' % ienc)
|
||||
else:
|
||||
det_encoding = detect(txt)
|
||||
ienc = det_encoding['encoding']
|
||||
log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, det_encoding['confidence'] * 100))
|
||||
if not ienc:
|
||||
ienc = 'utf-8'
|
||||
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
|
||||
txt = txt.decode(ienc, 'replace')
|
||||
|
||||
# Adjust paragraph formatting as requested
|
||||
if options.single_line_paras:
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
if options.print_formatted_paras:
|
||||
txt = separate_paragraphs_print_formatted(txt)
|
||||
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
|
||||
# Preserve spaces will replace multiple spaces to a space
|
||||
# followed by the entity.
|
||||
if options.preserve_spaces:
|
||||
txt = preserve_spaces(txt)
|
||||
|
||||
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
|
||||
if options.formatting_type == 'auto':
|
||||
options.formatting_type = detect_formatting_type(txt)
|
||||
|
||||
if options.markdown:
|
||||
if options.formatting_type == 'markdown':
|
||||
log.debug('Running text though markdown conversion...')
|
||||
try:
|
||||
html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)
|
||||
@ -66,6 +79,22 @@ class TXTInput(InputFormatPlugin):
|
||||
raise ValueError('This txt file has malformed markup, it cannot be'
|
||||
' converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
|
||||
else:
|
||||
# Determine the paragraph type of the document.
|
||||
if options.paragraph_type == 'auto':
|
||||
options.paragraph_type = detect_paragraph_type(txt)
|
||||
if options.paragraph_type == 'unknown':
|
||||
log.debug('Could not reliably determine paragraph type using block')
|
||||
options.paragraph_type = 'block'
|
||||
else:
|
||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||
|
||||
# We don't check for block because the processor assumes block.
|
||||
# single and print at transformed to block for processing.
|
||||
if options.paragraph_type == 'single':
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
elif options.paragraph_type == 'print':
|
||||
txt = separate_paragraphs_print_formatted(txt)
|
||||
|
||||
flow_size = getattr(options, 'flow_size', 0)
|
||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||
|
||||
@ -85,11 +114,10 @@ class TXTInput(InputFormatPlugin):
|
||||
htmlfile = open(fname, 'wb')
|
||||
with htmlfile:
|
||||
htmlfile.write(html.encode('utf-8'))
|
||||
cwd = os.getcwdu()
|
||||
odi = options.debug_pipeline
|
||||
options.debug_pipeline = None
|
||||
oeb = html_input(open(htmlfile.name, 'rb'), options, 'html', log,
|
||||
{}, cwd)
|
||||
oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log,
|
||||
{})
|
||||
options.debug_pipeline = odi
|
||||
os.remove(htmlfile.name)
|
||||
return oeb
|
||||
|
@ -26,7 +26,7 @@ class TXTOutput(OutputFormatPlugin):
|
||||
'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
|
||||
'For Mac OS X use \'unix\'. \'system\' will default to the newline '
|
||||
'type used by this OS.') % sorted(TxtNewlines.NEWLINE_TYPES.keys())),
|
||||
OptionRecommendation(name='output_encoding', recommended_value='utf-8',
|
||||
OptionRecommendation(name='txt_output_encoding', recommended_value='utf-8',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. ' \
|
||||
'The default is utf-8.')),
|
||||
@ -81,7 +81,7 @@ class TXTOutput(OutputFormatPlugin):
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
out_stream.write(txt.encode(opts.output_encoding, 'replace'))
|
||||
out_stream.write(txt.encode(opts.txt_output_encoding, 'replace'))
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
|
@ -48,7 +48,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
|
||||
if isbytestring(txt):
|
||||
txt = txt.decode('utf-8')
|
||||
|
||||
|
||||
lines = []
|
||||
# Split into paragraphs based on having a blank line between text.
|
||||
for line in txt.split('\n\n'):
|
||||
@ -93,3 +92,54 @@ def split_string_separator(txt, size) :
|
||||
xrange(0, len(txt), size)])
|
||||
return txt
|
||||
|
||||
def detect_paragraph_type(txt):
|
||||
'''
|
||||
Tries to determine the formatting of the document.
|
||||
|
||||
block: Paragraphs are separated by a blank line.
|
||||
single: Each line is a paragraph.
|
||||
print: Each paragraph starts with a 2+ spaces or a tab
|
||||
and ends when a new paragraph is reached.
|
||||
markdown: Markdown formatting is in the document.
|
||||
|
||||
returns block, single, print, markdown
|
||||
'''
|
||||
txt = txt.replace('\r\n', '\n')
|
||||
txt = txt.replace('\r', '\n')
|
||||
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
|
||||
|
||||
# Check for print
|
||||
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
|
||||
if tab_line_count / float(txt_line_count) >= .25:
|
||||
return 'print'
|
||||
|
||||
# Check for block
|
||||
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
|
||||
if empty_line_count / float(txt_line_count) >= .25:
|
||||
return 'block'
|
||||
|
||||
# Nothing else matched to assume single.
|
||||
return 'single'
|
||||
|
||||
def detect_formatting_type(txt):
|
||||
# Check for markdown
|
||||
# Headings
|
||||
if len(re.findall('(?mu)^#+', txt)) >= 5:
|
||||
return 'markdown'
|
||||
if len(re.findall('(?mu)^=+$', txt)) >= 5:
|
||||
return 'markdown'
|
||||
if len(re.findall('(?mu)^-+$', txt)) >= 5:
|
||||
return 'markdown'
|
||||
# Images
|
||||
if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5:
|
||||
return 'markdown'
|
||||
# Links
|
||||
if len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
|
||||
return 'markdown'
|
||||
# Escaped characters
|
||||
md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
|
||||
for c in md_escapted_characters:
|
||||
if txt.count('\\'+c) > 10:
|
||||
return 'markdown'
|
||||
|
||||
return 'none'
|
||||
|
@ -256,8 +256,10 @@ class BookInfo(QWebView):
|
||||
% (left_pane, right_pane)))
|
||||
|
||||
def mouseDoubleClickEvent(self, ev):
|
||||
if self.width() - ev.x() < 25 or \
|
||||
self.height() - ev.y() < 25:
|
||||
swidth = self.page().mainFrame().scrollBarGeometry(Qt.Vertical).width()
|
||||
sheight = self.page().mainFrame().scrollBarGeometry(Qt.Horizontal).height()
|
||||
if self.width() - ev.x() < swidth or \
|
||||
self.height() - ev.y() < sheight:
|
||||
# Filter out double clicks on the scroll bar
|
||||
ev.accept()
|
||||
else:
|
||||
|
@ -6,7 +6,7 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import textwrap
|
||||
import textwrap, codecs
|
||||
from functools import partial
|
||||
|
||||
from PyQt4.Qt import QWidget, QSpinBox, QDoubleSpinBox, QLineEdit, QTextEdit, \
|
||||
@ -128,6 +128,7 @@ class Widget(QWidget):
|
||||
def get_value(self, g):
|
||||
from calibre.gui2.convert.xpath_wizard import XPathEdit
|
||||
from calibre.gui2.convert.regex_builder import RegexEdit
|
||||
from calibre.gui2.widgets import EncodingComboBox
|
||||
ret = self.get_value_handler(g)
|
||||
if ret != 'this is a dummy return value, xcswx1avcx4x':
|
||||
return ret
|
||||
@ -139,6 +140,13 @@ class Widget(QWidget):
|
||||
if not ans:
|
||||
ans = None
|
||||
return ans
|
||||
elif isinstance(g, EncodingComboBox):
|
||||
ans = unicode(g.currentText()).strip()
|
||||
try:
|
||||
codecs.lookup(ans)
|
||||
except:
|
||||
ans = ''
|
||||
return ans
|
||||
elif isinstance(g, QComboBox):
|
||||
return unicode(g.currentText())
|
||||
elif isinstance(g, QCheckBox):
|
||||
@ -192,6 +200,11 @@ class Widget(QWidget):
|
||||
if not val: val = ''
|
||||
getattr(g, 'setPlainText', g.setText)(val)
|
||||
getattr(g, 'setCursorPosition', lambda x: x)(0)
|
||||
elif isinstance(g, EncodingComboBox):
|
||||
if val:
|
||||
g.setEditText(val)
|
||||
else:
|
||||
g.setCurrentIndex(0)
|
||||
elif isinstance(g, QComboBox) and val:
|
||||
idx = g.findText(val, Qt.MatchFixedString)
|
||||
if idx < 0:
|
||||
@ -202,8 +215,6 @@ class Widget(QWidget):
|
||||
g.setCheckState(Qt.Checked if bool(val) else Qt.Unchecked)
|
||||
elif isinstance(g, (XPathEdit, RegexEdit)):
|
||||
g.edit.setText(val if val else '')
|
||||
elif isinstance(g, EncodingComboBox):
|
||||
g.setEditText(val if val else '')
|
||||
else:
|
||||
raise Exception('Can\'t set value %s in %s'%(repr(val),
|
||||
unicode(g.objectName())))
|
||||
|
@ -1,21 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.gui2.convert.pdb_input_ui import Ui_Form
|
||||
from calibre.gui2.convert import Widget
|
||||
|
||||
class PluginWidget(Widget, Ui_Form):
|
||||
|
||||
TITLE = _('PDB Input')
|
||||
HELP = _('Options specific to')+' PDB '+_('input')
|
||||
COMMIT_NAME = 'pdb_input'
|
||||
ICON = I('mimetypes/unknown.png')
|
||||
|
||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||
Widget.__init__(self, parent,
|
||||
['single_line_paras', 'print_formatted_paras'])
|
||||
self.db, self.book_id = db, book_id
|
||||
self.initialize_options(get_option, get_help, db, book_id)
|
@ -1,48 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<ui version="4.0">
|
||||
<class>Form</class>
|
||||
<widget class="QWidget" name="Form">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>0</x>
|
||||
<y>0</y>
|
||||
<width>400</width>
|
||||
<height>300</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="windowTitle">
|
||||
<string>Form</string>
|
||||
</property>
|
||||
<layout class="QGridLayout" name="gridLayout">
|
||||
<item row="2" column="0">
|
||||
<spacer name="verticalSpacer">
|
||||
<property name="orientation">
|
||||
<enum>Qt::Vertical</enum>
|
||||
</property>
|
||||
<property name="sizeHint" stdset="0">
|
||||
<size>
|
||||
<width>20</width>
|
||||
<height>213</height>
|
||||
</size>
|
||||
</property>
|
||||
</spacer>
|
||||
</item>
|
||||
<item row="0" column="0">
|
||||
<widget class="QCheckBox" name="opt_single_line_paras">
|
||||
<property name="text">
|
||||
<string>Treat each &line as a paragraph</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="1" column="0">
|
||||
<widget class="QCheckBox" name="opt_print_formatted_paras">
|
||||
<property name="text">
|
||||
<string>Assume print formatting</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
<resources/>
|
||||
<connections/>
|
||||
</ui>
|
@ -19,7 +19,7 @@ class PluginWidget(Widget, Ui_Form):
|
||||
ICON = I('mimetypes/unknown.png')
|
||||
|
||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||
Widget.__init__(self, parent, ['format', 'inline_toc', 'output_encoding'])
|
||||
Widget.__init__(self, parent, ['format', 'inline_toc', 'pdb_output_encoding'])
|
||||
self.db, self.book_id = db, book_id
|
||||
self.initialize_options(get_option, get_help, db, book_id)
|
||||
|
||||
|
@ -55,10 +55,21 @@
|
||||
</widget>
|
||||
</item>
|
||||
<item row="1" column="1">
|
||||
<widget class="QLineEdit" name="opt_output_encoding"/>
|
||||
<widget class="EncodingComboBox" name="opt_pdb_output_encoding">
|
||||
<property name="editable">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
<customwidgets>
|
||||
<customwidget>
|
||||
<class>EncodingComboBox</class>
|
||||
<extends>QComboBox</extends>
|
||||
<header>widgets.h</header>
|
||||
</customwidget>
|
||||
</customwidgets>
|
||||
<resources/>
|
||||
<connections/>
|
||||
</ui>
|
||||
|
@ -18,6 +18,6 @@ class PluginWidget(Widget, Ui_Form):
|
||||
|
||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||
Widget.__init__(self, parent, ['inline_toc', 'full_image_depth',
|
||||
'output_encoding'])
|
||||
'pml_output_encoding'])
|
||||
self.db, self.book_id = db, book_id
|
||||
self.initialize_options(get_option, get_help, db, book_id)
|
||||
|
@ -14,7 +14,7 @@
|
||||
<string>Form</string>
|
||||
</property>
|
||||
<layout class="QGridLayout" name="gridLayout">
|
||||
<item row="3" column="0">
|
||||
<item row="4" column="0">
|
||||
<spacer name="verticalSpacer">
|
||||
<property name="orientation">
|
||||
<enum>Qt::Vertical</enum>
|
||||
@ -27,32 +27,47 @@
|
||||
</property>
|
||||
</spacer>
|
||||
</item>
|
||||
<item row="1" column="0">
|
||||
<item row="2" column="0">
|
||||
<widget class="QCheckBox" name="opt_inline_toc">
|
||||
<property name="text">
|
||||
<string>&Inline TOC</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="2" column="0">
|
||||
<item row="3" column="0">
|
||||
<widget class="QCheckBox" name="opt_full_image_depth">
|
||||
<property name="text">
|
||||
<string>Do not reduce image size and depth</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="0" column="0">
|
||||
<item row="1" column="0">
|
||||
<layout class="QHBoxLayout" name="horizontalLayout">
|
||||
<item>
|
||||
<widget class="QLabel" name="label">
|
||||
<property name="text">
|
||||
<string>Output Encoding:</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="0" column="1">
|
||||
<widget class="QLineEdit" name="opt_output_encoding"/>
|
||||
<item>
|
||||
<widget class="EncodingComboBox" name="opt_pml_output_encoding">
|
||||
<property name="editable">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
<customwidgets>
|
||||
<customwidget>
|
||||
<class>EncodingComboBox</class>
|
||||
<extends>QComboBox</extends>
|
||||
<header>widgets.h</header>
|
||||
</customwidget>
|
||||
</customwidgets>
|
||||
<resources/>
|
||||
<connections/>
|
||||
</ui>
|
||||
|
@ -16,7 +16,10 @@ class PluginWidget(Widget, Ui_Form):
|
||||
|
||||
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
|
||||
Widget.__init__(self, parent,
|
||||
['single_line_paras', 'print_formatted_paras', 'markdown',
|
||||
'markdown_disable_toc', 'preserve_spaces'])
|
||||
['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
|
||||
self.db, self.book_id = db, book_id
|
||||
for x in get_option('paragraph_type').option.choices:
|
||||
self.opt_paragraph_type.addItem(x)
|
||||
for x in get_option('formatting_type').option.choices:
|
||||
self.opt_formatting_type.addItem(x)
|
||||
self.initialize_options(get_option, get_help, db, book_id)
|
||||
|
@ -6,7 +6,7 @@
|
||||
<rect>
|
||||
<x>0</x>
|
||||
<y>0</y>
|
||||
<width>470</width>
|
||||
<width>518</width>
|
||||
<height>300</height>
|
||||
</rect>
|
||||
</property>
|
||||
@ -15,47 +15,23 @@
|
||||
</property>
|
||||
<layout class="QGridLayout" name="gridLayout">
|
||||
<item row="0" column="0">
|
||||
<widget class="QCheckBox" name="opt_single_line_paras">
|
||||
<widget class="QLabel" name="label_2">
|
||||
<property name="text">
|
||||
<string>Treat each &line as a paragraph</string>
|
||||
<string>Paragraph style:</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="1" column="0">
|
||||
<widget class="QCheckBox" name="opt_print_formatted_paras">
|
||||
<item row="0" column="1">
|
||||
<widget class="QComboBox" name="opt_paragraph_type"/>
|
||||
</item>
|
||||
<item row="5" column="0" colspan="2">
|
||||
<widget class="QCheckBox" name="opt_preserve_spaces">
|
||||
<property name="text">
|
||||
<string>Assume print formatting</string>
|
||||
<string>Preserve &spaces</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="2" column="0">
|
||||
<widget class="QCheckBox" name="opt_markdown">
|
||||
<property name="text">
|
||||
<string>Process using markdown</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="3" column="0">
|
||||
<widget class="QLabel" name="label">
|
||||
<property name="text">
|
||||
<string><p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>.</string>
|
||||
</property>
|
||||
<property name="wordWrap">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
<property name="openExternalLinks">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="4" column="0">
|
||||
<widget class="QCheckBox" name="opt_markdown_disable_toc">
|
||||
<property name="text">
|
||||
<string>Do not insert Table of Contents into output text when using markdown</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="6" column="0">
|
||||
<item row="6" column="0" colspan="2">
|
||||
<spacer name="verticalSpacer">
|
||||
<property name="orientation">
|
||||
<enum>Qt::Vertical</enum>
|
||||
@ -68,32 +44,47 @@
|
||||
</property>
|
||||
</spacer>
|
||||
</item>
|
||||
<item row="5" column="0">
|
||||
<widget class="QCheckBox" name="opt_preserve_spaces">
|
||||
<item row="1" column="1">
|
||||
<widget class="QComboBox" name="opt_formatting_type"/>
|
||||
</item>
|
||||
<item row="1" column="0">
|
||||
<widget class="QLabel" name="label_3">
|
||||
<property name="text">
|
||||
<string>Preserve &spaces</string>
|
||||
<string>Formatting style:</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="2" column="0" rowspan="2" colspan="2">
|
||||
<widget class="QGroupBox" name="groupBox">
|
||||
<property name="title">
|
||||
<string>Markdown Options</string>
|
||||
</property>
|
||||
<layout class="QVBoxLayout" name="verticalLayout">
|
||||
<item>
|
||||
<widget class="QLabel" name="label">
|
||||
<property name="text">
|
||||
<string><p>Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit <a href="http://daringfireball.net/projects/markdown">markdown</a>.</string>
|
||||
</property>
|
||||
<property name="wordWrap">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
<property name="openExternalLinks">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item>
|
||||
<widget class="QCheckBox" name="opt_markdown_disable_toc">
|
||||
<property name="text">
|
||||
<string>Do not insert Table of Contents into output text when using markdown</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
<resources/>
|
||||
<connections>
|
||||
<connection>
|
||||
<sender>opt_markdown</sender>
|
||||
<signal>toggled(bool)</signal>
|
||||
<receiver>opt_markdown_disable_toc</receiver>
|
||||
<slot>setEnabled(bool)</slot>
|
||||
<hints>
|
||||
<hint type="sourcelabel">
|
||||
<x>76</x>
|
||||
<y>80</y>
|
||||
</hint>
|
||||
<hint type="destinationlabel">
|
||||
<x>418</x>
|
||||
<y>105</y>
|
||||
</hint>
|
||||
</hints>
|
||||
</connection>
|
||||
</connections>
|
||||
<connections/>
|
||||
</ui>
|
||||
|
@ -22,7 +22,7 @@ class PluginWidget(Widget, Ui_Form):
|
||||
Widget.__init__(self, parent,
|
||||
['newline', 'max_line_length', 'force_max_line_length',
|
||||
'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references',
|
||||
'output_encoding'])
|
||||
'txt_output_encoding'])
|
||||
self.db, self.book_id = db, book_id
|
||||
self.initialize_options(get_option, get_help, db, book_id)
|
||||
|
||||
|
@ -96,10 +96,21 @@
|
||||
</widget>
|
||||
</item>
|
||||
<item row="2" column="1">
|
||||
<widget class="QLineEdit" name="opt_output_encoding"/>
|
||||
<widget class="EncodingComboBox" name="opt_txt_output_encoding">
|
||||
<property name="editable">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
<customwidgets>
|
||||
<customwidget>
|
||||
<class>EncodingComboBox</class>
|
||||
<extends>QComboBox</extends>
|
||||
<header>widgets.h</header>
|
||||
</customwidget>
|
||||
</customwidgets>
|
||||
<resources/>
|
||||
<connections/>
|
||||
</ui>
|
||||
|
@ -449,7 +449,7 @@ class Document(QWebPage): # {{{
|
||||
return self.mainFrame().contentsSize().width() # offsetWidth gives inaccurate results
|
||||
|
||||
def set_bottom_padding(self, amount):
|
||||
s = QSize(-1, -1) if amount == 0 else QSize(self.width,
|
||||
s = QSize(-1, -1) if amount == 0 else QSize(self.viewportSize().width(),
|
||||
self.height+amount)
|
||||
self.setPreferredContentsSize(s)
|
||||
|
||||
@ -820,6 +820,7 @@ class DocumentView(QWebView): # {{{
|
||||
self.flipper.initialize(self.current_page_image())
|
||||
self.manager.next_document()
|
||||
return
|
||||
#oheight = self.document.height
|
||||
lower_limit = opos + delta_y # Max value of top y co-ord after scrolling
|
||||
max_y = self.document.height - window_height # The maximum possible top y co-ord
|
||||
if max_y < lower_limit:
|
||||
@ -835,6 +836,7 @@ class DocumentView(QWebView): # {{{
|
||||
if epf:
|
||||
self.flipper.initialize(self.current_page_image())
|
||||
#print 'Document height:', self.document.height
|
||||
#print 'Height change:', (self.document.height - oheight)
|
||||
max_y = self.document.height - window_height
|
||||
lower_limit = min(max_y, lower_limit)
|
||||
#print 'Scroll to:', lower_limit
|
||||
@ -842,6 +844,7 @@ class DocumentView(QWebView): # {{{
|
||||
self.document.scroll_to(self.document.xpos, lower_limit)
|
||||
actually_scrolled = self.document.ypos - opos
|
||||
#print 'After scroll pos:', self.document.ypos
|
||||
#print 'Scrolled by:', self.document.ypos - opos
|
||||
self.find_next_blank_line(window_height - actually_scrolled)
|
||||
#print 'After blank line pos:', self.document.ypos
|
||||
if epf:
|
||||
|
85
src/calibre/utils/wordcount.py
Normal file
85
src/calibre/utils/wordcount.py
Normal file
@ -0,0 +1,85 @@
|
||||
#!/usr/bin/python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
"""
|
||||
Get word, character, and Asian character counts
|
||||
|
||||
1. Get a word count as a dictionary:
|
||||
wc = get_wordcount(text)
|
||||
words = wc['words'] # etc.
|
||||
|
||||
2. Get a word count as an object
|
||||
wc = get_wordcount_obj(text)
|
||||
words = wc.words # etc.
|
||||
|
||||
properties counted:
|
||||
* characters
|
||||
* chars_no_spaces
|
||||
* asian_chars
|
||||
* non_asian_words
|
||||
* words
|
||||
|
||||
Sourced from:
|
||||
http://ginstrom.com/scribbles/2008/05/17/counting-words-etc-in-an-html-file-with-python/
|
||||
http://ginstrom.com/scribbles/2007/10/06/counting-words-characters-and-asian-characters-with-python/
|
||||
"""
|
||||
__version__ = 0.1
|
||||
__author__ = "Ryan Ginstrom"
|
||||
|
||||
IDEOGRAPHIC_SPACE = 0x3000
|
||||
|
||||
def is_asian(char):
|
||||
"""Is the character Asian?"""
|
||||
|
||||
# 0x3000 is ideographic space (i.e. double-byte space)
|
||||
# Anything over is an Asian character
|
||||
return ord(char) > IDEOGRAPHIC_SPACE
|
||||
|
||||
def filter_jchars(c):
|
||||
"""Filters Asian characters to spaces"""
|
||||
if is_asian(c):
|
||||
return ' '
|
||||
return c
|
||||
|
||||
def nonj_len(word):
|
||||
u"""Returns number of non-Asian words in {word}
|
||||
- 日本語AアジアンB -> 2
|
||||
- hello -> 1
|
||||
@param word: A word, possibly containing Asian characters
|
||||
"""
|
||||
# Here are the steps:
|
||||
# 本spam日eggs
|
||||
# -> [' ', 's', 'p', 'a', 'm', ' ', 'e', 'g', 'g', 's']
|
||||
# -> ' spam eggs'
|
||||
# -> ['spam', 'eggs']
|
||||
# The length of which is 2!
|
||||
chars = [filter_jchars(c) for c in word]
|
||||
return len(u''.join(chars).split())
|
||||
|
||||
def get_wordcount(text):
|
||||
"""Get the word/character count for text
|
||||
|
||||
@param text: The text of the segment
|
||||
"""
|
||||
|
||||
characters = len(text)
|
||||
chars_no_spaces = sum([not x.isspace() for x in text])
|
||||
asian_chars = sum([is_asian(x) for x in text])
|
||||
non_asian_words = nonj_len(text)
|
||||
words = non_asian_words + asian_chars
|
||||
|
||||
return dict(characters=characters,
|
||||
chars_no_spaces=chars_no_spaces,
|
||||
asian_chars=asian_chars,
|
||||
non_asian_words=non_asian_words,
|
||||
words=words)
|
||||
|
||||
def dict2obj(dictionary):
|
||||
"""Transform a dictionary into an object"""
|
||||
class Obj(object):
|
||||
def __init__(self, dictionary):
|
||||
self.__dict__.update(dictionary)
|
||||
return Obj(dictionary)
|
||||
|
||||
def get_wordcount_obj(text):
|
||||
"""Get the wordcount as an object rather than a dictionary"""
|
||||
return dict2obj(get_wordcount(text))
|
Loading…
x
Reference in New Issue
Block a user