Merge from trunk

This commit is contained in:
Sengian 2011-01-07 08:26:32 +01:00
commit 15133ee8f1
34 changed files with 700 additions and 391 deletions

View File

@ -0,0 +1,74 @@
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1294342201(BasicNewsRecipe):
title = u'New London Day'
__author__ = 'Being'
description = 'State, local and business news from New London, CT'
language = 'en_GB'
oldest_article = 1
max_articles_per_feed = 200
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article')
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
dict(name=['script', 'noscript', 'style'])]
remove_tags_after = [ {'class':['photo_article',]} ]
remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer"]},
{'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent"]},
dict(name='font',attrs={'id':["cr-other-headlines"]})]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
.byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
.date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
.story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
feeds = [
(u'All News', u'http://www.theday.com/section/rss'),
(u'Breaking News', u'http://www.theday.com/section/rss01'),
(u'Police and Courts', u'http://www.theday.com/section/rss02'),
(u'State News', u'http://www.theday.com/section/rss03'),
(u'Local Business', u'http://www.theday.com/section/rss04'),
(u'Entertainment', u'http://www.theday.com/section/rss05'),
(u'Opinion', u'http://www.theday.com/section/rss06'),
(u'Casinos', u'http://www.theday.com/section/rss12'),
(u'Defense and Military', u'http://www.theday.com/section/rss14'),
(u'Ann Baldelli Ruminations', u'http://www.theday.com/section/rss20'),
(u'Paul Choiniere Ruminations', u'http://www.theday.com/section/rss21'),
(u'Michael Costanza Omnivore', u'http://www.theday.com/section/rss23'),
(u'Rebecca Dangelo Reel Life', u'http://www.theday.com/section/rss25'),]
def print_version(self, url):
return url.replace('/index.html', '/print.html')
def get_article_url(self, article):
return article.get('feedburner_origlink', article.get('guid', article.get('link')))
def postprocess_html(self, soup, first_fetch):
for t in soup.findAll(['table', 'tr', 'td']):
t.name = 'div'
for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})):
tag.extract()
for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})):
tag.extract()
return soup

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = 'Chema Cortés - 2011-01-05' __copyright__ = u'Chema Cort\xe9s - 2011-01-05'
__version__ = 'v0.01' __version__ = 'v0.01'
__date__ = '2011-01-05' __date__ = '2011-01-05'
''' '''
@ -13,7 +13,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class NewJournalOfPhysics(BasicNewsRecipe): class NewJournalOfPhysics(BasicNewsRecipe):
title = u'New Journal of Physics' title = u'New Journal of Physics'
__author__ = u'Chema Cortés' __author__ = u'Chema Cort\xe9s'
description = u'The open-access journal for physics' description = u'The open-access journal for physics'
publisher = u'IOP (Institute of Physics)' publisher = u'IOP (Institute of Physics)'
category = 'physics, journal, science' category = 'physics, journal, science'

View File

@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1283848012(BasicNewsRecipe):
description = 'The WallaNews.'
cover_url = 'http://ftp5.bizportal.co.il/web/giflib/news/rsPhoto/sz_5/rsz_220_220_logo_walla.gif'
title = u'Walla'
language = 'he'
__author__ = 'marbs'
extra_css='img {max-width:100%;} body{direction: rtl;},title{direction: rtl; } ,article_description{direction: rtl; }, a.article{direction: rtl; } ,calibre_feed_description{direction: rtl; }'
simultaneous_downloads = 5
# remove_javascript = True
timefmt = '[%a, %d %b, %Y]'
oldest_article = 1
max_articles_per_feed = 100
# remove_attributes = ['width']
keep_only_tags =dict(name='div', attrs={'class':'wp-0-b w3'})
remove_tags = [dict(name='div', attrs={'class':'tagsContainer'})]
max_articles_per_feed = 100
# preprocess_regexps = [
# (re.compile(r'<p>&nbsp;</p>', re.DOTALL|re.IGNORECASE), lambda match: '')
# ]
feeds = [(u'חדשות', u'http://rss.walla.co.il/?w=/1/0/1/@rss'),
(u'עסקים', u'http://rss.walla.co.il/?w=/2/3/1/@rss'),
(u'תרבות', u'http://rss.walla.co.il/?w=/4/249/1/@rss'),
(u'בריאות', u'http://rss.walla.co.il/?w=/5/18/1/@rss'),
(u'TECH', u'http://rss.walla.co.il/?w=/6/4/1/@rss'),
(u'אסטרולוגיה', u'http://rss.walla.co.il/?w=/8/3307/1/@rss'),
(u'בעלי חיים', u'http://rss.walla.co.il/?w=/59/5703/1/@rss'),
(u'רכב', u'http://rss.walla.co.il/?w=/31/4700/1/@rss'),
(u'סלבס', u'http://rss.walla.co.il/?w=/22/3600/1/@rss'),
(u'אוכל', u'http://rss.walla.co.il/?w=/9/903/1/@rss'),
(u'אופנה', u'http://rss.walla.co.il/?w=/24/2120/1/@rss'),
(u'ברנזה', u'http://rss.walla.co.il/?w=/27/3900/1/@rss'),
(u'ZONE', u'http://rss.walla.co.il/?w=/18/500/1/@rss'),
(u'ספורט', u'http://rss.walla.co.il/?w=/3/7/1/@rss')]
def print_version(self, url):
print_url = url + '/@@/item/printer'
return print_url

View File

@ -41,6 +41,7 @@ function scroll_to_bookmark(bookmark) {
$.scrollTo($(bm[0]), 1000, $.scrollTo($(bm[0]), 1000,
{ {
over:ratio, over:ratio,
axis: 'y', // Do not scroll in the x direction
onAfter:function(){window.py_bridge.animated_scroll_done()} onAfter:function(){window.py_bridge.animated_scroll_done()}
} }
); );

View File

@ -353,7 +353,7 @@ class HTMLPreProcessor(object):
(re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''), (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
# Center separator lines # Center separator lines
(re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'), (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
# Remove page links # Remove page links
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''), (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
@ -363,13 +363,11 @@ class HTMLPreProcessor(object):
# Remove gray background # Remove gray background
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
# Detect Chapters to match default XPATH in GUI # Convert line breaks to paragraphs
(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head), (re.compile(r'<br[^>]*>\s*'), lambda match : '</p>\n<p>'),
# Cover the case where every letter in a chapter title is separated by a space (re.compile(r'<body[^>]*>\s*'), lambda match : '<body>\n<p>'),
(re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head), (re.compile(r'\s*</body>'), lambda match : '</p>\n</body>'),
# Have paragraphs show better
(re.compile(r'<br.*?>'), lambda match : '<p>'),
# Clean up spaces # Clean up spaces
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics # Add space before and after italics
@ -455,9 +453,9 @@ class HTMLPreProcessor(object):
# delete soft hyphens - moved here so it's executed after header/footer removal # delete soft hyphens - moved here so it's executed after header/footer removal
if is_pdftohtml: if is_pdftohtml:
# unwrap/delete soft hyphens # unwrap/delete soft hyphens
end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: '')) end_rules.append((re.compile(u'[­](</p>\s*<p>\s*)+\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting # unwrap/delete soft hyphens with formatting
end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
# Make the more aggressive chapter marking regex optional with the preprocess option to # Make the more aggressive chapter marking regex optional with the preprocess option to
# reduce false positives and move after header/footer removal # reduce false positives and move after header/footer removal
@ -475,7 +473,7 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append( end_rules.append(
# Un wrap using punctuation # Un wrap using punctuation
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
) )
for rule in self.PREPROCESS + start_rules: for rule in self.PREPROCESS + start_rules:
@ -508,7 +506,15 @@ class HTMLPreProcessor(object):
if is_pdftohtml and length > -1: if is_pdftohtml and length > -1:
# Dehyphenate # Dehyphenate
dehyphenator = Dehyphenator() dehyphenator = Dehyphenator()
html = dehyphenator(html,'pdf', length) html = dehyphenator(html,'html', length)
if is_pdftohtml:
from calibre.ebooks.conversion.utils import PreProcessor
pdf_markup = PreProcessor(self.extra_opts, None)
totalwords = 0
totalwords = pdf_markup.get_word_count(html)
if totalwords > 7000:
html = pdf_markup.markup_chapters(html, totalwords, True)
#dump(html, 'post-preprocess') #dump(html, 'post-preprocess')
@ -554,5 +560,9 @@ class HTMLPreProcessor(object):
html = smartyPants(html) html = smartyPants(html)
html = html.replace(start, '<!--') html = html.replace(start, '<!--')
html = html.replace(stop, '-->') html = html.replace(stop, '-->')
# convert ellipsis to entities to prevent wrapping
html = re.sub('(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
# convert double dashes to em-dash
html = re.sub('\s--\s', u'\u2014', html)
return substitute_entites(html) return substitute_entites(html)

View File

@ -6,8 +6,10 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re import re
from math import ceil
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
from calibre.utils.wordcount import get_wordcount_obj
class PreProcessor(object): class PreProcessor(object):
@ -17,6 +19,9 @@ class PreProcessor(object):
self.found_indents = 0 self.found_indents = 0
self.extra_opts = extra_opts self.extra_opts = extra_opts
def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
def chapter_head(self, match): def chapter_head(self, match):
chap = match.group('chap') chap = match.group('chap')
title = match.group('title') title = match.group('title')
@ -64,7 +69,7 @@ class PreProcessor(object):
inspect. Percent is the minimum percent of line endings which should inspect. Percent is the minimum percent of line endings which should
be marked up to return true. be marked up to return true.
''' '''
htm_end_ere = re.compile('</p>', re.DOTALL) htm_end_ere = re.compile('</(p|div)>', re.DOTALL)
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
htm_end = htm_end_ere.findall(raw) htm_end = htm_end_ere.findall(raw)
line_end = line_end_ere.findall(raw) line_end = line_end_ere.findall(raw)
@ -101,12 +106,101 @@ class PreProcessor(object):
with open(os.path.join(odir, name), 'wb') as f: with open(os.path.join(odir, name), 'wb') as f:
f.write(raw.encode('utf-8')) f.write(raw.encode('utf-8'))
def get_word_count(self, html):
word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
wordcount = get_wordcount_obj(word_count_text)
return wordcount.words
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
# minimum of chapters to search for
self.min_chapters = 1
if wordcount > 7000:
self.min_chapters = int(ceil(wordcount / 7000.))
#print "minimum chapters required are: "+str(self.min_chapters)
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
# Build the Regular Expressions in pieces
init_lookahead = "(?=<(p|div))"
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
chapter_header_open = r"(?P<chap>"
title_header_open = r"(?P<title>"
chapter_header_close = ")\s*"
title_header_close = ")"
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
is_pdftohtml = self.is_pdftohtml(html)
if is_pdftohtml:
chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
title_line_open = "<(?P<outer2>p)[^>]*>\s*"
title_line_close = "\s*</(?P=outer2)>"
if blanks_between_paragraphs:
blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
else:
blank_lines = ""
opt_title_open = "("
opt_title_close = ")?"
n_lookahead_open = "\s+(?!"
n_lookahead_close = ")"
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
chapter_types = [
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
[r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"], # Spaced Lettering
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
[r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"], # Numeric Chapters, no dot or colon
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
]
# Start with most typical chapter headings, get more aggressive until one works
for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
if self.html_preprocess_sections >= self.min_chapters:
break
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
if lookahead_ignorecase:
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
else:
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
html = chapdetect.sub(self.chapter_head, html)
words_per_chptr = wordcount
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
words_per_chptr = wordcount / self.html_preprocess_sections
self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
return html
def __call__(self, html): def __call__(self, html):
self.log("********* Preprocessing HTML *********") self.log("********* Preprocessing HTML *********")
# Count the words in the document to estimate how many chapters to look for and whether
# other types of processing are attempted
totalwords = 0
totalwords = self.get_word_count(html)
if totalwords < 20:
self.log("not enough text, not preprocessing")
return html
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*</p>", "</p>\n", html) html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
html = re.sub(r"\s*<p(?P<style>[^>]*)>\s*", "\n<p"+"\g<style>"+">", html) html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
###### Check Markup ###### ###### Check Markup ######
# #
@ -141,12 +235,17 @@ class PreProcessor(object):
self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles") self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
# remove remaining non-breaking spaces # remove remaining non-breaking spaces
html = re.sub(ur'\u00a0', ' ', html) html = re.sub(ur'\u00a0', ' ', html)
# Get rid of various common microsoft specific tags which can cause issues later
# Get rid of empty <o:p> tags to simplify other processing # Get rid of empty <o:p> tags to simplify other processing
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
# Delete microsoft 'smart' tags
html = re.sub('(?i)</?st1:\w+>', '', html)
# Get rid of empty span, bold, & italics tags # Get rid of empty span, bold, & italics tags
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html) html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html) html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html) html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
# ADE doesn't render <br />, change to empty paragraphs
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
# If more than 40% of the lines are empty paragraphs and the user has enabled remove # If more than 40% of the lines are empty paragraphs and the user has enabled remove
# paragraph spacing then delete blank lines to clean up spacing # paragraph spacing then delete blank lines to clean up spacing
@ -168,59 +267,12 @@ class PreProcessor(object):
#print "blanks between paragraphs is marked True" #print "blanks between paragraphs is marked True"
else: else:
blanks_between_paragraphs = False blanks_between_paragraphs = False
#self.dump(html, 'before_chapter_markup') #self.dump(html, 'before_chapter_markup')
# detect chapters/sections to match xpath or splitting logic # detect chapters/sections to match xpath or splitting logic
# #
# Build the Regular Expressions in pieces
init_lookahead = "(?=<(p|div))"
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
chapter_header_open = r"(?P<chap>"
title_header_open = r"(?P<title>"
chapter_header_close = ")\s*"
title_header_close = ")"
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
if blanks_between_paragraphs: html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
else:
blank_lines = ""
opt_title_open = "("
opt_title_close = ")?"
n_lookahead_open = "\s+(?!"
n_lookahead_close = ")"
default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
min_chapters = 10
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
chapter_types = [
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
[r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
]
# Start with most typical chapter headings, get more aggressive until one works
for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
if self.html_preprocess_sections >= min_chapters:
break
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
if lookahead_ignorecase:
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
else:
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
html = chapdetect.sub(self.chapter_head, html)
###### Unwrap lines ###### ###### Unwrap lines ######
@ -247,7 +299,7 @@ class PreProcessor(object):
# Calculate Length # Calculate Length
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4) unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
length = docanalysis.line_length(unwrap_factor) length = docanalysis.line_length(unwrap_factor)
self.log("*** Median line length is " + unicode(length) + ", calculated with " + format + " format ***") self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
if hardbreaks or unwrap_factor < 0.4: if hardbreaks or unwrap_factor < 0.4:
self.log("Unwrapping required, unwrapping Lines") self.log("Unwrapping required, unwrapping Lines")
@ -260,7 +312,7 @@ class PreProcessor(object):
self.log("Done dehyphenating") self.log("Done dehyphenating")
# Unwrap lines using punctation and line length # Unwrap lines using punctation and line length
#unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE) #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
html = unwrap.sub(' ', html) html = unwrap.sub(' ', html)
#check any remaining hyphens, but only unwrap if there is a match #check any remaining hyphens, but only unwrap if there is a match
dehyphenator = Dehyphenator() dehyphenator = Dehyphenator()
@ -276,7 +328,7 @@ class PreProcessor(object):
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
# If still no sections after unwrapping mark split points on lines with no punctuation # If still no sections after unwrapping mark split points on lines with no punctuation
if self.html_preprocess_sections < 5: if self.html_preprocess_sections < self.min_chapters:
self.log("Looking for more split points based on punctuation," self.log("Looking for more split points based on punctuation,"
" currently have " + unicode(self.html_preprocess_sections)) " currently have " + unicode(self.html_preprocess_sections))
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)

View File

@ -16,6 +16,7 @@ import uuid
from lxml import etree from lxml import etree
from calibre import guess_type
from calibre import prepare_string_for_xml from calibre import prepare_string_for_xml
from calibre.constants import __appname__, __version__ from calibre.constants import __appname__, __version__
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
@ -161,6 +162,23 @@ class FB2MLizer(object):
text.append('<section>') text.append('<section>')
self.section_level += 1 self.section_level += 1
# Insert the title page / cover into the spine if it is not already referenced.
title_name = u''
if 'titlepage' in self.oeb_book.guide:
title_name = 'titlepage'
elif 'cover' in self.oeb_book.guide:
title_name = 'cover'
if title_name:
title_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[title_name].href]
if title_item.spine_position is None and title_item.media_type == 'application/xhtml+xml':
self.oeb_book.spine.insert(0, title_item, True)
# Create xhtml page to reference cover image so it can be used.
if not title_name and self.oeb_book.metadata.cover and unicode(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids:
id = unicode(self.oeb_book.metadata.cover[0])
cover_item = self.oeb_book.manifest.ids[id]
if cover_item.media_type in OEB_RASTER_IMAGES:
self.insert_image_cover(cover_item.href)
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
self.log.debug('Converting %s to FictionBook2 XML' % item.href) self.log.debug('Converting %s to FictionBook2 XML' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
@ -185,6 +203,17 @@ class FB2MLizer(object):
return ''.join(text) + '</body>' return ''.join(text) + '</body>'
def insert_image_cover(self, image_href):
from calibre.ebooks.oeb.base import RECOVER_PARSER
try:
root = etree.fromstring(u'<html xmlns="%s"><body><img src="%s" /></body></html>' % (XHTML_NS, image_href), parser=RECOVER_PARSER)
except:
root = etree.fromstring(u'', parser=RECOVER_PARSER)
id, href = self.oeb_book.manifest.generate('fb2_cover', 'fb2_cover.xhtml')
item = self.oeb_book.manifest.add(id, href, guess_type(href)[0], data=root)
self.oeb_book.spine.insert(0, item, True)
def fb2mlize_images(self): def fb2mlize_images(self):
''' '''
This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function. This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function.

View File

@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
import os import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
from calibre.ebooks.conversion.utils import PreProcessor from calibre.ebooks.conversion.utils import PreProcessor
@ -18,19 +18,6 @@ class PDBInput(InputFormatPlugin):
description = 'Convert PDB to HTML' description = 'Convert PDB to HTML'
file_types = set(['pdb']) file_types = set(['pdb'])
options = set([
OptionRecommendation(name='single_line_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line represents '
'a paragraph instead.')),
OptionRecommendation(name='print_formatted_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line starting with '
'an indent (either a tab or 2+ spaces) represents a paragraph. '
'Paragraphs end when the next line that starts with an indent '
'is reached.')),
])
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
header = PdbHeaderReader(stream) header = PdbHeaderReader(stream)

View File

@ -22,7 +22,7 @@ class PDBOutput(OutputFormatPlugin):
short_switch='f', choices=FORMAT_WRITERS.keys(), short_switch='f', choices=FORMAT_WRITERS.keys(),
help=(_('Format to use inside the pdb container. Choices are:')+\ help=(_('Format to use inside the pdb container. Choices are:')+\
' %s' % FORMAT_WRITERS.keys())), ' %s' % FORMAT_WRITERS.keys())),
OptionRecommendation(name='output_encoding', recommended_value='cp1252', OptionRecommendation(name='pdb_output_encoding', recommended_value='cp1252',
level=OptionRecommendation.LOW, level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. ' \ help=_('Specify the character encoding of the output document. ' \
'The default is cp1252. Note: This option is not honored by all ' \ 'The default is cp1252. Note: This option is not honored by all ' \

View File

@ -8,12 +8,11 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os
import struct import struct
from cStringIO import StringIO
from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted
class HeaderRecord(object): class HeaderRecord(object):
''' '''
@ -33,9 +32,7 @@ class Reader(FormatReader):
def __init__(self, header, stream, log, options): def __init__(self, header, stream, log, options):
self.stream = stream self.stream = stream
self.log = log self.log = log
self.encoding = options.input_encoding self.options = options
self.single_line_paras = options.single_line_paras
self.print_formatted_paras = options.print_formatted_paras
self.sections = [] self.sections = []
for i in range(header.num_sections): for i in range(header.num_sections):
@ -48,34 +45,29 @@ class Reader(FormatReader):
def decompress_text(self, number): def decompress_text(self, number):
if self.header_record.compression == 1: if self.header_record.compression == 1:
return self.section_data(number).decode('cp1252' if self.encoding is None else self.encoding) return self.section_data(number)
if self.header_record.compression == 2 or self.header_record.compression == 258: if self.header_record.compression == 2 or self.header_record.compression == 258:
from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.compression.palmdoc import decompress_doc
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') return decompress_doc(self.section_data(number))
return '' return ''
def extract_content(self, output_dir): def extract_content(self, output_dir):
txt = '' raw_txt = ''
self.log.info('Decompressing text...') self.log.info('Decompressing text...')
for i in range(1, self.header_record.num_records + 1): for i in range(1, self.header_record.num_records + 1):
self.log.debug('\tDecompressing text section %i' % i) self.log.debug('\tDecompressing text section %i' % i)
txt += self.decompress_text(i) raw_txt += self.decompress_text(i)
self.log.info('Converting text to OEB...') self.log.info('Converting text to OEB...')
if self.single_line_paras: stream = StringIO(raw_txt)
txt = separate_paragraphs_single_line(txt)
if self.print_formatted_paras:
txt = separate_paragraphs_print_formatted(txt)
html = convert_basic(txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
from calibre.ebooks.metadata.meta import get_metadata from calibre.customize.ui import plugin_for_input_format
mi = get_metadata(self.stream, 'pdb')
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
return os.path.join(output_dir, 'metadata.opf') txt_plugin = plugin_for_input_format('txt')
for option in txt_plugin.options:
if not hasattr(self.options, option.option.name):
setattr(self.options, option.name, option.recommended_value)
stream.seek(0)
return txt_plugin.convert(stream, self.options, 'txt', self.log, {})

View File

@ -50,7 +50,8 @@ class Writer(FormatWriter):
txt = writer.extract_content(oeb_book, self.opts) txt = writer.extract_content(oeb_book, self.opts)
self.log.debug('\tReplacing newlines with selected type...') self.log.debug('\tReplacing newlines with selected type...')
txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace') txt = specified_newlines(TxtNewlines('windows').newline,
txt).encode(self.opts.pdb_output_encoding, 'replace')
txt_length = len(txt) txt_length = len(txt)

View File

@ -19,9 +19,6 @@ class Reader(FormatReader):
self.stream = stream self.stream = stream
self.log = log self.log = log
self.options = options self.options = options
setattr(self.options, 'new_pdf_engine', False)
setattr(self.options, 'no_images', False)
setattr(self.options, 'unwrap_factor', 0.45)
def extract_content(self, output_dir): def extract_content(self, output_dir):
self.log.info('Extracting PDF...') self.log.info('Extracting PDF...')
@ -32,6 +29,11 @@ class Reader(FormatReader):
pdf.write(self.header.section_data(x)) pdf.write(self.header.section_data(x))
from calibre.customize.ui import plugin_for_input_format from calibre.customize.ui import plugin_for_input_format
pdf_plugin = plugin_for_input_format('pdf')
for option in pdf_plugin.options:
if not hasattr(self.options, option.option.name):
setattr(self.options, option.name, option.recommended_value)
pdf.seek(0) pdf.seek(0)
return plugin_for_input_format('pdf').convert(pdf, self.options, return pdf_plugin.convert(pdf, self.options, 'pdf', self.log, {})
'pdf', self.log, [])

View File

@ -8,12 +8,13 @@ __license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, struct, zlib import struct
import zlib
from cStringIO import StringIO
from calibre.ebooks.pdb.formatreader import FormatReader from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ztxt import zTXTError from calibre.ebooks.pdb.ztxt import zTXTError
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted
SUPPORTED_VERSION = (1, 40) SUPPORTED_VERSION = (1, 40)
@ -38,9 +39,7 @@ class Reader(FormatReader):
def __init__(self, header, stream, log, options): def __init__(self, header, stream, log, options):
self.stream = stream self.stream = stream
self.log = log self.log = log
self.encoding = options.input_encoding self.options = options
self.single_line_paras = options.single_line_paras
self.print_formatted_paras = options.print_formatted_paras
self.sections = [] self.sections = []
for i in range(header.num_sections): for i in range(header.num_sections):
@ -68,30 +67,25 @@ class Reader(FormatReader):
def decompress_text(self, number): def decompress_text(self, number):
if number == 1: if number == 1:
self.uncompressor = zlib.decompressobj() self.uncompressor = zlib.decompressobj()
return self.uncompressor.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace') return self.uncompressor.decompress(self.section_data(number))
def extract_content(self, output_dir): def extract_content(self, output_dir):
txt = '' raw_txt = ''
self.log.info('Decompressing text...') self.log.info('Decompressing text...')
for i in range(1, self.header_record.num_records + 1): for i in range(1, self.header_record.num_records + 1):
self.log.debug('\tDecompressing text section %i' % i) self.log.debug('\tDecompressing text section %i' % i)
txt += self.decompress_text(i) raw_txt += self.decompress_text(i)
self.log.info('Converting text to OEB...') self.log.info('Converting text to OEB...')
if self.single_line_paras: stream = StringIO(raw_txt)
txt = separate_paragraphs_single_line(txt)
if self.print_formatted_paras:
txt = separate_paragraphs_print_formatted(txt)
html = convert_basic(txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
from calibre.ebooks.metadata.meta import get_metadata from calibre.customize.ui import plugin_for_input_format
mi = get_metadata(self.stream, 'pdb')
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
return os.path.join(output_dir, 'metadata.opf') txt_plugin = plugin_for_input_format('txt')
for option in txt_plugin.options:
if not hasattr(self.options, option.option.name):
setattr(self.options, option.name, option.recommended_value)
stream.seek(0)
return txt_plugin.convert(stream, self.options, 'txt', self.log, {})

View File

@ -54,7 +54,8 @@ class Writer(FormatWriter):
txt = writer.extract_content(oeb_book, self.opts) txt = writer.extract_content(oeb_book, self.opts)
self.log.debug('\tReplacing newlines with selected type...') self.log.debug('\tReplacing newlines with selected type...')
txt = specified_newlines(TxtNewlines('windows').newline, txt).encode(self.opts.output_encoding, 'replace') txt = specified_newlines(TxtNewlines('windows').newline,
txt).encode(self.opts.pdb_output_encoding, 'replace')
txt_length = len(txt) txt_length = len(txt)

View File

@ -28,7 +28,7 @@ class PMLOutput(OutputFormatPlugin):
file_type = 'pmlz' file_type = 'pmlz'
options = set([ options = set([
OptionRecommendation(name='output_encoding', recommended_value='cp1252', OptionRecommendation(name='pml_output_encoding', recommended_value='cp1252',
level=OptionRecommendation.LOW, level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. ' \ help=_('Specify the character encoding of the output document. ' \
'The default is cp1252.')), 'The default is cp1252.')),
@ -48,7 +48,7 @@ class PMLOutput(OutputFormatPlugin):
pmlmlizer = PMLMLizer(log) pmlmlizer = PMLMLizer(log)
pml = unicode(pmlmlizer.extract_content(oeb_book, opts)) pml = unicode(pmlmlizer.extract_content(oeb_book, opts))
with open(os.path.join(tdir, 'index.pml'), 'wb') as out: with open(os.path.join(tdir, 'index.pml'), 'wb') as out:
out.write(pml.encode(opts.output_encoding, 'replace')) out.write(pml.encode(opts.pml_output_encoding, 'replace'))
self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, tdir, opts) self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, tdir, opts)

View File

@ -4,11 +4,9 @@ __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os from cStringIO import StringIO
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.txt.processor import convert_basic, opf_writer, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted
from calibre.ebooks.compression.tcr import decompress from calibre.ebooks.compression.tcr import decompress
class TCRInput(InputFormatPlugin): class TCRInput(InputFormatPlugin):
@ -18,37 +16,20 @@ class TCRInput(InputFormatPlugin):
description = 'Convert TCR files to HTML' description = 'Convert TCR files to HTML'
file_types = set(['tcr']) file_types = set(['tcr'])
options = set([
OptionRecommendation(name='single_line_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line represents '
'a paragraph instead.')),
OptionRecommendation(name='print_formatted_paras', recommended_value=False,
help=_('Normally calibre treats blank lines as paragraph markers. '
'With this option it will assume that every line starting with '
'an indent (either a tab or 2+ spaces) represents a paragraph. '
'Paragraphs end when the next line that starts with an indent '
'is reached.')),
])
def convert(self, stream, options, file_ext, log, accelerators): def convert(self, stream, options, file_ext, log, accelerators):
log.info('Decompressing text...') log.info('Decompressing text...')
ienc = options.input_encoding if options.input_encoding else 'utf-8' raw_txt = decompress(stream)
txt = decompress(stream).decode(ienc, 'replace')
log.info('Converting text to OEB...') log.info('Converting text to OEB...')
if options.single_line_paras: stream = StringIO(raw_txt)
txt = separate_paragraphs_single_line(txt)
if options.print_formatted_paras:
txt = separate_paragraphs_print_formatted(txt)
html = convert_basic(txt)
with open(os.path.join(os.getcwd(), 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
from calibre.ebooks.metadata.meta import get_metadata from calibre.customize.ui import plugin_for_input_format
mi = get_metadata(stream, 'tcr')
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(os.getcwd(), 'metadata.opf', manifest, spine, mi)
return os.path.join(os.getcwd(), 'metadata.opf') txt_plugin = plugin_for_input_format('txt')
for option in txt_plugin.options:
if not hasattr(options, option.option.name):
setattr(options, option.name, option.recommended_value)
stream.seek(0)
return txt_plugin.convert(stream, options,
'txt', log, accelerators)

View File

@ -18,7 +18,7 @@ class TCROutput(OutputFormatPlugin):
file_type = 'tcr' file_type = 'tcr'
options = set([ options = set([
OptionRecommendation(name='output_encoding', recommended_value='utf-8', OptionRecommendation(name='tcr_output_encoding', recommended_value='utf-8',
level=OptionRecommendation.LOW, level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. ' \ help=_('Specify the character encoding of the output document. ' \
'The default is utf-8.')), 'The default is utf-8.')),
@ -40,7 +40,7 @@ class TCROutput(OutputFormatPlugin):
setattr(opts, 'indent_paras', False) setattr(opts, 'indent_paras', False)
writer = TXTMLizer(log) writer = TXTMLizer(log)
txt = writer.extract_content(oeb_book, opts).encode(opts.output_encoding, 'replace') txt = writer.extract_content(oeb_book, opts).encode(opts.tcr_output_encoding, 'replace')
log.info('Compressing text...') log.info('Compressing text...')
txt = compress(txt) txt = compress(txt)

View File

@ -7,9 +7,10 @@ __docformat__ = 'restructuredtext en'
import os import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces preserve_spaces, detect_paragraph_type, detect_formatting_type
from calibre import _ent_pat, xml_entity_to_unicode from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin): class TXTInput(InputFormatPlugin):
@ -20,45 +21,57 @@ class TXTInput(InputFormatPlugin):
file_types = set(['txt']) file_types = set(['txt'])
options = set([ options = set([
OptionRecommendation(name='single_line_paras', recommended_value=False, OptionRecommendation(name='paragraph_type', recommended_value='auto',
help=_('Normally calibre treats blank lines as paragraph markers. ' choices=['auto', 'block', 'single', 'print'],
'With this option it will assume that every line represents ' help=_('Paragraph structure.\n'
'a paragraph instead.')), 'choices are [\'auto\', \'block\', \'single\', \'print\', \'markdown\']\n'
OptionRecommendation(name='print_formatted_paras', recommended_value=False, '* auto: Try to auto detect paragraph type.\n'
help=_('Normally calibre treats blank lines as paragraph markers. ' '* block: Treat a blank line as a paragraph break.\n'
'With this option it will assume that every line starting with ' '* single: Assume every line is a paragraph.\n'
'an indent (either a tab or 2+ spaces) represents a paragraph. ' '* print: Assume every line starting with 2+ spaces or a tab '
'Paragraphs end when the next line that starts with an indent ' 'starts a paragraph.')),
'is reached.')), OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=['auto', 'none', 'markdown'],
help=_('Formatting used within the document.'
'* auto: Try to auto detect the document formatting.\n'
'* none: Do not modify the paragraph formatting. Everything is a paragraph.\n'
'* markdown: Run the input though the markdown pre-processor. '
'To learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name='preserve_spaces', recommended_value=False, OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. ' help=_('Normally extra spaces are condensed into a single space. '
'With this option all spaces will be displayed.')), 'With this option all spaces will be displayed.')),
OptionRecommendation(name='markdown', recommended_value=False,
help=_('Run the text input through the markdown pre-processor. To '
'learn more about markdown see')+' http://daringfireball.net/projects/markdown/'),
OptionRecommendation(name="markdown_disable_toc", recommended_value=False, OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
help=_('Do not insert a Table of Contents into the output text.')), help=_('Do not insert a Table of Contents into the output text.')),
]) ])
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
ienc = stream.encoding if stream.encoding else 'utf-8' log.debug('Reading text from file...')
txt = stream.read()
# Get the encoding of the document.
if options.input_encoding: if options.input_encoding:
ienc = options.input_encoding ienc = options.input_encoding
log.debug('Reading text from file...') log.debug('Using user specified input encoding of %s' % ienc)
txt = stream.read().decode(ienc, 'replace') else:
det_encoding = detect(txt)
ienc = det_encoding['encoding']
log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, det_encoding['confidence'] * 100))
if not ienc:
ienc = 'utf-8'
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
txt = txt.decode(ienc, 'replace')
# Adjust paragraph formatting as requested txt = _ent_pat.sub(xml_entity_to_unicode, txt)
if options.single_line_paras: # Preserve spaces will replace multiple spaces to a space
txt = separate_paragraphs_single_line(txt) # followed by the &nbsp; entity.
if options.print_formatted_paras:
txt = separate_paragraphs_print_formatted(txt)
if options.preserve_spaces: if options.preserve_spaces:
txt = preserve_spaces(txt) txt = preserve_spaces(txt)
txt = _ent_pat.sub(xml_entity_to_unicode, txt) if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
if options.markdown: if options.formatting_type == 'markdown':
log.debug('Running text though markdown conversion...') log.debug('Running text though markdown conversion...')
try: try:
html = convert_markdown(txt, disable_toc=options.markdown_disable_toc) html = convert_markdown(txt, disable_toc=options.markdown_disable_toc)
@ -66,6 +79,22 @@ class TXTInput(InputFormatPlugin):
raise ValueError('This txt file has malformed markup, it cannot be' raise ValueError('This txt file has malformed markup, it cannot be'
' converted by calibre. See http://daringfireball.net/projects/markdown/syntax') ' converted by calibre. See http://daringfireball.net/projects/markdown/syntax')
else: else:
# Determine the paragraph type of the document.
if options.paragraph_type == 'auto':
options.paragraph_type = detect_paragraph_type(txt)
if options.paragraph_type == 'unknown':
log.debug('Could not reliably determine paragraph type using block')
options.paragraph_type = 'block'
else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
if options.paragraph_type == 'single':
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_paragraphs_print_formatted(txt)
flow_size = getattr(options, 'flow_size', 0) flow_size = getattr(options, 'flow_size', 0)
html = convert_basic(txt, epub_split_size_kb=flow_size) html = convert_basic(txt, epub_split_size_kb=flow_size)
@ -85,11 +114,10 @@ class TXTInput(InputFormatPlugin):
htmlfile = open(fname, 'wb') htmlfile = open(fname, 'wb')
with htmlfile: with htmlfile:
htmlfile.write(html.encode('utf-8')) htmlfile.write(html.encode('utf-8'))
cwd = os.getcwdu()
odi = options.debug_pipeline odi = options.debug_pipeline
options.debug_pipeline = None options.debug_pipeline = None
oeb = html_input(open(htmlfile.name, 'rb'), options, 'html', log, oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log,
{}, cwd) {})
options.debug_pipeline = odi options.debug_pipeline = odi
os.remove(htmlfile.name) os.remove(htmlfile.name)
return oeb return oeb

View File

@ -26,7 +26,7 @@ class TXTOutput(OutputFormatPlugin):
'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. ' 'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
'For Mac OS X use \'unix\'. \'system\' will default to the newline ' 'For Mac OS X use \'unix\'. \'system\' will default to the newline '
'type used by this OS.') % sorted(TxtNewlines.NEWLINE_TYPES.keys())), 'type used by this OS.') % sorted(TxtNewlines.NEWLINE_TYPES.keys())),
OptionRecommendation(name='output_encoding', recommended_value='utf-8', OptionRecommendation(name='txt_output_encoding', recommended_value='utf-8',
level=OptionRecommendation.LOW, level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. ' \ help=_('Specify the character encoding of the output document. ' \
'The default is utf-8.')), 'The default is utf-8.')),
@ -81,7 +81,7 @@ class TXTOutput(OutputFormatPlugin):
out_stream.seek(0) out_stream.seek(0)
out_stream.truncate() out_stream.truncate()
out_stream.write(txt.encode(opts.output_encoding, 'replace')) out_stream.write(txt.encode(opts.txt_output_encoding, 'replace'))
if close: if close:
out_stream.close() out_stream.close()

View File

@ -48,7 +48,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
if isbytestring(txt): if isbytestring(txt):
txt = txt.decode('utf-8') txt = txt.decode('utf-8')
lines = [] lines = []
# Split into paragraphs based on having a blank line between text. # Split into paragraphs based on having a blank line between text.
for line in txt.split('\n\n'): for line in txt.split('\n\n'):
@ -93,3 +92,54 @@ def split_string_separator(txt, size) :
xrange(0, len(txt), size)]) xrange(0, len(txt), size)])
return txt return txt
def detect_paragraph_type(txt):
'''
Tries to determine the formatting of the document.
block: Paragraphs are separated by a blank line.
single: Each line is a paragraph.
print: Each paragraph starts with a 2+ spaces or a tab
and ends when a new paragraph is reached.
markdown: Markdown formatting is in the document.
returns block, single, print, markdown
'''
txt = txt.replace('\r\n', '\n')
txt = txt.replace('\r', '\n')
txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))
# Check for print
tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
if tab_line_count / float(txt_line_count) >= .25:
return 'print'
# Check for block
empty_line_count = len(re.findall('(?mu)^\s*$', txt))
if empty_line_count / float(txt_line_count) >= .25:
return 'block'
# Nothing else matched to assume single.
return 'single'
def detect_formatting_type(txt):
# Check for markdown
# Headings
if len(re.findall('(?mu)^#+', txt)) >= 5:
return 'markdown'
if len(re.findall('(?mu)^=+$', txt)) >= 5:
return 'markdown'
if len(re.findall('(?mu)^-+$', txt)) >= 5:
return 'markdown'
# Images
if len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) >= 5:
return 'markdown'
# Links
if len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt)) >= 5:
return 'markdown'
# Escaped characters
md_escapted_characters = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!']
for c in md_escapted_characters:
if txt.count('\\'+c) > 10:
return 'markdown'
return 'none'

View File

@ -256,8 +256,10 @@ class BookInfo(QWebView):
% (left_pane, right_pane))) % (left_pane, right_pane)))
def mouseDoubleClickEvent(self, ev): def mouseDoubleClickEvent(self, ev):
if self.width() - ev.x() < 25 or \ swidth = self.page().mainFrame().scrollBarGeometry(Qt.Vertical).width()
self.height() - ev.y() < 25: sheight = self.page().mainFrame().scrollBarGeometry(Qt.Horizontal).height()
if self.width() - ev.x() < swidth or \
self.height() - ev.y() < sheight:
# Filter out double clicks on the scroll bar # Filter out double clicks on the scroll bar
ev.accept() ev.accept()
else: else:

View File

@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import textwrap import textwrap, codecs
from functools import partial from functools import partial
from PyQt4.Qt import QWidget, QSpinBox, QDoubleSpinBox, QLineEdit, QTextEdit, \ from PyQt4.Qt import QWidget, QSpinBox, QDoubleSpinBox, QLineEdit, QTextEdit, \
@ -128,6 +128,7 @@ class Widget(QWidget):
def get_value(self, g): def get_value(self, g):
from calibre.gui2.convert.xpath_wizard import XPathEdit from calibre.gui2.convert.xpath_wizard import XPathEdit
from calibre.gui2.convert.regex_builder import RegexEdit from calibre.gui2.convert.regex_builder import RegexEdit
from calibre.gui2.widgets import EncodingComboBox
ret = self.get_value_handler(g) ret = self.get_value_handler(g)
if ret != 'this is a dummy return value, xcswx1avcx4x': if ret != 'this is a dummy return value, xcswx1avcx4x':
return ret return ret
@ -139,6 +140,13 @@ class Widget(QWidget):
if not ans: if not ans:
ans = None ans = None
return ans return ans
elif isinstance(g, EncodingComboBox):
ans = unicode(g.currentText()).strip()
try:
codecs.lookup(ans)
except:
ans = ''
return ans
elif isinstance(g, QComboBox): elif isinstance(g, QComboBox):
return unicode(g.currentText()) return unicode(g.currentText())
elif isinstance(g, QCheckBox): elif isinstance(g, QCheckBox):
@ -192,6 +200,11 @@ class Widget(QWidget):
if not val: val = '' if not val: val = ''
getattr(g, 'setPlainText', g.setText)(val) getattr(g, 'setPlainText', g.setText)(val)
getattr(g, 'setCursorPosition', lambda x: x)(0) getattr(g, 'setCursorPosition', lambda x: x)(0)
elif isinstance(g, EncodingComboBox):
if val:
g.setEditText(val)
else:
g.setCurrentIndex(0)
elif isinstance(g, QComboBox) and val: elif isinstance(g, QComboBox) and val:
idx = g.findText(val, Qt.MatchFixedString) idx = g.findText(val, Qt.MatchFixedString)
if idx < 0: if idx < 0:
@ -202,8 +215,6 @@ class Widget(QWidget):
g.setCheckState(Qt.Checked if bool(val) else Qt.Unchecked) g.setCheckState(Qt.Checked if bool(val) else Qt.Unchecked)
elif isinstance(g, (XPathEdit, RegexEdit)): elif isinstance(g, (XPathEdit, RegexEdit)):
g.edit.setText(val if val else '') g.edit.setText(val if val else '')
elif isinstance(g, EncodingComboBox):
g.setEditText(val if val else '')
else: else:
raise Exception('Can\'t set value %s in %s'%(repr(val), raise Exception('Can\'t set value %s in %s'%(repr(val),
unicode(g.objectName()))) unicode(g.objectName())))

View File

@ -1,21 +0,0 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.gui2.convert.pdb_input_ui import Ui_Form
from calibre.gui2.convert import Widget
class PluginWidget(Widget, Ui_Form):
TITLE = _('PDB Input')
HELP = _('Options specific to')+' PDB '+_('input')
COMMIT_NAME = 'pdb_input'
ICON = I('mimetypes/unknown.png')
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent,
['single_line_paras', 'print_formatted_paras'])
self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id)

View File

@ -1,48 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<ui version="4.0">
<class>Form</class>
<widget class="QWidget" name="Form">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>400</width>
<height>300</height>
</rect>
</property>
<property name="windowTitle">
<string>Form</string>
</property>
<layout class="QGridLayout" name="gridLayout">
<item row="2" column="0">
<spacer name="verticalSpacer">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>213</height>
</size>
</property>
</spacer>
</item>
<item row="0" column="0">
<widget class="QCheckBox" name="opt_single_line_paras">
<property name="text">
<string>Treat each &amp;line as a paragraph</string>
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QCheckBox" name="opt_print_formatted_paras">
<property name="text">
<string>Assume print formatting</string>
</property>
</widget>
</item>
</layout>
</widget>
<resources/>
<connections/>
</ui>

View File

@ -19,7 +19,7 @@ class PluginWidget(Widget, Ui_Form):
ICON = I('mimetypes/unknown.png') ICON = I('mimetypes/unknown.png')
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, ['format', 'inline_toc', 'output_encoding']) Widget.__init__(self, parent, ['format', 'inline_toc', 'pdb_output_encoding'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -55,10 +55,21 @@
</widget> </widget>
</item> </item>
<item row="1" column="1"> <item row="1" column="1">
<widget class="QLineEdit" name="opt_output_encoding"/> <widget class="EncodingComboBox" name="opt_pdb_output_encoding">
<property name="editable">
<bool>true</bool>
</property>
</widget>
</item> </item>
</layout> </layout>
</widget> </widget>
<customwidgets>
<customwidget>
<class>EncodingComboBox</class>
<extends>QComboBox</extends>
<header>widgets.h</header>
</customwidget>
</customwidgets>
<resources/> <resources/>
<connections/> <connections/>
</ui> </ui>

View File

@ -18,6 +18,6 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, ['inline_toc', 'full_image_depth', Widget.__init__(self, parent, ['inline_toc', 'full_image_depth',
'output_encoding']) 'pml_output_encoding'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -14,7 +14,7 @@
<string>Form</string> <string>Form</string>
</property> </property>
<layout class="QGridLayout" name="gridLayout"> <layout class="QGridLayout" name="gridLayout">
<item row="3" column="0"> <item row="4" column="0">
<spacer name="verticalSpacer"> <spacer name="verticalSpacer">
<property name="orientation"> <property name="orientation">
<enum>Qt::Vertical</enum> <enum>Qt::Vertical</enum>
@ -27,32 +27,47 @@
</property> </property>
</spacer> </spacer>
</item> </item>
<item row="1" column="0"> <item row="2" column="0">
<widget class="QCheckBox" name="opt_inline_toc"> <widget class="QCheckBox" name="opt_inline_toc">
<property name="text"> <property name="text">
<string>&amp;Inline TOC</string> <string>&amp;Inline TOC</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="2" column="0"> <item row="3" column="0">
<widget class="QCheckBox" name="opt_full_image_depth"> <widget class="QCheckBox" name="opt_full_image_depth">
<property name="text"> <property name="text">
<string>Do not reduce image size and depth</string> <string>Do not reduce image size and depth</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="0" column="0"> <item row="1" column="0">
<layout class="QHBoxLayout" name="horizontalLayout">
<item>
<widget class="QLabel" name="label"> <widget class="QLabel" name="label">
<property name="text"> <property name="text">
<string>Output Encoding:</string> <string>Output Encoding:</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="0" column="1"> <item>
<widget class="QLineEdit" name="opt_output_encoding"/> <widget class="EncodingComboBox" name="opt_pml_output_encoding">
<property name="editable">
<bool>true</bool>
</property>
</widget>
</item>
</layout>
</item> </item>
</layout> </layout>
</widget> </widget>
<customwidgets>
<customwidget>
<class>EncodingComboBox</class>
<extends>QComboBox</extends>
<header>widgets.h</header>
</customwidget>
</customwidgets>
<resources/> <resources/>
<connections/> <connections/>
</ui> </ui>

View File

@ -16,7 +16,10 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None): def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, Widget.__init__(self, parent,
['single_line_paras', 'print_formatted_paras', 'markdown', ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
'markdown_disable_toc', 'preserve_spaces'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
for x in get_option('paragraph_type').option.choices:
self.opt_paragraph_type.addItem(x)
for x in get_option('formatting_type').option.choices:
self.opt_formatting_type.addItem(x)
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -6,7 +6,7 @@
<rect> <rect>
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>470</width> <width>518</width>
<height>300</height> <height>300</height>
</rect> </rect>
</property> </property>
@ -15,47 +15,23 @@
</property> </property>
<layout class="QGridLayout" name="gridLayout"> <layout class="QGridLayout" name="gridLayout">
<item row="0" column="0"> <item row="0" column="0">
<widget class="QCheckBox" name="opt_single_line_paras"> <widget class="QLabel" name="label_2">
<property name="text"> <property name="text">
<string>Treat each &amp;line as a paragraph</string> <string>Paragraph style:</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="1" column="0"> <item row="0" column="1">
<widget class="QCheckBox" name="opt_print_formatted_paras"> <widget class="QComboBox" name="opt_paragraph_type"/>
</item>
<item row="5" column="0" colspan="2">
<widget class="QCheckBox" name="opt_preserve_spaces">
<property name="text"> <property name="text">
<string>Assume print formatting</string> <string>Preserve &amp;spaces</string>
</property> </property>
</widget> </widget>
</item> </item>
<item row="2" column="0"> <item row="6" column="0" colspan="2">
<widget class="QCheckBox" name="opt_markdown">
<property name="text">
<string>Process using markdown</string>
</property>
</widget>
</item>
<item row="3" column="0">
<widget class="QLabel" name="label">
<property name="text">
<string>&lt;p&gt;Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit &lt;a href=&quot;http://daringfireball.net/projects/markdown&quot;&gt;markdown&lt;/a&gt;.</string>
</property>
<property name="wordWrap">
<bool>true</bool>
</property>
<property name="openExternalLinks">
<bool>true</bool>
</property>
</widget>
</item>
<item row="4" column="0">
<widget class="QCheckBox" name="opt_markdown_disable_toc">
<property name="text">
<string>Do not insert Table of Contents into output text when using markdown</string>
</property>
</widget>
</item>
<item row="6" column="0">
<spacer name="verticalSpacer"> <spacer name="verticalSpacer">
<property name="orientation"> <property name="orientation">
<enum>Qt::Vertical</enum> <enum>Qt::Vertical</enum>
@ -68,32 +44,47 @@
</property> </property>
</spacer> </spacer>
</item> </item>
<item row="5" column="0"> <item row="1" column="1">
<widget class="QCheckBox" name="opt_preserve_spaces"> <widget class="QComboBox" name="opt_formatting_type"/>
</item>
<item row="1" column="0">
<widget class="QLabel" name="label_3">
<property name="text"> <property name="text">
<string>Preserve &amp;spaces</string> <string>Formatting style:</string>
</property>
</widget>
</item>
<item row="2" column="0" rowspan="2" colspan="2">
<widget class="QGroupBox" name="groupBox">
<property name="title">
<string>Markdown Options</string>
</property>
<layout class="QVBoxLayout" name="verticalLayout">
<item>
<widget class="QLabel" name="label">
<property name="text">
<string>&lt;p&gt;Markdown is a simple markup language for text files, that allows for advanced formatting. To learn more visit &lt;a href=&quot;http://daringfireball.net/projects/markdown&quot;&gt;markdown&lt;/a&gt;.</string>
</property>
<property name="wordWrap">
<bool>true</bool>
</property>
<property name="openExternalLinks">
<bool>true</bool>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="opt_markdown_disable_toc">
<property name="text">
<string>Do not insert Table of Contents into output text when using markdown</string>
</property> </property>
</widget> </widget>
</item> </item>
</layout> </layout>
</widget> </widget>
</item>
</layout>
</widget>
<resources/> <resources/>
<connections> <connections/>
<connection>
<sender>opt_markdown</sender>
<signal>toggled(bool)</signal>
<receiver>opt_markdown_disable_toc</receiver>
<slot>setEnabled(bool)</slot>
<hints>
<hint type="sourcelabel">
<x>76</x>
<y>80</y>
</hint>
<hint type="destinationlabel">
<x>418</x>
<y>105</y>
</hint>
</hints>
</connection>
</connections>
</ui> </ui>

View File

@ -22,7 +22,7 @@ class PluginWidget(Widget, Ui_Form):
Widget.__init__(self, parent, Widget.__init__(self, parent,
['newline', 'max_line_length', 'force_max_line_length', ['newline', 'max_line_length', 'force_max_line_length',
'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references', 'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references',
'output_encoding']) 'txt_output_encoding'])
self.db, self.book_id = db, book_id self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id) self.initialize_options(get_option, get_help, db, book_id)

View File

@ -96,10 +96,21 @@
</widget> </widget>
</item> </item>
<item row="2" column="1"> <item row="2" column="1">
<widget class="QLineEdit" name="opt_output_encoding"/> <widget class="EncodingComboBox" name="opt_txt_output_encoding">
<property name="editable">
<bool>true</bool>
</property>
</widget>
</item> </item>
</layout> </layout>
</widget> </widget>
<customwidgets>
<customwidget>
<class>EncodingComboBox</class>
<extends>QComboBox</extends>
<header>widgets.h</header>
</customwidget>
</customwidgets>
<resources/> <resources/>
<connections/> <connections/>
</ui> </ui>

View File

@ -449,7 +449,7 @@ class Document(QWebPage): # {{{
return self.mainFrame().contentsSize().width() # offsetWidth gives inaccurate results return self.mainFrame().contentsSize().width() # offsetWidth gives inaccurate results
def set_bottom_padding(self, amount): def set_bottom_padding(self, amount):
s = QSize(-1, -1) if amount == 0 else QSize(self.width, s = QSize(-1, -1) if amount == 0 else QSize(self.viewportSize().width(),
self.height+amount) self.height+amount)
self.setPreferredContentsSize(s) self.setPreferredContentsSize(s)
@ -820,6 +820,7 @@ class DocumentView(QWebView): # {{{
self.flipper.initialize(self.current_page_image()) self.flipper.initialize(self.current_page_image())
self.manager.next_document() self.manager.next_document()
return return
#oheight = self.document.height
lower_limit = opos + delta_y # Max value of top y co-ord after scrolling lower_limit = opos + delta_y # Max value of top y co-ord after scrolling
max_y = self.document.height - window_height # The maximum possible top y co-ord max_y = self.document.height - window_height # The maximum possible top y co-ord
if max_y < lower_limit: if max_y < lower_limit:
@ -835,6 +836,7 @@ class DocumentView(QWebView): # {{{
if epf: if epf:
self.flipper.initialize(self.current_page_image()) self.flipper.initialize(self.current_page_image())
#print 'Document height:', self.document.height #print 'Document height:', self.document.height
#print 'Height change:', (self.document.height - oheight)
max_y = self.document.height - window_height max_y = self.document.height - window_height
lower_limit = min(max_y, lower_limit) lower_limit = min(max_y, lower_limit)
#print 'Scroll to:', lower_limit #print 'Scroll to:', lower_limit
@ -842,6 +844,7 @@ class DocumentView(QWebView): # {{{
self.document.scroll_to(self.document.xpos, lower_limit) self.document.scroll_to(self.document.xpos, lower_limit)
actually_scrolled = self.document.ypos - opos actually_scrolled = self.document.ypos - opos
#print 'After scroll pos:', self.document.ypos #print 'After scroll pos:', self.document.ypos
#print 'Scrolled by:', self.document.ypos - opos
self.find_next_blank_line(window_height - actually_scrolled) self.find_next_blank_line(window_height - actually_scrolled)
#print 'After blank line pos:', self.document.ypos #print 'After blank line pos:', self.document.ypos
if epf: if epf:

View File

@ -0,0 +1,85 @@
#!/usr/bin/python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
"""
Get word, character, and Asian character counts
1. Get a word count as a dictionary:
wc = get_wordcount(text)
words = wc['words'] # etc.
2. Get a word count as an object
wc = get_wordcount_obj(text)
words = wc.words # etc.
properties counted:
* characters
* chars_no_spaces
* asian_chars
* non_asian_words
* words
Sourced from:
http://ginstrom.com/scribbles/2008/05/17/counting-words-etc-in-an-html-file-with-python/
http://ginstrom.com/scribbles/2007/10/06/counting-words-characters-and-asian-characters-with-python/
"""
__version__ = 0.1
__author__ = "Ryan Ginstrom"
IDEOGRAPHIC_SPACE = 0x3000
def is_asian(char):
"""Is the character Asian?"""
# 0x3000 is ideographic space (i.e. double-byte space)
# Anything over is an Asian character
return ord(char) > IDEOGRAPHIC_SPACE
def filter_jchars(c):
"""Filters Asian characters to spaces"""
if is_asian(c):
return ' '
return c
def nonj_len(word):
u"""Returns number of non-Asian words in {word}
- 日本語AアジアンB -> 2
- hello -> 1
@param word: A word, possibly containing Asian characters
"""
# Here are the steps:
# 本spam日eggs
# -> [' ', 's', 'p', 'a', 'm', ' ', 'e', 'g', 'g', 's']
# -> ' spam eggs'
# -> ['spam', 'eggs']
# The length of which is 2!
chars = [filter_jchars(c) for c in word]
return len(u''.join(chars).split())
def get_wordcount(text):
"""Get the word/character count for text
@param text: The text of the segment
"""
characters = len(text)
chars_no_spaces = sum([not x.isspace() for x in text])
asian_chars = sum([is_asian(x) for x in text])
non_asian_words = nonj_len(text)
words = non_asian_words + asian_chars
return dict(characters=characters,
chars_no_spaces=chars_no_spaces,
asian_chars=asian_chars,
non_asian_words=non_asian_words,
words=words)
def dict2obj(dictionary):
"""Transform a dictionary into an object"""
class Obj(object):
def __init__(self, dictionary):
self.__dict__.update(dictionary)
return Obj(dictionary)
def get_wordcount_obj(text):
"""Get the wordcount as an object rather than a dictionary"""
return dict2obj(get_wordcount(text))