\n', res)
# Unwrap lines using punctation if the median length of all lines is less than 150
length = line_length('html', res, 0.4)
print "*** Median length is " + str(length) + " ***\n"
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*\s*(
\n'
+
+ def chapter_link(match):
+ chap = match.group('sectionlink')
+ if not chap:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
+ return ' '
+ else:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
+ return ' \n
'+chap+'
'
+
+
+ def no_markup(raw, percent):
+ '''
+ Detects total marked up line endings in the file. raw is the text to
+ inspect. Percent is the minimum percent of line endings which should
+ be marked up to return true.
+ '''
+ htm_end_ere = re.compile('
', re.DOTALL)
+ line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
+ htm_end = htm_end_ere.findall(raw)
+ line_end = line_end_ere.findall(raw)
+ tot_htm_ends = len(htm_end)
+ tot_ln_fds = len(line_end)
+ self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
+
+ if percent > 1:
+ percent = 1
+ if percent < 0:
+ percent = 0
+
+ min_lns = tot_ln_fds * percent
+ self.log("There must be more than " + str(min_lns) + " unmarked lines to be true")
+ if min_lns > tot_htm_ends:
+ return True
+
self.log("********* Preprocessing HTML *********")
- # Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r'(?=?(br|p|span))(?(br|p|span)[^>]*>)?\s*(?P(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(i|b)>|(i|b)>)?)(?(p|br|span)[^>]*>)', re.IGNORECASE)
- html = chapdetect.sub('
'+'\g'+'
\n', html)
- # Unwrap lines using punctation if the median length of all lines is less than 150
+ # remove non-breaking spaces
+ html = re.sub(ur'\u00a0', ' ', html)
+ # Get rid of empty tags to simplify other processing
+ html = re.sub(ur'\s*\s*', ' ', html)
+ # Get rid of empty span tags
+ html = re.sub(r"\s*]*>\s*", " ", html)
+
+ # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+ linereg = re.compile('(?<=
)', re.IGNORECASE)
+ blankreg = re.compile(r'\s*
]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
', re.IGNORECASE)
+ blanklines = blankreg.findall(html)
+ lines = linereg.findall(html)
+ if len(lines) > 1:
+ self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
+ if float(len(blanklines)) / float(len(lines)) > 0.40:
+ self.log("deleting blank lines")
+ html = blankreg.sub('', html)
+ # Arrange line feeds and
tags so the line_length and no_markup functions work correctly
+ html = re.sub(r"\s*
", "
\n", html)
+
+ # some lit files don't have any
tags or equivalent, check and
+ # mark up line endings if required before proceeding
+ if no_markup(html, 0.1):
+ self.log("not enough paragraph markers, adding now")
+ add_markup = re.compile('(?)(\n)')
+ html = add_markup.sub('
\n
', html)
+
+ # detect chapters/sections to match xpath or splitting logic
#
- # Insert extra line feeds so the line length regex functions properly
- html = re.sub(r"
", "\n", html)
+ # Mark split points based on embedded links
+ chaplink = re.compile(r']*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P[^\s<]+(\s*[^\s<]+){0,4})?\s*()?\s*((i|b|u)>){0,2}\s*', re.IGNORECASE)
+ html = chaplink.sub(chapter_link, html)
+ # Continue with alternate patterns, start with most typical chapter headings
+ if self.html_preprocess_sections < 10:
+ chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}.?(\d+\.?|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\s*){0,4}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.IGNORECASE)
+ html = chapdetect.sub(chapter_head, html)
+ if self.html_preprocess_sections < 10:
+ self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
+ html = chapdetect2.sub(chapter_head, html)
+
+ # search for places where a first or second level heading is immediately followed by another
+ # top level heading. demote the second heading to h3 to prevent splitting between chapter
+ # headings and titles, images, etc
+ doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
+ html = doubleheading.sub('\g'+'
'+'
', html)
+ #
+ # Unwrap lines using punctation if the median length of all lines is less than 150
length = line_length('html', html, 0.4)
self.log("*** Median length is " + str(length) + " ***")
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index 487e70c04f..b8dc7a9560 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -3,6 +3,7 @@ __license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal '
__docformat__ = 'restructuredtext en'
+import re
from calibre.customize.conversion import InputFormatPlugin
class MOBIInput(InputFormatPlugin):
@@ -37,3 +38,12 @@ class MOBIInput(InputFormatPlugin):
include_meta_content_type=False))
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
return mr.created_opf_path
+
+ def preprocess_html(self, html):
+ # search for places where a first or second level heading is immediately followed by another
+ # top level heading. demote the second heading to h3 to prevent splitting between chapter
+ # headings and titles, images, etc
+ doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
+ html = doubleheading.sub('\g'+'
'+'
', html)
+ return html
+
diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py
index 584d631d0b..36848ddb8b 100644
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@@ -408,6 +408,10 @@ class Page(object):
# Fraction of text height that two strings' bottoms can differ by
# for them to be considered to be part of the same text fragment
LINE_FACTOR = 0.4
+
+ # Percentage of the page heigth which should be considered header
+ # or footer to be discarded from reflow considerations
+ HEAD_FOOTER_MARGIN
# Multiplies the average line height when determining row height
# of a particular element to detect columns.
From c9cb61a40e015059716478255ad67aa30716ea6f Mon Sep 17 00:00:00 2001
From: GRiker
Date: Fri, 10 Sep 2010 13:46:01 -0700
Subject: [PATCH 03/43] GwR jacket work
---
src/calibre/ebooks/oeb/transforms/jacket.py | 72 ++++++++++++++-------
1 file changed, 49 insertions(+), 23 deletions(-)
diff --git a/src/calibre/ebooks/oeb/transforms/jacket.py b/src/calibre/ebooks/oeb/transforms/jacket.py
index fec4d230c3..030067850c 100644
--- a/src/calibre/ebooks/oeb/transforms/jacket.py
+++ b/src/calibre/ebooks/oeb/transforms/jacket.py
@@ -6,14 +6,14 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import textwrap
+import os, textwrap
from xml.sax.saxutils import escape
from itertools import repeat
from lxml import etree
+from calibre import guess_type, strftime
from calibre.ebooks.oeb.base import XPath, XPNSMAP
-from calibre import guess_type
from calibre.library.comments import comments_to_html
class Jacket(object):
'''
@@ -24,22 +24,18 @@ class Jacket(object):
JACKET_TEMPLATE = textwrap.dedent(u'''\
- %(title)s
+ %(title_str)s
+
-
-
-
%(title)s
-
%(jacket)s
-
%(series)s
-
%(rating)s
-
%(tags)s
-
-
- %(comments)s
-
+
+
%(title)s
+
%(series)s
+
%(rating)s
+
%(tags)s
+
%(comments)s
''')
@@ -71,11 +67,18 @@ class Jacket(object):
return ans
id, href = self.oeb.manifest.generate('star', 'star.png')
self.oeb.manifest.add(id, href, 'image/png', data=I('star.png', data=True))
- ans = 'Rating: ' + ''.join(repeat(''%href, num))
+ ans = 'Rating: ' + ''.join(repeat(''%href, num))
return ans
def insert_metadata(self, mi):
self.log('Inserting metadata into book...')
+ jacket_resources = P("jacket")
+
+ if os.path.isdir(jacket_resources):
+ stylesheet = os.path.join(jacket_resources, 'stylesheet.css')
+ with open(stylesheet) as f:
+ css_data = f.read()
+
comments = mi.comments
if not comments:
try:
@@ -87,11 +90,13 @@ class Jacket(object):
orig_comments = comments
if comments:
comments = comments_to_html(comments)
- series = 'Series: ' + escape(mi.series if mi.series else '')
+
+ series = 'Series: %s' % escape(mi.series if mi.series else '')
if mi.series and mi.series_index is not None:
- series += escape(' [%s]'%mi.format_series_index())
+ series += '%s' % escape(' [%s]'%mi.format_series_index())
if not mi.series:
series = ''
+
tags = mi.tags
if not tags:
try:
@@ -99,23 +104,30 @@ class Jacket(object):
except:
tags = []
if tags:
- tags = 'Tags: ' + self.opts.dest.tags_to_string(tags)
+ tags = 'Tags:%s' % self.opts.dest.tags_to_string(tags)
else:
tags = ''
+
try:
- title = mi.title if mi.title else unicode(self.oeb.metadata.title[0])
+ title_str = mi.title if mi.title else unicode(self.oeb.metadata.title[0])
except:
- title = _('Unknown')
+ title_str = _('Unknown')
+ title = '%s (%s)' % (escape(title_str), strftime(u'%Y', mi.pubdate.timetuple()))
+
def generate_html(comments):
return self.JACKET_TEMPLATE%dict(xmlns=XPNSMAP['h'],
- title=escape(title), comments=comments,
- jacket=escape(_('Book Jacket')), series=series,
- tags=tags, rating=self.get_rating(mi.rating))
+ title=title, comments=comments,
+ series=series,
+ tags=tags, rating=self.get_rating(mi.rating),
+ css=css_data, title_str=title_str)
+
id, href = self.oeb.manifest.generate('jacket', 'jacket.xhtml')
from calibre.ebooks.oeb.base import RECOVER_PARSER, XPath
try:
root = etree.fromstring(generate_html(comments), parser=RECOVER_PARSER)
+# print "root: %s" % etree.tostring(root, encoding='utf-8',
+# xml_declaration=True, pretty_print=True)
except:
root = etree.fromstring(generate_html(escape(orig_comments)),
parser=RECOVER_PARSER)
@@ -137,8 +149,22 @@ class Jacket(object):
def __call__(self, oeb, opts, metadata):
+ '''
+ Add metadata in jacket.xhtml if specifed in opts
+ If not specified, remove previous jacket instance
+ '''
self.oeb, self.opts, self.log = oeb, opts, oeb.log
if opts.remove_first_image:
self.remove_first_image()
if opts.insert_metadata:
self.insert_metadata(metadata)
+ else:
+ jacket = XPath('//h:meta[@name="calibre-content" and @content="jacket"]')
+ for item in list(self.oeb.spine)[:4]:
+ if jacket(item.data):
+ try:
+ self.log.info("Removing previous jacket instance")
+ self.oeb.manifest.remove(item)
+ break
+ except:
+ continue
From 4c7373026b9ee8a618dccf8602740d6a7d578aa2 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sat, 11 Sep 2010 12:10:49 +1000
Subject: [PATCH 04/43] preprocessing changes for lit & pdf, added utils.py,
changed default unwrap_factor
---
src/calibre/ebooks/conversion/preprocess.py | 15 ++++++++---
src/calibre/ebooks/conversion/utils.py | 6 +++++
src/calibre/ebooks/lit/input.py | 29 +++++++++++++--------
src/calibre/ebooks/pdf/input.py | 4 +--
4 files changed, 37 insertions(+), 17 deletions(-)
create mode 100644 src/calibre/ebooks/conversion/utils.py
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 2954fd7c26..452a322d95 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -77,6 +77,7 @@ def line_length(format, raw, percent):
elif format == 'pdf':
linere = re.compile('(?<= ).*?(?= )', re.DOTALL)
lines = linere.findall(raw)
+ print "percent is " + str(percent)
lengths = []
for line in lines:
@@ -165,6 +166,11 @@ class HTMLPreProcessor(object):
(re.compile(u'`\s*()*\s*I', re.UNICODE), lambda match: u'Ì'),
(re.compile(u'`\s*()*\s*a', re.UNICODE), lambda match: u'à'),
(re.compile(u'`\s*()*\s*A', re.UNICODE), lambda match: u'À'),
+
+ #(re.compile(u'a\s*()*\s*`', re.UNICODE), lambda match: u'à'),
+ #(re.compile(u'A\s*()*\s*`', re.UNICODE), lambda match: u'À'),
+ #(re.compile(u'o\s*()*\s*`', re.UNICODE), lambda match: u'ò'),
+ #(re.compile(u'O\s*()*\s*`', re.UNICODE), lambda match: u'Ò'),
(re.compile(u'´\s*()*\s*o', re.UNICODE), lambda match: u'ó'),
(re.compile(u'´\s*()*\s*O', re.UNICODE), lambda match: u'Ó'),
@@ -206,13 +212,13 @@ class HTMLPreProcessor(object):
# (re.compile(r' \s* ', re.IGNORECASE), lambda match: '\n
]+>'), lambda match : ''),
# Detect Chapters to match default XPATH in GUI
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+(\s\w+)?)?\s*((i|b)>((i|b)>)?)?)\s*(?(br|p)[^>]*>\s*){1,3}\s*(?P(<(i|b)>)?(\s*\w+){1,4}\s*((i|b)>)?\s*(?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
+ (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*((i|b)>((i|b)>)?)?)\s*(?(br|p)[^>]*>\s*){1,3}\s*(?P(<(i|b)>)?(\s*\w+){1,4}\s*((i|b)>)?\s*(?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
# Have paragraphs show better
(re.compile(r''), lambda match : '
'),
@@ -303,15 +309,16 @@ class HTMLPreProcessor(object):
if getattr(self.extra_opts, 'preprocess_html', None):
if is_pdftohtml:
end_rules.append(
- (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?)\s*((i|b)>((i|b)>)?)?\s*(?p[^>]*>| ]*>)\n?((?=()?\s*\w+(\s+\w+)?()?( ]*>|?p[^>]*>))((?P.*)( ]*>|?p[^>]*>)))?'), chap_head),
+ (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*((i|b)>((i|b)>)?)?\s*(?p[^>]*>| ]*>)\n?((?=()?\s*\w+(\s+\w+)?()?( ]*>|?p[^>]*>))((?P.*)( ]*>|?p[^>]*>)))?'), chap_head),
)
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
if length:
+ print "The pdf line length returned is " + str(length)
end_rules.append(
# Un wrap using punctuation
- (re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P(i|b|u)>)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+ (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P(i|b|u)>)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
new file mode 100644
index 0000000000..52be473372
--- /dev/null
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__ = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
\ No newline at end of file
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index f7bb0fbfd9..35dad501be 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -102,7 +102,7 @@ class LITInput(InputFormatPlugin):
percent = 0
min_lns = tot_ln_fds * percent
- self.log("There must be more than " + str(min_lns) + " unmarked lines to be true")
+ self.log("There must be more than " + str(min_lns) + " unmarked lines to return true")
if min_lns > tot_htm_ends:
return True
@@ -141,24 +141,31 @@ class LITInput(InputFormatPlugin):
html = chaplink.sub(chapter_link, html)
# Continue with alternate patterns, start with most typical chapter headings
if self.html_preprocess_sections < 10:
- chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}.?(\d+\.?|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\s*){0,4}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.IGNORECASE)
+ chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}s*(]*>)?\s*.?(\d+\.?|Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*((i|b|u)>){0,2})\s*()?s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.IGNORECASE)
html = chapdetect.sub(chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
- html = chapdetect2.sub(chapter_head, html)
-
+ html = chapdetect2.sub(chapter_head, html)
+ #
+ # Unwrap lines using punctation if the median length of all lines is less than 150
+ length = line_length('html', html, 0.4)
+ self.log("*** Median line length is " + str(length) + " ***")
+ unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+ if length < 150:
+ self.log("Unwrapping Lines")
+ html = unwrap.sub(' ', html)
+ # If still no sections after unwrapping lines break on lines with no punctuation
+ if self.html_preprocess_sections < 10:
+ self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
+ #self.log(html)
+ chapdetect3 = re.compile(r'(
)(?P)?', re.IGNORECASE)
+ html = chapdetect3.sub(chapter_head, html)
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc
doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
html = doubleheading.sub('\g'+'
'+'
', html)
- #
- # Unwrap lines using punctation if the median length of all lines is less than 150
- length = line_length('html', html, 0.4)
- self.log("*** Median length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
- if length < 150:
- html = unwrap.sub(' ', html)
+
return html
diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 64a089281e..113c3d99d8 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -22,10 +22,10 @@ class PDFInput(InputFormatPlugin):
options = set([
OptionRecommendation(name='no_images', recommended_value=False,
help=_('Do not extract images from the document')),
- OptionRecommendation(name='unwrap_factor', recommended_value=0.5,
+ OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
- 'default is 0.5, this is the median line length.')),
+ 'default is 0.45, this is the median line length.')),
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
help=_('Use the new PDF conversion engine.'))
])
From faf15b2f3d611594352721d4d06407025fea1320 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sat, 11 Sep 2010 13:09:23 +1000
Subject: [PATCH 05/43] preprocess merge gone wrong, fixing
---
src/calibre/ebooks/conversion/preprocess.py | 25 ++++++---------------
1 file changed, 7 insertions(+), 18 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index e2364d961f..24a389e65c 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -168,7 +168,6 @@ class HTMLPreProcessor(object):
(re.compile(u'`\s*()*\s*O', re.UNICODE), lambda match: u'Ò'),
(re.compile(u'`\s*()*\s*u', re.UNICODE), lambda match: u'ù'),
(re.compile(u'`\s*()*\s*U', re.UNICODE), lambda match: u'Ù'),
-<<<<<<< TREE
(re.compile(u'`\s*()*\s*e', re.UNICODE), lambda match: u'è'),
(re.compile(u'`\s*()*\s*E', re.UNICODE), lambda match: u'È'),
(re.compile(u'`\s*()*\s*i', re.UNICODE), lambda match: u'ì'),
@@ -176,13 +175,6 @@ class HTMLPreProcessor(object):
(re.compile(u'`\s*()*\s*a', re.UNICODE), lambda match: u'à'),
(re.compile(u'`\s*()*\s*A', re.UNICODE), lambda match: u'À'),
- #(re.compile(u'a\s*()*\s*`', re.UNICODE), lambda match: u'à'),
- #(re.compile(u'A\s*()*\s*`', re.UNICODE), lambda match: u'À'),
- #(re.compile(u'o\s*()*\s*`', re.UNICODE), lambda match: u'ò'),
- #(re.compile(u'O\s*()*\s*`', re.UNICODE), lambda match: u'Ò'),
-=======
->>>>>>> MERGE-SOURCE
-
# ´
(re.compile(u'´\s*()*\s*a', re.UNICODE), lambda match: u'á'),
(re.compile(u'´\s*()*\s*A', re.UNICODE), lambda match: u'Á'),
@@ -218,14 +210,7 @@ class HTMLPreProcessor(object):
# ¸
(re.compile(u'¸\s*()*\s*c', re.UNICODE), lambda match: u'ç'),
(re.compile(u'¸\s*()*\s*C', re.UNICODE), lambda match: u'Ç'),
-
-<<<<<<< TREE
- # If pdf printed from a browser then the header/footer has a reliable pattern
- (re.compile(r'((?<=)\s*file:////?[A-Z].* |file:////?[A-Z].* (?=\s*))', re.IGNORECASE), lambda match: ''),
-
- # Center separator lines
- (re.compile(u' \s*(?P([*#•]+\s*)+)\s* '), lambda match: '
\n
' + match.group(1) + '
'),
-=======
+
# ˛
(re.compile(u'˛\s*()*\s*a', re.UNICODE), lambda match: u'ą'),
(re.compile(u'˛\s*()*\s*A', re.UNICODE), lambda match: u'Ą'),
@@ -235,8 +220,12 @@ class HTMLPreProcessor(object):
# ˙
(re.compile(u'˙\s*()*\s*z', re.UNICODE), lambda match: u'ż'),
(re.compile(u'˙\s*()*\s*Z', re.UNICODE), lambda match: u'Ż'),
-
->>>>>>> MERGE-SOURCE
+
+ # If pdf printed from a browser then the header/footer has a reliable pattern
+ (re.compile(r'((?<=)\s*file:////?[A-Z].* |file:////?[A-Z].* (?=\s*))', re.IGNORECASE), lambda match: ''),
+
+ # Center separator lines
+ (re.compile(u' \s*(?P([*#•]+\s*)+)\s* '), lambda match: '
]+>'), lambda match : ''),
# Detect Chapters to match default XPATH in GUI
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*((i|b)>((i|b)>)?)?)\s*(?(br|p)[^>]*>\s*){1,3}\s*(?P(<(i|b)>)?(\s*\w+){1,4}\s*((i|b)>)?\s*(?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
-
+ (re.compile(r' \s*(?P([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*( \s*){1,3}\s*(?P(<(i|b)>)?(\s*\w+){1,4}\s*((i|b)>)?\s*(?(br|p)[^>]*>))?'), chap_head),
+
# Have paragraphs show better
(re.compile(r''), lambda match : '
'),
# Clean up spaces
@@ -322,21 +324,29 @@ class HTMLPreProcessor(object):
import traceback
print 'Failed to parse remove_footer regexp'
traceback.print_exc()
+
+ # unwrap hyphenation - moved here so it's executed after header/footer removal
+ if is_pdftohtml:
+ # unwrap visible dashes and hyphens - don't delete as 50% or more of the time these
+ # hyphens are for compound words, formatting, etc
+ end_rules.append((re.compile(u'(?<=[-–—])\s*
\n'
+
+ def chapter_link(self, match):
+ chap = match.group('sectionlink')
+ if not chap:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
+ return ' '
+ else:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
+ return ' \n
'+chap+'
'
+
+ def no_markup(self, raw, percent):
+ '''
+ Detects total marked up line endings in the file. raw is the text to
+ inspect. Percent is the minimum percent of line endings which should
+ be marked up to return true.
+ '''
+ htm_end_ere = re.compile('
', re.DOTALL)
+ line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
+ htm_end = htm_end_ere.findall(raw)
+ line_end = line_end_ere.findall(raw)
+ tot_htm_ends = len(htm_end)
+ tot_ln_fds = len(line_end)
+ self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
+
+ if percent > 1:
+ percent = 1
+ if percent < 0:
+ percent = 0
+
+ min_lns = tot_ln_fds * percent
+ self.log("There must be fewer than " + str(min_lns) + " unmarked lines to return true")
+ if min_lns > tot_htm_ends:
+ return True
+
+ def __call__(self, html):
+ self.log("********* Preprocessing HTML *********")
+ # remove non-breaking spaces
+ html = re.sub(ur'\u00a0', ' ', html)
+ # Get rid of empty tags to simplify other processing
+ html = re.sub(ur'\s*\s*', ' ', html)
+ # Get rid of empty span tags
+ html = re.sub(r"\s*]*>\s*", " ", html)
+
+ # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+ linereg = re.compile('(?<=
)', re.IGNORECASE)
+ blankreg = re.compile(r'\s*
]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
', re.IGNORECASE)
+ blanklines = blankreg.findall(html)
+ lines = linereg.findall(html)
+ if len(lines) > 1:
+ self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
+ if float(len(blanklines)) / float(len(lines)) > 0.40:
+ self.log("deleting blank lines")
+ html = blankreg.sub('', html)
+ # Arrange line feeds and
tags so the line_length and no_markup functions work correctly
+ html = re.sub(r"\s*", "\n", html)
+ html = re.sub(r"\s*
\s*", "\n
", html)
+
+ # some lit files don't have any
tags or equivalent, check and
+ # mark up line endings if required before proceeding
+ if self.no_markup(html, 0.1):
+ self.log("not enough paragraph markers, adding now")
+ add_markup = re.compile('(?)(\n)')
+ html = add_markup.sub('
\n
', html)
+
+ # detect chapters/sections to match xpath or splitting logic
+ #
+ # Start with most typical chapter headings
+ chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}s*(]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*((i|b|u)>){0,2})\s*()?s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.IGNORECASE)
+ html = chapdetect.sub(self.chapter_head, html)
+ if self.html_preprocess_sections < 10:
+ self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9}|\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
+ html = chapdetect2.sub(self.chapter_head, html)
+ #
+ # Unwrap lines using punctation if the median length of all lines is less than 200
+ length = line_length('html', html, 0.4)
+ self.log("*** Median line length is " + str(length) + " ***")
+ unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+ if length < 200:
+ self.log("Unwrapping Lines")
+ html = unwrap.sub(' ', html)
+ # If still no sections after unwrapping lines break on lines with no punctuation
+ if self.html_preprocess_sections < 10:
+ self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
+ #self.log(html)
+ chapdetect3 = re.compile(r'(
)(?P)?', re.IGNORECASE)
+ html = chapdetect3.sub(self.chapter_head, html)
+ # search for places where a first or second level heading is immediately followed by another
+ # top level heading. demote the second heading to h3 to prevent splitting between chapter
+ # headings and titles, images, etc
+ doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
+ html = doubleheading.sub('\g'+'
'+'
', html)
+
+ return html
\ No newline at end of file
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 35a8a1a9bc..e83216ae1f 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
from calibre import unicode_path
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
class Link(object):
'''
@@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin):
return (None, raw)
def preprocess_html(self, html):
- if not hasattr(self, 'log'):
- from calibre.utils.logging import default_log
- self.log = default_log
- self.log("********* Preprocessing HTML - HTML Input plugin *********")
- # Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r'(?=?(br|p|span))(?(br|p|span)[^>]*>)?\s*(?P(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(i|b)>|(i|b)>)?)(?(p|br|span)[^>]*>)', re.IGNORECASE)
- html = chapdetect.sub('
'+'\g'+'
\n', html)
- # Unwrap lines using punctation if the median length of all lines is less than 150
- #
- # Insert extra line feeds so the line length regex functions properly
- html = re.sub(r"
", "\n", html)
- length = line_length('html', html, 0.4)
- self.log.debug("*** Median length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
- if length < 150:
- html = unwrap.sub(' ', html)
+ preprocessor = PreProcessor(html)
+ html = preprocessor(html)
return html
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 35dad501be..58e7bc84bf 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -6,10 +6,8 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import re
-
from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
class LITInput(InputFormatPlugin):
@@ -18,7 +16,6 @@ class LITInput(InputFormatPlugin):
author = 'Marshall T. Vandegrift'
description = 'Convert LIT files to HTML'
file_types = set(['lit'])
- html_preprocess_sections = 0
def convert(self, stream, options, file_ext, log,
accelerators):
@@ -57,115 +54,7 @@ class LITInput(InputFormatPlugin):
def preprocess_html(self, html):
-
- def chapter_head(match):
- chap = match.group('chap')
- title = match.group('title')
- if not title:
- self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
- return '
\n'
-
- def chapter_link(match):
- chap = match.group('sectionlink')
- if not chap:
- self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
- return ' '
- else:
- self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
- return ' \n
'+chap+'
'
-
-
- def no_markup(raw, percent):
- '''
- Detects total marked up line endings in the file. raw is the text to
- inspect. Percent is the minimum percent of line endings which should
- be marked up to return true.
- '''
- htm_end_ere = re.compile('', re.DOTALL)
- line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
- htm_end = htm_end_ere.findall(raw)
- line_end = line_end_ere.findall(raw)
- tot_htm_ends = len(htm_end)
- tot_ln_fds = len(line_end)
- self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
-
- if percent > 1:
- percent = 1
- if percent < 0:
- percent = 0
-
- min_lns = tot_ln_fds * percent
- self.log("There must be more than " + str(min_lns) + " unmarked lines to return true")
- if min_lns > tot_htm_ends:
- return True
-
- self.log("********* Preprocessing HTML *********")
- # remove non-breaking spaces
- html = re.sub(ur'\u00a0', ' ', html)
- # Get rid of empty tags to simplify other processing
- html = re.sub(ur'\s*\s*', ' ', html)
- # Get rid of empty span tags
- html = re.sub(r"\s*]*>\s*", " ", html)
-
- # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
- linereg = re.compile('(?<=
)', re.IGNORECASE)
- blankreg = re.compile(r'\s*
]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
', re.IGNORECASE)
- blanklines = blankreg.findall(html)
- lines = linereg.findall(html)
- if len(lines) > 1:
- self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
- if float(len(blanklines)) / float(len(lines)) > 0.40:
- self.log("deleting blank lines")
- html = blankreg.sub('', html)
- # Arrange line feeds and tags so the line_length and no_markup functions work correctly
- html = re.sub(r"\s*", "\n", html)
-
- # some lit files don't have any
tags or equivalent, check and
- # mark up line endings if required before proceeding
- if no_markup(html, 0.1):
- self.log("not enough paragraph markers, adding now")
- add_markup = re.compile('(?)(\n)')
- html = add_markup.sub('
\n
', html)
-
- # detect chapters/sections to match xpath or splitting logic
- #
- # Mark split points based on embedded links
- chaplink = re.compile(r']*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P[^\s<]+(\s*[^\s<]+){0,4})?\s*()?\s*((i|b|u)>){0,2}\s*', re.IGNORECASE)
- html = chaplink.sub(chapter_link, html)
- # Continue with alternate patterns, start with most typical chapter headings
- if self.html_preprocess_sections < 10:
- chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}s*(]*>)?\s*.?(\d+\.?|Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*((i|b|u)>){0,2})\s*()?s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.IGNORECASE)
- html = chapdetect.sub(chapter_head, html)
- if self.html_preprocess_sections < 10:
- self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
- html = chapdetect2.sub(chapter_head, html)
- #
- # Unwrap lines using punctation if the median length of all lines is less than 150
- length = line_length('html', html, 0.4)
- self.log("*** Median line length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
- if length < 150:
- self.log("Unwrapping Lines")
- html = unwrap.sub(' ', html)
- # If still no sections after unwrapping lines break on lines with no punctuation
- if self.html_preprocess_sections < 10:
- self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
- #self.log(html)
- chapdetect3 = re.compile(r'(
)(?P)?', re.IGNORECASE)
- html = chapdetect3.sub(chapter_head, html)
- # search for places where a first or second level heading is immediately followed by another
- # top level heading. demote the second heading to h3 to prevent splitting between chapter
- # headings and titles, images, etc
- doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
- html = doubleheading.sub('\g'+'
'+'
', html)
-
+ preprocessor = PreProcessor(html)
+ html = preprocessor(html)
return html
diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py
index 3ae9f8ccca..c151551866 100644
--- a/src/calibre/ebooks/pdb/pdf/reader.py
+++ b/src/calibre/ebooks/pdb/pdf/reader.py
@@ -21,7 +21,7 @@ class Reader(FormatReader):
self.options = options
setattr(self.options, 'new_pdf_engine', False)
setattr(self.options, 'no_images', False)
- setattr(self.options, 'unwrap_factor', 0.5)
+ setattr(self.options, 'unwrap_factor', 0.45)
def extract_content(self, output_dir):
self.log.info('Extracting PDF...')
diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 113c3d99d8..14b3552b04 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -25,7 +25,7 @@ class PDFInput(InputFormatPlugin):
OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
- 'default is 0.45, this is the median line length.')),
+ 'default is 0.45, just below the median line length.')),
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
help=_('Use the new PDF conversion engine.'))
])
From f6de0bef13d7d1001b951d465cff3135aad616ed Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sat, 11 Sep 2010 22:15:09 +1000
Subject: [PATCH 09/43] replaced messed up rtf file
---
src/calibre/ebooks/rtf/preprocess.py | 624 +++++++++++++--------------
1 file changed, 289 insertions(+), 335 deletions(-)
diff --git a/src/calibre/ebooks/rtf/preprocess.py b/src/calibre/ebooks/rtf/preprocess.py
index ee45da697f..a3076651fd 100644
--- a/src/calibre/ebooks/rtf/preprocess.py
+++ b/src/calibre/ebooks/rtf/preprocess.py
@@ -1,390 +1,344 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
__license__ = 'GPL v3'
-__copyright__ = '2009, Kovid Goyal '
+__copyright__ = '2010, Gerendi Sandor Attila'
__docformat__ = 'restructuredtext en'
-import functools, re
+"""
+RTF tokenizer and token parser. v.1.0 (1/17/2010)
+Author: Gerendi Sandor Attila
-from calibre import entity_to_unicode
+At this point this will tokenize a RTF file then rebuild it from the tokens.
+In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compilant.
+"""
-XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
-SVG_NS = 'http://www.w3.org/2000/svg'
-XLINK_NS = 'http://www.w3.org/1999/xlink'
+class tokenDelimitatorStart():
+ def __init__(self):
+ pass
+ def toRTF(self):
+ return b'{'
+ def __repr__(self):
+ return '{'
-convert_entities = functools.partial(entity_to_unicode,
- result_exceptions = {
- u'<' : '<',
- u'>' : '>',
- u"'" : ''',
- u'"' : '"',
- u'&' : '&',
- })
-_span_pat = re.compile('', re.DOTALL|re.IGNORECASE)
+class tokenDelimitatorEnd():
+ def __init__(self):
+ pass
+ def toRTF(self):
+ return b'}'
+ def __repr__(self):
+ return '}'
-LIGATURES = {
-# u'\u00c6': u'AE',
-# u'\u00e6': u'ae',
-# u'\u0152': u'OE',
-# u'\u0153': u'oe',
-# u'\u0132': u'IJ',
-# u'\u0133': u'ij',
-# u'\u1D6B': u'ue',
- u'\uFB00': u'ff',
- u'\uFB01': u'fi',
- u'\uFB02': u'fl',
- u'\uFB03': u'ffi',
- u'\uFB04': u'ffl',
- u'\uFB05': u'ft',
- u'\uFB06': u'st',
- }
+class tokenControlWord():
+ def __init__(self, name, separator = ''):
+ self.name = name
+ self.separator = separator
+ def toRTF(self):
+ return self.name + self.separator
+ def __repr__(self):
+ return self.name + self.separator
-_ligpat = re.compile(u'|'.join(LIGATURES))
+class tokenControlWordWithNumericArgument():
+ def __init__(self, name, argument, separator = ''):
+ self.name = name
+ self.argument = argument
+ self.separator = separator
+ def toRTF(self):
+ return self.name + repr(self.argument) + self.separator
+ def __repr__(self):
+ return self.name + repr(self.argument) + self.separator
-def sanitize_head(match):
- x = match.group(1)
- x = _span_pat.sub('', x)
- return '\n%s\n' % x
+class tokenControlSymbol():
+ def __init__(self, name):
+ self.name = name
+ def toRTF(self):
+ return self.name
+ def __repr__(self):
+ return self.name
-def chap_head(match):
- chap = match.group('chap')
- title = match.group('title')
- if not title:
- return '
'+chap+'
\n'
- else:
- return '
'+chap+'
\n
'+title+'
\n'
+class tokenData():
+ def __init__(self, data):
+ self.data = data
+ def toRTF(self):
+ return self.data
+ def __repr__(self):
+ return self.data
-def wrap_lines(match):
- ital = match.group('ital')
- if not ital:
- return ' '
- else:
- return ital+' '
+class tokenBinN():
+ def __init__(self, data, separator = ''):
+ self.data = data
+ self.separator = separator
+ def toRTF(self):
+ return "\\bin" + repr(len(self.data)) + self.separator + self.data
+ def __repr__(self):
+ return "\\bin" + repr(len(self.data)) + self.separator + self.data
+
+class token8bitChar():
+ def __init__(self, data):
+ self.data = data
+ def toRTF(self):
+ return "\\'" + self.data
+ def __repr__(self):
+ return "\\'" + self.data
+
+class tokenUnicode():
+ def __init__(self, data, separator = '', current_ucn = 1, eqList = []):
+ self.data = data
+ self.separator = separator
+ self.current_ucn = current_ucn
+ self.eqList = eqList
+ def toRTF(self):
+ result = '\\u' + repr(self.data) + ' '
+ ucn = self.current_ucn
+ if len(self.eqList) < ucn:
+ ucn = len(self.eqList)
+ result = tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result
+ i = 0
+ for eq in self.eqList:
+ if i >= ucn:
+ break
+ result = result + eq.toRTF()
+ return result
+ def __repr__(self):
+ return '\\u' + repr(self.data)
-def line_length(format, raw, percent):
- '''
- raw is the raw text to find the line length to use for wrapping.
- percentage is a decimal number, 0 - 1 which is used to determine
- how far in the list of line lengths to use. The list of line lengths is
- ordered smallest to larged and does not include duplicates. 0.5 is the
- median value.
- '''
- raw = raw.replace(' ', ' ')
- if format == 'html':
- linere = re.compile('(?<=
)', re.DOTALL)
- elif format == 'pdf':
- linere = re.compile('(?<= ).*?(?= )', re.DOTALL)
- lines = linere.findall(raw)
- print "percent is " + str(percent)
+def isAsciiLetter(value):
+ return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z'))
- lengths = []
- for line in lines:
- if len(line) > 0:
- lengths.append(len(line))
+def isDigit(value):
+ return (value >= '0') and (value <= '9')
- if not lengths:
- return 0
+def isChar(value, char):
+ return value == char
- lengths = list(set(lengths))
- total = sum(lengths)
- avg = total / len(lengths)
- max_line = avg * 2
-
- lengths = sorted(lengths)
- for i in range(len(lengths) - 1, -1, -1):
- if lengths[i] > max_line:
- del lengths[i]
-
- if percent > 1:
- percent = 1
- if percent < 0:
- percent = 0
-
- index = int(len(lengths) * percent) - 1
-
- return lengths[index]
+def isString(buffer, string):
+ return buffer == string
-class CSSPreProcessor(object):
+class RtfTokenParser():
+ def __init__(self, tokens):
+ self.tokens = tokens
+ self.process()
+ self.processUnicode()
- PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
+ def process(self):
+ i = 0
+ newTokens = []
+ while i < len(self.tokens):
+ if isinstance(self.tokens[i], tokenControlSymbol):
+ if isString(self.tokens[i].name, "\\'"):
+ i = i + 1
+ if not isinstance(self.tokens[i], tokenData):
+ raise Exception('Error: token8bitChar without data.')
+ if len(self.tokens[i].data) < 2:
+ raise Exception('Error: token8bitChar without data.')
+ newTokens.append(token8bitChar(self.tokens[i].data[0:2]))
+ if len(self.tokens[i].data) > 2:
+ newTokens.append(tokenData(self.tokens[i].data[2:]))
+ i = i + 1
+ continue
- def __call__(self, data, add_namespace=False):
- from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
- data = self.PAGE_PAT.sub('', data)
- if not add_namespace:
- return data
- ans, namespaced = [], False
- for line in data.splitlines():
- ll = line.lstrip()
- if not (namespaced or ll.startswith('@import') or
- ll.startswith('@charset')):
- ans.append(XHTML_CSS_NAMESPACE.strip())
- namespaced = True
- ans.append(line)
+ newTokens.append(self.tokens[i])
+ i = i + 1
- return u'\n'.join(ans)
+ self.tokens = list(newTokens)
-class HTMLPreProcessor(object):
+ def processUnicode(self):
+ i = 0
+ newTokens = []
+ ucNbStack = [1]
+ while i < len(self.tokens):
+ if isinstance(self.tokens[i], tokenDelimitatorStart):
+ ucNbStack.append(ucNbStack[len(ucNbStack) - 1])
+ newTokens.append(self.tokens[i])
+ i = i + 1
+ continue
+ if isinstance(self.tokens[i], tokenDelimitatorEnd):
+ ucNbStack.pop()
+ newTokens.append(self.tokens[i])
+ i = i + 1
+ continue
+ if isinstance(self.tokens[i], tokenControlWordWithNumericArgument):
+ if isString(self.tokens[i].name, '\\uc'):
+ ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument
+ newTokens.append(self.tokens[i])
+ i = i + 1
+ continue
+ if isString(self.tokens[i].name, '\\u'):
+ x = i
+ j = 0
+ i = i + 1
+ replace = []
+ partialData = None
+ ucn = ucNbStack[len(ucNbStack) - 1]
+ while (i < len(self.tokens)) and (j < ucn):
+ if isinstance(self.tokens[i], tokenDelimitatorStart):
+ break
+ if isinstance(self.tokens[i], tokenDelimitatorEnd):
+ break
+ if isinstance(self.tokens[i], tokenData):
+ if len(self.tokens[i].data) >= ucn - j:
+ replace.append(tokenData(self.tokens[i].data[0 : ucn - j]))
+ if len(self.tokens[i].data) > ucn - j:
+ partialData = tokenData(self.tokens[i].data[ucn - j:])
+ i = i + 1
+ break
+ else:
+ replace.append(self.tokens[i])
+ j = j + len(self.tokens[i].data)
+ i = i + 1
+ continue
+ if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN):
+ replace.append(self.tokens[i])
+ i = i + 1
+ j = j + 1
+ continue
+ raise Exception('Error: incorect utf replacement.')
- PREPROCESS = [
- # Some idiotic HTML generators (Frontpage I'm looking at you)
- # Put all sorts of crap into
'),
+ if isChar(self.rtfData[i], '{'):
+ if lastDataStart > -1:
+ self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+ lastDataStart = -1
+ self.tokens.append(tokenDelimitatorStart())
+ i = i + 1
+ continue
- # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
- (re.compile(u'(?<=[-–—])\s* \s*(?=[[a-z\d])'), lambda match: ''),
+ if isChar(self.rtfData[i], '}'):
+ if lastDataStart > -1:
+ self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+ lastDataStart = -1
+ self.tokens.append(tokenDelimitatorEnd())
+ i = i + 1
+ continue
- # Remove gray background
- (re.compile(r'
]+>'), lambda match : ''),
+ if isChar(self.rtfData[i], '\\'):
+ if i + 1 >= len(self.rtfData):
+ raise Exception('Error: Control character found at the end of the document.')
- # Detect Chapters to match default XPATH in GUI
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*((i|b)>((i|b)>)?)?)\s*(?(br|p)[^>]*>\s*){1,3}\s*(?P(<(i|b)>)?(\s*\w+){1,4}\s*((i|b)>)?\s*(?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
+ if lastDataStart > -1:
+ self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+ lastDataStart = -1
- # Have paragraphs show better
- (re.compile(r''), lambda match : '
'),
- # Clean up spaces
- (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
- # Add space before and after italics
- (re.compile(u'(?'), lambda match: ' '),
- (re.compile(r'(?=\w)'), lambda match: ' '),
-
- ]
+ tokenStart = i
+ i = i + 1
- # Fix Book Designer markup
- BOOK_DESIGNER = [
- # HR
- (re.compile('
)(?P)?', re.IGNORECASE)
- html = chapdetect3.sub(self.chapter_head, html)
+ chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*()?((i|b|u)>){0,2}\s*()?\s*((i|b|u)>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
+ html = chapdetect3.sub(self.chapter_break, html)
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc
diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py
index 36848ddb8b..584d631d0b 100644
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@@ -408,10 +408,6 @@ class Page(object):
# Fraction of text height that two strings' bottoms can differ by
# for them to be considered to be part of the same text fragment
LINE_FACTOR = 0.4
-
- # Percentage of the page heigth which should be considered header
- # or footer to be discarded from reflow considerations
- HEAD_FOOTER_MARGIN
# Multiplies the average line height when determining row height
# of a particular element to detect columns.
From cdb696f63bc39b9327abe809fa71e94baa6e0b86 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Mon, 13 Sep 2010 00:12:21 +1000
Subject: [PATCH 11/43] enhanced preprocessing class - looking pretty good
---
src/calibre/ebooks/conversion/preprocess.py | 18 ++--
src/calibre/ebooks/conversion/utils.py | 98 +++++++++++++++------
2 files changed, 82 insertions(+), 34 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 46308b2ea0..f6277956c8 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -62,7 +62,6 @@ def wrap_lines(match):
else:
return ital+' '
-
def line_length(format, raw, percent):
'''
raw is the raw text to find the line length to use for wrapping.
@@ -76,6 +75,8 @@ def line_length(format, raw, percent):
linere = re.compile('(?<=
)', re.DOTALL)
elif format == 'pdf':
linere = re.compile('(?<= ).*?(?= )', re.DOTALL)
+ elif format == 'spanned_html':
+ linere = re.compile('(?<=)', re.DOTALL)
lines = linere.findall(raw)
lengths = []
@@ -223,14 +224,15 @@ class HTMLPreProcessor(object):
# Remove page links
(re.compile(r'', re.IGNORECASE), lambda match: ''),
# Remove tags
- (re.compile(r'', re.IGNORECASE), lambda match: ' '),
+ (re.compile(r'', re.IGNORECASE), lambda match: ' '),
# Remove gray background
(re.compile(r']+>'), lambda match : ''),
# Detect Chapters to match default XPATH in GUI
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*((i|b)>((i|b)>)?)?)\s*(?(br|p)[^>]*>\s*){1,3}\s*(?P(<(i|b)>)?(\s*\w+){1,4}\s*((i|b)>)?\s*(?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
- (re.compile(r' \s*(?P([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*( \s*){1,3}\s*(?P(<(i|b)>)?(\s*\w+){1,4}\s*((i|b)>)?\s*(?(br|p)[^>]*>))?'), chap_head),
+ (re.compile(r' \s*(?P(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*([ibu]>){0,2})\s*( \s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s* )?', re.IGNORECASE), chap_head),
+ # Cover the case where every letter in a chapter title is separated by a space
+ (re.compile(r' \s*(?P([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*( \s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*( ))?'), chap_head),
# Have paragraphs show better
(re.compile(r''), lambda match : '
'),
@@ -238,8 +240,7 @@ class HTMLPreProcessor(object):
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics
(re.compile(u'(?'), lambda match: ' '),
- (re.compile(r'(?=\w)'), lambda match: ' '),
-
+ (re.compile(r'(?=\w)'), lambda match: ' '),
]
# Fix Book Designer markup
@@ -327,10 +328,11 @@ class HTMLPreProcessor(object):
# unwrap/delete soft hyphens with formatting
end_rules.append((re.compile(u'[]\s*((i|u|b)>)+(\s*
)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
- # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
+ # Make the more aggressive chapter marking regex optional with the preprocess option to
+ # reduce false positives and move after header/footer removal
if getattr(self.extra_opts, 'preprocess_html', None):
if is_pdftohtml:
- end_rules.append((re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*((i|b)>((i|b)>)?)?\s*(?p[^>]*>| ]*>)\n?((?=()?\s*\w+(\s+\w+)?()?( ]*>|?p[^>]*>))((?P.*)( ]*>|?p[^>]*>)))?'), chap_head))
+ end_rules.append((re.compile(r'
'+span
+
def no_markup(self, raw, percent):
'''
Detects total marked up line endings in the file. raw is the text to
@@ -48,7 +63,7 @@ class PreProcessor(object):
line_end = line_end_ere.findall(raw)
tot_htm_ends = len(htm_end)
tot_ln_fds = len(line_end)
- self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
+ self.log("There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked up endings")
if percent > 1:
percent = 1
@@ -56,13 +71,18 @@ class PreProcessor(object):
percent = 0
min_lns = tot_ln_fds * percent
- self.log("There must be fewer than " + str(min_lns) + " unmarked lines to return true")
+ self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
if min_lns > tot_htm_ends:
return True
def __call__(self, html):
self.log("********* Preprocessing HTML *********")
- # remove non-breaking spaces
+ # Replace series of non-breaking spaces with text-indent
+ txtindent = re.compile(ur'
[^>]*)>\s*(?P(]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
+ html = txtindent.sub(self.insert_indent, html)
+ if self.found_indents > 1:
+ self.log("replaced "+str(self.found_indents)+ " nbsp indents with inline styles")
+ # remove remaining non-breaking spaces
html = re.sub(ur'\u00a0', ' ', html)
# Get rid of empty tags to simplify other processing
html = re.sub(ur'\s*\s*', ' ', html)
@@ -83,41 +103,67 @@ class PreProcessor(object):
html = re.sub(r"\s*
", "
\n", html)
html = re.sub(r"\s*
\s*", "\n
", html)
- # some lit files don't have any
tags or equivalent, check and
- # mark up line endings if required before proceeding
+ # some lit files don't have any
tags or equivalent (generally just plain text between
+ #
tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now")
add_markup = re.compile('(?)(\n)')
html = add_markup.sub('
\n
', html)
# detect chapters/sections to match xpath or splitting logic
+ heading = re.compile(']*>', re.IGNORECASE)
+ self.html_preprocess_sections = len(heading.findall(html))
+ self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
#
- # Start with most typical chapter headings
- chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}s*(]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*((i|b|u)>){0,2})\s*()?s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.IGNORECASE)
- html = chapdetect.sub(self.chapter_head, html)
+ # Start with most typical chapter headings, get more aggressive until one works
+ if self.html_preprocess_sections < 10:
+ chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}s*(]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*([ibu]>){0,2})\s*()?s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.IGNORECASE)
+ html = chapdetect.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
- html = chapdetect2.sub(self.chapter_head, html)
- #
- # Unwrap lines using punctation if the median length of all lines is less than 200
- length = line_length('html', html, 0.4)
- self.log("*** Median line length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
- if length < 200:
- self.log("Unwrapping Lines")
- html = unwrap.sub(' ', html)
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
+ html = chapdetect2.sub(self.chapter_head, html)
- # If still no sections after unwrapping lines break on lines with no punctuation
+ # Unwrap lines
+ #
+ self.log("Unwrapping Lines")
+ # Some OCR sourced files have line breaks in the html using a combination of span & p tags
+ # span are used for hard line breaks, p for new paragraphs. Determine which is used so
+ # that lines can be wrapped across page boundaries
+ paras_reg = re.compile('
]*>', re.IGNORECASE)
+ spans_reg = re.compile(']*>', re.IGNORECASE)
+ paras = len(paras_reg.findall(html))
+ spans = len(spans_reg.findall(html))
+ if spans > 1:
+ if float(paras) / float(spans) < 0.75:
+ format = 'spanned_html'
+ else:
+ format = 'html'
+ else:
+ format = 'html'
+
+ # Calculate Length
+ length = line_length(format, html, 0.4)
+ self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
+ #
+ # Unwrap and/or delete soft-hyphens, hyphens
+ html = re.sub(u'\s*(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+ html = re.sub(u'(?<=[-–—])\s*(?=<)(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
+
+ # Unwrap lines using punctation if the median length of all lines is less than 200
+ unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+ html = unwrap.sub(' ', html)
+
+ # If still no sections after unwrapping mark split points on lines with no punctuation
if self.html_preprocess_sections < 10:
- self.log(str(self.html_preprocess_sections) + " split points marked, matching based on punctuation")
+ self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
#self.log(html)
- chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*()?((i|b|u)>){0,2}\s*()?\s*((i|b|u)>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
+ chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(<[ibu]>){0,2}\s*(]*>)?\s*(<[ibu]>){0,2}\s*(]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*()?([ibu]>){0,2}\s*()?\s*([ibu]>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_break, html)
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
From 6cc332089a421e6100fa4937c5126309c483e132 Mon Sep 17 00:00:00 2001
From: Starson17
Date: Sun, 12 Sep 2010 11:28:24 -0400
Subject: [PATCH 12/43] Change Merge and Safe Merge warnings re ISBN
---
src/calibre/gui2/actions/edit_metadata.py | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/src/calibre/gui2/actions/edit_metadata.py b/src/calibre/gui2/actions/edit_metadata.py
index f0232d9859..878ba77a43 100644
--- a/src/calibre/gui2/actions/edit_metadata.py
+++ b/src/calibre/gui2/actions/edit_metadata.py
@@ -209,8 +209,9 @@ class EditMetadataAction(InterfaceAction):
dest_id, src_books, src_ids = self.books_to_merge(rows)
if safe_merge:
if not confirm('
'+_(
- 'All book formats and metadata from the selected books '
- 'will be added to the first selected book.
'
+ 'Book formats and metadata from the selected books '
+ 'will be added to the first selected book. '
+ 'ISBN will not be merged.
'
'The second and subsequently selected books will not '
'be deleted or changed.
'
'Please confirm you want to proceed.')
@@ -220,8 +221,9 @@ class EditMetadataAction(InterfaceAction):
self.merge_metadata(dest_id, src_ids)
else:
if not confirm('
'+_(
- 'All book formats and metadata from the selected books will be merged '
- 'into the first selected book.
'
+ 'Book formats and metadata from the selected books will be merged '
+ 'into the first selected book. '
+ 'ISBN will not be merged.
'
'After merger the second and '
'subsequently selected books will be deleted.
'
'All book formats of the first selected book will be kept '
From 78874a9117941de749f3b09934be8588181dd4b7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 12 Sep 2010 09:32:16 -0600
Subject: [PATCH 13/43] Use the new sorting code in the content server as well.
---
src/calibre/library/caches.py | 153 +-------------------------
src/calibre/library/server/content.py | 38 +++----
2 files changed, 18 insertions(+), 173 deletions(-)
diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index dfd7086076..4f795ab733 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import re, itertools, functools
+import re, itertools
from itertools import repeat
from datetime import timedelta
from threading import Thread, RLock
@@ -584,39 +584,7 @@ class ResultCache(SearchQueryParser):
# Sorting functions {{{
- def seriescmp(self, sidx, siidx, x, y, library_order=None):
- try:
- if library_order:
- ans = cmp(title_sort(self._data[x][sidx].lower()),
- title_sort(self._data[y][sidx].lower()))
- else:
- ans = cmp(self._data[x][sidx].lower(),
- self._data[y][sidx].lower())
- except AttributeError: # Some entries may be None
- ans = cmp(self._data[x][sidx], self._data[y][sidx])
- if ans != 0: return ans
- return cmp(self._data[x][siidx], self._data[y][siidx])
-
- def cmp(self, loc, x, y, asstr=True, subsort=False):
- try:
- ans = cmp(self._data[x][loc].lower(), self._data[y][loc].lower()) if \
- asstr else cmp(self._data[x][loc], self._data[y][loc])
- except AttributeError: # Some entries may be None
- ans = cmp(self._data[x][loc], self._data[y][loc])
- except TypeError: ## raised when a datetime is None
- x = self._data[x][loc]
- if x is None:
- x = UNDEFINED_DATE
- y = self._data[y][loc]
- if y is None:
- y = UNDEFINED_DATE
- return cmp(x, y)
- if subsort and ans == 0:
- idx = self.FIELD_MAP['sort']
- return cmp(self._data[x][idx].lower(), self._data[y][idx].lower())
- return ans
-
- def sanitize_field_name(self, field):
+ def sanitize_sort_field_name(self, field):
field = field.lower().strip()
if field not in self.field_metadata.iterkeys():
if field in ('author', 'tag', 'comment'):
@@ -627,38 +595,10 @@ class ResultCache(SearchQueryParser):
return field
def sort(self, field, ascending, subsort=False):
- field = self.sanitize_field_name(field)
- as_string = field not in ('size', 'rating', 'timestamp')
-
- if self.first_sort:
- subsort = True
- self.first_sort = False
- if self.field_metadata[field]['is_custom']:
- if self.field_metadata[field]['datatype'] == 'series':
- fcmp = functools.partial(self.seriescmp,
- self.field_metadata[field]['rec_index'],
- self.field_metadata.cc_series_index_column_for(field),
- library_order=tweaks['title_series_sorting'] == 'library_order')
- else:
- as_string = self.field_metadata[field]['datatype'] in ('comments', 'text')
- field = self.field_metadata[field]['colnum']
- fcmp = functools.partial(self.cmp, self.FIELD_MAP[field],
- subsort=subsort, asstr=as_string)
- elif field == 'series':
- fcmp = functools.partial(self.seriescmp, self.FIELD_MAP['series'],
- self.FIELD_MAP['series_index'],
- library_order=tweaks['title_series_sorting'] == 'library_order')
- else:
- fcmp = functools.partial(self.cmp, self.field_metadata[field]['rec_index'],
- subsort=subsort, asstr=as_string)
- self._map.sort(cmp=fcmp, reverse=not ascending)
- tmap = list(itertools.repeat(False, len(self._data)))
- for x in self._map_filtered:
- tmap[x] = True
- self._map_filtered = [x for x in self._map if tmap[x]]
+ self.multisort([(field, ascending)])
def multisort(self, fields=[], subsort=False):
- fields = [(self.sanitize_field_name(x), bool(y)) for x, y in fields]
+ fields = [(self.sanitize_sort_field_name(x), bool(y)) for x, y in fields]
keys = self.field_metadata.field_keys()
fields = [x for x in fields if x[0] in keys]
if subsort and 'sort' not in [x[0] for x in fields]:
@@ -671,6 +611,7 @@ class ResultCache(SearchQueryParser):
self._map.sort(key=keyg, reverse=not fields[0][1])
else:
self._map.sort(key=keyg)
+
tmap = list(itertools.repeat(False, len(self._data)))
for x in self._map_filtered:
tmap[x] = True
@@ -733,87 +674,3 @@ class SortKeyGenerator(object):
# }}}
-if __name__ == '__main__':
- # Testing.timing for new multi-sort {{{
- import time
-
- from calibre.library import db
- db = db()
-
- db.refresh()
-
- fields = db.field_metadata.field_keys()
-
- print fields
-
-
- def do_single_sort(meth, field, order):
- if meth == 'old':
- db.data.sort(field, order)
- else:
- db.data.multisort([(field, order)])
-
- def test_single_sort(field):
- for meth in ('old', 'new'):
- ttime = 0
- NUM = 10
- asc = desc = None
- for i in range(NUM):
- db.data.sort('id', False)
- st = time.time()
- do_single_sort(meth, field, True)
- asc = db.data._map
- do_single_sort(meth, field, False)
- desc = db.data._map
- ttime += time.time() - st
- yield (ttime/NUM, asc, desc)
-
-
- print 'Running single sort differentials'
- for field in fields:
- if field in ('search', 'id', 'news', 'flags'): continue
- print '\t', field, db.field_metadata[field]['datatype']
- old, new = test_single_sort(field)
- if old[1] != new[1] or old[2] != new[2]:
- print '\t\t', 'Sort failure!'
- raise SystemExit(1)
- print '\t\t', 'Old:', old[0], 'New:', new[0], 'Ratio: %.2f'%(new[0]/old[0])
-
- def do_multi_sort(meth, ms):
- if meth == 'new':
- db.data.multisort(ms)
- else:
- for s in reversed(ms):
- db.data.sort(*s)
-
- def test_multi_sort(ms):
- for meth in ('old', 'new'):
- ttime = 0
- NUM = 10
- for i in range(NUM):
- db.data.sort('id', False)
- st = time.time()
- do_multi_sort(meth, ms)
- ttime += time.time() - st
- yield (ttime/NUM, db.data._map)
-
- print 'Running multi-sort differentials'
-
- for ms in [
- [('timestamp', False), ('author', True), ('title', False)],
- [('size', True), ('tags', True), ('author', False)],
- [('series', False), ('title', True)],
- [('size', True), ('tags', True), ('author', False), ('pubdate',
- True), ('tags', False), ('formats', False), ('uuid', True)],
-
- ]:
- print '\t', ms
- db.data.sort('id', False)
- old, new = test_multi_sort(ms)
- if old[1] != new[1]:
- print '\t\t', 'Sort failure!'
- raise SystemExit()
- print '\t\t', 'Old:', old[0], 'New:', new[0], 'Ratio: %.2f'%(new[0]/old[0])
-
- # }}}
-
diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py
index 6784abd8f4..ecb467b4c2 100644
--- a/src/calibre/library/server/content.py
+++ b/src/calibre/library/server/content.py
@@ -5,7 +5,7 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import re, os, cStringIO, operator
+import re, os, cStringIO
import cherrypy
try:
@@ -16,7 +16,15 @@ except ImportError:
from calibre import fit_image, guess_type
from calibre.utils.date import fromtimestamp
-from calibre.ebooks.metadata import title_sort
+from calibre.library.caches import SortKeyGenerator
+
+class CSSortKeyGenerator(SortKeyGenerator):
+
+ def __init__(self, fields, fm):
+ SortKeyGenerator.__init__(self, fields, fm, None)
+
+ def __call__(self, record):
+ return self.itervals(record).next()
class ContentServer(object):
@@ -47,32 +55,12 @@ class ContentServer(object):
def sort(self, items, field, order):
- field = field.lower().strip()
- if field == 'author':
- field = 'authors'
- if field == 'date':
- field = 'timestamp'
+ field = self.db.data.sanitize_sort_field_name(field)
if field not in ('title', 'authors', 'rating', 'timestamp', 'tags', 'size', 'series'):
raise cherrypy.HTTPError(400, '%s is not a valid sort field'%field)
- cmpf = cmp if field in ('rating', 'size', 'timestamp') else \
- lambda x, y: cmp(x.lower() if x else '', y.lower() if y else '')
- if field == 'series':
- items.sort(cmp=self.seriescmp, reverse=not order)
- else:
- lookup = 'sort' if field == 'title' else field
- lookup = 'author_sort' if field == 'authors' else field
- field = self.db.FIELD_MAP[lookup]
- getter = operator.itemgetter(field)
- items.sort(cmp=lambda x, y: cmpf(getter(x), getter(y)), reverse=not order)
+ keyg = CSSortKeyGenerator([(field, order)], self.db.field_metadata)
+ items.sort(key=keyg, reverse=not order)
- def seriescmp(self, x, y):
- si = self.db.FIELD_MAP['series']
- try:
- ans = cmp(title_sort(x[si].lower()), title_sort(y[si].lower()))
- except AttributeError: # Some entries may be None
- ans = cmp(x[si], y[si])
- if ans != 0: return ans
- return cmp(x[self.db.FIELD_MAP['series_index']], y[self.db.FIELD_MAP['series_index']])
# }}}
From 80c976e0f24f05a5ee7a9bfce50bf7745215e339 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Sun, 12 Sep 2010 11:11:00 -0600
Subject: [PATCH 14/43] Fix #6794 (Updated recipes for Infobae and NSPM)
---
resources/recipes/infobae.recipe | 82 ++++++++------------------------
resources/recipes/nspm.recipe | 11 ++++-
2 files changed, 30 insertions(+), 63 deletions(-)
diff --git a/resources/recipes/infobae.recipe b/resources/recipes/infobae.recipe
index cda9bf83d2..b7f9cd3c6c 100644
--- a/resources/recipes/infobae.recipe
+++ b/resources/recipes/infobae.recipe
@@ -1,12 +1,8 @@
-#!/usr/bin/env python
-
__license__ = 'GPL v3'
-__copyright__ = '2008-2009, Darko Miletic '
+__copyright__ = '2008-2010, Darko Miletic '
'''
infobae.com
'''
-import re
-import urllib, urlparse
from calibre.web.feeds.news import BasicNewsRecipe
@@ -20,35 +16,24 @@ class Infobae(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
- language = 'es'
- lang = 'es-AR'
-
+ language = 'es'
encoding = 'cp1252'
- cover_url = 'http://www.infobae.com/imgs/header/header.gif'
+ masthead_url = 'http://www.infobae.com/imgs/header/header.gif'
remove_javascript = True
- preprocess_regexps = [(re.compile(
- r''), lambda m:'')]
-
-
- html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
-
- extra_css = '''
- .col-center{font-family:Arial,Helvetica,sans-serif;}
- h1{font-family:Arial,Helvetica,sans-serif; color:#0D4261;}
- .fuenteIntNota{font-family:Arial,Helvetica,sans-serif; color:#1D1D1D; font-size:x-small;}
- '''
-
- keep_only_tags = [dict(name='div', attrs={'class':['content']})]
-
-
- remove_tags = [
- dict(name='div', attrs={'class':['options','col-right','controles', 'bannerLibre','tiulo-masleidas','masleidas-h']}),
- dict(name='a', attrs={'name' : 'comentario',}),
- dict(name='iframe'),
- dict(name='img', alt = "Ver galerias de imagenes"),
-
- ]
-
+ remove_empty_feeds = True
+ extra_css = '''
+ body{font-family:Arial,Helvetica,sans-serif;}
+ .popUpTitulo{color:#0D4261; font-size: xx-large}
+ '''
+
+ conversion_options = {
+ 'comment' : description
+ , 'tags' : category
+ , 'publisher' : publisher
+ , 'language' : language
+ , 'linearize_tables' : True
+ }
+
feeds = [
(u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' )
@@ -57,39 +42,14 @@ class Infobae(BasicNewsRecipe):
,(u'Deportes' , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml' )
]
-# def print_version(self, url):
-# main, sep, article_part = url.partition('contenidos/')
-# article_id, rsep, rrest = article_part.partition('-')
-# return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
-
- def get_article_url(self, article):
- ans = article.get('link').encode('utf-8')
- parts = list(urlparse.urlparse(ans))
- parts[2] = urllib.quote(parts[2])
- ans = urlparse.urlunparse(parts)
- return ans.decode('utf-8')
-
-
- def preprocess_html(self, soup):
-
- for tag in soup.head.findAll('strong'):
- tag.extract()
- for tag in soup.findAll('meta'):
- del tag['content']
- tag.extract()
-
- mtag = '\n\n'
- soup.head.insert(0,mtag)
- for item in soup.findAll(style=True):
- del item['style']
-
- return soup
+ def print_version(self, url):
+ article_part = url.rpartition('/')[2]
+ article_id= article_part.partition('-')[0]
+ return 'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
def postprocess_html(self, soup, first):
-
for tag in soup.findAll(name='strong'):
tag.name = 'b'
-
return soup
diff --git a/resources/recipes/nspm.recipe b/resources/recipes/nspm.recipe
index 13ff42b277..29f2cfc5e3 100644
--- a/resources/recipes/nspm.recipe
+++ b/resources/recipes/nspm.recipe
@@ -6,6 +6,7 @@ nspm.rs
import re
from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Tag, NavigableString
class Nspm(BasicNewsRecipe):
title = 'Nova srpska politicka misao'
@@ -21,6 +22,7 @@ class Nspm(BasicNewsRecipe):
encoding = 'utf-8'
language = 'sr'
delay = 2
+ remove_empty_feeds = True
publication_type = 'magazine'
masthead_url = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg'
extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
@@ -45,8 +47,9 @@ class Nspm(BasicNewsRecipe):
dict(name=['link','object','embed','script','meta','base','iframe'])
,dict(attrs={'class':'buttonheading'})
]
- remove_tags_after = dict(attrs={'class':'article_separator'})
- remove_attributes = ['width','height']
+ remove_tags_before = dict(attrs={'class':'contentheading'})
+ remove_tags_after = dict(attrs={'class':'article_separator'})
+ remove_attributes = ['width','height']
def get_browser(self):
br = BasicNewsRecipe.get_browser()
@@ -67,4 +70,8 @@ class Nspm(BasicNewsRecipe):
def preprocess_html(self, soup):
for item in soup.body.findAll(style=True):
del item['style']
+ for item in soup.body.findAll('h1'):
+ nh = NavigableString(item.a.string)
+ item.a.extract()
+ item.insert(0,nh)
return self.adeify_images(soup)
From 548417ea6b6157faf1688b3b082f3eac5476636f Mon Sep 17 00:00:00 2001
From: ldolse
Date: Mon, 13 Sep 2010 09:18:45 +1000
Subject: [PATCH 15/43] comments and minor tweak
---
src/calibre/ebooks/conversion/utils.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index abfa43e7ed..ecf030b27d 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -111,7 +111,7 @@ class PreProcessor(object):
html = add_markup.sub('
\n
', html)
# detect chapters/sections to match xpath or splitting logic
- heading = re.compile(']*>', re.IGNORECASE)
+ heading = re.compile(']*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
#
@@ -134,7 +134,7 @@ class PreProcessor(object):
self.log("Unwrapping Lines")
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
- # that lines can be wrapped across page boundaries
+ # that lines can be un-wrapped across page boundaries
paras_reg = re.compile('
]*>', re.IGNORECASE)
spans_reg = re.compile(']*>', re.IGNORECASE)
paras = len(paras_reg.findall(html))
From de6aadee76d4dafe9b84133dc3af43ddef22fd0a Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Mon, 13 Sep 2010 10:15:35 -0600
Subject: [PATCH 16/43] News download: Fix bug that could break some downloads
in non ASCII locales
---
resources/recipes/xkcd.recipe | 6 +++---
src/calibre/web/feeds/__init__.py | 4 +++-
2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/resources/recipes/xkcd.recipe b/resources/recipes/xkcd.recipe
index 312027004e..ad0d420deb 100644
--- a/resources/recipes/xkcd.recipe
+++ b/resources/recipes/xkcd.recipe
@@ -24,18 +24,18 @@ class XkcdCom(BasicNewsRecipe):
(re.compile(r'()'),
lambda m: '%s%s
%s
' % (m.group(1), m.group(3), m.group(2)))
]
-
+
def parse_index(self):
INDEX = 'http://xkcd.com/archive/'
- soup = self.index_to_soup(INDEX)
+ soup = self.index_to_soup(INDEX)
articles = []
for item in soup.findAll('a', title=True):
articles.append({
'date': item['title'],
'timestamp': time.mktime(time.strptime(item['title'], '%Y-%m-%d'))+1,
'url': 'http://xkcd.com' + item['href'],
- 'title': self.tag_to_string(item).encode('UTF-8'),
+ 'title': self.tag_to_string(item),
'description': '',
'content': '',
})
diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py
index a70cf8b664..8aef350498 100644
--- a/src/calibre/web/feeds/__init__.py
+++ b/src/calibre/web/feeds/__init__.py
@@ -165,7 +165,9 @@ class Feed(object):
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
self.articles.append(article)
else:
- self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
+ t = strftime(u'%a, %d %b, %Y %H:%M', article.localtime.timetuple())
+ self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%
+ (title, t, self.title))
d = item.get('date', '')
article.formatted_date = d
From b73e1b3da50810e151d10a5d62251754a077e605 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Tue, 14 Sep 2010 02:56:56 +1000
Subject: [PATCH 17/43] tweaked preprocess for $, added rtf to new preprocess
logic, changed last pdf default
---
src/calibre/ebooks/conversion/preprocess.py | 2 +-
src/calibre/ebooks/rtf/input.py | 13 +++----------
src/calibre/gui2/convert/pdf_input.ui | 2 +-
3 files changed, 5 insertions(+), 12 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index f6277956c8..9464be1210 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -340,7 +340,7 @@ class HTMLPreProcessor(object):
# print "The pdf line length returned is " + str(length)
end_rules.append(
# Un wrap using punctuation
- (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P(i|b|u)>)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+ (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P(i|b|u)>)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 216ccf591d..d229b80c16 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -8,6 +8,7 @@ from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
class InlineClass(etree.XSLTExtension):
@@ -229,16 +230,8 @@ class RTFInput(InputFormatPlugin):
res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
if self.options.preprocess_html:
- self.log("********* Preprocessing HTML *********")
- # Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r'
\n', res)
- # Unwrap lines using punctation if the median length of all lines is less than 150
- length = line_length('html', res, 0.4)
- self.log("*** Median length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*\s*
\s*(?P
]*>\s*(]*>\s*\s*)
\s*){0,3}\s*
]*>\s*]*>\s*" % length, re.UNICODE)
- if length < 150:
- res = unwrap.sub(' ', res)
+ preprocessor = PreProcessor(res)
+ res = preprocessor(res)
f.write(res)
self.write_inline_css(inline_class)
stream.seek(0)
diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui
index 626c68ea63..b2ee421922 100644
--- a/src/calibre/gui2/convert/pdf_input.ui
+++ b/src/calibre/gui2/convert/pdf_input.ui
@@ -46,7 +46,7 @@
0.010000000000000
- 0.500000000000000
+ 0.450000000000000
From 8b73bb52e8d551538d0c0e55e7b91b6b16f69977 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Mon, 13 Sep 2010 16:42:22 -0600
Subject: [PATCH 18/43] Fix #6802 (Sovos E Reader Not Recognised / Floppy Drive
Activation)
---
src/calibre/customize/builtins.py | 3 ++-
src/calibre/devices/teclast/driver.py | 11 +++++++++++
2 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index 4c87236e71..68df832048 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -459,7 +459,7 @@ from calibre.devices.iriver.driver import IRIVER_STORY
from calibre.devices.binatone.driver import README
from calibre.devices.hanvon.driver import N516, EB511, ALEX, AZBOOKA, THEBOOK
from calibre.devices.edge.driver import EDGE
-from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS
+from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, SOVOS
from calibre.devices.sne.driver import SNE
from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, GEMEI
from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
@@ -557,6 +557,7 @@ plugins += [
TECLAST_K3,
NEWSMY,
IPAPYRUS,
+ SOVOS,
EDGE,
SNE,
ALEX,
diff --git a/src/calibre/devices/teclast/driver.py b/src/calibre/devices/teclast/driver.py
index 0c60a367cf..2055ff9306 100644
--- a/src/calibre/devices/teclast/driver.py
+++ b/src/calibre/devices/teclast/driver.py
@@ -52,3 +52,14 @@ class IPAPYRUS(TECLAST_K3):
VENDOR_NAME = 'E_READER'
WINDOWS_MAIN_MEM = ''
+class SOVOS(TECLAST_K3):
+
+ name = 'Sovos device interface'
+ gui_name = 'Sovos'
+ description = _('Communicate with the Sovos reader.')
+
+ FORMATS = ['epub', 'fb2', 'pdf', 'txt']
+
+ VENDOR_NAME = 'RK28XX'
+ WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'USB-MSC'
+
From fb053fe3f37d531a170bb2a1d67ccf70ea030351 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Mon, 13 Sep 2010 16:58:09 -0600
Subject: [PATCH 19/43] Fix #6773 (Slightly broken CHM file)
---
src/calibre/ebooks/chm/reader.py | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py
index 67a2d36607..831c16bf6a 100644
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@@ -132,7 +132,11 @@ class CHMReader(CHMFile):
for path in self.Contents():
lpath = os.path.join(output_dir, path)
self._ensure_dir(lpath)
- data = self.GetFile(path)
+ try:
+ data = self.GetFile(path)
+ except:
+ self.log.exception('Failed to extract %s from CHM, ignoring'%path)
+ continue
if lpath.find(';') != -1:
# fix file names with ";" at the end, see _reformat()
lpath = lpath.split(';')[0]
From ba5de1c92d797abc1f82782c7e15bd61dfa387c5 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Mon, 13 Sep 2010 18:18:32 -0600
Subject: [PATCH 20/43] Conversion pipeline: When setting margins on
explicitly set padding to 0 to override and existing padding in the input
document
---
src/calibre/ebooks/oeb/transforms/flatcss.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py
index f48bdb9934..ffdc641d1e 100644
--- a/src/calibre/ebooks/oeb/transforms/flatcss.py
+++ b/src/calibre/ebooks/oeb/transforms/flatcss.py
@@ -138,6 +138,7 @@ class CSSFlattener(object):
float(self.context.margin_left))
bs.append('margin-right : %fpt'%\
float(self.context.margin_right))
+ bs.extend(['padding-left: 0pt', 'padding-right: 0pt'])
if self.context.change_justification != 'original':
bs.append('text-align: '+ self.context.change_justification)
body.set('style', '; '.join(bs))
From c5063b8633506f3b661d3e3dcc84d7ec68e74345 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Mon, 13 Sep 2010 18:26:51 -0600
Subject: [PATCH 21/43] Fix #6804 (Timeout error when browsing content server
via browser)
---
resources/content_server/gui.js | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/resources/content_server/gui.js b/resources/content_server/gui.js
index 631fb8b617..d0fb49cc8e 100644
--- a/resources/content_server/gui.js
+++ b/resources/content_server/gui.js
@@ -26,7 +26,7 @@ var current_library_request = null;
////////////////////////////// GET BOOK LIST //////////////////////////////
-var LIBRARY_FETCH_TIMEOUT = 30000; // milliseconds
+var LIBRARY_FETCH_TIMEOUT = 5*60000; // milliseconds
function create_table_headers() {
var thead = $('table#book_list thead tr');
From c5415bbe8012179b405f2c3ca3b5258e83a863b3 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Mon, 13 Sep 2010 19:11:38 -0600
Subject: [PATCH 22/43] Fix #6806 (--start-in-tray switch displays hidden
windows in metacity, xfwm4 and compiz)
---
src/calibre/gui2/cover_flow.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/calibre/gui2/cover_flow.py b/src/calibre/gui2/cover_flow.py
index 88bbae6c41..cb951b09be 100644
--- a/src/calibre/gui2/cover_flow.py
+++ b/src/calibre/gui2/cover_flow.py
@@ -155,6 +155,7 @@ class CoverFlowMixin(object):
self.cb_splitter.action_toggle.triggered.connect(self.toggle_cover_browser)
if CoverFlow is not None:
self.cover_flow.stop.connect(self.hide_cover_browser)
+ self.cover_flow.setVisible(False)
else:
self.cb_splitter.insertWidget(self.cb_splitter.side_index, self.cover_flow)
if CoverFlow is not None:
From ba67e47c9260a1f813048ab0239ed78d5324e89a Mon Sep 17 00:00:00 2001
From: GRiker
Date: Mon, 13 Sep 2010 19:12:49 -0600
Subject: [PATCH 23/43] GwR wip book jacket
---
src/calibre/devices/apple/driver.py | 6 +-
src/calibre/ebooks/oeb/transforms/flatcss.py | 16 +-
src/calibre/ebooks/oeb/transforms/jacket.py | 147 +++++++++++++------
src/calibre/library/catalog.py | 4 +
4 files changed, 120 insertions(+), 53 deletions(-)
diff --git a/src/calibre/devices/apple/driver.py b/src/calibre/devices/apple/driver.py
index e318d368ff..c9bc04a242 100644
--- a/src/calibre/devices/apple/driver.py
+++ b/src/calibre/devices/apple/driver.py
@@ -2342,8 +2342,10 @@ class ITUNES(DriverBase):
if isosx:
if DEBUG:
self.log.info(" deleting '%s' from iDevice" % cached_book['title'])
- cached_book['dev_book'].delete()
-
+ try:
+ cached_book['dev_book'].delete()
+ except:
+ self.log.error(" error deleting '%s'" % cached_book['title'])
elif iswindows:
hit = self._find_device_book(cached_book)
if hit:
diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py
index f48bdb9934..030c271362 100644
--- a/src/calibre/ebooks/oeb/transforms/flatcss.py
+++ b/src/calibre/ebooks/oeb/transforms/flatcss.py
@@ -146,7 +146,6 @@ class CSSFlattener(object):
extra_css=css)
self.stylizers[item] = stylizer
-
def baseline_node(self, node, stylizer, sizes, csize):
csize = stylizer.style(node)['font-size']
if node.text:
@@ -194,7 +193,7 @@ class CSSFlattener(object):
value = 0.0
cssdict[property] = "%0.5fem" % (value / fsize)
- def flatten_node(self, node, stylizer, names, styles, psize, left=0):
+ def flatten_node(self, node, stylizer, names, styles, psize, item_id, left=0):
if not isinstance(node.tag, basestring) \
or namespace(node.tag) != XHTML_NS:
return
@@ -286,15 +285,18 @@ class CSSFlattener(object):
if self.lineh and 'line-height' not in cssdict:
lineh = self.lineh / psize
cssdict['line-height'] = "%0.5fem" % lineh
+
if (self.context.remove_paragraph_spacing or
self.context.insert_blank_line) and tag in ('p', 'div'):
- for prop in ('margin', 'padding', 'border'):
- for edge in ('top', 'bottom'):
- cssdict['%s-%s'%(prop, edge)] = '0pt'
+ if item_id != 'jacket' or self.context.output_profile.name == 'Kindle':
+ for prop in ('margin', 'padding', 'border'):
+ for edge in ('top', 'bottom'):
+ cssdict['%s-%s'%(prop, edge)] = '0pt'
if self.context.insert_blank_line:
cssdict['margin-top'] = cssdict['margin-bottom'] = '0.5em'
if self.context.remove_paragraph_spacing:
cssdict['text-indent'] = "%1.1fem" % self.context.remove_paragraph_spacing_indent_size
+
if cssdict:
items = cssdict.items()
items.sort()
@@ -313,7 +315,7 @@ class CSSFlattener(object):
if 'style' in node.attrib:
del node.attrib['style']
for child in node:
- self.flatten_node(child, stylizer, names, styles, psize, left)
+ self.flatten_node(child, stylizer, names, styles, psize, item_id, left)
def flatten_head(self, item, stylizer, href):
html = item.data
@@ -360,7 +362,7 @@ class CSSFlattener(object):
stylizer = self.stylizers[item]
body = html.find(XHTML('body'))
fsize = self.context.dest.fbase
- self.flatten_node(body, stylizer, names, styles, fsize)
+ self.flatten_node(body, stylizer, names, styles, fsize, item.id)
items = [(key, val) for (val, key) in styles.items()]
items.sort()
css = ''.join(".%s {\n%s;\n}\n\n" % (key, val) for key, val in items)
diff --git a/src/calibre/ebooks/oeb/transforms/jacket.py b/src/calibre/ebooks/oeb/transforms/jacket.py
index 030067850c..309a7fd7b6 100644
--- a/src/calibre/ebooks/oeb/transforms/jacket.py
+++ b/src/calibre/ebooks/oeb/transforms/jacket.py
@@ -13,6 +13,9 @@ from itertools import repeat
from lxml import etree
from calibre import guess_type, strftime
+from calibre.constants import __appname__, __version__
+from calibre.utils.date import now
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.oeb.base import XPath, XPNSMAP
from calibre.library.comments import comments_to_html
class Jacket(object):
@@ -29,13 +32,30 @@ class Jacket(object):
-
-
%(title)s
-
%(series)s
-
%(rating)s
-
%(tags)s
+
+
%(title)s
+
+
+
Series:
+
%(series)s
+
+
+
Published:
+
%(pubdate)s
+
+
+
Rating:
+
%(rating)s
+
+
+
Tags:
+
%(tags)s
+
+
+
-
%(comments)s
+
+
%(comments)s
''')
@@ -56,7 +76,7 @@ class Jacket(object):
def get_rating(self, rating):
ans = ''
if rating is None:
- return
+ return ans
try:
num = float(rating)/2
except:
@@ -65,19 +85,54 @@ class Jacket(object):
num = min(num, 5)
if num < 1:
return ans
- id, href = self.oeb.manifest.generate('star', 'star.png')
- self.oeb.manifest.add(id, href, 'image/png', data=I('star.png', data=True))
- ans = 'Rating: ' + ''.join(repeat(''%href, num))
+ if self.opts.output_profile.name == 'Kindle':
+ ans = '%s' % ''.join(repeat('★', num))
+ else:
+ id, href = self.oeb.manifest.generate('star', 'star.png')
+ self.oeb.manifest.add(id, href, 'image/png', data=I('star.png', data=True))
+ ans = '%s' % ''.join(repeat(''%href, num))
return ans
def insert_metadata(self, mi):
self.log('Inserting metadata into book...')
jacket_resources = P("jacket")
- if os.path.isdir(jacket_resources):
- stylesheet = os.path.join(jacket_resources, 'stylesheet.css')
- with open(stylesheet) as f:
- css_data = f.read()
+ css_data = ''
+ stylesheet = os.path.join(jacket_resources, 'stylesheet.css')
+ with open(stylesheet) as f:
+ css = f.read()
+
+ try:
+ title_str = mi.title if mi.title else unicode(self.oeb.metadata.title[0])
+ except:
+ title_str = _('Unknown')
+ title = '%s' % (escape(title_str))
+
+ series = escape(mi.series if mi.series else '')
+ if mi.series and mi.series_index is not None:
+ series += escape(' [%s]'%mi.format_series_index())
+ if not mi.series:
+ series = ''
+
+ try:
+ pubdate = strftime(u'%Y', mi.pubdate.timetuple())
+ except:
+ #pubdate = strftime(u'%Y', now())
+ pubdate = ''
+
+ rating = self.get_rating(mi.rating)
+
+ tags = mi.tags
+ if not tags:
+ try:
+ tags = map(unicode, self.oeb.metadata.subject)
+ except:
+ tags = []
+ if tags:
+ #tags = self.opts.dest.tags_to_string(tags)
+ tags = ', '.join(tags)
+ else:
+ tags = ''
comments = mi.comments
if not comments:
@@ -91,46 +146,50 @@ class Jacket(object):
if comments:
comments = comments_to_html(comments)
- series = 'Series: %s' % escape(mi.series if mi.series else '')
- if mi.series and mi.series_index is not None:
- series += '%s' % escape(' [%s]'%mi.format_series_index())
- if not mi.series:
- series = ''
-
- tags = mi.tags
- if not tags:
- try:
- tags = map(unicode, self.oeb.metadata.subject)
- except:
- tags = []
- if tags:
- tags = 'Tags:%s' % self.opts.dest.tags_to_string(tags)
- else:
- tags = ''
-
- try:
- title_str = mi.title if mi.title else unicode(self.oeb.metadata.title[0])
- except:
- title_str = _('Unknown')
- title = '%s (%s)' % (escape(title_str), strftime(u'%Y', mi.pubdate.timetuple()))
-
+ footer = 'BOOK JACKET GENERATED BY %s %s' % (__appname__.upper(),__version__)
def generate_html(comments):
- return self.JACKET_TEMPLATE%dict(xmlns=XPNSMAP['h'],
- title=title, comments=comments,
- series=series,
- tags=tags, rating=self.get_rating(mi.rating),
- css=css_data, title_str=title_str)
+ args = dict(xmlns=XPNSMAP['h'],
+ title_str=title_str,
+ css=css,
+ title=title,
+ pubdate=pubdate,
+ series=series,
+ rating=rating,
+ tags=tags,
+ comments=comments,
+ footer = footer)
+
+ # Post-process the generated html to strip out empty header items
+ generated_html = self.JACKET_TEMPLATE % args
+ soup = BeautifulSoup(generated_html)
+ if not series:
+ series_tag = soup.find('tr', attrs={'class':'cbj_series'})
+ series_tag.extract()
+ if not rating:
+ rating_tag = soup.find('tr', attrs={'class':'cbj_rating'})
+ rating_tag.extract()
+ if not tags:
+ tags_tag = soup.find('tr', attrs={'class':'cbj_tags'})
+ tags_tag.extract()
+ if not pubdate:
+ pubdate_tag = soup.find('tr', attrs={'class':'cbj_pubdate'})
+ pubdate_tag.extract()
+ if self.opts.output_profile.name != 'Kindle':
+ hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'})
+ hr_tag.extract()
+
+ return soup.renderContents()
id, href = self.oeb.manifest.generate('jacket', 'jacket.xhtml')
from calibre.ebooks.oeb.base import RECOVER_PARSER, XPath
+
try:
root = etree.fromstring(generate_html(comments), parser=RECOVER_PARSER)
-# print "root: %s" % etree.tostring(root, encoding='utf-8',
-# xml_declaration=True, pretty_print=True)
except:
root = etree.fromstring(generate_html(escape(orig_comments)),
parser=RECOVER_PARSER)
+
jacket = XPath('//h:meta[@name="calibre-content" and @content="jacket"]')
found = None
for item in list(self.oeb.spine)[:4]:
diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py
index bd2160aff1..ef7569bd88 100644
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@@ -2523,6 +2523,10 @@ class EPUB_MOBI(CatalogPlugin):
# Fetch the database as a dictionary
self.booksBySeries = self.plugin.search_sort_db(self.db, self.opts)
+ if not self.booksBySeries:
+ self.opts.generate_series = False
+ self.opts.log(" no series found in selected books, cancelling series generation")
+ return
friendly_name = "Series"
From 6a3609f031bb9400630cd6418b278903a4883c8a Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Mon, 13 Sep 2010 19:58:22 -0600
Subject: [PATCH 24/43] Implement #6808 (Feature request: ability to convert
all single/double quotes to "smart quotes")
---
src/calibre/ebooks/conversion/cli.py | 2 +-
src/calibre/ebooks/conversion/plumber.py | 8 +
src/calibre/ebooks/conversion/preprocess.py | 23 +-
src/calibre/gui2/convert/look_and_feel.py | 2 +-
src/calibre/gui2/convert/look_and_feel.ui | 9 +-
src/calibre/utils/smartypants.py | 899 ++++++++++++++++++++
6 files changed, 933 insertions(+), 10 deletions(-)
create mode 100755 src/calibre/utils/smartypants.py
diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 7439718cf6..2ef633d0bb 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -122,7 +122,7 @@ def add_pipeline_options(parser, plumber):
'font_size_mapping',
'line_height',
'linearize_tables',
- 'extra_css',
+ 'extra_css', 'smarten_punctuation',
'margin_top', 'margin_left', 'margin_right',
'margin_bottom', 'change_justification',
'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 24b35f804f..16282dd28d 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -362,6 +362,14 @@ OptionRecommendation(name='preprocess_html',
)
),
+OptionRecommendation(name='smarten_punctuation',
+ recommended_value=False, level=OptionRecommendation.LOW,
+ help=_('Convert plain quotes, dashes and ellipsis to their '
+ 'typographically correct equivalents. For details, see '
+ 'http://daringfireball.net/projects/smartypants'
+ )
+ ),
+
OptionRecommendation(name='remove_header',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Use a regular expression to try and remove the header.'
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 7742a20a21..4538af96c4 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -221,7 +221,7 @@ class HTMLPreProcessor(object):
(re.compile(u'˛\s*()*\s*A', re.UNICODE), lambda match: u'Ą'),
(re.compile(u'˛\s*()*\s*e', re.UNICODE), lambda match: u'ę'),
(re.compile(u'˛\s*()*\s*E', re.UNICODE), lambda match: u'Ę'),
-
+
# ˙
(re.compile(u'˙\s*()*\s*z', re.UNICODE), lambda match: u'ż'),
(re.compile(u'˙\s*()*\s*Z', re.UNICODE), lambda match: u'Ż'),
@@ -244,14 +244,14 @@ class HTMLPreProcessor(object):
(re.compile(r' \s*(?P(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*([ibu]>){0,2})\s*( \s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s* )?', re.IGNORECASE), chap_head),
# Cover the case where every letter in a chapter title is separated by a space
(re.compile(r' \s*(?P([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*( \s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*( ))?'), chap_head),
-
+
# Have paragraphs show better
(re.compile(r''), lambda match : '
'),
# Clean up spaces
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics
(re.compile(u'(?'), lambda match: ' '),
- (re.compile(r'(?=\w)'), lambda match: ' '),
+ (re.compile(r'(?=\w)'), lambda match: ' '),
]
# Fix Book Designer markup
@@ -328,7 +328,7 @@ class HTMLPreProcessor(object):
import traceback
print 'Failed to parse remove_footer regexp'
traceback.print_exc()
-
+
# unwrap hyphenation - moved here so it's executed after header/footer removal
if is_pdftohtml:
# unwrap visible dashes and hyphens - don't delete they are often hyphens for
@@ -338,13 +338,13 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(u'[](\s*
)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
-
- # Make the more aggressive chapter marking regex optional with the preprocess option to
+
+ # Make the more aggressive chapter marking regex optional with the preprocess option to
# reduce false positives and move after header/footer removal
if getattr(self.extra_opts, 'preprocess_html', None):
if is_pdftohtml:
end_rules.append((re.compile(r'
)?'), chap_head),)
-
+
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
if length:
@@ -401,5 +401,14 @@ class HTMLPreProcessor(object):
if self.plugin_preprocess:
html = self.input_plugin_preprocess(html)
+ if getattr(self.extra_opts, 'smarten_punctuation', False):
+ html = self.smarten_punctuation(html)
+
return html
+ def smarten_punctuation(self, html):
+ from calibre.utils.smartypants import smartyPants
+ from calibre.ebooks.chardet import substitute_entites
+ html = smartyPants(html)
+ return substitute_entites(html)
+
diff --git a/src/calibre/gui2/convert/look_and_feel.py b/src/calibre/gui2/convert/look_and_feel.py
index b0403bf1dd..ec3f0b944d 100644
--- a/src/calibre/gui2/convert/look_and_feel.py
+++ b/src/calibre/gui2/convert/look_and_feel.py
@@ -22,7 +22,7 @@ class LookAndFeelWidget(Widget, Ui_Form):
Widget.__init__(self, parent,
['change_justification', 'extra_css', 'base_font_size',
'font_size_mapping', 'line_height',
- 'linearize_tables',
+ 'linearize_tables', 'smarten_punctuation',
'disable_font_rescaling', 'insert_blank_line',
'remove_paragraph_spacing', 'remove_paragraph_spacing_indent_size','input_encoding',
'asciiize', 'keep_ligatures']
diff --git a/src/calibre/gui2/convert/look_and_feel.ui b/src/calibre/gui2/convert/look_and_feel.ui
index de48e7caf9..c683300854 100644
--- a/src/calibre/gui2/convert/look_and_feel.ui
+++ b/src/calibre/gui2/convert/look_and_feel.ui
@@ -178,7 +178,7 @@
-
+ Extra &CSS
@@ -214,6 +214,13 @@
+
+
+
+ Smarten &punctuation
+
+
+
diff --git a/src/calibre/utils/smartypants.py b/src/calibre/utils/smartypants.py
new file mode 100755
index 0000000000..44aac4de8c
--- /dev/null
+++ b/src/calibre/utils/smartypants.py
@@ -0,0 +1,899 @@
+#!/usr/bin/python
+
+r"""
+==============
+smartypants.py
+==============
+
+----------------------------
+SmartyPants ported to Python
+----------------------------
+
+Ported by `Chad Miller`_
+Copyright (c) 2004, 2007 Chad Miller
+
+original `SmartyPants`_ by `John Gruber`_
+Copyright (c) 2003 John Gruber
+
+
+Synopsis
+========
+
+A smart-quotes plugin for Pyblosxom_.
+
+The priginal "SmartyPants" is a free web publishing plug-in for Movable Type,
+Blosxom, and BBEdit that easily translates plain ASCII punctuation characters
+into "smart" typographic punctuation HTML entities.
+
+This software, *smartypants.py*, endeavours to be a functional port of
+SmartyPants to Python, for use with Pyblosxom_.
+
+
+Description
+===========
+
+SmartyPants can perform the following transformations:
+
+- Straight quotes ( " and ' ) into "curly" quote HTML entities
+- Backticks-style quotes (\`\`like this'') into "curly" quote HTML entities
+- Dashes (``--`` and ``---``) into en- and em-dash entities
+- Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity
+
+This means you can write, edit, and save your posts using plain old
+ASCII straight quotes, plain dashes, and plain dots, but your published
+posts (and final HTML output) will appear with smart quotes, em-dashes,
+and proper ellipses.
+
+SmartyPants does not modify characters within ``
``, ````, ````,
+``
-
+ Insert &metadata as page at start of book
-
-
-
- &Preprocess input file to possibly improve structure detection
-
-
-
-
+
-
+ Qt::Vertical
@@ -71,26 +64,33 @@
-
+ Remove F&ooter
-
+ Remove H&eader
-
+
-
+
+
+
+
+ &Preprocess input file to possibly improve structure detection
+
+
+
From 30fafed01710d563a6aaf67b12be0f7db189f4f2 Mon Sep 17 00:00:00 2001
From: GRiker
Date: Wed, 15 Sep 2010 10:12:04 -0600
Subject: [PATCH 37/43] GwR fix #6822
---
src/calibre/library/catalog.py | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py
index ef7569bd88..e14d092727 100644
--- a/src/calibre/library/catalog.py
+++ b/src/calibre/library/catalog.py
@@ -2590,7 +2590,7 @@ class EPUB_MOBI(CatalogPlugin):
aTag = Tag(soup, 'a')
aTag['name'] = "%s_series" % re.sub('\W','',book['series']).lower()
pSeriesTag.insert(0,aTag)
- pSeriesTag.insert(1,NavigableString(self.NOT_READ_SYMBOL + '%s' % book['series']))
+ pSeriesTag.insert(1,NavigableString('%s' % book['series']))
divTag.insert(dtc,pSeriesTag)
dtc += 1
@@ -2599,7 +2599,14 @@ class EPUB_MOBI(CatalogPlugin):
ptc = 0
# book with read/reading/unread symbol
- if 'read' in book and book['read']:
+ for tag in book['tags']:
+ if tag == self.opts.read_tag:
+ book['read'] = True
+ break
+ else:
+ book['read'] = False
+
+ if book['read']:
# check mark
pBookTag.insert(ptc,NavigableString(self.READ_SYMBOL))
pBookTag['class'] = "read_book"
From a20015e1e7b656a47fd87d474c08bdaef61b0bae Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Wed, 15 Sep 2010 11:09:40 -0600
Subject: [PATCH 38/43] Workaround for bug that affects some windows install
causing white backgrounds on default covers to be rendered as yellow
---
src/calibre/utils/magick/__init__.py | 2 +-
src/calibre/utils/magick/draw.py | 10 +++++-----
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/src/calibre/utils/magick/__init__.py b/src/calibre/utils/magick/__init__.py
index 073a030361..2707430c67 100644
--- a/src/calibre/utils/magick/__init__.py
+++ b/src/calibre/utils/magick/__init__.py
@@ -194,7 +194,7 @@ class Image(_magick.Image): # {{{
# }}}
-def create_canvas(width, height, bgcolor='white'):
+def create_canvas(width, height, bgcolor='#ffffff'):
canvas = Image()
canvas.create_canvas(int(width), int(height), str(bgcolor))
return canvas
diff --git a/src/calibre/utils/magick/draw.py b/src/calibre/utils/magick/draw.py
index 301bf9912a..ed9e3d3d83 100644
--- a/src/calibre/utils/magick/draw.py
+++ b/src/calibre/utils/magick/draw.py
@@ -11,7 +11,7 @@ from calibre.utils.magick import Image, DrawingWand, create_canvas
from calibre.constants import __appname__, __version__
from calibre import fit_image
-def save_cover_data_to(data, path, bgcolor='white', resize_to=None,
+def save_cover_data_to(data, path, bgcolor='#ffffff', resize_to=None,
return_data=False):
'''
Saves image in data to path, in the format specified by the path
@@ -28,7 +28,7 @@ def save_cover_data_to(data, path, bgcolor='white', resize_to=None,
return canvas.export(os.path.splitext(path)[1][1:])
canvas.save(path)
-def thumbnail(data, width=120, height=120, bgcolor='white', fmt='jpg'):
+def thumbnail(data, width=120, height=120, bgcolor='#ffffff', fmt='jpg'):
img = Image()
img.load(data)
owidth, oheight = img.size
@@ -61,7 +61,7 @@ def identify(path):
return identify_data(data)
def add_borders_to_image(path_to_image, left=0, top=0, right=0, bottom=0,
- border_color='white'):
+ border_color='#ffffff'):
img = Image()
img.open(path_to_image)
lwidth, lheight = img.size
@@ -80,7 +80,7 @@ def create_text_wand(font_size, font_path=None):
ans.text_alias = True
return ans
-def create_text_arc(text, font_size, font=None, bgcolor='white'):
+def create_text_arc(text, font_size, font=None, bgcolor='#ffffff'):
if isinstance(text, unicode):
text = text.encode('utf-8')
@@ -148,7 +148,7 @@ class TextLine(object):
def create_cover_page(top_lines, logo_path, width=590, height=750,
- bgcolor='white', output_format='jpg'):
+ bgcolor='#ffffff', output_format='jpg'):
'''
Create the standard calibre cover page and return it as a byte string in
the specified output_format.
From 57ca76e68efb7c3f615d948231ac741e60251dd1 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Wed, 15 Sep 2010 11:12:53 -0600
Subject: [PATCH 39/43] ...
---
src/calibre/web/feeds/news.py | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index 9ba9583c73..a140dfbf05 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -290,10 +290,12 @@ class BasicNewsRecipe(Recipe):
#: the cover for the periodical. Overriding this in your recipe instructs
#: calibre to render the downloaded cover into a frame whose width and height
#: are expressed as a percentage of the downloaded cover.
- #: cover_margins = (10,15,'white') pads the cover with a white margin
+ #: cover_margins = (10, 15, '#ffffff') pads the cover with a white margin
#: 10px on the left and right, 15px on the top and bottom.
- #: Colors name defined at http://www.imagemagick.org/script/color.php
- cover_margins = (0,0,'white')
+ #: Color names defined at http://www.imagemagick.org/script/color.php
+ #: Note that for some reason, white does not always work on windows. Use
+ #: #ffffff instead
+ cover_margins = (0, 0, '#ffffff')
#: Set to a non empty string to disable this recipe
#: The string will be used as the disabled message
From c006e2e14bebef07898a934bdb6225ea14b6280f Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Wed, 15 Sep 2010 11:27:39 -0600
Subject: [PATCH 40/43] Database: Update has_cover cache when setting/removing
covers so that the search returns correct results. Also fix an exception that
could occur when adding books with a db that has been upgraded from very old
SQL.
---
src/calibre/library/database2.py | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py
index 2df6b3bdc4..f5f0f724ba 100644
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@@ -598,7 +598,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
def has_cover(self, index, index_is_id=False):
id = index if index_is_id else self.id(index)
- path = os.path.join(self.library_path, self.path(id, index_is_id=True), 'cover.jpg')
+ try:
+ path = os.path.join(self.abspath(id, index_is_id=True), 'cover.jpg')
+ except:
+ # Can happen if path has not yet been set
+ return False
return os.access(path, os.R_OK)
def remove_cover(self, id, notify=True):
@@ -609,6 +613,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
except (IOError, OSError):
time.sleep(0.2)
os.remove(path)
+ self.data.set(id, self.FIELD_MAP['cover'], False, row_is_id=True)
if notify:
self.notify('cover', [id])
@@ -629,6 +634,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
except (IOError, OSError):
time.sleep(0.2)
save_cover_data_to(data, path)
+ self.data.set(id, self.FIELD_MAP['cover'], True, row_is_id=True)
if notify:
self.notify('cover', [id])
@@ -1087,8 +1093,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
self.set_path(id, True)
self.notify('metadata', [id])
- # Given a book, return the list of author sort strings for the book's authors
def authors_sort_strings(self, id, index_is_id=False):
+ '''
+ Given a book, return the list of author sort strings
+ for the book's authors
+ '''
id = id if index_is_id else self.id(id)
aut_strings = self.conn.get('''
SELECT sort
@@ -1744,10 +1753,10 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
series_index = 1.0 if mi.series_index is None else mi.series_index
aus = mi.author_sort if mi.author_sort else self.author_sort_from_authors(mi.authors)
title = mi.title
- if isinstance(aus, str):
+ if isbytestring(aus):
aus = aus.decode(preferred_encoding, 'replace')
- if isinstance(title, str):
- title = title.decode(preferred_encoding)
+ if isbytestring(title):
+ title = title.decode(preferred_encoding, 'replace')
obj = self.conn.execute('INSERT INTO books(title, series_index, author_sort) VALUES (?, ?, ?)',
(title, series_index, aus))
id = obj.lastrowid
From 6bbbb0a1f57635d8d5aae6398f27914e20d333e6 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Wed, 15 Sep 2010 12:19:26 -0600
Subject: [PATCH 41/43] Fix #6819 (Not recognising New sony PRS 650)
---
src/calibre/devices/prs505/driver.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/calibre/devices/prs505/driver.py b/src/calibre/devices/prs505/driver.py
index 4c14565c2d..094c12cf0c 100644
--- a/src/calibre/devices/prs505/driver.py
+++ b/src/calibre/devices/prs505/driver.py
@@ -35,7 +35,7 @@ class PRS505(USBMS):
VENDOR_NAME = 'SONY'
WINDOWS_MAIN_MEM = re.compile(
- r'(PRS-(505|300|500))|'
+ r'(PRS-(505|500))|'
r'(PRS-((700[#/])|((6|9|3)(0|5)0&)))'
)
WINDOWS_CARD_A_MEM = re.compile(
From 062d369b43b435991d0b140bdba6217c0b5b0ccf Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Wed, 15 Sep 2010 13:22:05 -0600
Subject: [PATCH 42/43] ...
---
src/calibre/gui2/device.py | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py
index 45c78ce6da..f839e1d519 100644
--- a/src/calibre/gui2/device.py
+++ b/src/calibre/gui2/device.py
@@ -627,12 +627,11 @@ class DeviceMixin(object): # {{{
def connect_to_folder(self):
dir = choose_dir(self, 'Select Device Folder',
_('Select folder to open as device'))
- kls = FOLDER_DEVICE
- self.device_manager.mount_device(kls=kls, kind='folder', path=dir)
+ if dir is not None:
+ self.device_manager.mount_device(kls=FOLDER_DEVICE, kind='folder', path=dir)
def connect_to_itunes(self):
- kls = ITUNES_ASYNC
- self.device_manager.mount_device(kls=kls, kind='itunes', path=None)
+ self.device_manager.mount_device(kls=ITUNES_ASYNC, kind='itunes', path=None)
# disconnect from both folder and itunes devices
def disconnect_mounted_device(self):
From ff319ccc4daae0b1d01ccab78733412fb4edb53f Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Wed, 15 Sep 2010 14:05:47 -0600
Subject: [PATCH 43/43] Add an option to split the toolbar into two toolbars
---
src/calibre/gui2/__init__.py | 1 +
src/calibre/gui2/actions/__init__.py | 6 +++
src/calibre/gui2/actions/add.py | 1 +
src/calibre/gui2/actions/add_to_library.py | 1 +
src/calibre/gui2/actions/annotate.py | 1 +
src/calibre/gui2/actions/convert.py | 1 +
src/calibre/gui2/actions/copy_to_library.py | 1 +
src/calibre/gui2/actions/delete.py | 1 +
src/calibre/gui2/actions/edit_collections.py | 1 +
src/calibre/gui2/actions/edit_metadata.py | 1 +
src/calibre/gui2/actions/open.py | 1 +
src/calibre/gui2/actions/save_to_disk.py | 1 +
src/calibre/gui2/actions/show_book_details.py | 1 +
src/calibre/gui2/actions/similar_books.py | 1 +
src/calibre/gui2/actions/view.py | 1 +
src/calibre/gui2/layout.py | 43 ++++++++++++++++---
src/calibre/gui2/preferences/look_feel.py | 1 +
src/calibre/gui2/preferences/look_feel.ui | 7 +++
18 files changed, 65 insertions(+), 6 deletions(-)
diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py
index 1b61404589..e58dce5559 100644
--- a/src/calibre/gui2/__init__.py
+++ b/src/calibre/gui2/__init__.py
@@ -50,6 +50,7 @@ gprefs.defaults['action-layout-context-menu-device'] = (
gprefs.defaults['show_splash_screen'] = True
gprefs.defaults['toolbar_icon_size'] = 'medium'
gprefs.defaults['toolbar_text'] = 'auto'
+gprefs.defaults['show_child_bar'] = False
# }}}
diff --git a/src/calibre/gui2/actions/__init__.py b/src/calibre/gui2/actions/__init__.py
index 57ad900fba..b2d1656367 100644
--- a/src/calibre/gui2/actions/__init__.py
+++ b/src/calibre/gui2/actions/__init__.py
@@ -71,6 +71,12 @@ class InterfaceAction(QObject):
all_locations = frozenset(['toolbar', 'toolbar-device', 'context-menu',
'context-menu-device'])
+ #: Type of action
+ #: 'current' means acts on the current view
+ #: 'global' means an action that does not act on the current view, but rather
+ #: on calibre as a whole
+ action_type = 'global'
+
def __init__(self, parent, site_customization):
QObject.__init__(self, parent)
self.setObjectName(self.name)
diff --git a/src/calibre/gui2/actions/add.py b/src/calibre/gui2/actions/add.py
index f0ff794fab..add7bf1d5b 100644
--- a/src/calibre/gui2/actions/add.py
+++ b/src/calibre/gui2/actions/add.py
@@ -25,6 +25,7 @@ class AddAction(InterfaceAction):
action_spec = (_('Add books'), 'add_book.png',
_('Add books to the calibre library/device from files on your computer')
, _('A'))
+ action_type = 'current'
def genesis(self):
self._add_filesystem_book = self.Dispatcher(self.__add_filesystem_book)
diff --git a/src/calibre/gui2/actions/add_to_library.py b/src/calibre/gui2/actions/add_to_library.py
index 6fc0d5fb1f..05aea8f1dd 100644
--- a/src/calibre/gui2/actions/add_to_library.py
+++ b/src/calibre/gui2/actions/add_to_library.py
@@ -13,6 +13,7 @@ class AddToLibraryAction(InterfaceAction):
action_spec = (_('Add books to library'), 'add_book.png',
_('Add books to your calibre library from the connected device'), None)
dont_add_to = frozenset(['toolbar', 'context-menu'])
+ action_type = 'current'
def genesis(self):
self.qaction.triggered.connect(self.add_books_to_library)
diff --git a/src/calibre/gui2/actions/annotate.py b/src/calibre/gui2/actions/annotate.py
index 5356d63e98..dfafcd1a39 100644
--- a/src/calibre/gui2/actions/annotate.py
+++ b/src/calibre/gui2/actions/annotate.py
@@ -18,6 +18,7 @@ class FetchAnnotationsAction(InterfaceAction):
name = 'Fetch Annotations'
action_spec = (_('Fetch annotations (experimental)'), None, None, None)
+ action_type = 'current'
def genesis(self):
pass
diff --git a/src/calibre/gui2/actions/convert.py b/src/calibre/gui2/actions/convert.py
index ee0f06ab71..29acfc52b1 100644
--- a/src/calibre/gui2/actions/convert.py
+++ b/src/calibre/gui2/actions/convert.py
@@ -21,6 +21,7 @@ class ConvertAction(InterfaceAction):
name = 'Convert Books'
action_spec = (_('Convert books'), 'convert.png', None, _('C'))
dont_add_to = frozenset(['toolbar-device', 'context-menu-device'])
+ action_type = 'current'
def genesis(self):
cm = QMenu()
diff --git a/src/calibre/gui2/actions/copy_to_library.py b/src/calibre/gui2/actions/copy_to_library.py
index 7127c91e8c..6b7654f644 100644
--- a/src/calibre/gui2/actions/copy_to_library.py
+++ b/src/calibre/gui2/actions/copy_to_library.py
@@ -80,6 +80,7 @@ class CopyToLibraryAction(InterfaceAction):
_('Copy selected books to the specified library'), None)
popup_type = QToolButton.InstantPopup
dont_add_to = frozenset(['toolbar-device', 'context-menu-device'])
+ action_type = 'current'
def genesis(self):
self.menu = QMenu(self.gui)
diff --git a/src/calibre/gui2/actions/delete.py b/src/calibre/gui2/actions/delete.py
index 0343c6df84..406860e4ec 100644
--- a/src/calibre/gui2/actions/delete.py
+++ b/src/calibre/gui2/actions/delete.py
@@ -16,6 +16,7 @@ class DeleteAction(InterfaceAction):
name = 'Remove Books'
action_spec = (_('Remove books'), 'trash.png', None, _('Del'))
+ action_type = 'current'
def genesis(self):
self.qaction.triggered.connect(self.delete_books)
diff --git a/src/calibre/gui2/actions/edit_collections.py b/src/calibre/gui2/actions/edit_collections.py
index e45d36fc62..7f5dd76538 100644
--- a/src/calibre/gui2/actions/edit_collections.py
+++ b/src/calibre/gui2/actions/edit_collections.py
@@ -13,6 +13,7 @@ class EditCollectionsAction(InterfaceAction):
action_spec = (_('Manage collections'), None,
_('Manage the collections on this device'), None)
dont_add_to = frozenset(['toolbar', 'context-menu'])
+ action_type = 'current'
def genesis(self):
self.qaction.triggered.connect(self.edit_collections)
diff --git a/src/calibre/gui2/actions/edit_metadata.py b/src/calibre/gui2/actions/edit_metadata.py
index 878ba77a43..ac04652efa 100644
--- a/src/calibre/gui2/actions/edit_metadata.py
+++ b/src/calibre/gui2/actions/edit_metadata.py
@@ -22,6 +22,7 @@ class EditMetadataAction(InterfaceAction):
name = 'Edit Metadata'
action_spec = (_('Edit metadata'), 'edit_input.png', None, _('E'))
+ action_type = 'current'
def genesis(self):
self.create_action(spec=(_('Merge book records'), 'merge_books.png',
diff --git a/src/calibre/gui2/actions/open.py b/src/calibre/gui2/actions/open.py
index 106bfa24f6..141ff01a66 100644
--- a/src/calibre/gui2/actions/open.py
+++ b/src/calibre/gui2/actions/open.py
@@ -14,6 +14,7 @@ class OpenFolderAction(InterfaceAction):
action_spec = (_('Open containing folder'), 'document_open.png', None,
_('O'))
dont_add_to = frozenset(['toolbar-device', 'context-menu-device'])
+ action_type = 'current'
def genesis(self):
self.qaction.triggered.connect(self.gui.iactions['View'].view_folder)
diff --git a/src/calibre/gui2/actions/save_to_disk.py b/src/calibre/gui2/actions/save_to_disk.py
index bfcc02e130..e9664b9980 100644
--- a/src/calibre/gui2/actions/save_to_disk.py
+++ b/src/calibre/gui2/actions/save_to_disk.py
@@ -38,6 +38,7 @@ class SaveToDiskAction(InterfaceAction):
name = "Save To Disk"
action_spec = (_('Save to disk'), 'save.png', None, _('S'))
+ action_type = 'current'
def genesis(self):
self.qaction.triggered.connect(self.save_to_disk)
diff --git a/src/calibre/gui2/actions/show_book_details.py b/src/calibre/gui2/actions/show_book_details.py
index d17d0998f1..18b0a694bf 100644
--- a/src/calibre/gui2/actions/show_book_details.py
+++ b/src/calibre/gui2/actions/show_book_details.py
@@ -16,6 +16,7 @@ class ShowBookDetailsAction(InterfaceAction):
action_spec = (_('Show book details'), 'dialog_information.png', None,
_('I'))
dont_add_to = frozenset(['toolbar-device', 'context-menu-device'])
+ action_type = 'current'
def genesis(self):
self.qaction.triggered.connect(self.show_book_info)
diff --git a/src/calibre/gui2/actions/similar_books.py b/src/calibre/gui2/actions/similar_books.py
index 1a14869a9c..644cd3160a 100644
--- a/src/calibre/gui2/actions/similar_books.py
+++ b/src/calibre/gui2/actions/similar_books.py
@@ -16,6 +16,7 @@ class SimilarBooksAction(InterfaceAction):
name = 'Similar Books'
action_spec = (_('Similar books...'), None, None, None)
popup_type = QToolButton.InstantPopup
+ action_type = 'current'
def genesis(self):
m = QMenu(self.gui)
diff --git a/src/calibre/gui2/actions/view.py b/src/calibre/gui2/actions/view.py
index 2f6be24e5b..0fbf86c567 100644
--- a/src/calibre/gui2/actions/view.py
+++ b/src/calibre/gui2/actions/view.py
@@ -22,6 +22,7 @@ class ViewAction(InterfaceAction):
name = 'View'
action_spec = (_('View'), 'view.png', None, _('V'))
+ action_type = 'current'
def genesis(self):
self.persistent_files = []
diff --git a/src/calibre/gui2/layout.py b/src/calibre/gui2/layout.py
index 58d5267c8e..ec7e023dc1 100644
--- a/src/calibre/gui2/layout.py
+++ b/src/calibre/gui2/layout.py
@@ -61,7 +61,7 @@ class LocationManager(QObject): # {{{
ac('library', _('Library'), 'lt.png',
_('Show books in calibre library'))
- ac('main', _('Reader'), 'reader.png',
+ ac('main', _('Device'), 'reader.png',
_('Show books in the main memory of the device'))
ac('carda', _('Card A'), 'sd.png',
_('Show books in storage card A'))
@@ -197,11 +197,21 @@ class SearchBar(QWidget): # {{{
# }}}
+class Spacer(QWidget):
+
+ def __init__(self, parent):
+ QWidget.__init__(self, parent)
+ self.l = QHBoxLayout()
+ self.setLayout(self.l)
+ self.l.addStretch(10)
+
+
class ToolBar(QToolBar): # {{{
- def __init__(self, donate, location_manager, parent):
+ def __init__(self, donate, location_manager, child_bar, parent):
QToolBar.__init__(self, parent)
self.gui = parent
+ self.child_bar = child_bar
self.setContextMenuPolicy(Qt.PreventContextMenu)
self.setMovable(False)
self.setFloatable(False)
@@ -223,16 +233,19 @@ class ToolBar(QToolBar): # {{{
sz = gprefs['toolbar_icon_size']
sz = {'small':24, 'medium':48, 'large':64}[sz]
self.setIconSize(QSize(sz, sz))
+ self.child_bar.setIconSize(QSize(sz, sz))
style = Qt.ToolButtonTextUnderIcon
if gprefs['toolbar_text'] == 'never':
style = Qt.ToolButtonIconOnly
self.setToolButtonStyle(style)
+ self.child_bar.setToolButtonStyle(style)
self.donate_button.set_normal_icon_size(sz, sz)
def contextMenuEvent(self, *args):
pass
def build_bar(self):
+ self.child_bar.setVisible(gprefs['show_child_bar'])
self.showing_donate = False
showing_device = self.location_manager.has_device
actions = '-device' if showing_device else ''
@@ -244,10 +257,16 @@ class ToolBar(QToolBar): # {{{
m.setVisible(False)
self.clear()
+ self.child_bar.clear()
self.added_actions = []
+ self.spacers = [Spacer(self.child_bar), Spacer(self.child_bar),
+ Spacer(self), Spacer(self)]
+ self.child_bar.addWidget(self.spacers[0])
+ if gprefs['show_child_bar']:
+ self.addWidget(self.spacers[2])
for what in actions:
- if what is None:
+ if what is None and not gprefs['show_child_bar']:
self.addSeparator()
elif what == 'Location Manager':
for ac in self.location_manager.available_actions:
@@ -262,12 +281,21 @@ class ToolBar(QToolBar): # {{{
self.showing_donate = True
elif what in self.gui.iactions:
action = self.gui.iactions[what]
- self.addAction(action.qaction)
+ bar = self
+ if action.action_type == 'current' and gprefs['show_child_bar']:
+ bar = self.child_bar
+ bar.addAction(action.qaction)
self.added_actions.append(action.qaction)
self.setup_tool_button(action.qaction, action.popup_type)
+ self.child_bar.addWidget(self.spacers[1])
+ if gprefs['show_child_bar']:
+ self.addWidget(self.spacers[3])
+
def setup_tool_button(self, ac, menu_mode=None):
ch = self.widgetForAction(ac)
+ if ch is None:
+ ch = self.child_bar.widgetForAction(ac)
ch.setCursor(Qt.PointingHandCursor)
ch.setAutoRaise(True)
if ac.menu() is not None and menu_mode is not None:
@@ -280,7 +308,8 @@ class ToolBar(QToolBar): # {{{
if p == 'never':
style = Qt.ToolButtonIconOnly
- if p == 'auto' and self.preferred_width > self.width()+35:
+ if p == 'auto' and self.preferred_width > self.width()+35 and \
+ not gprefs['show_child_bar']:
style = Qt.ToolButtonIconOnly
self.setToolButtonStyle(style)
@@ -309,9 +338,11 @@ class MainWindowMixin(object): # {{{
self.iactions['Fetch News'].init_scheduler(db)
self.search_bar = SearchBar(self)
+ self.child_bar = QToolBar(self)
self.tool_bar = ToolBar(self.donate_button,
- self.location_manager, self)
+ self.location_manager, self.child_bar, self)
self.addToolBar(Qt.TopToolBarArea, self.tool_bar)
+ self.addToolBar(Qt.BottomToolBarArea, self.child_bar)
l = self.centralwidget.layout()
l.addWidget(self.search_bar)
diff --git a/src/calibre/gui2/preferences/look_feel.py b/src/calibre/gui2/preferences/look_feel.py
index f30b2fddbb..10c2fcfe95 100644
--- a/src/calibre/gui2/preferences/look_feel.py
+++ b/src/calibre/gui2/preferences/look_feel.py
@@ -46,6 +46,7 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form):
r('use_roman_numerals_for_series_number', config)
r('separate_cover_flow', config, restart_required=True)
r('search_as_you_type', config)
+ r('show_child_bar', gprefs)
choices = [(_('Small'), 'small'), (_('Medium'), 'medium'),
(_('Large'), 'large')]
diff --git a/src/calibre/gui2/preferences/look_feel.ui b/src/calibre/gui2/preferences/look_feel.ui
index 7c6c736b24..1de55d51ef 100644
--- a/src/calibre/gui2/preferences/look_feel.ui
+++ b/src/calibre/gui2/preferences/look_feel.ui
@@ -173,6 +173,13 @@
+
+
+
+ &Split the toolbar into two toolbars
+
+
+