diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index b105a6c042..fb55ee74fb 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -32,6 +32,39 @@ def chap_head(match):
return '
'+chap+'
'+title+'
'
+def line_length(raw, percent):
+ '''
+ raw is the raw text to find the line length to use for wrapping.
+ percentage is a decimal number, 0 - 1 which is used to determine
+ how far in the list of line lengths to use.
+ '''
+ raw = raw.replace(' ', ' ')
+ linere = re.compile('(?<=
).*?(?=
)', re.DOTALL)
+ lines = linere.findall(raw)
+
+ lengths = []
+ for line in lines:
+ if len(line) > 0:
+ lengths.append(len(line))
+ total = sum(lengths)
+ avg = total / len(lengths)
+ max_line = avg * 2
+
+ lengths = sorted(lengths)
+ for i in range(len(lengths) - 1, -1, -1):
+ if lengths[i] > max_line:
+ del lengths[i]
+
+ if percent > 1:
+ percent = 1
+ if percent < 0:
+ percent = 0
+
+ index = int(len(lengths) * percent) - 1
+
+ return lengths[index]
+
+
class CSSPreProcessor(object):
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
@@ -129,7 +162,12 @@ class HTMLPreProcessor(object):
elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html):
- rules = self.PDFTOHTML
+ # Add rules that require matching line length here
+ #line_length_rules = [
+ # (re.compile('%i' % line_length(html, .85)), lambda match:)
+ #]
+
+ rules = self.PDFTOHTML # + line_length_rules
else:
rules = []
for rule in self.PREPROCESS + rules:
diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py
index 5b47e48a16..60ce9f15b9 100644
--- a/src/calibre/ebooks/pdb/header.py
+++ b/src/calibre/ebooks/pdb/header.py
@@ -63,10 +63,10 @@ class PdbHeaderReader(object):
class PdbHeaderWriter(object):
def __init__(self, identity, title):
- self.identity = identity[:8]
+ self.identity = identity.ljust(3, '\x00')[:8]
self.title = title.ljust(32, '\x00')[:32]
- def build_header(self, sections)
+ def build_header(self, offsets):
'''
Sections is a list of section offsets
'''
diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index 180e0814a6..31808d27d5 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.pdb.header import PdbHeader
+from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
class PDBInput(InputFormatPlugin):