From 19ba43153b6bec69f0df754a064e565399cea62a Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Fri, 24 Apr 2009 07:27:54 -0400
Subject: [PATCH] Line length for pdf processing

---
 src/calibre/ebooks/conversion/preprocess.py | 40 ++++++++++++++++++++-
 src/calibre/ebooks/pdb/header.py            |  4 +--
 src/calibre/ebooks/pdb/input.py             |  2 +-
 3 files changed, 42 insertions(+), 4 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index b105a6c042..fb55ee74fb 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -32,6 +32,39 @@ def chap_head(match):
                return '<h1>'+chap+'<br/>'+title+'</h1><br/>'
 
 
+def line_length(raw, percent):
+    '''
+    raw is the raw text to find the line length to use for wrapping.
+    percentage is a decimal number, 0 - 1 which is used to determine
+    how far in the list of line lengths to use.
+    '''
+    raw = raw.replace('&nbsp;', ' ')
+    linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
+    lines = linere.findall(raw)
+
+    lengths = []
+    for line in lines:
+        if len(line) > 0:
+            lengths.append(len(line))
+    total = sum(lengths)
+    avg = total / len(lengths)
+    max_line = avg * 2
+    
+    lengths = sorted(lengths)
+    for i in range(len(lengths) - 1, -1, -1):
+        if lengths[i] > max_line:
+            del lengths[i]
+    
+    if percent > 1:
+        percent = 1
+    if percent < 0:
+        percent = 0
+
+    index = int(len(lengths) * percent) - 1
+    
+    return lengths[index]
+
+
 class CSSPreProcessor(object):
 
     PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')
@@ -129,7 +162,12 @@ class HTMLPreProcessor(object):
         elif self.is_book_designer(html):
             rules = self.BOOK_DESIGNER
         elif self.is_pdftohtml(html):
-            rules = self.PDFTOHTML
+            # Add rules that require matching line length here
+            #line_length_rules = [
+            #    (re.compile('%i' % line_length(html, .85)), lambda match:)
+            #]
+            
+            rules = self.PDFTOHTML # + line_length_rules
         else:
             rules = []
         for rule in self.PREPROCESS + rules:
diff --git a/src/calibre/ebooks/pdb/header.py b/src/calibre/ebooks/pdb/header.py
index 5b47e48a16..60ce9f15b9 100644
--- a/src/calibre/ebooks/pdb/header.py
+++ b/src/calibre/ebooks/pdb/header.py
@@ -63,10 +63,10 @@ class PdbHeaderReader(object):
 class PdbHeaderWriter(object):
 
     def __init__(self, identity, title):
-        self.identity = identity[:8]
+        self.identity = identity.ljust(3, '\x00')[:8]
         self.title = title.ljust(32, '\x00')[:32]
         
-    def build_header(self, sections)
+    def build_header(self, offsets):
         '''
         Sections is a list of section offsets
         '''
diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index 180e0814a6..31808d27d5 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
 import os
 
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.pdb.header import PdbHeader
+from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
 
 class PDBInput(InputFormatPlugin):