Line length for pdf processing

2025-07-09 03:04:10 -04:00 · 2009-04-24 07:27:54 -04:00 · 2009-04-24 07:27:54 -04:00 · 19ba43153b
commit 19ba43153b
parent d871313ff0
3 changed files with 42 additions and 4 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -32,6 +32,39 @@ def chap_head(match):
               return '<h1>'+chap+'<br/>'+title+'</h1><br/>'
 def line_length(raw, percent):
    '''
    raw is the raw text to find the line length to use for wrapping.
    percentage is a decimal number, 0 - 1 which is used to determine
    how far in the list of line lengths to use.
    '''
    raw = raw.replace('&nbsp;', ' ')
    linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
    lines = linere.findall(raw)
    lengths = []
    for line in lines:
        if len(line) > 0:
            lengths.append(len(line))
    total = sum(lengths)
    avg = total / len(lengths)
    max_line = avg * 2
    lengths = sorted(lengths)
    for i in range(len(lengths) - 1, -1, -1):
        if lengths[i] > max_line:
            del lengths[i]
    if percent > 1:
        percent = 1
    if percent < 0:
        percent = 0
    index = int(len(lengths) * percent) - 1
    return lengths[index]
 class CSSPreProcessor(object):
    PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')
@ -129,7 +162,12 @@ class HTMLPreProcessor(object):
        elif self.is_book_designer(html):
            rules = self.BOOK_DESIGNER
        elif self.is_pdftohtml(html):
-            rules = self.PDFTOHTML
+            # Add rules that require matching line length here
            #line_length_rules = [
            #    (re.compile('%i' % line_length(html, .85)), lambda match:)
            #]
            rules = self.PDFTOHTML # + line_length_rules
        else:
            rules = []
        for rule in self.PREPROCESS + rules:
--- a/src/calibre/ebooks/pdb/header.py
+++ b/src/calibre/ebooks/pdb/header.py
@ -63,10 +63,10 @@ class PdbHeaderReader(object):
 class PdbHeaderWriter(object):
    def __init__(self, identity, title):
-        self.identity = identity[:8]
+        self.identity = identity.ljust(3, '\x00')[:8]
        self.title = title.ljust(32, '\x00')[:32]
-    def build_header(self, sections)
+    def build_header(self, offsets):
        '''
        Sections is a list of section offsets
        '''
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
 import os
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.pdb.header import PdbHeader
+from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
 class PDBInput(InputFormatPlugin):