Handle output of pdftohtml

2025-07-09 03:04:10 -04:00 · 2007-07-21 04:05:46 +00:00 · 2007-07-21 04:05:46 +00:00 · 5b7416ff84
commit 5b7416ff84
parent bc3044ef60
4 changed files with 42 additions and 13 deletions
--- a/src/libprs500/init.py
+++ b/src/libprs500/init.py
@ -13,7 +13,7 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ''' E-book management software'''
-__version__   = "0.3.78"
+__version__   = "0.3.79"
 __docformat__ = "epytext"
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
 __appname__   = 'libprs500'
--- a/src/libprs500/ebooks/BeautifulSoup.py
+++ b/src/libprs500/ebooks/BeautifulSoup.py
@ -959,6 +959,9 @@ class BeautifulStoneSoup(Tag, SGMLParser):
                     (markup, [self.fromEncoding, inDocumentEncoding],
                      smartQuotesTo=self.smartQuotesTo)
            markup = dammit.unicode
+            if not markup: # Added by Kovid
+                from libprs500.ebooks import ConversionError
+                raise ConversionError, 'Failed to coerce to unicode'
            self.originalEncoding = dammit.originalEncoding
        if markup:
            if self.markupMassage:
@ -1591,6 +1594,7 @@ class UnicodeDammit:
            #print e
            return None        
        #print "Correct encoding: %s" % proposed
+        
        return self.markup

    def _toUnicode(self, data, encoding):
@ -1679,6 +1683,7 @@ class UnicodeDammit:
                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
                                 'utf16', 'u16')):
                xml_encoding = sniffed_xml_encoding
+        
        return xml_data, xml_encoding, sniffed_xml_encoding


--- a/src/libprs500/ebooks/lrf/init.py
+++ b/src/libprs500/ebooks/lrf/init.py
@ -150,6 +150,8 @@ def option_parser(usage):
    prepro = parser.add_option_group('PREPROCESSING OPTIONS')
    prepro.add_option('--baen', action='store_true', default=False, dest='baen',
                      help='''Preprocess Baen HTML files to improve generated LRF.''')
+    prepro.add_option('--pdftohtml', action='store_true', default=False, dest='pdftohtml',
+                      help='''You must add this option if processing files generated by pdftohtml, otherwise conversion will fail.''')

    
    fonts = parser.add_option_group('FONT FAMILIES', 
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -223,8 +223,12 @@ class HTMLConverter(object):
    PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
    IGNORED_TAGS   = (Comment, Declaration, ProcessingInstruction)
    # Fix <a /> elements 
-    MARKUP_MASSAGE   = [(re.compile('&nbsp;'), lambda match : ' '), # Convert &nbsp; into a normal space as the default conversion converts it into \xa0 which is not a space in LRF
-                        (re.compile("(<\s*[aA]\s+.*\/)\s*>"), #Close <a /> tags
+    MARKUP_MASSAGE   = [
+                        # Convert &nbsp; into a normal space as the default 
+                        # conversion converts it into \xa0 which is not a space in LRF
+                        (re.compile('&nbsp;'), lambda match : ' '),
+                        # Close <a /> tags
+                        (re.compile("(<\s*[aA]\s+.*\/)\s*>"), 
                         lambda match: match.group(1)+"></a>"),
                         # Strip comments from <style> tags. This is needed as 
                         # sometimes there are unterminated comments
@ -243,6 +247,15 @@ class HTMLConverter(object):
                     (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*</a>', re.IGNORECASE), 
                      lambda match: ''),
                     ]
+    # Fix pdftohtml markup
+    PDFTOHTML  = [
+                  # Remove <hr> tags
+                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: ''),
+                  # Remove <br> and replace <br><br> with <p>
+                  (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
+                  (re.compile(r'(.{75,}?)<br.*?>', re.IGNORECASE), 
+                   lambda match: match.group(1)),
+                  ]
    
    class Link(object):
        def __init__(self, para, tag):
@ -261,7 +274,8 @@ class HTMLConverter(object):
                 force_page_break=re.compile('$', re.IGNORECASE),
                 profile=PRS500_PROFILE,
                 disable_autorotation=False,
-                 ignore_tables=False):
+                 ignore_tables=False,
+                 pdftohtml=False):
        '''
        Convert HTML file at C{path} and add it to C{book}. After creating
        the object, you must call L{self.process_links} on it to create the links and
@ -365,9 +379,15 @@ class HTMLConverter(object):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
        self.baen = baen
+        self.pdftohtml = pdftohtml
        if baen:
            nmassage.extend(HTMLConverter.BAEN_SANCTIFY)
-        self.soup = BeautifulSoup(open(self.file_name, 'r').read(), 
+            
+        raw = open(self.file_name, 'rb').read()
+        if pdftohtml:
+            nmassage.extend(HTMLConverter.PDFTOHTML)
+            raw = unicode(raw, 'utf8', 'replace')
+        self.soup = BeautifulSoup(raw, 
                         convertEntities=BeautifulSoup.HTML_ENTITIES,
                         markupMassage=nmassage)
        print 'done\n\tConverting to BBeB...',
@ -614,7 +634,8 @@ class HTMLConverter(object):
                                     page_break=self.page_break,
                                     force_page_break=self.force_page_break,
                                     disable_autorotation=self.disable_autorotation,
-                                     ignore_tables=self.ignore_tables)
+                                     ignore_tables=self.ignore_tables,
+                                     pdftohtml=self.pdftohtml)
                        HTMLConverter.processed_files[path] = self.files[path]
                    except Exception:
                        print >>sys.stderr, 'Unable to process', path
@ -1298,7 +1319,8 @@ def process_file(path, options):
                             chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE),
                             link_exclude=re.compile(le), page_break=pb, force_page_break=fpb,
                             disable_autorotation=options.disable_autorotation,
-                             ignore_tables=options.ignore_tables)
+                             ignore_tables=options.ignore_tables,
+                             pdftohtml=options.pdftohtml)
        conv.process_links()
        oname = options.output
        if not oname: