Handle output of pdftohtml

2025-07-09 03:04:10 -04:00 · 2007-07-21 04:05:46 +00:00 · 2007-07-21 04:05:46 +00:00 · 5b7416ff84
commit 5b7416ff84
parent bc3044ef60
4 changed files with 42 additions and 13 deletions
--- a/src/libprs500/init.py
+++ b/src/libprs500/init.py
@ -13,7 +13,7 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ''' E-book management software'''
-__version__   = "0.3.78"
+__version__   = "0.3.79"
 __docformat__ = "epytext"
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
 __appname__   = 'libprs500'
--- a/src/libprs500/ebooks/BeautifulSoup.py
+++ b/src/libprs500/ebooks/BeautifulSoup.py
@ -959,6 +959,9 @@ class BeautifulStoneSoup(Tag, SGMLParser):
                     (markup, [self.fromEncoding, inDocumentEncoding],
                      smartQuotesTo=self.smartQuotesTo)
            markup = dammit.unicode
            if not markup: # Added by Kovid
                from libprs500.ebooks import ConversionError
                raise ConversionError, 'Failed to coerce to unicode'
            self.originalEncoding = dammit.originalEncoding
        if markup:
            if self.markupMassage:
@ -967,7 +970,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
                for fix, m in self.markupMassage:
                    markup = fix.sub(m, markup)
        self.reset()
-
+        
        SGMLParser.feed(self, markup)
        # Close out any unfinished strings and close all the open tags.
        self.endData()
@ -1530,7 +1533,7 @@ class UnicodeDammit:
        self.triedEncodings = []
        if markup == '' or isinstance(markup, unicode):
            self.originalEncoding = None
-            self.unicode = unicode(markup)            
+            self.unicode = unicode(markup)
            return
        u = None
@ -1552,7 +1555,7 @@ class UnicodeDammit:
                u = self._convertFrom(proposed_encoding)
                if u: break
        self.unicode = u
-        if not u: self.originalEncoding = None
+        if not u: self.originalEncoding = None        
    def _subMSChar(self, orig):
        """Changes a MS smart quote character to an XML or HTML
@ -1587,10 +1590,11 @@ class UnicodeDammit:
            self.markup = u       
            self.originalEncoding = proposed
        except Exception, e:
-            # print "That didn't work!"
+            #print "That didn't work!"
-            # print e
+            #print e
            return None        
        #print "Correct encoding: %s" % proposed
        return self.markup
    def _toUnicode(self, data, encoding):
@ -1679,6 +1683,7 @@ class UnicodeDammit:
                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
                                 'utf16', 'u16')):
                xml_encoding = sniffed_xml_encoding
        return xml_data, xml_encoding, sniffed_xml_encoding
--- a/src/libprs500/ebooks/lrf/init.py
+++ b/src/libprs500/ebooks/lrf/init.py
@ -150,6 +150,8 @@ def option_parser(usage):
    prepro = parser.add_option_group('PREPROCESSING OPTIONS')
    prepro.add_option('--baen', action='store_true', default=False, dest='baen',
                      help='''Preprocess Baen HTML files to improve generated LRF.''')
    prepro.add_option('--pdftohtml', action='store_true', default=False, dest='pdftohtml',
                      help='''You must add this option if processing files generated by pdftohtml, otherwise conversion will fail.''')
    fonts = parser.add_option_group('FONT FAMILIES', 
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -223,8 +223,12 @@ class HTMLConverter(object):
    PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
    IGNORED_TAGS   = (Comment, Declaration, ProcessingInstruction)
    # Fix <a /> elements 
-    MARKUP_MASSAGE   = [(re.compile('&nbsp;'), lambda match : ' '), # Convert &nbsp; into a normal space as the default conversion converts it into \xa0 which is not a space in LRF
+    MARKUP_MASSAGE   = [
-                        (re.compile("(<\s*[aA]\s+.*\/)\s*>"), #Close <a /> tags
+                        # Convert &nbsp; into a normal space as the default 
                        # conversion converts it into \xa0 which is not a space in LRF
                        (re.compile('&nbsp;'), lambda match : ' '),
                        # Close <a /> tags
                        (re.compile("(<\s*[aA]\s+.*\/)\s*>"), 
                         lambda match: match.group(1)+"></a>"),
                         # Strip comments from <style> tags. This is needed as 
                         # sometimes there are unterminated comments
@ -242,7 +246,16 @@ class HTMLConverter(object):
                      lambda match: match.group(1)),
                     (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*</a>', re.IGNORECASE), 
                      lambda match: ''),
-                     ] 
+                     ]
    # Fix pdftohtml markup
    PDFTOHTML  = [
                  # Remove <hr> tags
                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: ''),
                  # Remove <br> and replace <br><br> with <p>
                  (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
                  (re.compile(r'(.{75,}?)<br.*?>', re.IGNORECASE), 
                   lambda match: match.group(1)),
                  ]
    class Link(object):
        def __init__(self, para, tag):
@ -261,7 +274,8 @@ class HTMLConverter(object):
                 force_page_break=re.compile('$', re.IGNORECASE),
                 profile=PRS500_PROFILE,
                 disable_autorotation=False,
-                 ignore_tables=False):
+                 ignore_tables=False,
                 pdftohtml=False):
        '''
        Convert HTML file at C{path} and add it to C{book}. After creating
        the object, you must call L{self.process_links} on it to create the links and
@ -365,9 +379,15 @@ class HTMLConverter(object):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
        self.baen = baen
        self.pdftohtml = pdftohtml
        if baen:
            nmassage.extend(HTMLConverter.BAEN_SANCTIFY)
-        self.soup = BeautifulSoup(open(self.file_name, 'r').read(), 
+            
        raw = open(self.file_name, 'rb').read()
        if pdftohtml:
            nmassage.extend(HTMLConverter.PDFTOHTML)
            raw = unicode(raw, 'utf8', 'replace')
        self.soup = BeautifulSoup(raw, 
                         convertEntities=BeautifulSoup.HTML_ENTITIES,
                         markupMassage=nmassage)
        print 'done\n\tConverting to BBeB...',
@ -614,7 +634,8 @@ class HTMLConverter(object):
                                     page_break=self.page_break,
                                     force_page_break=self.force_page_break,
                                     disable_autorotation=self.disable_autorotation,
-                                     ignore_tables=self.ignore_tables)
+                                     ignore_tables=self.ignore_tables,
                                     pdftohtml=self.pdftohtml)
                        HTMLConverter.processed_files[path] = self.files[path]
                    except Exception:
                        print >>sys.stderr, 'Unable to process', path
@ -1298,7 +1319,8 @@ def process_file(path, options):
                             chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE),
                             link_exclude=re.compile(le), page_break=pb, force_page_break=fpb,
                             disable_autorotation=options.disable_autorotation,
-                             ignore_tables=options.ignore_tables)
+                             ignore_tables=options.ignore_tables,
                             pdftohtml=options.pdftohtml)
        conv.process_links()
        oname = options.output
        if not oname: