Handle output of pdftohtml

This commit is contained in:
Kovid Goyal 2007-07-21 04:05:46 +00:00
parent bc3044ef60
commit 5b7416ff84
4 changed files with 42 additions and 13 deletions

View File

@ -13,7 +13,7 @@
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
''' E-book management software''' ''' E-book management software'''
__version__ = "0.3.78" __version__ = "0.3.79"
__docformat__ = "epytext" __docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>" __author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
__appname__ = 'libprs500' __appname__ = 'libprs500'

View File

@ -959,6 +959,9 @@ class BeautifulStoneSoup(Tag, SGMLParser):
(markup, [self.fromEncoding, inDocumentEncoding], (markup, [self.fromEncoding, inDocumentEncoding],
smartQuotesTo=self.smartQuotesTo) smartQuotesTo=self.smartQuotesTo)
markup = dammit.unicode markup = dammit.unicode
if not markup: # Added by Kovid
from libprs500.ebooks import ConversionError
raise ConversionError, 'Failed to coerce to unicode'
self.originalEncoding = dammit.originalEncoding self.originalEncoding = dammit.originalEncoding
if markup: if markup:
if self.markupMassage: if self.markupMassage:
@ -967,7 +970,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
for fix, m in self.markupMassage: for fix, m in self.markupMassage:
markup = fix.sub(m, markup) markup = fix.sub(m, markup)
self.reset() self.reset()
SGMLParser.feed(self, markup) SGMLParser.feed(self, markup)
# Close out any unfinished strings and close all the open tags. # Close out any unfinished strings and close all the open tags.
self.endData() self.endData()
@ -1530,7 +1533,7 @@ class UnicodeDammit:
self.triedEncodings = [] self.triedEncodings = []
if markup == '' or isinstance(markup, unicode): if markup == '' or isinstance(markup, unicode):
self.originalEncoding = None self.originalEncoding = None
self.unicode = unicode(markup) self.unicode = unicode(markup)
return return
u = None u = None
@ -1552,7 +1555,7 @@ class UnicodeDammit:
u = self._convertFrom(proposed_encoding) u = self._convertFrom(proposed_encoding)
if u: break if u: break
self.unicode = u self.unicode = u
if not u: self.originalEncoding = None if not u: self.originalEncoding = None
def _subMSChar(self, orig): def _subMSChar(self, orig):
"""Changes a MS smart quote character to an XML or HTML """Changes a MS smart quote character to an XML or HTML
@ -1587,10 +1590,11 @@ class UnicodeDammit:
self.markup = u self.markup = u
self.originalEncoding = proposed self.originalEncoding = proposed
except Exception, e: except Exception, e:
# print "That didn't work!" #print "That didn't work!"
# print e #print e
return None return None
#print "Correct encoding: %s" % proposed #print "Correct encoding: %s" % proposed
return self.markup return self.markup
def _toUnicode(self, data, encoding): def _toUnicode(self, data, encoding):
@ -1679,6 +1683,7 @@ class UnicodeDammit:
'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf-16', 'utf-32', 'utf_16', 'utf_32',
'utf16', 'u16')): 'utf16', 'u16')):
xml_encoding = sniffed_xml_encoding xml_encoding = sniffed_xml_encoding
return xml_data, xml_encoding, sniffed_xml_encoding return xml_data, xml_encoding, sniffed_xml_encoding

View File

@ -150,6 +150,8 @@ def option_parser(usage):
prepro = parser.add_option_group('PREPROCESSING OPTIONS') prepro = parser.add_option_group('PREPROCESSING OPTIONS')
prepro.add_option('--baen', action='store_true', default=False, dest='baen', prepro.add_option('--baen', action='store_true', default=False, dest='baen',
help='''Preprocess Baen HTML files to improve generated LRF.''') help='''Preprocess Baen HTML files to improve generated LRF.''')
prepro.add_option('--pdftohtml', action='store_true', default=False, dest='pdftohtml',
help='''You must add this option if processing files generated by pdftohtml, otherwise conversion will fail.''')
fonts = parser.add_option_group('FONT FAMILIES', fonts = parser.add_option_group('FONT FAMILIES',

View File

@ -223,8 +223,12 @@ class HTMLConverter(object):
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
# Fix <a /> elements # Fix <a /> elements
MARKUP_MASSAGE = [(re.compile('&nbsp;'), lambda match : ' '), # Convert &nbsp; into a normal space as the default conversion converts it into \xa0 which is not a space in LRF MARKUP_MASSAGE = [
(re.compile("(<\s*[aA]\s+.*\/)\s*>"), #Close <a /> tags # Convert &nbsp; into a normal space as the default
# conversion converts it into \xa0 which is not a space in LRF
(re.compile('&nbsp;'), lambda match : ' '),
# Close <a /> tags
(re.compile("(<\s*[aA]\s+.*\/)\s*>"),
lambda match: match.group(1)+"></a>"), lambda match: match.group(1)+"></a>"),
# Strip comments from <style> tags. This is needed as # Strip comments from <style> tags. This is needed as
# sometimes there are unterminated comments # sometimes there are unterminated comments
@ -242,7 +246,16 @@ class HTMLConverter(object):
lambda match: match.group(1)), lambda match: match.group(1)),
(re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*</a>', re.IGNORECASE), (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*</a>', re.IGNORECASE),
lambda match: ''), lambda match: ''),
] ]
# Fix pdftohtml markup
PDFTOHTML = [
# Remove <hr> tags
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: ''),
# Remove <br> and replace <br><br> with <p>
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
(re.compile(r'(.{75,}?)<br.*?>', re.IGNORECASE),
lambda match: match.group(1)),
]
class Link(object): class Link(object):
def __init__(self, para, tag): def __init__(self, para, tag):
@ -261,7 +274,8 @@ class HTMLConverter(object):
force_page_break=re.compile('$', re.IGNORECASE), force_page_break=re.compile('$', re.IGNORECASE),
profile=PRS500_PROFILE, profile=PRS500_PROFILE,
disable_autorotation=False, disable_autorotation=False,
ignore_tables=False): ignore_tables=False,
pdftohtml=False):
''' '''
Convert HTML file at C{path} and add it to C{book}. After creating Convert HTML file at C{path} and add it to C{book}. After creating
the object, you must call L{self.process_links} on it to create the links and the object, you must call L{self.process_links} on it to create the links and
@ -365,9 +379,15 @@ class HTMLConverter(object):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(HTMLConverter.MARKUP_MASSAGE) nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
self.baen = baen self.baen = baen
self.pdftohtml = pdftohtml
if baen: if baen:
nmassage.extend(HTMLConverter.BAEN_SANCTIFY) nmassage.extend(HTMLConverter.BAEN_SANCTIFY)
self.soup = BeautifulSoup(open(self.file_name, 'r').read(),
raw = open(self.file_name, 'rb').read()
if pdftohtml:
nmassage.extend(HTMLConverter.PDFTOHTML)
raw = unicode(raw, 'utf8', 'replace')
self.soup = BeautifulSoup(raw,
convertEntities=BeautifulSoup.HTML_ENTITIES, convertEntities=BeautifulSoup.HTML_ENTITIES,
markupMassage=nmassage) markupMassage=nmassage)
print 'done\n\tConverting to BBeB...', print 'done\n\tConverting to BBeB...',
@ -614,7 +634,8 @@ class HTMLConverter(object):
page_break=self.page_break, page_break=self.page_break,
force_page_break=self.force_page_break, force_page_break=self.force_page_break,
disable_autorotation=self.disable_autorotation, disable_autorotation=self.disable_autorotation,
ignore_tables=self.ignore_tables) ignore_tables=self.ignore_tables,
pdftohtml=self.pdftohtml)
HTMLConverter.processed_files[path] = self.files[path] HTMLConverter.processed_files[path] = self.files[path]
except Exception: except Exception:
print >>sys.stderr, 'Unable to process', path print >>sys.stderr, 'Unable to process', path
@ -1298,7 +1319,8 @@ def process_file(path, options):
chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE), chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE),
link_exclude=re.compile(le), page_break=pb, force_page_break=fpb, link_exclude=re.compile(le), page_break=pb, force_page_break=fpb,
disable_autorotation=options.disable_autorotation, disable_autorotation=options.disable_autorotation,
ignore_tables=options.ignore_tables) ignore_tables=options.ignore_tables,
pdftohtml=options.pdftohtml)
conv.process_links() conv.process_links()
oname = options.output oname = options.output
if not oname: if not oname: