Handle output of pdftohtml

This commit is contained in:
Kovid Goyal 2007-07-21 04:05:46 +00:00
parent bc3044ef60
commit 5b7416ff84
4 changed files with 42 additions and 13 deletions

View File

@ -13,7 +13,7 @@
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
''' E-book management software'''
__version__ = "0.3.78"
__version__ = "0.3.79"
__docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
__appname__ = 'libprs500'

View File

@ -959,6 +959,9 @@ class BeautifulStoneSoup(Tag, SGMLParser):
(markup, [self.fromEncoding, inDocumentEncoding],
smartQuotesTo=self.smartQuotesTo)
markup = dammit.unicode
if not markup: # Added by Kovid
from libprs500.ebooks import ConversionError
raise ConversionError, 'Failed to coerce to unicode'
self.originalEncoding = dammit.originalEncoding
if markup:
if self.markupMassage:
@ -1591,6 +1594,7 @@ class UnicodeDammit:
#print e
return None
#print "Correct encoding: %s" % proposed
return self.markup
def _toUnicode(self, data, encoding):
@ -1679,6 +1683,7 @@ class UnicodeDammit:
'utf-16', 'utf-32', 'utf_16', 'utf_32',
'utf16', 'u16')):
xml_encoding = sniffed_xml_encoding
return xml_data, xml_encoding, sniffed_xml_encoding

View File

@ -150,6 +150,8 @@ def option_parser(usage):
prepro = parser.add_option_group('PREPROCESSING OPTIONS')
prepro.add_option('--baen', action='store_true', default=False, dest='baen',
help='''Preprocess Baen HTML files to improve generated LRF.''')
prepro.add_option('--pdftohtml', action='store_true', default=False, dest='pdftohtml',
help='''You must add this option if processing files generated by pdftohtml, otherwise conversion will fail.''')
fonts = parser.add_option_group('FONT FAMILIES',

View File

@ -223,8 +223,12 @@ class HTMLConverter(object):
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
# Fix <a /> elements
MARKUP_MASSAGE = [(re.compile('&nbsp;'), lambda match : ' '), # Convert &nbsp; into a normal space as the default conversion converts it into \xa0 which is not a space in LRF
(re.compile("(<\s*[aA]\s+.*\/)\s*>"), #Close <a /> tags
MARKUP_MASSAGE = [
# Convert &nbsp; into a normal space as the default
# conversion converts it into \xa0 which is not a space in LRF
(re.compile('&nbsp;'), lambda match : ' '),
# Close <a /> tags
(re.compile("(<\s*[aA]\s+.*\/)\s*>"),
lambda match: match.group(1)+"></a>"),
# Strip comments from <style> tags. This is needed as
# sometimes there are unterminated comments
@ -243,6 +247,15 @@ class HTMLConverter(object):
(re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*</a>', re.IGNORECASE),
lambda match: ''),
]
# Fix pdftohtml markup
PDFTOHTML = [
# Remove <hr> tags
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: ''),
# Remove <br> and replace <br><br> with <p>
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
(re.compile(r'(.{75,}?)<br.*?>', re.IGNORECASE),
lambda match: match.group(1)),
]
class Link(object):
def __init__(self, para, tag):
@ -261,7 +274,8 @@ class HTMLConverter(object):
force_page_break=re.compile('$', re.IGNORECASE),
profile=PRS500_PROFILE,
disable_autorotation=False,
ignore_tables=False):
ignore_tables=False,
pdftohtml=False):
'''
Convert HTML file at C{path} and add it to C{book}. After creating
the object, you must call L{self.process_links} on it to create the links and
@ -365,9 +379,15 @@ class HTMLConverter(object):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
self.baen = baen
self.pdftohtml = pdftohtml
if baen:
nmassage.extend(HTMLConverter.BAEN_SANCTIFY)
self.soup = BeautifulSoup(open(self.file_name, 'r').read(),
raw = open(self.file_name, 'rb').read()
if pdftohtml:
nmassage.extend(HTMLConverter.PDFTOHTML)
raw = unicode(raw, 'utf8', 'replace')
self.soup = BeautifulSoup(raw,
convertEntities=BeautifulSoup.HTML_ENTITIES,
markupMassage=nmassage)
print 'done\n\tConverting to BBeB...',
@ -614,7 +634,8 @@ class HTMLConverter(object):
page_break=self.page_break,
force_page_break=self.force_page_break,
disable_autorotation=self.disable_autorotation,
ignore_tables=self.ignore_tables)
ignore_tables=self.ignore_tables,
pdftohtml=self.pdftohtml)
HTMLConverter.processed_files[path] = self.files[path]
except Exception:
print >>sys.stderr, 'Unable to process', path
@ -1298,7 +1319,8 @@ def process_file(path, options):
chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE),
link_exclude=re.compile(le), page_break=pb, force_page_break=fpb,
disable_autorotation=options.disable_autorotation,
ignore_tables=options.ignore_tables)
ignore_tables=options.ignore_tables,
pdftohtml=options.pdftohtml)
conv.process_links()
oname = options.output
if not oname: