mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Handle output of pdftohtml
This commit is contained in:
parent
bc3044ef60
commit
5b7416ff84
@ -13,7 +13,7 @@
|
|||||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
''' E-book management software'''
|
''' E-book management software'''
|
||||||
__version__ = "0.3.78"
|
__version__ = "0.3.79"
|
||||||
__docformat__ = "epytext"
|
__docformat__ = "epytext"
|
||||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||||
__appname__ = 'libprs500'
|
__appname__ = 'libprs500'
|
||||||
|
@ -959,6 +959,9 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
|||||||
(markup, [self.fromEncoding, inDocumentEncoding],
|
(markup, [self.fromEncoding, inDocumentEncoding],
|
||||||
smartQuotesTo=self.smartQuotesTo)
|
smartQuotesTo=self.smartQuotesTo)
|
||||||
markup = dammit.unicode
|
markup = dammit.unicode
|
||||||
|
if not markup: # Added by Kovid
|
||||||
|
from libprs500.ebooks import ConversionError
|
||||||
|
raise ConversionError, 'Failed to coerce to unicode'
|
||||||
self.originalEncoding = dammit.originalEncoding
|
self.originalEncoding = dammit.originalEncoding
|
||||||
if markup:
|
if markup:
|
||||||
if self.markupMassage:
|
if self.markupMassage:
|
||||||
@ -967,7 +970,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
|||||||
for fix, m in self.markupMassage:
|
for fix, m in self.markupMassage:
|
||||||
markup = fix.sub(m, markup)
|
markup = fix.sub(m, markup)
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
SGMLParser.feed(self, markup)
|
SGMLParser.feed(self, markup)
|
||||||
# Close out any unfinished strings and close all the open tags.
|
# Close out any unfinished strings and close all the open tags.
|
||||||
self.endData()
|
self.endData()
|
||||||
@ -1530,7 +1533,7 @@ class UnicodeDammit:
|
|||||||
self.triedEncodings = []
|
self.triedEncodings = []
|
||||||
if markup == '' or isinstance(markup, unicode):
|
if markup == '' or isinstance(markup, unicode):
|
||||||
self.originalEncoding = None
|
self.originalEncoding = None
|
||||||
self.unicode = unicode(markup)
|
self.unicode = unicode(markup)
|
||||||
return
|
return
|
||||||
|
|
||||||
u = None
|
u = None
|
||||||
@ -1552,7 +1555,7 @@ class UnicodeDammit:
|
|||||||
u = self._convertFrom(proposed_encoding)
|
u = self._convertFrom(proposed_encoding)
|
||||||
if u: break
|
if u: break
|
||||||
self.unicode = u
|
self.unicode = u
|
||||||
if not u: self.originalEncoding = None
|
if not u: self.originalEncoding = None
|
||||||
|
|
||||||
def _subMSChar(self, orig):
|
def _subMSChar(self, orig):
|
||||||
"""Changes a MS smart quote character to an XML or HTML
|
"""Changes a MS smart quote character to an XML or HTML
|
||||||
@ -1587,10 +1590,11 @@ class UnicodeDammit:
|
|||||||
self.markup = u
|
self.markup = u
|
||||||
self.originalEncoding = proposed
|
self.originalEncoding = proposed
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
# print "That didn't work!"
|
#print "That didn't work!"
|
||||||
# print e
|
#print e
|
||||||
return None
|
return None
|
||||||
#print "Correct encoding: %s" % proposed
|
#print "Correct encoding: %s" % proposed
|
||||||
|
|
||||||
return self.markup
|
return self.markup
|
||||||
|
|
||||||
def _toUnicode(self, data, encoding):
|
def _toUnicode(self, data, encoding):
|
||||||
@ -1679,6 +1683,7 @@ class UnicodeDammit:
|
|||||||
'utf-16', 'utf-32', 'utf_16', 'utf_32',
|
'utf-16', 'utf-32', 'utf_16', 'utf_32',
|
||||||
'utf16', 'u16')):
|
'utf16', 'u16')):
|
||||||
xml_encoding = sniffed_xml_encoding
|
xml_encoding = sniffed_xml_encoding
|
||||||
|
|
||||||
return xml_data, xml_encoding, sniffed_xml_encoding
|
return xml_data, xml_encoding, sniffed_xml_encoding
|
||||||
|
|
||||||
|
|
||||||
|
@ -150,6 +150,8 @@ def option_parser(usage):
|
|||||||
prepro = parser.add_option_group('PREPROCESSING OPTIONS')
|
prepro = parser.add_option_group('PREPROCESSING OPTIONS')
|
||||||
prepro.add_option('--baen', action='store_true', default=False, dest='baen',
|
prepro.add_option('--baen', action='store_true', default=False, dest='baen',
|
||||||
help='''Preprocess Baen HTML files to improve generated LRF.''')
|
help='''Preprocess Baen HTML files to improve generated LRF.''')
|
||||||
|
prepro.add_option('--pdftohtml', action='store_true', default=False, dest='pdftohtml',
|
||||||
|
help='''You must add this option if processing files generated by pdftohtml, otherwise conversion will fail.''')
|
||||||
|
|
||||||
|
|
||||||
fonts = parser.add_option_group('FONT FAMILIES',
|
fonts = parser.add_option_group('FONT FAMILIES',
|
||||||
|
@ -223,8 +223,12 @@ class HTMLConverter(object):
|
|||||||
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
|
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
|
||||||
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
|
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
|
||||||
# Fix <a /> elements
|
# Fix <a /> elements
|
||||||
MARKUP_MASSAGE = [(re.compile(' '), lambda match : ' '), # Convert into a normal space as the default conversion converts it into \xa0 which is not a space in LRF
|
MARKUP_MASSAGE = [
|
||||||
(re.compile("(<\s*[aA]\s+.*\/)\s*>"), #Close <a /> tags
|
# Convert into a normal space as the default
|
||||||
|
# conversion converts it into \xa0 which is not a space in LRF
|
||||||
|
(re.compile(' '), lambda match : ' '),
|
||||||
|
# Close <a /> tags
|
||||||
|
(re.compile("(<\s*[aA]\s+.*\/)\s*>"),
|
||||||
lambda match: match.group(1)+"></a>"),
|
lambda match: match.group(1)+"></a>"),
|
||||||
# Strip comments from <style> tags. This is needed as
|
# Strip comments from <style> tags. This is needed as
|
||||||
# sometimes there are unterminated comments
|
# sometimes there are unterminated comments
|
||||||
@ -242,7 +246,16 @@ class HTMLConverter(object):
|
|||||||
lambda match: match.group(1)),
|
lambda match: match.group(1)),
|
||||||
(re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*</a>', re.IGNORECASE),
|
(re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*</a>', re.IGNORECASE),
|
||||||
lambda match: ''),
|
lambda match: ''),
|
||||||
]
|
]
|
||||||
|
# Fix pdftohtml markup
|
||||||
|
PDFTOHTML = [
|
||||||
|
# Remove <hr> tags
|
||||||
|
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: ''),
|
||||||
|
# Remove <br> and replace <br><br> with <p>
|
||||||
|
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
|
||||||
|
(re.compile(r'(.{75,}?)<br.*?>', re.IGNORECASE),
|
||||||
|
lambda match: match.group(1)),
|
||||||
|
]
|
||||||
|
|
||||||
class Link(object):
|
class Link(object):
|
||||||
def __init__(self, para, tag):
|
def __init__(self, para, tag):
|
||||||
@ -261,7 +274,8 @@ class HTMLConverter(object):
|
|||||||
force_page_break=re.compile('$', re.IGNORECASE),
|
force_page_break=re.compile('$', re.IGNORECASE),
|
||||||
profile=PRS500_PROFILE,
|
profile=PRS500_PROFILE,
|
||||||
disable_autorotation=False,
|
disable_autorotation=False,
|
||||||
ignore_tables=False):
|
ignore_tables=False,
|
||||||
|
pdftohtml=False):
|
||||||
'''
|
'''
|
||||||
Convert HTML file at C{path} and add it to C{book}. After creating
|
Convert HTML file at C{path} and add it to C{book}. After creating
|
||||||
the object, you must call L{self.process_links} on it to create the links and
|
the object, you must call L{self.process_links} on it to create the links and
|
||||||
@ -365,9 +379,15 @@ class HTMLConverter(object):
|
|||||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||||
nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
|
nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
|
||||||
self.baen = baen
|
self.baen = baen
|
||||||
|
self.pdftohtml = pdftohtml
|
||||||
if baen:
|
if baen:
|
||||||
nmassage.extend(HTMLConverter.BAEN_SANCTIFY)
|
nmassage.extend(HTMLConverter.BAEN_SANCTIFY)
|
||||||
self.soup = BeautifulSoup(open(self.file_name, 'r').read(),
|
|
||||||
|
raw = open(self.file_name, 'rb').read()
|
||||||
|
if pdftohtml:
|
||||||
|
nmassage.extend(HTMLConverter.PDFTOHTML)
|
||||||
|
raw = unicode(raw, 'utf8', 'replace')
|
||||||
|
self.soup = BeautifulSoup(raw,
|
||||||
convertEntities=BeautifulSoup.HTML_ENTITIES,
|
convertEntities=BeautifulSoup.HTML_ENTITIES,
|
||||||
markupMassage=nmassage)
|
markupMassage=nmassage)
|
||||||
print 'done\n\tConverting to BBeB...',
|
print 'done\n\tConverting to BBeB...',
|
||||||
@ -614,7 +634,8 @@ class HTMLConverter(object):
|
|||||||
page_break=self.page_break,
|
page_break=self.page_break,
|
||||||
force_page_break=self.force_page_break,
|
force_page_break=self.force_page_break,
|
||||||
disable_autorotation=self.disable_autorotation,
|
disable_autorotation=self.disable_autorotation,
|
||||||
ignore_tables=self.ignore_tables)
|
ignore_tables=self.ignore_tables,
|
||||||
|
pdftohtml=self.pdftohtml)
|
||||||
HTMLConverter.processed_files[path] = self.files[path]
|
HTMLConverter.processed_files[path] = self.files[path]
|
||||||
except Exception:
|
except Exception:
|
||||||
print >>sys.stderr, 'Unable to process', path
|
print >>sys.stderr, 'Unable to process', path
|
||||||
@ -1298,7 +1319,8 @@ def process_file(path, options):
|
|||||||
chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE),
|
chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE),
|
||||||
link_exclude=re.compile(le), page_break=pb, force_page_break=fpb,
|
link_exclude=re.compile(le), page_break=pb, force_page_break=fpb,
|
||||||
disable_autorotation=options.disable_autorotation,
|
disable_autorotation=options.disable_autorotation,
|
||||||
ignore_tables=options.ignore_tables)
|
ignore_tables=options.ignore_tables,
|
||||||
|
pdftohtml=options.pdftohtml)
|
||||||
conv.process_links()
|
conv.process_links()
|
||||||
oname = options.output
|
oname = options.output
|
||||||
if not oname:
|
if not oname:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user