mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Handle output of pdftohtml
This commit is contained in:
parent
bc3044ef60
commit
5b7416ff84
@ -13,7 +13,7 @@
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
''' E-book management software'''
|
||||
__version__ = "0.3.78"
|
||||
__version__ = "0.3.79"
|
||||
__docformat__ = "epytext"
|
||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||
__appname__ = 'libprs500'
|
||||
|
@ -959,6 +959,9 @@ class BeautifulStoneSoup(Tag, SGMLParser):
|
||||
(markup, [self.fromEncoding, inDocumentEncoding],
|
||||
smartQuotesTo=self.smartQuotesTo)
|
||||
markup = dammit.unicode
|
||||
if not markup: # Added by Kovid
|
||||
from libprs500.ebooks import ConversionError
|
||||
raise ConversionError, 'Failed to coerce to unicode'
|
||||
self.originalEncoding = dammit.originalEncoding
|
||||
if markup:
|
||||
if self.markupMassage:
|
||||
@ -1591,6 +1594,7 @@ class UnicodeDammit:
|
||||
#print e
|
||||
return None
|
||||
#print "Correct encoding: %s" % proposed
|
||||
|
||||
return self.markup
|
||||
|
||||
def _toUnicode(self, data, encoding):
|
||||
@ -1679,6 +1683,7 @@ class UnicodeDammit:
|
||||
'utf-16', 'utf-32', 'utf_16', 'utf_32',
|
||||
'utf16', 'u16')):
|
||||
xml_encoding = sniffed_xml_encoding
|
||||
|
||||
return xml_data, xml_encoding, sniffed_xml_encoding
|
||||
|
||||
|
||||
|
@ -150,6 +150,8 @@ def option_parser(usage):
|
||||
prepro = parser.add_option_group('PREPROCESSING OPTIONS')
|
||||
prepro.add_option('--baen', action='store_true', default=False, dest='baen',
|
||||
help='''Preprocess Baen HTML files to improve generated LRF.''')
|
||||
prepro.add_option('--pdftohtml', action='store_true', default=False, dest='pdftohtml',
|
||||
help='''You must add this option if processing files generated by pdftohtml, otherwise conversion will fail.''')
|
||||
|
||||
|
||||
fonts = parser.add_option_group('FONT FAMILIES',
|
||||
|
@ -223,8 +223,12 @@ class HTMLConverter(object):
|
||||
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
|
||||
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
|
||||
# Fix <a /> elements
|
||||
MARKUP_MASSAGE = [(re.compile(' '), lambda match : ' '), # Convert into a normal space as the default conversion converts it into \xa0 which is not a space in LRF
|
||||
(re.compile("(<\s*[aA]\s+.*\/)\s*>"), #Close <a /> tags
|
||||
MARKUP_MASSAGE = [
|
||||
# Convert into a normal space as the default
|
||||
# conversion converts it into \xa0 which is not a space in LRF
|
||||
(re.compile(' '), lambda match : ' '),
|
||||
# Close <a /> tags
|
||||
(re.compile("(<\s*[aA]\s+.*\/)\s*>"),
|
||||
lambda match: match.group(1)+"></a>"),
|
||||
# Strip comments from <style> tags. This is needed as
|
||||
# sometimes there are unterminated comments
|
||||
@ -243,6 +247,15 @@ class HTMLConverter(object):
|
||||
(re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*</a>', re.IGNORECASE),
|
||||
lambda match: ''),
|
||||
]
|
||||
# Fix pdftohtml markup
|
||||
PDFTOHTML = [
|
||||
# Remove <hr> tags
|
||||
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: ''),
|
||||
# Remove <br> and replace <br><br> with <p>
|
||||
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
|
||||
(re.compile(r'(.{75,}?)<br.*?>', re.IGNORECASE),
|
||||
lambda match: match.group(1)),
|
||||
]
|
||||
|
||||
class Link(object):
|
||||
def __init__(self, para, tag):
|
||||
@ -261,7 +274,8 @@ class HTMLConverter(object):
|
||||
force_page_break=re.compile('$', re.IGNORECASE),
|
||||
profile=PRS500_PROFILE,
|
||||
disable_autorotation=False,
|
||||
ignore_tables=False):
|
||||
ignore_tables=False,
|
||||
pdftohtml=False):
|
||||
'''
|
||||
Convert HTML file at C{path} and add it to C{book}. After creating
|
||||
the object, you must call L{self.process_links} on it to create the links and
|
||||
@ -365,9 +379,15 @@ class HTMLConverter(object):
|
||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||
nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
|
||||
self.baen = baen
|
||||
self.pdftohtml = pdftohtml
|
||||
if baen:
|
||||
nmassage.extend(HTMLConverter.BAEN_SANCTIFY)
|
||||
self.soup = BeautifulSoup(open(self.file_name, 'r').read(),
|
||||
|
||||
raw = open(self.file_name, 'rb').read()
|
||||
if pdftohtml:
|
||||
nmassage.extend(HTMLConverter.PDFTOHTML)
|
||||
raw = unicode(raw, 'utf8', 'replace')
|
||||
self.soup = BeautifulSoup(raw,
|
||||
convertEntities=BeautifulSoup.HTML_ENTITIES,
|
||||
markupMassage=nmassage)
|
||||
print 'done\n\tConverting to BBeB...',
|
||||
@ -614,7 +634,8 @@ class HTMLConverter(object):
|
||||
page_break=self.page_break,
|
||||
force_page_break=self.force_page_break,
|
||||
disable_autorotation=self.disable_autorotation,
|
||||
ignore_tables=self.ignore_tables)
|
||||
ignore_tables=self.ignore_tables,
|
||||
pdftohtml=self.pdftohtml)
|
||||
HTMLConverter.processed_files[path] = self.files[path]
|
||||
except Exception:
|
||||
print >>sys.stderr, 'Unable to process', path
|
||||
@ -1298,7 +1319,8 @@ def process_file(path, options):
|
||||
chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE),
|
||||
link_exclude=re.compile(le), page_break=pb, force_page_break=fpb,
|
||||
disable_autorotation=options.disable_autorotation,
|
||||
ignore_tables=options.ignore_tables)
|
||||
ignore_tables=options.ignore_tables,
|
||||
pdftohtml=options.pdftohtml)
|
||||
conv.process_links()
|
||||
oname = options.output
|
||||
if not oname:
|
||||
|
Loading…
x
Reference in New Issue
Block a user