diff --git a/src/libprs500/__init__.py b/src/libprs500/__init__.py index c8534df2ef..7bd1535092 100644 --- a/src/libprs500/__init__.py +++ b/src/libprs500/__init__.py @@ -13,7 +13,7 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ''' E-book management software''' -__version__ = "0.3.78" +__version__ = "0.3.79" __docformat__ = "epytext" __author__ = "Kovid Goyal " __appname__ = 'libprs500' diff --git a/src/libprs500/ebooks/BeautifulSoup.py b/src/libprs500/ebooks/BeautifulSoup.py index 6ef8ac026a..db916473b4 100644 --- a/src/libprs500/ebooks/BeautifulSoup.py +++ b/src/libprs500/ebooks/BeautifulSoup.py @@ -959,6 +959,9 @@ class BeautifulStoneSoup(Tag, SGMLParser): (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo) markup = dammit.unicode + if not markup: # Added by Kovid + from libprs500.ebooks import ConversionError + raise ConversionError, 'Failed to coerce to unicode' self.originalEncoding = dammit.originalEncoding if markup: if self.markupMassage: @@ -967,7 +970,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): for fix, m in self.markupMassage: markup = fix.sub(m, markup) self.reset() - + SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() @@ -1530,7 +1533,7 @@ class UnicodeDammit: self.triedEncodings = [] if markup == '' or isinstance(markup, unicode): self.originalEncoding = None - self.unicode = unicode(markup) + self.unicode = unicode(markup) return u = None @@ -1552,7 +1555,7 @@ class UnicodeDammit: u = self._convertFrom(proposed_encoding) if u: break self.unicode = u - if not u: self.originalEncoding = None + if not u: self.originalEncoding = None def _subMSChar(self, orig): """Changes a MS smart quote character to an XML or HTML @@ -1587,10 +1590,11 @@ class UnicodeDammit: self.markup = u self.originalEncoding = proposed except Exception, e: - # print "That didn't work!" - # print e + #print "That didn't work!" + #print e return None #print "Correct encoding: %s" % proposed + return self.markup def _toUnicode(self, data, encoding): @@ -1679,6 +1683,7 @@ class UnicodeDammit: 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): xml_encoding = sniffed_xml_encoding + return xml_data, xml_encoding, sniffed_xml_encoding diff --git a/src/libprs500/ebooks/lrf/__init__.py b/src/libprs500/ebooks/lrf/__init__.py index 5e8108dcd9..247db16bf5 100644 --- a/src/libprs500/ebooks/lrf/__init__.py +++ b/src/libprs500/ebooks/lrf/__init__.py @@ -150,6 +150,8 @@ def option_parser(usage): prepro = parser.add_option_group('PREPROCESSING OPTIONS') prepro.add_option('--baen', action='store_true', default=False, dest='baen', help='''Preprocess Baen HTML files to improve generated LRF.''') + prepro.add_option('--pdftohtml', action='store_true', default=False, dest='pdftohtml', + help='''You must add this option if processing files generated by pdftohtml, otherwise conversion will fail.''') fonts = parser.add_option_group('FONT FAMILIES', diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index 144830b175..3619bd02b7 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -223,8 +223,12 @@ class HTMLConverter(object): PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) # Fix elements - MARKUP_MASSAGE = [(re.compile(' '), lambda match : ' '), # Convert   into a normal space as the default conversion converts it into \xa0 which is not a space in LRF - (re.compile("(<\s*[aA]\s+.*\/)\s*>"), #Close tags + MARKUP_MASSAGE = [ + # Convert   into a normal space as the default + # conversion converts it into \xa0 which is not a space in LRF + (re.compile(' '), lambda match : ' '), + # Close tags + (re.compile("(<\s*[aA]\s+.*\/)\s*>"), lambda match: match.group(1)+">"), # Strip comments from