diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index fbee5d3170..2f7be1c38a 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -110,16 +110,16 @@ class HTMLConverter(object): (re.compile('
', re.IGNORECASE), lambda match : ' '), # Create header tags - (re.compile('(.*?)', re.IGNORECASE|re.DOTALL), + (re.compile('<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), lambda match : '

%s

'%(match.group(2) if match.group(2) else 'center', match.group(3))), - (re.compile('(.*?)', re.IGNORECASE|re.DOTALL), + (re.compile('<]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), lambda match : '

%s

'%(match.group(2) if match.group(2) else 'center', match.group(3))), - (re.compile('(.*?)', re.IGNORECASE|re.DOTALL), + (re.compile('<]*?id=title[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), lambda match : '

%s

'%(match.group(1),)), - (re.compile('(.*?)', re.IGNORECASE|re.DOTALL), + (re.compile('<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), lambda match : '

%s

'%(match.group(1),)), # Blank lines - (re.compile('( ){4}', re.IGNORECASE), + (re.compile('<]*?>( ){4}', re.IGNORECASE), lambda match : '

'), ] @@ -229,22 +229,12 @@ class HTMLConverter(object): return bool(soup.find('meta', attrs={'name':'Publisher', 'content':re.compile('Baen', re.IGNORECASE)})) - def start_on_file(self, path, is_root=True, link_level=0): - self.css = HTMLConverter.CSS.copy() - self.pseudo_css = self.override_pcss.copy() - self.css.update(self.override_css) - - path = os.path.abspath(path) - os.chdir(os.path.dirname(path)) - self.file_name = os.path.basename(path) - self.logger.info('Processing %s\n\tParsing HTML...', self.file_name) - sys.stdout.flush() + def preprocess(self, raw): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(HTMLConverter.MARKUP_MASSAGE) if self.baen: nmassage.extend(HTMLConverter.BAEN) - raw = open(self.file_name, 'rb').read() if self.pdftohtml: nmassage.extend(HTMLConverter.PDFTOHTML) if self.book_designer: @@ -263,7 +253,7 @@ class HTMLConverter(object): if not self.baen and self.is_baen(soup): self.baen = True self.logger.info('Baen file detected. Re-parsing...') - return self.start_on_file(path, is_root=is_root, link_level=link_level) + return self.preprocess(raw) if self.book_designer: t = soup.find(id='BookTitle') if t: @@ -277,6 +267,21 @@ class HTMLConverter(object): dump.write(unicode(soup).encode('utf-8')) self.logger.info('Written preprocessed HTML to '+dump.name) dump.close() + + print soup + return soup + + def start_on_file(self, path, is_root=True, link_level=0): + self.css = HTMLConverter.CSS.copy() + self.pseudo_css = self.override_pcss.copy() + self.css.update(self.override_css) + + path = os.path.abspath(path) + os.chdir(os.path.dirname(path)) + self.file_name = os.path.basename(path) + self.logger.info('Processing %s\n\tParsing HTML...', self.file_name) + sys.stdout.flush() + soup = self.preprocess(open(self.file_name, 'rb').read()) self.logger.info('\tConverting to BBeB...') sys.stdout.flush() self.current_page = None