Make BD file detection more accurate

This commit is contained in:
Kovid Goyal 2007-10-10 19:00:34 +00:00
parent d8b4bffb7e
commit 9e1498969d

View File

@ -229,12 +229,19 @@ class HTMLConverter(object):
return bool(soup.find('meta', attrs={'name':'Publisher', return bool(soup.find('meta', attrs={'name':'Publisher',
'content':re.compile('Baen', re.IGNORECASE)})) 'content':re.compile('Baen', re.IGNORECASE)}))
def is_book_designer(self, soup): def is_book_designer(self, raw):
return bool(soup.find('h2', attrs={'id':'BookTitle'})) return bool(re.search('<H2[^><]*id=BookTitle', raw))
def preprocess(self, raw): def preprocess(self, raw):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(HTMLConverter.MARKUP_MASSAGE) nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
if not self.book_designer and self.is_book_designer(raw):
self.book_designer = True
self.logger.info('\tBook Designer file detected.')
self.logger.info('\tParsing HTML...')
if self.baen: if self.baen:
nmassage.extend(HTMLConverter.BAEN) nmassage.extend(HTMLConverter.BAEN)
@ -255,11 +262,7 @@ class HTMLConverter(object):
if not self.baen and self.is_baen(soup): if not self.baen and self.is_baen(soup):
self.baen = True self.baen = True
self.logger.info('Baen file detected. Re-parsing...') self.logger.info('\tBaen file detected. Re-parsing...')
return self.preprocess(raw)
if not self.book_designer and self.is_book_designer(soup):
self.book_designer = True
self.logger.info('Book Designer file detected. Re-parsing...')
return self.preprocess(raw) return self.preprocess(raw)
if self.book_designer: if self.book_designer:
t = soup.find(id='BookTitle') t = soup.find(id='BookTitle')
@ -286,7 +289,7 @@ class HTMLConverter(object):
path = os.path.abspath(path) path = os.path.abspath(path)
os.chdir(os.path.dirname(path)) os.chdir(os.path.dirname(path))
self.file_name = os.path.basename(path) self.file_name = os.path.basename(path)
self.logger.info('Processing %s\n\tParsing HTML...', self.file_name) self.logger.info('Processing %s', self.file_name)
sys.stdout.flush() sys.stdout.flush()
soup = self.preprocess(open(self.file_name, 'rb').read()) soup = self.preprocess(open(self.file_name, 'rb').read())
self.logger.info('\tConverting to BBeB...') self.logger.info('\tConverting to BBeB...')