EPUB Output: Strip <form> tags since ADE runs screaming when it sees one. Fixes #2029 (IHT resetting P505)

This commit is contained in:
Kovid Goyal 2009-03-10 19:23:43 -07:00
parent a52286c594
commit 74486fc40d
4 changed files with 13 additions and 4 deletions

View File

@ -197,6 +197,9 @@ class HTMLProcessor(Processor, Rationalizer):
if not tag.text and not tag.get('src', False): if not tag.text and not tag.get('src', False):
tag.getparent().remove(tag) tag.getparent().remove(tag)
for tag in self.root.xpath('//form'):
tag.getparent().remove(tag)
if self.opts.linearize_tables: if self.opts.linearize_tables:
for tag in self.root.xpath('//table | //tr | //th | //td'): for tag in self.root.xpath('//table | //tr | //th | //td'):
tag.tag = 'div' tag.tag = 'div'

View File

@ -98,7 +98,7 @@ class Feed(object):
if len(self.articles) >= max_articles_per_feed: if len(self.articles) >= max_articles_per_feed:
break break
self.parse_article(item) self.parse_article(item)
def populate_from_preparsed_feed(self, title, articles, oldest_article=7, def populate_from_preparsed_feed(self, title, articles, oldest_article=7,
max_articles_per_feed=100): max_articles_per_feed=100):
@ -156,7 +156,6 @@ class Feed(object):
content = None content = None
if not link and not content: if not link and not content:
return return
article = Article(id, title, link, description, published, content) article = Article(id, title, link, description, published, content)
delta = datetime.utcnow() - article.utctime delta = datetime.utcnow() - article.utctime
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article: if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:

View File

@ -1012,7 +1012,8 @@ class BasicNewsRecipe(object, LoggingInterface):
feed.description = unicode(err) feed.description = unicode(err)
parsed_feeds.append(feed) parsed_feeds.append(feed)
self.log_exception(msg) self.log_exception(msg)
return parsed_feeds return parsed_feeds
@classmethod @classmethod

View File

@ -3,6 +3,7 @@ __copyright__ = '2008, Derry FitzGerald'
''' '''
iht.com iht.com
''' '''
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
@ -16,7 +17,12 @@ class InternationalHeraldTribune(BasicNewsRecipe):
max_articles_per_feed = 10 max_articles_per_feed = 10
no_stylesheets = True no_stylesheets = True
remove_tags = [dict(name='div', attrs={'class':'footer'})] remove_tags = [dict(name='div', attrs={'class':'footer'}),
dict(name=['form'])]
preprocess_regexps = [
(re.compile(r'<!-- webtrends.*', re.DOTALL),
lambda m:'</body></html>')
]
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }' extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
feeds = [ feeds = [