diff --git a/recipes/hackernews.recipe b/recipes/hackernews.recipe
new file mode 100644
index 0000000000..2ce2b0ce09
--- /dev/null
+++ b/recipes/hackernews.recipe
@@ -0,0 +1,90 @@
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+'''
+Hacker News
+'''
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ptempfile import PersistentTemporaryFile
+from urlparse import urlparse
+
+class HackerNews(BasicNewsRecipe):
+    title                 = 'Hacker News'
+    __author__            = 'Tom Scholl'
+    description           = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
+    publisher             = 'Y Combinator'
+    category              = 'news, programming, it, technology'
+    masthead_url          = 'http://i55.tinypic.com/2u6io76.png'
+    cover_url             = 'http://i55.tinypic.com/2u6io76.png'
+    delay                 = 5
+    max_articles_per_feed = 5
+    use_embedded_content  = False
+    no_stylesheets        = True
+    encoding              = 'utf-8'
+    language              = 'en'
+    requires_version      = (0,8,16)
+
+    feeds = [
+                (u'Hacker News', 'http://news.ycombinator.com/rss')
+            ]
+            
+    temp_files = []
+    articles_are_obfuscated = True
+    
+    def get_readable_content(self, url):
+        self.log('get_readable_content(' + url + ')')
+        br = self.get_browser()
+        f = br.open(url)
+        html = f.read()
+        f.close()
+
+        data = self.extract_readable_article(html, url)
+        article_html = data[0]
+        extracted_title = data[1]
+        article_html = u'<cite><strong>' + extracted_title + u'</strong></cite><span> (' + self.prettyify_url(url) + u')</span><br/>' + article_html
+        return u'<html><head><title>' + extracted_title + u'</title></head><body>' + article_html + u'</body></html>'
+
+    def get_hn_content(self, url):
+        self.log('get_hn_content(' + url + ')')
+        # this could be improved
+        br = self.get_browser()
+        f = br.open(url)
+        html = f.read()
+        f.close()
+        return html
+        
+    def get_obfuscated_article(self, url):
+        if url.startswith('http://news.ycombinator.com'):
+            content = self.get_hn_content(url)
+        else:
+            # TODO: use content-type header instead of url
+            is_image = False
+            for ext in ['.jpg', '.png', '.svg', '.gif', '.jpeg', '.tiff', '.bmp',]:
+                if url.endswith(ext):
+                    is_image = True
+                    break
+                    
+            if is_image:
+                self.log('using image_content (' + url + ')')
+                content = u'<html><body><img src="' + url + u'"></body></html>'
+            else:
+                content = self.get_readable_content(url)
+
+        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
+        self.temp_files[-1].write(content)
+        self.temp_files[-1].close()
+        return self.temp_files[-1].name
+        
+    def is_link_wanted(self, url, tag):
+        if url.endswith('.pdf'):
+            return False
+        return True
+        
+    def prettyify_url(self, url):
+        return urlparse(url).hostname
+        
+    def populate_article_metadata(self, article, soup, first):
+        article.text_summary = self.prettyify_url(article.url)
+        article.summary = article.text_summary
+
+
diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index 1d513082f1..532bcbf206 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -29,6 +29,7 @@ from calibre.ptempfile import PersistentTemporaryFile
 from calibre.utils.date import now as nowf
 from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image
 from calibre.utils.localization import canonicalize_lang
+from readability import readability
 
 class LoginFailed(ValueError):
     pass
@@ -515,7 +516,16 @@ class BasicNewsRecipe(Recipe):
             entity_to_unicode(match, encoding=enc)))
         return BeautifulSoup(_raw, markupMassage=massage)
 
-
+    def extract_readable_article(self, html, base_url):
+        '''
+        Extracts main article content from 'html', cleans up and returns as a (article_html, extracted_title) tuple.
+        Based on the original readability algorithm by Arc90.
+        '''
+        doc = readability.Document(html, url=base_url)
+        article_html = doc.summary()
+        extracted_title = doc.title()
+        return (article_html, extracted_title)
+        
     def sort_index_by(self, index, weights):
         '''
         Convenience method to sort the titles in `index` according to `weights`.
diff --git a/src/readability/__init__.py b/src/readability/__init__.py
new file mode 100644
index 0000000000..8822a51299
--- /dev/null
+++ b/src/readability/__init__.py
@@ -0,0 +1 @@
+from .readability import Document
diff --git a/src/readability/cleaners.py b/src/readability/cleaners.py
new file mode 100644
index 0000000000..9b158c5248
--- /dev/null
+++ b/src/readability/cleaners.py
@@ -0,0 +1,32 @@
+# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
+import re
+from lxml.html.clean import Cleaner
+
+bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
+single_quoted = "'[^']+'"
+double_quoted = '"[^"]+"'
+non_space = '[^ "\'>]+'
+htmlstrip = re.compile("<" # open
+    "([^>]+) " # prefix
+    "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
+    '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
+    "([^>]*)"  # postfix
+    ">"        # end
+, re.I)
+
+def clean_attributes(html):
+    while htmlstrip.search(html):
+        html = htmlstrip.sub('<\\1\\2>', html)
+    return html
+
+def normalize_spaces(s):
+    if not s: return ''
+    """replace any sequence of whitespace
+    characters with a single space"""
+    return ' '.join(s.split())
+
+html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
+                  style=True, links=True, meta=False, add_nofollow=False,
+                  page_structure=False, processing_instructions=True, embedded=False,
+                  frames=False, forms=False, annoying_tags=False, remove_tags=None,
+                  remove_unknown_tags=False, safe_attrs_only=False)
diff --git a/src/readability/debug.py b/src/readability/debug.py
new file mode 100644
index 0000000000..a5e644d8cc
--- /dev/null
+++ b/src/readability/debug.py
@@ -0,0 +1,25 @@
+def save_to_file(text, filename):
+    f = open(filename, 'wt')
+    f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
+    f.write(text.encode('utf-8'))
+    f.close()
+
+uids = {} 
+def describe(node, depth=2):
+    if not hasattr(node, 'tag'):
+        return "[%s]" % type(node)
+    name = node.tag
+    if node.get('id', ''): name += '#'+node.get('id') 
+    if node.get('class', ''): 
+        name += '.' + node.get('class').replace(' ','.')
+    if name[:4] in ['div#', 'div.']:
+        name = name[3:]
+    if name in ['tr', 'td', 'div', 'p']:
+        if not node in uids:
+            uid = uids[node] = len(uids)+1
+        else:
+            uid = uids.get(node)
+        name += "%02d" % (uid)
+    if depth and node.getparent() is not None:
+        return name+' - '+describe(node.getparent(), depth-1)
+    return name
diff --git a/src/readability/encoding.py b/src/readability/encoding.py
new file mode 100644
index 0000000000..d05b7f44a0
--- /dev/null
+++ b/src/readability/encoding.py
@@ -0,0 +1,21 @@
+import re
+import chardet
+
+def get_encoding(page):
+    text = re.sub('</?[^>]*>\s*', ' ', page)
+    enc = 'utf-8'
+    if not text.strip() or len(text) < 10:
+        return enc # can't guess
+    try:
+        diff = text.decode(enc, 'ignore').encode(enc)
+        sizes = len(diff), len(text)
+        if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
+            return enc
+    except UnicodeDecodeError:
+        pass
+    res = chardet.detect(text)
+    enc = res['encoding']
+    #print '->', enc, "%.2f" % res['confidence']
+    if enc == 'MacCyrillic':
+        enc = 'cp1251'
+    return enc
diff --git a/src/readability/htmls.py b/src/readability/htmls.py
new file mode 100644
index 0000000000..97aa55b787
--- /dev/null
+++ b/src/readability/htmls.py
@@ -0,0 +1,115 @@
+from cleaners import normalize_spaces, clean_attributes
+from encoding import get_encoding
+from lxml.html import tostring
+import logging
+import lxml.html
+import re
+
+logging.getLogger().setLevel(logging.DEBUG)
+
+utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
+
+def build_doc(page):
+    if isinstance(page, unicode):
+        page_unicode = page
+    else:
+        enc = get_encoding(page)
+        page_unicode = page.decode(enc, 'replace')
+    doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
+    return doc
+
+def js_re(src, pattern, flags, repl):
+    return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
+
+
+def normalize_entities(cur_title):
+    entities = {
+        u'\u2014':'-',
+        u'\u2013':'-',
+        u'&mdash;': '-',
+        u'&ndash;': '-',
+        u'\u00A0': ' ',
+        u'\u00AB': '"',
+        u'\u00BB': '"',
+        u'&quot;': '"',
+    }
+    for c, r in entities.iteritems():
+        if c in cur_title:
+            cur_title = cur_title.replace(c, r)
+
+    return cur_title
+
+def norm_title(title):
+    return normalize_entities(normalize_spaces(title))
+
+def get_title(doc):
+    title = doc.find('.//title').text
+    if not title:
+        return '[no-title]'
+    
+    return norm_title(title)
+
+def add_match(collection, text, orig):
+    text = norm_title(text)
+    if len(text.split()) >= 2 and len(text) >= 15:
+        if text.replace('"', '') in orig.replace('"', ''):
+            collection.add(text)
+
+def shorten_title(doc):
+    title = doc.find('.//title').text
+    if not title:
+        return ''
+    
+    title = orig = norm_title(title)
+
+    candidates = set()
+
+    for item in ['.//h1', './/h2', './/h3']:
+        for e in list(doc.iterfind(item)):
+            if e.text:
+                add_match(candidates, e.text, orig)
+            if e.text_content():
+                add_match(candidates, e.text_content(), orig)
+
+    for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
+        for e in doc.cssselect(item):
+            if e.text:
+                add_match(candidates, e.text, orig)
+            if e.text_content():
+                add_match(candidates, e.text_content(), orig)
+                
+    if candidates:
+        title = sorted(candidates, key=len)[-1]
+    else:
+        for delimiter in [' | ', ' - ', ' :: ', ' / ']:
+            if delimiter in title:
+                parts = orig.split(delimiter)
+                if len(parts[0].split()) >= 4:
+                    title = parts[0]
+                    break
+                elif len(parts[-1].split()) >= 4:
+                    title = parts[-1]
+                    break
+        else:
+            if ': ' in title:
+                parts = orig.split(': ')
+                if len(parts[-1].split()) >= 4:
+                    title = parts[-1]
+                else:
+                    title = orig.split(': ', 1)[1]
+
+    if not 15 < len(title) < 150:
+        return orig
+
+    return title
+
+def get_body(doc):
+    [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
+    raw_html = unicode(tostring(doc.body or doc))
+    cleaned = clean_attributes(raw_html)
+    try:
+        #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
+        return cleaned
+    except Exception: #FIXME find the equivalent lxml error
+        logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
+        return raw_html
diff --git a/src/readability/readability.py b/src/readability/readability.py
new file mode 100644
index 0000000000..e4991f3a41
--- /dev/null
+++ b/src/readability/readability.py
@@ -0,0 +1,508 @@
+#!/usr/bin/env python
+from cleaners import html_cleaner, clean_attributes
+from collections import defaultdict
+from htmls import build_doc, get_body, get_title, shorten_title
+from lxml.etree import tostring, tounicode
+from lxml.html import fragment_fromstring, document_fromstring
+import logging
+import re
+import sys
+
+logging.basicConfig(level=logging.INFO)
+
+REGEXES = {
+	'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I),
+	'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow',re.I),
+	'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
+	'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I),
+	'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
+	#'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
+	#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
+	#'trimRe': re.compile('^\s+|\s+$/'),
+	#'normalizeRe': re.compile('\s{2,}/'),
+	#'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
+	#'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
+	#skipFootnoteLink:	  /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
+}
+
+def describe(node, depth=1):
+	if not hasattr(node, 'tag'):
+		return "[%s]" % type(node)
+	name = node.tag
+	if node.get('id', ''): name += '#'+node.get('id') 
+	if node.get('class', ''): 
+		name += '.' + node.get('class').replace(' ','.')
+	if name[:4] in ['div#', 'div.']:
+		name = name[3:]
+	if depth and node.getparent() is not None:
+		return name+' - '+describe(node.getparent(), depth-1)
+	return name
+
+def to_int(x):
+	if not x: return None
+	x = x.strip()
+	if x.endswith('px'):
+		return int(x[:-2]) 
+	if x.endswith('em'):
+		return int(x[:-2]) * 12 
+	return int(x)
+
+def clean(text):
+	text = re.sub('\s*\n\s*', '\n', text)
+	text = re.sub('[ \t]{2,}', ' ', text)
+	return text.strip()
+
+def text_length(i):
+	return len(clean(i.text_content() or ""))
+
+class Unparseable(ValueError):
+	pass
+
+class Document:
+	TEXT_LENGTH_THRESHOLD = 25
+	RETRY_LENGTH = 250
+
+	def __init__(self, input, **options):
+		self.input = input
+		self.options = defaultdict(lambda: None)
+		for k, v in options.items():
+			self.options[k] = v
+		self.html = None
+
+	def _html(self, force=False):
+		if force or self.html is None:
+			self.html = self._parse(self.input)
+		return self.html
+	
+	def _parse(self, input):
+		doc = build_doc(input)
+		doc = html_cleaner.clean_html(doc)
+		base_href = self.options['url']
+		if base_href:
+			doc.make_links_absolute(base_href, resolve_base_href=True)
+		else:
+			doc.resolve_base_href()
+		return doc
+	
+	def content(self):
+		return get_body(self._html(True))
+	
+	def title(self):
+		return get_title(self._html(True))
+
+	def short_title(self):
+		return shorten_title(self._html(True))
+
+	def summary(self):
+		try:
+			ruthless = True
+			while True:
+				self._html(True)
+				
+				for i in self.tags(self.html, 'script', 'style'):
+					i.drop_tree()
+				for i in self.tags(self.html, 'body'):
+					i.set('id', 'readabilityBody')
+				if ruthless: 
+					self.remove_unlikely_candidates()
+				self.transform_misused_divs_into_paragraphs()
+				candidates = self.score_paragraphs()
+				
+				best_candidate = self.select_best_candidate(candidates)
+				if best_candidate:
+					article = self.get_article(candidates, best_candidate)
+				else:
+					if ruthless:
+						logging.debug("ruthless removal did not work. ")
+						ruthless = False
+						self.debug("ended up stripping too much - going for a safer _parse")
+						# try again
+						continue
+					else:
+						logging.debug("Ruthless and lenient parsing did not work. Returning raw html")
+						article = self.html.find('body')
+						if article is None:
+							article = self.html
+
+				cleaned_article = self.sanitize(article, candidates)
+				of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
+				if ruthless and not of_acceptable_length:
+					ruthless = False
+					continue # try again
+				else:
+					return cleaned_article
+		except StandardError, e:
+			#logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
+			logging.exception('error getting summary: ' )
+			raise Unparseable(str(e)), None, sys.exc_info()[2]
+
+	def get_article(self, candidates, best_candidate):
+		# Now that we have the top candidate, look through its siblings for content that might also be related.
+		# Things like preambles, content split by ads that we removed, etc.
+
+		sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
+		output = document_fromstring('<div/>')
+		best_elem = best_candidate['elem']
+		for sibling in best_elem.getparent().getchildren():
+			#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text 
+			append = False 
+			if sibling is best_elem:
+				append = True
+			sibling_key = sibling #HashableElement(sibling)
+			if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
+				append = True
+
+			if sibling.tag == "p":
+				link_density = self.get_link_density(sibling)
+				node_content = sibling.text or ""
+				node_length = len(node_content)
+
+				if node_length > 80 and link_density < 0.25:
+					append = True
+				elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
+					append = True
+
+			if append:
+				output.append(sibling)
+		#if output is not None: 
+		#	output.append(best_elem)
+		return output
+
+	def select_best_candidate(self, candidates):
+		sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
+		for candidate in sorted_candidates[:5]:
+			elem = candidate['elem']
+			self.debug("Top 5 : %6.3f %s" % (candidate['content_score'], describe(elem)))
+
+		if len(sorted_candidates) == 0:
+			return None
+
+		best_candidate = sorted_candidates[0]
+		return best_candidate
+
+
+	def get_link_density(self, elem):
+		link_length = 0
+		for i in elem.findall(".//a"):
+			link_length += text_length(i)
+		#if len(elem.findall(".//div") or elem.findall(".//p")):
+		#	link_length = link_length
+		total_length = text_length(elem)
+		return float(link_length) / max(total_length, 1)
+
+	def score_paragraphs(self, ):
+		MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
+		candidates = {}
+		#self.debug(str([describe(node) for node in self.tags(self.html, "div")]))
+
+		ordered = []
+		for elem in self.tags(self.html, "p", "pre", "td"):
+			parent_node = elem.getparent()
+			if parent_node is None:
+				continue 
+			grand_parent_node = parent_node.getparent()
+
+			inner_text = clean(elem.text_content() or "")
+			inner_text_len = len(inner_text)
+
+			# If this paragraph is less than 25 characters, don't even count it.
+			if inner_text_len < MIN_LEN:
+				continue
+
+			if parent_node not in candidates:
+				candidates[parent_node] = self.score_node(parent_node)
+				ordered.append(parent_node)
+				
+			if grand_parent_node is not None and grand_parent_node not in candidates:
+				candidates[grand_parent_node] = self.score_node(grand_parent_node)
+				ordered.append(grand_parent_node)
+
+			content_score = 1
+			content_score += len(inner_text.split(','))
+			content_score += min((inner_text_len / 100), 3)
+			#if elem not in candidates:
+			#	candidates[elem] = self.score_node(elem)
+				
+			#WTF? candidates[elem]['content_score'] += content_score
+			candidates[parent_node]['content_score'] += content_score
+			if grand_parent_node is not None:
+				candidates[grand_parent_node]['content_score'] += content_score / 2.0
+
+		# Scale the final candidates score based on link density. Good content should have a
+		# relatively small link density (5% or less) and be mostly unaffected by this operation.
+		for elem in ordered:
+			candidate = candidates[elem]
+			ld = self.get_link_density(elem)
+			score = candidate['content_score']
+			self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld)))
+			candidate['content_score'] *= (1 - ld)
+
+		return candidates
+
+	def class_weight(self, e):
+		weight = 0
+		if e.get('class', None):
+			if REGEXES['negativeRe'].search(e.get('class')):
+				weight -= 25
+
+			if REGEXES['positiveRe'].search(e.get('class')):
+				weight += 25
+
+		if e.get('id', None):
+			if REGEXES['negativeRe'].search(e.get('id')):
+				weight -= 25
+
+			if REGEXES['positiveRe'].search(e.get('id')):
+				weight += 25
+
+		return weight
+
+	def score_node(self, elem):
+		content_score = self.class_weight(elem)
+		name = elem.tag.lower()
+		if name == "div":
+			content_score += 5
+		elif name in ["pre", "td", "blockquote"]:
+			content_score += 3
+		elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
+			content_score -= 3
+		elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
+			content_score -= 5
+		return { 
+			'content_score': content_score, 
+			'elem': elem
+		}
+
+	def debug(self, *a):
+		#if self.options['debug']:
+			logging.debug(*a)
+
+	def remove_unlikely_candidates(self):
+		for elem in self.html.iter():
+			s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
+			#self.debug(s)
+			if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
+				self.debug("Removing unlikely candidate - %s" % describe(elem))
+				elem.drop_tree()
+
+	def transform_misused_divs_into_paragraphs(self):
+		for elem in self.tags(self.html, 'div'):
+			# transform <div>s that do not contain other block elements into <p>s
+			if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
+				#self.debug("Altering %s to p" % (describe(elem)))
+				elem.tag = "p"
+				#print "Fixed element "+describe(elem)
+				
+		for elem in self.tags(self.html, 'div'):
+			if elem.text and elem.text.strip():
+				p = fragment_fromstring('<p/>')
+				p.text = elem.text
+				elem.text = None
+				elem.insert(0, p)
+				#print "Appended "+tounicode(p)+" to "+describe(elem)
+			
+			for pos, child in reversed(list(enumerate(elem))):
+				if child.tail and child.tail.strip():
+					p = fragment_fromstring('<p/>')
+					p.text = child.tail
+					child.tail = None
+					elem.insert(pos + 1, p)
+					#print "Inserted "+tounicode(p)+" to "+describe(elem)
+				if child.tag == 'br':
+					#print 'Dropped <br> at '+describe(elem) 
+					child.drop_tree()
+
+	def tags(self, node, *tag_names):
+		for tag_name in tag_names:
+			for e in node.findall('.//%s' % tag_name):
+				yield e
+
+	def reverse_tags(self, node, *tag_names):
+		for tag_name in tag_names:
+			for e in reversed(node.findall('.//%s' % tag_name)):
+				yield e
+
+	def sanitize(self, node, candidates):
+		MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
+		for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
+			if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: 
+				header.drop_tree()
+
+		for elem in self.tags(node, "form", "iframe", "textarea"):
+			elem.drop_tree()
+		allowed = {}
+		# Conditionally clean <table>s, <ul>s, and <div>s
+		for el in self.reverse_tags(node, "table", "ul", "div"):
+			if el in allowed:
+				continue
+			weight = self.class_weight(el)
+			if el in candidates:
+				content_score = candidates[el]['content_score']
+				#print '!',el, '-> %6.3f' % content_score
+			else:
+				content_score = 0
+			tag = el.tag
+
+			if weight + content_score < 0:
+				self.debug("Cleaned %s with score %6.3f and weight %-3s" %
+					(describe(el), content_score, weight, ))
+				el.drop_tree()
+			elif el.text_content().count(",") < 10:
+				counts = {}
+				for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
+					counts[kind] = len(el.findall('.//%s' %kind))
+				counts["li"] -= 100
+
+				content_length = text_length(el) # Count the text length excluding any surrounding whitespace
+				link_density = self.get_link_density(el)
+				parent_node = el.getparent()
+				if parent_node is not None:
+					if parent_node in candidates:
+						content_score = candidates[parent_node]['content_score']
+					else:
+						content_score = 0
+				#if parent_node is not None:
+					#pweight = self.class_weight(parent_node) + content_score
+					#pname = describe(parent_node)
+				#else:
+					#pweight = 0
+					#pname = "no parent"
+				to_remove = False
+				reason = ""
+
+				#if el.tag == 'div' and counts["img"] >= 1:
+				#	continue
+				if counts["p"] and counts["img"] > counts["p"]:
+					reason = "too many images (%s)" % counts["img"]
+					to_remove = True
+				elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
+					reason = "more <li>s than <p>s"
+					to_remove = True
+				elif counts["input"] > (counts["p"] / 3):
+					reason = "less than 3x <p>s than <input>s"
+					to_remove = True
+				elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2):
+					reason = "too short content length %s without a single image" % content_length
+					to_remove = True
+				elif weight < 25 and link_density > 0.2:
+						reason = "too many links %.3f for its weight %s" % (link_density, weight)
+						to_remove = True
+				elif weight >= 25 and link_density > 0.5:
+					reason = "too many links %.3f for its weight %s" % (link_density, weight)
+					to_remove = True
+				elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
+					reason = "<embed>s with too short content length, or too many <embed>s"
+					to_remove = True
+#				if el.tag == 'div' and counts['img'] >= 1 and to_remove:
+#					imgs = el.findall('.//img')
+#					valid_img = False
+#					self.debug(tounicode(el))
+#					for img in imgs:
+#
+#						height = img.get('height')
+#						text_length = img.get('text_length')
+#						self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
+#						if to_int(height) >= 100 or to_int(text_length) >= 100:
+#							valid_img = True
+#							self.debug("valid image" + tounicode(img))
+#							break
+#					if valid_img:
+#						to_remove = False
+#						self.debug("Allowing %s" %el.text_content())
+#						for desnode in self.tags(el, "table", "ul", "div"):
+#							allowed[desnode] = True
+
+					#find x non empty preceding and succeeding siblings
+					i, j = 0, 0
+					x  = 1
+					siblings = []
+					for sib in el.itersiblings():
+						#self.debug(sib.text_content())
+						sib_content_length = text_length(sib)
+						if sib_content_length:
+							i =+ 1
+							siblings.append(sib_content_length)
+							if i == x:
+								break
+					for sib in el.itersiblings(preceding=True):
+						#self.debug(sib.text_content())
+						sib_content_length = text_length(sib)
+						if sib_content_length:
+							j =+ 1
+							siblings.append(sib_content_length)
+							if j == x:
+								break
+					#self.debug(str(siblings))
+					if siblings and sum(siblings) > 1000 :
+						to_remove = False
+						self.debug("Allowing %s" % describe(el))
+						for desnode in self.tags(el, "table", "ul", "div"):
+							allowed[desnode] = True
+
+				if to_remove:
+					self.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
+						(content_score, describe(el), weight, reason))
+					#print tounicode(el)
+					#self.debug("pname %s pweight %.3f" %(pname, pweight))
+					el.drop_tree()
+
+		for el in ([node] + [n for n in node.iter()]):
+			if not (self.options['attributes']):
+				#el.attrib = {} #FIXME:Checkout the effects of disabling this
+				pass
+
+		return clean_attributes(tounicode(node))
+	
+
+class HashableElement():
+	def __init__(self, node):
+		self.node = node
+		self._path = None
+
+	def _get_path(self):
+		if self._path is None:
+			reverse_path = []
+			node = self.node
+			while node is not None:
+				node_id = (node.tag, tuple(node.attrib.items()), node.text)
+				reverse_path.append(node_id)
+				node = node.getparent()
+			self._path = tuple(reverse_path)
+		return self._path
+	path = property(_get_path)
+
+	def __hash__(self):
+		return hash(self.path)
+
+	def __eq__(self, other):
+		return self.path == other.path
+
+	def __getattr__(self, tag):
+		return getattr(self.node, tag)
+
+def main():
+	from optparse import OptionParser
+	parser = OptionParser(usage="%prog: [options] [file]")
+	parser.add_option('-v', '--verbose', action='store_true')
+	parser.add_option('-u', '--url', help="use URL instead of a local file")
+	(options, args) = parser.parse_args()
+	
+	if not (len(args) == 1 or options.url):
+		parser.print_help()
+		sys.exit(1)
+	logging.basicConfig(level=logging.INFO)
+
+	file = None
+	if options.url:
+		import urllib
+		file = urllib.urlopen(options.url)
+	else:
+		file = open(args[0], 'rt')
+	enc = sys.__stdout__.encoding or 'utf-8'
+	try:
+		print Document(file.read(), debug=options.verbose).summary().encode(enc, 'replace')
+	finally:
+		file.close()
+
+if __name__ == '__main__':
+	main()