From d5bd94840494bc203ae4614d224917a3f29e98ae Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 20 Aug 2007 00:50:55 +0000 Subject: [PATCH] Fix handling of ' --- src/libprs500/ebooks/lrf/html/convert_from.py | 4 ++-- src/libprs500/ebooks/lrf/web/__init__.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index 20527b059b..3da1aaf086 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -50,10 +50,10 @@ class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) - replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] + replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo'] patterns = [ re.compile('&'+i+';') for i in replaced_entities ] targets = [ unichr(name2codepoint[i]) for i in replaced_entities ] - ENTITY_RULES = zip(patterns, targets) + ENTITY_RULES = zip(patterns, targets) + [(re.compile('''), "'")] MARKUP_MASSAGE = [ diff --git a/src/libprs500/ebooks/lrf/web/__init__.py b/src/libprs500/ebooks/lrf/web/__init__.py index fd5f9a4c8d..5523ac8b1a 100644 --- a/src/libprs500/ebooks/lrf/web/__init__.py +++ b/src/libprs500/ebooks/lrf/web/__init__.py @@ -16,14 +16,14 @@ import os, time, calendar, operator from libprs500 import iswindows -from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup +from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10): articles = {} for title, url in feeds: src = browser.open(url).read() articles[title] = [] - soup = BeautifulStoneSoup(src, convertEntities=BeautifulSoup.HTML_ENTITIES) + soup = BeautifulStoneSoup(src) for item in soup.findAll('item'): try: pubdate = item.find('pubdate').string