diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index 20527b059b..3da1aaf086 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -50,10 +50,10 @@ class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) - replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] + replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo'] patterns = [ re.compile('&'+i+';') for i in replaced_entities ] targets = [ unichr(name2codepoint[i]) for i in replaced_entities ] - ENTITY_RULES = zip(patterns, targets) + ENTITY_RULES = zip(patterns, targets) + [(re.compile('''), "'")] MARKUP_MASSAGE = [ diff --git a/src/libprs500/ebooks/lrf/web/__init__.py b/src/libprs500/ebooks/lrf/web/__init__.py index fd5f9a4c8d..5523ac8b1a 100644 --- a/src/libprs500/ebooks/lrf/web/__init__.py +++ b/src/libprs500/ebooks/lrf/web/__init__.py @@ -16,14 +16,14 @@ import os, time, calendar, operator from libprs500 import iswindows -from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup +from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10): articles = {} for title, url in feeds: src = browser.open(url).read() articles[title] = [] - soup = BeautifulStoneSoup(src, convertEntities=BeautifulSoup.HTML_ENTITIES) + soup = BeautifulStoneSoup(src) for item in soup.findAll('item'): try: pubdate = item.find('pubdate').string