Fix #7938 (Stanza shows some authors as "catalog is empty")

This commit is contained in:
Kovid Goyal 2010-12-17 20:59:48 -07:00
parent 3cc953d621
commit b9f2346cba
2 changed files with 41 additions and 51 deletions

View File

@ -12,16 +12,6 @@ class JournalofHospitalMedicine(BasicNewsRecipe):
language = 'en'
no_stylesheets = True
#remove_tags_before = dict(name='div', attrs={'align':'center'})
#remove_tags_after = dict(name='ol', attrs={'compact':'COMPACT'})
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':'subContent'}),
dict(name='div', attrs={'id':['contentFrame']}),
#dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or author')"}),
#dict(name='table', attrs={'align':'RIGHT'}),
]
# TO LOGIN
@ -39,47 +29,47 @@ class JournalofHospitalMedicine(BasicNewsRecipe):
#TO GET ARTICLE TOC
def johm_get_index(self):
return self.index_to_soup('http://www3.interscience.wiley.com/journal/111081937/home')
return self.index_to_soup('http://www3.interscience.wiley.com/journal/111081937/home')
# To parse artice toc
def parse_index(self):
parse_soup = self.johm_get_index()
parse_soup = self.johm_get_index()
div = parse_soup.find(id='contentCell')
div = parse_soup.find(id='contentCell')
current_section = None
current_articles = []
feeds = []
for x in div.findAll(True):
if x.name == 'h4':
# Section heading found
if current_articles and current_section:
feeds.append((current_section, current_articles))
current_section = self.tag_to_string(x)
current_articles = []
self.log('\tFound section:', current_section)
if current_section is not None and x.name == 'strong':
title = self.tag_to_string(x)
p = x.parent.parent.find('a', href=lambda x: x and '/HTMLSTART' in x)
if p is None:
continue
url = p.get('href', False)
if not url or not title:
continue
if url.startswith('/'):
url = 'http://www3.interscience.wiley.com'+url
url = url.replace('/HTMLSTART', '/main.html,ftx_abs')
self.log('\t\tFound article:', title)
self.log('\t\t\t', url)
#if url.startswith('/'):
#url = 'http://online.wsj.com'+url
current_articles.append({'title': title, 'url':url,
'description':'', 'date':''})
current_section = None
current_articles = []
feeds = []
for x in div.findAll(True):
if x.name == 'h4':
# Section heading found
if current_articles and current_section:
feeds.append((current_section, current_articles))
current_section = self.tag_to_string(x)
current_articles = []
self.log('\tFound section:', current_section)
if current_section is not None and x.name == 'strong':
title = self.tag_to_string(x)
p = x.parent.parent.find('a', href=lambda x: x and '/HTMLSTART' in x)
if p is None:
continue
url = p.get('href', False)
if not url or not title:
continue
if url.startswith('/'):
url = 'http://www3.interscience.wiley.com'+url
url = url.replace('/HTMLSTART', '/main.html,ftx_abs')
self.log('\t\tFound article:', title)
self.log('\t\t\t', url)
#if url.startswith('/'):
#url = 'http://online.wsj.com'+url
current_articles.append({'title': title, 'url':url,
'description':'', 'date':''})
if current_articles and current_section:
feeds.append((current_section, current_articles))
if current_articles and current_section:
feeds.append((current_section, current_articles))
return feeds
return feeds
def preprocess_html(self, soup):
for img in soup.findAll('img', src=True):

View File

@ -19,7 +19,7 @@ from calibre.ebooks.metadata import fmt_sidx
from calibre.library.comments import comments_to_html
from calibre.library.server import custom_fields_to_display
from calibre.library.server.utils import format_tag_string, Offsets
from calibre import guess_type
from calibre import guess_type, prepare_string_for_xml as xml
from calibre.utils.icu import sort_key
from calibre.utils.ordered_dict import OrderedDict
@ -150,13 +150,13 @@ def ACQUISITION_ENTRY(item, version, db, updated, CFM, CKEYS, prefix):
extra.append(_('RATING: %s<br />')%rating)
tags = item[FM['tags']]
if tags:
extra.append(_('TAGS: %s<br />')%format_tag_string(tags, ',',
extra.append(_('TAGS: %s<br />')%xml(format_tag_string(tags, ',',
ignore_max=True,
no_tag_count=True))
no_tag_count=True)))
series = item[FM['series']]
if series:
extra.append(_('SERIES: %s [%s]<br />')%\
(series,
(xml(series),
fmt_sidx(float(item[FM['series_index']]))))
for key in CKEYS:
mi = db.get_metadata(item[CFM['id']['rec_index']], index_is_id=True)
@ -164,11 +164,11 @@ def ACQUISITION_ENTRY(item, version, db, updated, CFM, CKEYS, prefix):
if val:
datatype = CFM[key]['datatype']
if datatype == 'text' and CFM[key]['is_multiple']:
extra.append('%s: %s<br />'%(name, format_tag_string(val, ',',
extra.append('%s: %s<br />'%(xml(name), xml(format_tag_string(val, ',',
ignore_max=True,
no_tag_count=True)))
no_tag_count=True))))
else:
extra.append('%s: %s<br />'%(name, val))
extra.append('%s: %s<br />'%(xml(name), xml(unicode(val))))
comments = item[FM['comments']]
if comments:
comments = comments_to_html(comments)