From a174d251643b23d0e67c6faefdbb4dad2b23f830 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 28 Apr 2010 07:05:07 -0600 Subject: [PATCH] Updated Sueddeutsche Zeitung --- resources/recipes/sueddeutschezeitung.recipe | 64 +++++++++----------- src/calibre/ebooks/mobi/reader.py | 3 +- 2 files changed, 32 insertions(+), 35 deletions(-) diff --git a/resources/recipes/sueddeutschezeitung.recipe b/resources/recipes/sueddeutschezeitung.recipe index 8b731e2c4f..48618fe996 100644 --- a/resources/recipes/sueddeutschezeitung.recipe +++ b/resources/recipes/sueddeutschezeitung.recipe @@ -5,9 +5,8 @@ __copyright__ = '2010, Darko Miletic ' www.sueddeutsche.de/sz/ ''' -import urllib -from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe +from calibre import strftime class SueddeutcheZeitung(BasicNewsRecipe): title = 'Sueddeutche Zeitung' @@ -20,12 +19,13 @@ class SueddeutcheZeitung(BasicNewsRecipe): encoding = 'cp1252' needs_subscription = True remove_empty_feeds = True + delay = 2 PREFIX = 'http://www.sueddeutsche.de' - INDEX = PREFIX + strftime('/sz/%Y-%m-%d/') - LOGIN = PREFIX + '/app/lbox/index.html' + INDEX = PREFIX + '/app/epaper/textversion/' use_embedded_content = False - masthead_url = 'http://pix.sueddeutsche.de/img/g_.gif' + masthead_url = 'http://pix.sueddeutsche.de/img/layout/header/logo.gif' language = 'de' + publication_type = 'newspaper' extra_css = ' body{font-family: Arial,Helvetica,sans-serif} ' conversion_options = { @@ -40,49 +40,49 @@ class SueddeutcheZeitung(BasicNewsRecipe): def get_browser(self): br = BasicNewsRecipe.get_browser() - br.open(self.INDEX) if self.username is not None and self.password is not None: - data = urllib.urlencode({ 'login_name':self.username - ,'login_passwort':self.password - ,'lboxaction':'doLogin' - ,'passtxt':'Passwort' - ,'referer':self.INDEX - ,'x':'22' - ,'y':'7' - }) - br.open(self.LOGIN,data) + br.open(self.INDEX) + br.select_form(name='lbox') + br['login_name' ] = self.username + br['login_passwort'] = self.password + br.submit() return br remove_tags =[ dict(attrs={'class':'hidePrint'}) ,dict(name=['link','object','embed','base','iframe']) ] - remove_tags_before = dict(name='h2') - remove_tags_after = dict(attrs={'class':'author'}) + keep_only_tags = [dict(attrs={'class':'artikelBox'})] + remove_tags_before = dict(attrs={'class':'artikelTitel'}) + remove_tags_after = dict(attrs={'class':'author'}) feeds = [ - (u'Politik' , INDEX + 'politik/' ) - ,(u'Seite drei' , INDEX + 'seitedrei/' ) - ,(u'Meinungsseite', INDEX + 'meinungsseite/') - ,(u'Wissen' , INDEX + 'wissen/' ) - ,(u'Panorama' , INDEX + 'panorama/' ) - ,(u'Feuilleton' , INDEX + 'feuilleton/' ) - ,(u'Medien' , INDEX + 'medien/' ) - ,(u'Wirtschaft' , INDEX + 'wirtschaft/' ) - ,(u'Sport' , INDEX + 'sport/' ) - ,(u'Bayern' , INDEX + 'bayern/' ) - ,(u'Muenchen' , INDEX + 'muenchen/' ) - ,(u'jetzt.de' , INDEX + 'jetzt.de/' ) + (u'Politik' , INDEX + 'Politik/' ) + ,(u'Seite drei' , INDEX + 'Seite+drei/' ) + ,(u'Meinungsseite', INDEX + 'Meinungsseite/') + ,(u'Wissen' , INDEX + 'Wissen/' ) + ,(u'Panorama' , INDEX + 'Panorama/' ) + ,(u'Feuilleton' , INDEX + 'Feuilleton/' ) + ,(u'Medien' , INDEX + 'Medien/' ) + ,(u'Wirtschaft' , INDEX + 'Wirtschaft/' ) + ,(u'Sport' , INDEX + 'Sport/' ) + ,(u'Bayern' , INDEX + 'Bayern/' ) + ,(u'Muenchen' , INDEX + 'M%FCnchen/' ) ] def parse_index(self): + src = self.index_to_soup(self.INDEX) + id = '' + for itt in src.findAll('a',href=True): + if itt['href'].startswith('/app/epaper/textversion/inhalt/'): + id = itt['href'].rpartition('/inhalt/')[2] totalfeeds = [] lfeeds = self.get_feeds() for feedobj in lfeeds: feedtitle, feedurl = feedobj self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) articles = [] - soup = self.index_to_soup(feedurl) + soup = self.index_to_soup(feedurl + id) tbl = soup.find(attrs={'class':'szprintd'}) for item in tbl.findAll(name='td',attrs={'class':'topthema'}): atag = item.find(attrs={'class':'Titel'}).a @@ -101,7 +101,3 @@ class SueddeutcheZeitung(BasicNewsRecipe): }) totalfeeds.append((feedtitle, articles)) return totalfeeds - - def print_version(self, url): - return url + 'print.html' - diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 4fff140ce0..c5f134c8d9 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -61,7 +61,8 @@ class EXTHHeader(object): # last update time pass elif id == 503: # Long title - if not title or title == _('Unknown') or 'USER_CONTENT' in title: + if not title or title == _('Unknown') or \ + 'USER_CONTENT' in title or title.startswith('dtp_'): try: title = content.decode(codec) except: