diff --git a/resources/recipes/daily_telegraph.recipe b/resources/recipes/daily_telegraph.recipe index 61054e1db0..5e1a2f7bfb 100644 --- a/resources/recipes/daily_telegraph.recipe +++ b/resources/recipes/daily_telegraph.recipe @@ -6,43 +6,66 @@ __docformat__ = 'restructuredtext en' ''' http://www.news.com.au/dailytelegraph/ ''' -import re from calibre.web.feeds.news import BasicNewsRecipe class DailyTelegraph(BasicNewsRecipe): title = u'Daily Telegraph' - __author__ = u'AprilHare' + __author__ = u'Adrian G.' language = 'en_AU' - description = u'News from down under' - oldest_article = 2 - max_articles_per_feed = 10 - remove_tags_before = dict(name='div', attrs={'class':'article-title'}) - remove_tags = [dict(attrs={'class':['article-source', 'article-tools']})] - remove_tags_after = dict(attrs={'class':re.compile('share-article')}) - + description = u'Daily Telegraph News' + oldest_article = 5 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + no_javascript = True + + + timefmt = ' [%A, %d %B, %Y]' + encoding = 'utf-8' + + keep_only_tags = [dict(name='div', attrs ={'id':'story'})] + + extra_css = ''' + h1{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large;} + .cT-storyDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;} + .articleBody{font-family:Arial,Helvetica,sans-serif; color:black;font-size:small;} + .cT-imageLandscape{font-family:Arial,Helvetica,sans-serif; color:#333333 ;font-size:x-small;} + .source{font-family:Arial,Helvetica,sans-serif; color:#333333 ;font-size:xx-small;} + #content{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} + .pageprint{font-family:Arial,Helvetica,sans-serif;font-size:small;} + #bylineDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;} + .featurePic-wide{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} + #idfeaturepic{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} + h3{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;} + h2{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;} + h4{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;} + h5{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;} + body{font-family:Arial,Helvetica,sans-serif; font-size:x-small;} + ''' + + remove_tags = [ + dict(name='div', attrs ={'id':['comments','story-related-coverage']}), + dict(name='div', attrs ={'class':['story-header-tools','story-footer','story-extras','story-related']}), + dict(name='div', attrs ={'class':['promo-image','story-extras story-extras-2']}), + dict(name='div', attrs ={'class':['assistive sidebar-jump']}) + ] + feeds = [ - (u'Top Stories', u'http://feeds.news.com.au/public/rss/2.0/dtele_top_stories_253.xml'), - (u'National News', u'http://feeds.news.com.au/public/rss/2.0/dtele_national_news_202.xml'), - (u'World News', u'http://feeds.news.com.au/public/rss/2.0/dtele_world_news_204.xml'), - (u'NSW and ACT', u'http://feeds.news.com.au/public/rss/2.0/dtele_nswact_225.xml'), - (u'Arts', u'http://feeds.news.com.au/public/rss/2.0/dtele_art_444.xml'), - (u'Business News', u'http://feeds.news.com.au/public/rss/2.0/dtele_business_226.xml'), - (u'Entertainment News', u'http://feeds.news.com.au/public/rss/2.0/dtele_entertainment_news_201.xml'), - (u'Lifestyle News', u'http://feeds.news.com.au/public/rss/2.0/dtele_lifestyle_227.xml'), - (u'Music', u'http://feeds.news.com.au/public/rss/2.0/dtele_music_441.xml'), - (u'Property Confidential', u'http://feeds.news.com.au/public/rss/2.0/dtele_property_confidential_463.xml'), - (u'Property - Your Space', u'http://feeds.news.com.au/public/rss/2.0/dtele_property_yourspace_462.xml'), - (u'Confidential News', u'http://feeds.news.com.au/public/rss/2.0/dtele_entertainment_confidential_252.xml'), - (u'Confidential Biographies', u'http://feeds.news.com.au/public/rss/2.0/dtele_confidential_biographies_491.xml'), - (u'Confidential Galleries', u'http://feeds.news.com.au/public/rss/2.0/dtele_confidential_galleries_483.xml'), - (u'Confidential In-depth', u'http://feeds.news.com.au/public/rss/2.0/dtele_confidential_indepth_490.xml'), - (u'Confidential ShowBuzz', u'http://feeds.news.com.au/public/rss/2.0/dtele_confidential_showbuzz_485.xml'), - (u'Sport', u'http://feeds.news.com.au/public/rss/2.0/dtele_sport_203.xml'), - (u'AFL', u'http://feeds.news.com.au/public/rss/2.0/dtele_sports_afl_341.xml'), - (u'Cricket', u'http://feeds.news.com.au/public/rss/2.0/dtele_sports_cricket_343.xml'), - (u'Horse Racing', u'http://feeds.news.com.au/public/rss/2.0/dtele_sports_horseracing_686.xml'), - (u'NRL', u'http://feeds.news.com.au/public/rss/2.0/dtele_sports_nrl_345.xml'), - (u'Rugby Union', u'http://feeds.news.com.au/public/rss/2.0/dtele_sports_rugby_union_342.xml'), - (u'Soccer', u'http://feeds.news.com.au/public/rss/2.0/dtele_sports_soccer_344.xml') + (u'Top Stories', u'http://feeds.news.com.au/public/rss/2.0/dtele_top_stories_253.xml'), + (u'National News', u'http://feeds.news.com.au/public/rss/2.0/dtele_national_news_202.xml'), + (u'World News', u'http://feeds.news.com.au/public/rss/2.0/dtele_world_news_204.xml'), + (u'NSW and ACT', u'http://feeds.news.com.au/public/rss/2.0/dtele_nswact_225.xml'), + (u'Arts', u'http://feeds.news.com.au/public/rss/2.0/dtele_art_444.xml'), + (u'Business News', u'http://feeds.news.com.au/public/rss/2.0/dtele_business_226.xml'), + (u'Entertainment News', u'http://feeds.news.com.au/public/rss/2.0/dtele_entertainment_news_201.xml'), + (u'Lifestyle News', u'http://feeds.news.com.au/public/rss/2.0/dtele_lifestyle_227.xml'), + (u'Music', u'http://feeds.news.com.au/public/rss/2.0/dtele_music_441.xml'), + (u'Property Confidential', u'http://feeds.news.com.au/public/rss/2.0/dtele_property_confidential_463.xml'), + (u'Property - Your Space', u'http://feeds.news.com.au/public/rss/2.0/dtele_property_yourspace_462.xml'), + (u'Confidential News', u'http://feeds.news.com.au/public/rss/2.0/dtele_entertainment_confidential_252.xml'), + (u'Confidential Biographies', u'http://feeds.news.com.au/public/rss/2.0/dtele_confidential_biographies_491.xml'), + (u'Confidential Galleries', u'http://feeds.news.com.au/public/rss/2.0/dtele_confidential_galleries_483.xml'), ] + + diff --git a/resources/recipes/elsevier.recipe b/resources/recipes/elsevier.recipe index 389ce3f74d..299b886736 100644 --- a/resources/recipes/elsevier.recipe +++ b/resources/recipes/elsevier.recipe @@ -9,16 +9,16 @@ from calibre.web.feeds.news import BasicNewsRecipe class Pagina12(BasicNewsRecipe): title = 'Elsevier.nl' __author__ = 'Darko Miletic' - description = 'News from Denmark' + description = 'News from Holland' publisher = 'elsevier.nl' - category = 'news, politics, Denmark' + category = 'news, politics, Holland' oldest_article = 2 max_articles_per_feed = 200 no_stylesheets = True encoding = 'utf-8' use_embedded_content = False language = 'nl' - country = 'NL' + country = 'NL' remove_empty_feeds = True masthead_url = 'http://www.elsevier.nl/static/elsevier/stdimg/logo.gif' extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} ' @@ -29,7 +29,7 @@ class Pagina12(BasicNewsRecipe): , 'publisher' : publisher , 'language' : language } - + keep_only_tags = dict(attrs={'id':'artikel_container'}) remove_tags_before = dict(attrs={'id':'breadcrumb_container'}) remove_tags_after = dict(attrs={'class':'author_link'}) @@ -50,7 +50,7 @@ class Pagina12(BasicNewsRecipe): ,(u'Cultuur & Televisie' , u'http://www.elsevier.nl/web/RSS/Cultuur-Televisie-RSS.htm?output=xml') ,(u'Society' , u'http://www.elsevier.nl/web/RSS/Society-RSS.htm?output=xml' ) ,(u'Internet&/Gadgets' , u'http://www.elsevier.nl/web/RSS/Internet-Gadgets-RSS.htm?output=xml' ) - ,(u'Comentaren' , u'http://www.elsevier.nl/web/RSS/Commentaren-RSS.htm?output=xml' ) + ,(u'Comentaren' , u'http://www.elsevier.nl/web/RSS/Commentaren-RSS.htm?output=xml' ) ] def print_version(self, url): @@ -58,8 +58,8 @@ class Pagina12(BasicNewsRecipe): def get_article_url(self, article): return article.get('guid', None).rpartition('?')[0] - + def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] - return soup + return soup \ No newline at end of file diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index d389ca4eea..c126902899 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -4,65 +4,136 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' nytimes.com +V5 - One picture per article, moved to top: +Headline +Image +Byline +Story ''' -import string, re, time +import re, string, time from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup - -def decode(self, src): - enc = 'utf-8' - if 'iso-8859-1' in src: - enc = 'cp1252' - return src.decode(enc, 'ignore') +from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, NavigableString, Tag class NYTimes(BasicNewsRecipe): - title = 'The New York Times (subscription)' - __author__ = 'Kovid Goyal' + title = 'The New York Times' + __author__ = 'GRiker' language = 'en' - requires_version = (0, 6, 36) description = 'Daily news from the New York Times (subscription version)' - timefmt = ' [%a, %b %d, %Y]' + allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials', + 'New York','Business Day','Science Times','Sports','Dining','Arts', + 'Home','Styles','Sunday Business','Week In Review','Travel','Magazine', + 'Book Review','Weddings','Real Estate','Automobiles',"T Men's Fashion", + "T Women's Fashion"] + + # List of sections to exclude + # To add a section, copy the section name from the allSectionKeywords list above + # For example, to exclude 'Dining' and 'Weddings': + # excludeSectionKeywords = ['Dining','Weddings'] + excludeSectionKeywords = [] + + # List of sections to include (test and debug only) + # By default, any sections in today's paper that are not listed in excludeSectionKeywords + # are downloaded. fetch_only specifies that only certain sections are to be downloaded. + # This should only be used for testing and debugging. + # For example, to download only 'The Front Page' section: + # fetch_only = set(['The Front Page']) + fetch_only = set([]) + if fetch_only: + excludeSectionKeywords = list(set(allSectionKeywords) ^ fetch_only) + + # one_picture_per_article specifies that calibre should only use the first image + # from an article (if one exists). If one_picture_per_article = True, the image + # will be moved to a location between the headline and the byline. + # If one_picture_per_article = False, all images from the article will be included + # and shown in their original location. + one_picture_per_article = True + + timefmt = '' needs_subscription = True remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') - remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}), - dict(id=['footer', 'toolsRight', 'articleInline', - 'navigation', 'archive', 'side_search', 'blog_sidebar', - 'side_tool', 'side_index', 'login', 'businessSearchBar', - 'adxLeaderboard', - 'relatedArticles', 'relatedTopics', 'adxSponLink']), + remove_tags = [dict(attrs={'class':[ + 'articleTools', + 'columnGroup doubleRule', + 'columnGroup last', + 'doubleRule', + 'dottedLine', + 'entry-meta', + 'icon enlargeThis', + 'leftNavTabs', + 'module box nav', + 'nextArticleLink clearfix', + 'post-tools', + 'relatedSearchesModule', + 'side_tool', + 'singleAd', + ]}), + dict(id=[ + 'adxLeaderboard', + 'archive', + 'articleExtras', + 'articleInline', + 'blog_sidebar', + 'cCol', + 'entertainmentSearchBar', + 'footer', + 'header', + 'header_search', + 'login', + 'masthead', + 'memberTools', + 'navigation', + 'portfolioInline', + 'relatedArticles', + 'side_search', + 'side_index', + 'side_tool', + 'toolsRight', + ]), dict(name=['script', 'noscript', 'style'])] - encoding = decode + masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' no_stylesheets = True - extra_css = 'h1 {font-face:sans-serif; font-size:2em; font-weight:bold;}\n.byline {font:monospace;}\n.bold {font-weight:bold;}' + extra_css = '.headline {text-align: left;}\n \ + .byline {font-family: monospace; \ + text-align: left; \ + margin-top: 0px; \ + margin-bottom: 0px;}\n \ + .dateline {font-size: small; \ + margin-top: 0px; \ + margin-bottom: 0px;}\n \ + .timestamp {font-size: small; \ + margin-top: 0px; \ + margin-bottom: 0px;}\n \ + .source {text-align: left;}\n \ + .image {text-align: center;}\n \ + .credit {text-align: right; \ + font-size: small; \ + margin-top: 0px; \ + margin-bottom: 0px;}\n \ + .articleBody {text-align: left;}\n \ + .authorId {text-align: left; \ + font-style: italic;}\n ' def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: - br.open('http://www.nytimes.com/auth/login') - br.select_form(name='login') - br['USERID'] = self.username - br['PASSWORD'] = self.password - raw = br.submit().read() - if 'Sorry, we could not find the combination you entered. Please try again.' in raw: - raise Exception('Your username and password are incorrect') - #open('/t/log.html', 'wb').write(raw) + try: + br.open('http://www.nytimes.com/auth/login') + br.select_form(name='login') + br['USERID'] = self.username + br['PASSWORD'] = self.password + raw = br.submit().read() + if 'Sorry, we could not find the combination you entered. Please try again.' in raw: + raise Exception('Your username and password are incorrect') + #open('/t/log.html', 'wb').write(raw) + except: + self.log("\nFailed to login") + return br - def get_masthead_url(self): - masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' - br = BasicNewsRecipe.get_browser() - try: - br.open(masthead) - except: - self.log("\nCover unavailable") - masthead = None - return masthead - - def get_cover_url(self): cover = None st = time.localtime() @@ -78,13 +149,66 @@ class NYTimes(BasicNewsRecipe): cover = None return cover - def short_title(self): - return 'NY Times' + def get_masthead_title(self): + return 'NYTimes GR Version' + + def dump_ans(self, ans): + total_article_count = 0 + for section in ans : + if self.verbose: + self.log("section %s: %d articles" % (section[0], len(section[1])) ) + for article in section[1]: + total_article_count += 1 + if self.verbose: + self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('mac-roman','replace'), + article['url'].encode('mac-roman','replace'))) + self.log( "Queued %d articles" % total_article_count ) + + def dump_hex(self, src, length=16): + ''' Diagnostic ''' + FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) + N=0; result='' + while src: + s,src = src[:length],src[length:] + hexa = ' '.join(["%02X"%ord(x) for x in s]) + s = s.translate(FILTER) + result += "%04X %-*s %s\n" % (N, length*3, hexa, s) + N+=length + print result + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + + return fixed + + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description def parse_index(self): - self.encoding = 'cp1252' soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') - self.encoding = decode def feed_title(div): return ''.join(div.findAll(text=True, recursive=False)).strip() @@ -92,18 +216,13 @@ class NYTimes(BasicNewsRecipe): articles = {} key = None ans = [] - #allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials', - #'New York','Business Day','Sports','Dining','Arts','Home','Styles'] - excludeSectionKeywords = ['Dining','Styles'] - - # Find each instance of class="section-headline", class="story", class="story headline" for div in soup.findAll(True, attrs={'class':['section-headline', 'story', 'story headline']}): if div['class'] == 'section-headline': key = string.capwords(feed_title(div)) - excluded = re.compile('|'.join(excludeSectionKeywords)) + excluded = re.compile('|'.join(self.excludeSectionKeywords)) if excluded.search(key): self.log("Skipping section %s" % key) continue @@ -117,13 +236,14 @@ class NYTimes(BasicNewsRecipe): continue url = re.sub(r'\?.*', '', a['href']) url += '?pagewanted=all' - title = self.tag_to_string(a, use_alt=True).strip() + + title = self.massageNCXText(self.tag_to_string(a, use_alt=True).strip()) description = '' pubdate = strftime('%a, %d %b') summary = div.find(True, attrs={'class':'summary'}) if summary: - description = self.tag_to_string(summary, use_alt=False) + description = self.massageNCXText(self.tag_to_string(summary, use_alt=False)) author = '' authorAttribution = div.find(True, attrs={'class':'storyheadline-author'}) @@ -133,6 +253,8 @@ class NYTimes(BasicNewsRecipe): authorAttribution = div.find(True, attrs={'class':'byline'}) if authorAttribution: author = self.tag_to_string(authorAttribution, use_alt=False) + # Kill commas - Kindle switches to '&' + author = re.sub(',','',author) feed = key if key is not None else 'Uncategorized' if not articles.has_key(feed): @@ -146,13 +268,208 @@ class NYTimes(BasicNewsRecipe): 'Dining In, Dining Out':1, 'Obituaries':2}) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - + self.dump_ans(ans) return ans def preprocess_html(self, soup): + ''' refresh = soup.find('meta', {'http-equiv':'refresh'}) if refresh is None: return soup content = refresh.get('content').partition('=')[2] - raw = self.browser.open_novisit('http://www.nytimes.com'+content).read() + raw = self.browser.open('http://www.nytimes.com'+content).read() return BeautifulSoup(raw.decode('cp1252', 'replace')) + ''' + return self.strip_anchors(soup) + + def postprocess_html(self,soup, True): + + if self.one_picture_per_article: + # Remove all images after first + largeImg = soup.find(True, {'class':'articleSpanImage'}) + inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) + if largeImg: + for inlineImg in inlineImgs: + inlineImg.extract() + else: + if inlineImgs: + firstImg = inlineImgs[0] + for inlineImg in inlineImgs[1:]: + inlineImg.extract() + # Move firstImg after headline + cgFirst = soup.find(True, {'class':'columnGroup first'}) + if cgFirst: + # Strip all sibling NavigableStrings: noise + navstrings = cgFirst.findAll(text=True, recursive=False) + [ns.extract() for ns in navstrings] + headline_found = False + tag = cgFirst.find(True) + insertLoc = 0 + while True: + insertLoc += 1 + if hasattr(tag,'class') and tag['class'] == 'articleHeadline': + headline_found = True + break + tag = tag.nextSibling + if not tag: + headline_found = False + break + if headline_found: + cgFirst.insert(insertLoc,firstImg) + else: + self.log(">>> No class:'columnGroup first' found <<<") + # Change class="kicker" to

+ kicker = soup.find(True, {'class':'kicker'}) + if kicker and kicker.contents[0]: + h3Tag = Tag(soup, "h3") + h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker, + use_alt=False))) + kicker.replaceWith(h3Tag) + + # Change captions to italic -1 + for caption in soup.findAll(True, {'class':'caption'}) : + if caption and caption.contents[0]: + emTag = Tag(soup, "em") + c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() + mp_off = c.find("More Photos") + if mp_off >= 0: + c = c[:mp_off] + emTag.insert(0, c) + hrTag = Tag(soup, 'hr') + #hrTag['style'] = "margin-top:0em;margin-bottom:0em" + emTag.insert(1, hrTag) + caption.replaceWith(emTag) + + # Change to

+ h1 = soup.find('h1') + if h1: + headline = h1.find("nyt_headline") + if headline: + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, self.fixChars(headline.contents[0])) + h1.replaceWith(tag) + else: + # Blog entry - replace headline, remove
tags + headline = soup.find('title') + if headline: + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, self.fixChars(headline.contents[0])) + soup.insert(0, tag) + hrs = soup.findAll('hr') + for hr in hrs: + hr.extract() + + # Change

to

- used in editorial blogs + masthead = soup.find("h1") + if masthead: + # Nuke the href + if masthead.a: + del(masthead.a['href']) + tag = Tag(soup, "h3") + tag.insert(0, self.fixChars(masthead.contents[0])) + masthead.replaceWith(tag) + + # Change to + for subhead in soup.findAll(True, {'class':'bold'}) : + if subhead.contents: + bTag = Tag(soup, "b") + bTag.insert(0, subhead.contents[0]) + subhead.replaceWith(bTag) + + # Synthesize a section header + dsk = soup.find('meta', attrs={'name':'dsk'}) + if dsk and dsk.has_key('content'): + hTag = Tag(soup,'h3') + hTag['class'] = 'section' + hTag.insert(0,NavigableString(dsk['content'])) + articleTag = soup.find(True, attrs={'id':'article'}) + if articleTag: + articleTag.insert(0,hTag) + + # Add class="articleBody" to
so we can format with CSS + divTag = soup.find('div',attrs={'id':'articleBody'}) + if divTag: + divTag['class'] = divTag['id'] + + # Add class="authorId" to
so we can format with CSS + divTag = soup.find('div',attrs={'id':'authorId'}) + if divTag and divTag.contents[0]: + tag = Tag(soup, "p") + tag['class'] = "authorId" + tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], + use_alt=False))) + divTag.replaceWith(tag) + + return soup + + def postprocess_book(self, oeb, opts, log) : + + def extract_byline(href) : + # + byline = soup.find('div', attrs={'class':'byline'}) + if byline: + author = byline.renderContents() + else: + print "couldn't find byline in %s" % href + print soup.prettify() + return None + # Kill commas - Kindle switches to '&' + return re.sub(',','',author) + + def extract_description(href) : + soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) + description = soup.find('meta',attrs={'name':['description','description ']}) + if description : +# print repr(description['content']) +# print self.massageNCXText(description['content']) + return self.massageNCXText(description['content']) + else: + # Take first paragraph of article + articleBody = soup.find('div',attrs={'id':'articleBody'}) + if not articleBody: + # Try again with class instead of id + articleBody = soup.find('div',attrs={'class':'articleBody'}) + if not articleBody: + print 'postprocess_book.extract_description(): Did not find
:' + print soup.prettify() + return None + paras = articleBody.findAll('p') + for p in paras: + if p.renderContents() > '' : + return self.massageNCXText(self.tag_to_string(p,use_alt=False)) + return None + + # Method entry point here + # Single section toc looks different than multi-section tocs + if oeb.toc.depth() == 2 : + for article in oeb.toc : + if article.author is None : + article.author = extract_byline(article.href) + if article.description is None : + article.description = extract_description(article.href).decode('utf-8') + elif oeb.toc.depth() == 3 : + for section in oeb.toc : + for article in section : + if article.author is None : + article.author = extract_byline(article.href) + if article.description is None : + article.description = extract_description(article.href) + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('utf-8','replace')) + #a.replaceWith(a.renderContents().decode('cp1252','replace')) + return soup + diff --git a/resources/recipes/oilprice.recipe b/resources/recipes/oilprice.recipe new file mode 100644 index 0000000000..04505c2eec --- /dev/null +++ b/resources/recipes/oilprice.recipe @@ -0,0 +1,34 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +oilprice.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class OilPrice(BasicNewsRecipe): + title = 'Oil Price' + __author__ = 'Darko Miletic' + description = 'The nr. 1 source for Oil Price Information' + publisher = 'oilprice.com' + category = 'news, oil, politics, world, usa' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'cp1252' + use_embedded_content = False + language = 'en' + country = 'US' + extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} ' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + remove_tags = [dict(name='div',attrs={'class':'banner'})] + keep_only_tags = [dict(name='div',attrs={'id':'storyContent'})] + remove_tags_after = dict(attrs={'id':'KonaBody'}) + + feeds = [(u'Articles', u'http://www.oilprice.com/rss.xml')] diff --git a/resources/recipes/the_sun.recipe b/resources/recipes/the_sun.recipe new file mode 100644 index 0000000000..f9905a61dc --- /dev/null +++ b/resources/recipes/the_sun.recipe @@ -0,0 +1,45 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1268409464(BasicNewsRecipe): + title = u'The Sun' + __author__ = 'Chaz Ralph' + description = 'News from The Sun' + oldest_article = 1 + max_articles_per_feed = 100 + language = 'en' + no_stylesheets = True + extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }' + encoding= 'iso-8859-1' + remove_javascript = True + + keep_only_tags = [ + dict(name='div', attrs={'class':'medium-centered'}) + ,dict(name='div', attrs={'class':'article'}) + ,dict(name='div', attrs={'class':'clear-left'}) + ,dict(name='div', attrs={'class':'text-center'}) + ] + + remove_tags = [ + dict(name='div', attrs={'class':'slideshow'}) + ,dict(name='div', attrs={'class':'float-left'}) + ,dict(name='div', attrs={'class':'ltbx-slideshow ltbx-btn-ss'}) + ,dict(name='a', attrs={'class':'add_a_comment'}) + ,dict(name='div', attrs={'id':'vxFlashPlayerContent'}) + ,dict(name='div', attrs={'id':'k1006094r1c1t5w380h529'}) + ,dict(name='div', attrs={'id':'tum_login_form_container'}) + ,dict(name='div', attrs={'class':'discHeader'}) + ,dict(name='div', attrs={'class':'margin-bottom-neg-2'}) + ] + + + feeds = [(u'News', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article312900.ece') +,(u'Sport', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article247732.ece') +,(u'Football', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article247739.ece') +,(u'Gizmo', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article247829.ece') +,(u'Bizarre', u'http://www.thesun.co.uk/sol/homepage/feeds/rss/article247767.ece')] + + def print_version(self, url): + return re.sub(r'\?OTC-RSS&ATTR=[-a-zA-Z]+', '?print=yes', url) + + diff --git a/resources/recipes/wash_times.recipe b/resources/recipes/wash_times.recipe new file mode 100644 index 0000000000..fd29fd7396 --- /dev/null +++ b/resources/recipes/wash_times.recipe @@ -0,0 +1,49 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + + +class WashingtonTimes(BasicNewsRecipe): + + title = 'Washington Times' + max_articles_per_feed = 15 + language = 'en' + __author__ = 'Kos Semonski' + + + preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + [ + (r'.*?' , lambda match : ''), + (r'
.*?', lambda match : ''), + (r'.*?', lambda match : ''), + (r'.*?', lambda match : ''), + (r'.*?', lambda match : ''), + ##(r'.*?
', lambda match : ''), + (r'