diff --git a/resources/recipes/brand_eins.recipe b/resources/recipes/brand_eins.recipe index be5b98ffe6..c69dd693b2 100644 --- a/resources/recipes/brand_eins.recipe +++ b/resources/recipes/brand_eins.recipe @@ -1,18 +1,22 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +# -*- coding: utf-8 mode: python -*- + +# Find the newest version of this recipe here: +# https://github.com/consti/BrandEins-Recipe/raw/master/brandeins.recipe __license__ = 'GPL v3' -__copyright__ = '2010, Constantin Hofstetter ' -__version__ = '0.95' +__copyright__ = '2010, Constantin Hofstetter , Steffen Siebert ' +__version__ = '0.96' ''' http://brandeins.de - Wirtschaftsmagazin ''' import re import string from calibre.web.feeds.recipes import BasicNewsRecipe + class BrandEins(BasicNewsRecipe): - title = u'Brand Eins' + title = u'brand eins' __author__ = 'Constantin Hofstetter' description = u'Wirtschaftsmagazin' publisher ='brandeins.de' @@ -22,11 +26,14 @@ class BrandEins(BasicNewsRecipe): no_stylesheets = True encoding = 'utf-8' language = 'de' + publication_type = 'magazine' + needs_subscription = True # 2 is the last full magazine (default) # 1 is the newest (but not full) # 3 is one before 2 etc. - which_ausgabe = 2 + # This value can be set via the username field. + default_issue = 2 keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})] @@ -61,17 +68,31 @@ class BrandEins(BasicNewsRecipe): return soup + def get_cover(self, soup): + cover_url = None + cover_item = soup.find('div', attrs = {'class': 'cover_image'}) + if cover_item: + cover_url = 'http://www.brandeins.de/' + cover_item.img['src'] + return cover_url + def parse_index(self): feeds = [] archive = "http://www.brandeins.de/archiv.html" + issue = self.default_issue + if self.username: + try: + issue = int(self.username) + except: + pass + soup = self.index_to_soup(archive) latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0] - pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-self.which_ausgabe] + pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-issue] url = pre_latest_issue.get('href', False) # Get the title for the magazin - build it out of the title of the cover - take the issue and year; - self.title = "Brand Eins "+ re.search(r"(?P\d\d\/\d\d\d\d+)", pre_latest_issue.find('img').get('title', False)).group('date') + self.title = "brand eins "+ re.search(r"(?P\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date') url = 'http://brandeins.de/'+url # url = "http://www.brandeins.de/archiv/magazin/tierisch.html" @@ -83,6 +104,7 @@ class BrandEins(BasicNewsRecipe): def brand_eins_parse_latest_issue(self, url): soup = self.index_to_soup(url) + self.cover_url = self.get_cover(soup) article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})] titles_and_articles = [] @@ -123,3 +145,4 @@ class BrandEins(BasicNewsRecipe): current_articles.append({'title': title, 'url': url, 'description': description, 'date':''}) titles_and_articles.append([chapter_title, current_articles]) return titles_and_articles + diff --git a/resources/recipes/nikkei_sub.recipe b/resources/recipes/nikkei_sub.recipe new file mode 100644 index 0000000000..95b0017339 --- /dev/null +++ b/resources/recipes/nikkei_sub.recipe @@ -0,0 +1,125 @@ +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +import mechanize +from calibre.ptempfile import PersistentTemporaryFile + + +class NikkeiNet_subscription(BasicNewsRecipe): + title = u'\u65e5\u7d4c\u65b0\u805e\u96fb\u5b50\u7248' + __author__ = 'Hiroshi Miura' + description = 'News and current market affairs from Japan' + needs_subscription = True + oldest_article = 2 + max_articles_per_feed = 20 + language = 'ja' + remove_javascript = False + temp_files = [] + + remove_tags_before = {'class':"cmn-section cmn-indent"} + remove_tags = [ + {'class':"JSID_basePageMove JSID_baseAsyncSubmit cmn-form_area JSID_optForm_utoken"}, + {'class':"cmn-article_keyword cmn-clearfix"}, + {'class':"cmn-print_headline cmn-clearfix"}, + ] + remove_tags_after = {'class':"cmn-pr_list"} + + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + + cj = mechanize.LWPCookieJar() + br.set_cookiejar(cj) + + #br.set_debug_http(True) + #br.set_debug_redirects(True) + #br.set_debug_responses(True) + + if self.username is not None and self.password is not None: + #print "----------------------------get login form--------------------------------------------" + # open login form + br.open('https://id.nikkei.com/lounge/nl/base/LA0010.seam') + response = br.response() + #print "----------------------------get login form---------------------------------------------" + #print "----------------------------set login form---------------------------------------------" + # remove disabled input which brings error on mechanize + response.set_data(response.get_data().replace("", " -->")) + br.set_response(response) + br.select_form(name='LA0010Form01') + br['LA0010Form01:LA0010Email'] = self.username + br['LA0010Form01:LA0010Password'] = self.password + br.form.find_control(id='LA0010Form01:LA0010AutoLoginOn',type="checkbox").get(nr=0).selected = True + br.submit() + br.response() + #print "----------------------------send login form---------------------------------------------" + #print "----------------------------open news main page-----------------------------------------" + # open news site + br.open('http://www.nikkei.com/') + br.response() + #print "----------------------------www.nikkei.com BODY --------------------------------------" + #print response2.get_data() + #print "-------------------------^^-got auto redirect form----^^--------------------------------" + # forced redirect in default + br.select_form(nr=0) + br.submit() + response3 = br.response() + # return some cookie which should be set by Javascript + #print response3.geturl() + raw = response3.get_data() + #print "---------------------------response to form --------------------------------------------" + # grab cookie from JS and set it + redirectflag = re.search(r"var checkValue = '(\d+)';", raw, re.M).group(1) + br.select_form(nr=0) + + self.temp_files.append(PersistentTemporaryFile('_fa.html')) + self.temp_files[-1].write("#LWP-Cookies-2.0\n") + + self.temp_files[-1].write("Set-Cookie3: Cookie-dummy=Cookie-value; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].write("Set-Cookie3: redirectFlag="+redirectflag+"; domain=\".nikkei.com\"; path=\"/\"; path_spec; secure; expires=\"2029-12-21 05:07:59Z\"; version=0\n") + self.temp_files[-1].close() + cj.load(self.temp_files[-1].name) + + br.submit() + + #br.set_debug_http(False) + #br.set_debug_redirects(False) + #br.set_debug_responses(False) + return br + + + + feeds = [ (u'\u65e5\u7d4c\u4f01\u696d', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sangyo'), + (u'\u65e5\u7d4c\u88fd\u54c1', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=newpro'), + (u'internet', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=internet'), + (u'\u653f\u6cbb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=seiji'), + (u'\u8ca1\u52d9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zaimu'), + (u'\u7d4c\u6e08', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keizai'), + (u'\u56fd\u969b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kaigai'), + (u'\u79d1\u5b66', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kagaku'), + (u'\u30de\u30fc\u30b1\u30c3\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=market'), + (u'\u304f\u3089\u3057', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kurashi'), + (u'\u30b9\u30dd\u30fc\u30c4', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sports'), + (u'\u793e\u4f1a', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shakai'), + (u'\u30a8\u30b3', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=eco'), + (u'\u5065\u5eb7', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kenkou'), + (u'\u96c7\u7528', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=koyou'), + (u'\u6559\u80b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=kyouiku'), + (u'\u304a\u304f\u3084\u307f', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=okuyami'), + (u'\u4eba\u4e8b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=zinzi'), + (u'\u7279\u96c6', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=special'), + (u'\u5730\u57df\u30cb\u30e5\u30fc\u30b9', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=local'), + (u'\u7d71\u8a08\u30fb\u767d\u66f8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=report'), + (u'\u30e9\u30f3\u30ad\u30f3\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=ranking'), + (u'\u4f1a\u898b', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=interview'), + (u'\u793e\u8aac\u30fb\u6625\u79cb', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=shasetsu'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30d7\u30ed\u91ce\u7403', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=baseball'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u5927\u30ea\u30fc\u30b0', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=mlb'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b5\u30c3\u30ab\u30fc', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=soccer'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u30b4\u30eb\u30d5', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=golf'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u76f8\u64b2', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=sumou'), + (u'\u30b9\u30dd\u30fc\u30c4\uff1a\u7af6\u99ac', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=keiba'), + (u'\u8abf\u67fb\u30fb\u30a2\u30f3\u30b1\u30fc\u30c8', u'http://www.zou3.net/php/rss/nikkei2rss.php?head=research') + ] + + + diff --git a/resources/recipes/nytimes.recipe b/resources/recipes/nytimes.recipe index 16ddea9f8c..fbb4641580 100644 --- a/resources/recipes/nytimes.recipe +++ b/resources/recipes/nytimes.recipe @@ -7,14 +7,22 @@ nytimes.com ''' import re, string, time from calibre import entity_to_unicode, strftime +from datetime import timedelta, date from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup + class NYTimes(BasicNewsRecipe): - # set headlinesOnly to True for the headlines-only version + # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored. headlinesOnly = True + # set webEdition to True for the Web edition of the newspaper. Set oldest_article to the + # number of days old an article can be for inclusion. If oldest_article = 0 all articles + # will be included. Note: oldest_article is ignored if webEdition = False + webEdition = False + oldest_article = 7 + # includeSections: List of sections to include. If empty, all sections found will be included. # Otherwise, only the sections named will be included. For example, # @@ -39,20 +47,76 @@ class NYTimes(BasicNewsRecipe): # from an article (if one exists). If one_picture_per_article = True, the image # will be moved to a location between the headline and the byline. # If one_picture_per_article = False, all images from the article will be included - # and shown in their original location. - one_picture_per_article = True + one_picture_per_article = False # The maximum number of articles that will be downloaded max_articles_per_feed = 100 + # Whether to omit duplicates of articles (typically arsing when articles are indexed in + # more than one section). If True, only the first occurance will be downloaded. + filterDuplicates = True + + # Sections to collect for the Web edition. + # Delete any you don't want, or use includeSections or excludeSections + web_sections = [(u'World',u'world'), + (u'U.S.',u'national'), + (u'Politics',u'politics'), + (u'New York',u'nyregion'), + (u'Business','business'), + (u'Technology',u'technology'), + (u'Sports',u'sports'), + (u'Science',u'science'), + (u'Health',u'health'), + (u'Opinion',u'opinion'), + (u'Arts',u'arts'), + (u'Books',u'books'), + (u'Movies',u'movies'), + (u'Music',u'arts/music'), + (u'Television',u'arts/television'), + (u'Style',u'style'), + (u'Dining & Wine',u'dining'), + (u'Fashion & Style',u'fashion'), + (u'Home & Garden',u'garden'), + (u'Travel',u'travel'), + ('Education',u'education'), + ('Multimedia',u'multimedia'), + (u'Obituaries',u'obituaries'), + (u'Sunday Magazine',u'magazine'), + (u'Week in Review',u'weekinreview')] + if headlinesOnly: title='New York Times Headlines' description = 'Headlines from the New York Times' + needs_subscription = False + elif webEdition: + title='New York Times (Web)' + description = 'New York Times on the Web' + needs_subscription = True else: title='New York Times' description = 'Today\'s New York Times' + needs_subscription = True + + + month_list = ['january','february','march','april','may','june','july','august','september','october','november','december'] + + def decode_us_date(self,datestr): + udate = datestr.strip().lower().split() + try: + m = self.month_list.index(udate[0])+1 + except: + return date.today() + d = int(udate[1]) + y = int(udate[2]) + try: + d = date(y,m,d) + except: + d = date.today + return d + + earliest_date = date.today() - timedelta(days=oldest_article) __author__ = 'GRiker/Kovid Goyal/Nick Redding' language = 'en' @@ -136,6 +200,12 @@ class NYTimes(BasicNewsRecipe): .image {text-align: center;} .source {text-align: left; }''' + + articles = {} + key = None + ans = [] + url_list = [] + def filter_ans(self, ans) : total_article_count = 0 idx = 0 @@ -164,6 +234,29 @@ class NYTimes(BasicNewsRecipe): self.log( "Queued %d articles" % total_article_count ) return ans + def exclude_url(self,url): + if not url.startswith("http"): + return True + if not url.endswith(".html"): + return True + if 'nytimes.com' not in url: + return True + if 'podcast' in url: + return True + if '/video/' in url: + return True + if '/slideshow/' in url: + return True + if '/magazine/index' in url: + return True + if '/interactive/' in url: + return True + if '/reference/' in url: + return True + if '/premium/' in url: + return True + return False + def fixChars(self,string): # Replace lsquo (\x91) fixed = re.sub("\x91","‘",string) @@ -249,7 +342,6 @@ class NYTimes(BasicNewsRecipe): return BeautifulSoup(_raw, markupMassage=massage) # Entry point - print "index_to_soup()" soup = get_the_soup( self.encoding, url_or_raw ) contentType = soup.find(True,attrs={'http-equiv':'Content-Type'}) docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')] @@ -273,83 +365,110 @@ class NYTimes(BasicNewsRecipe): else: return description - def parse_todays_index(self): + def feed_title(self,div): + return ''.join(div.findAll(text=True, recursive=True)).strip() - def feed_title(div): - return ''.join(div.findAll(text=True, recursive=True)).strip() - - articles = {} - key = None - ans = [] - url_list = [] - - def handle_article(div): - a = div.find('a', href=True) - if not a: + def handle_article(self,div): + thumbnail = div.find('div','thumbnail') + if thumbnail: + thumbnail.extract() + a = div.find('a', href=True) + if not a: + return + url = re.sub(r'\?.*', '', a['href']) + if self.exclude_url(url): + return + url += '?pagewanted=all' + if self.filterDuplicates: + if url in self.url_list: return - url = re.sub(r'\?.*', '', a['href']) - if not url.startswith("http"): - return - if not url.endswith(".html"): - return - if 'podcast' in url: - return - if '/video/' in url: - return - url += '?pagewanted=all' - if url in url_list: - return - url_list.append(url) - title = self.tag_to_string(a, use_alt=True).strip() - description = '' - pubdate = strftime('%a, %d %b') - summary = div.find(True, attrs={'class':'summary'}) - if summary: - description = self.tag_to_string(summary, use_alt=False) - author = '' + self.url_list.append(url) + title = self.tag_to_string(a, use_alt=True).strip() + description = '' + pubdate = strftime('%a, %d %b') + summary = div.find(True, attrs={'class':'summary'}) + if summary: + description = self.tag_to_string(summary, use_alt=False) + author = '' + authorAttribution = div.find(True, attrs={'class':'byline'}) + if authorAttribution: + author = self.tag_to_string(authorAttribution, use_alt=False) + else: authorAttribution = div.find(True, attrs={'class':'byline'}) if authorAttribution: author = self.tag_to_string(authorAttribution, use_alt=False) - else: - authorAttribution = div.find(True, attrs={'class':'byline'}) - if authorAttribution: - author = self.tag_to_string(authorAttribution, use_alt=False) - feed = key if key is not None else 'Uncategorized' - if not articles.has_key(feed): - ans.append(feed) - articles[feed] = [] - articles[feed].append( - dict(title=title, url=url, date=pubdate, - description=description, author=author, - content='')) + feed = self.key if self.key is not None else 'Uncategorized' + if not self.articles.has_key(feed): + self.ans.append(feed) + self.articles[feed] = [] + self.articles[feed].append( + dict(title=title, url=url, date=pubdate, + description=description, author=author, + content='')) + def parse_web_edition(self): + + for (sec_title,index_url) in self.web_sections: + if self.includeSections != []: + if sec_title not in self.includeSections: + print "SECTION NOT INCLUDED: ",sec_title + continue + if sec_title in self.excludeSections: + print "SECTION EXCLUDED: ",sec_title + continue + print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html' + soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html') + self.key = sec_title + # Find each article + for div in soup.findAll(True, + attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): + if div['class'] in ['story', 'story headline'] : + self.handle_article(div) + elif div['class'] == 'headlinesOnly multiline flush': + for lidiv in div.findAll('li'): + self.handle_article(lidiv) + + self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] + return self.filter_ans(self.ans) + + + def parse_todays_index(self): + soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') - + skipping = False # Find each article for div in soup.findAll(True, attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): if div['class'] in ['section-headline','sectionHeader']: - key = string.capwords(feed_title(div)) - key = key.replace('Op-ed','Op-Ed') - key = key.replace('U.s.','U.S.') + self.key = string.capwords(self.feed_title(div)) + self.key = self.key.replace('Op-ed','Op-Ed') + self.key = self.key.replace('U.s.','U.S.') + self.key = self.key.replace('N.y.','N.Y.') + skipping = False + if self.includeSections != []: + if self.key not in self.includeSections: + print "SECTION NOT INCLUDED: ",self.key + skipping = True + if self.key in self.excludeSections: + print "SECTION EXCLUDED: ",self.key + skipping = True + elif div['class'] in ['story', 'story headline'] : - handle_article(div) + if not skipping: + self.handle_article(div) elif div['class'] == 'headlinesOnly multiline flush': for lidiv in div.findAll('li'): - handle_article(lidiv) + if not skipping: + self.handle_article(lidiv) - ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - return self.filter_ans(ans) + self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] + return self.filter_ans(self.ans) def parse_headline_index(self): - articles = {} - ans = [] - url_list = [] - soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') # Fetch the content table @@ -363,15 +482,24 @@ class NYTimes(BasicNewsRecipe): for td_col in content_table.findAll('td', {'id' : re.compile('Column')}): for div_sec in td_col.findAll('div',recursive=False): for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}): + section_name = self.tag_to_string(h6_sec_name,use_alt=False) section_name = re.sub(r'^ *$','',section_name) + if section_name == '': continue + if self.includeSections != []: + if section_name not in self.includeSections: + print "SECTION NOT INCLUDED: ",section_name + continue + if section_name in self.excludeSections: + print "SECTION EXCLUDED: ",section_name + continue + section_name=string.capwords(section_name) - if section_name == 'U.s.': - section_name = 'U.S.' - elif section_name == 'Op-ed': - section_name = 'Op-Ed' + section_name = section_name.replace('Op-ed','Op-Ed') + section_name = section_name.replace('U.s.','U.S.') + section_name = section_name.replace('N.y.','N.Y.') pubdate = strftime('%a, %d %b') search_div = div_sec @@ -392,37 +520,32 @@ class NYTimes(BasicNewsRecipe): if not a: continue url = re.sub(r'\?.*', '', a['href']) - if not url.startswith("http"): - continue - if not url.endswith(".html"): - continue - if 'podcast' in url: - continue - if 'video' in url: + if self.exclude_url(url): continue url += '?pagewanted=all' - if url in url_list: - continue - url_list.append(url) - self.log("URL %s" % url) + if self.filterDuplicates: + if url in self.url_list: + continue + self.url_list.append(url) title = self.tag_to_string(a, use_alt=True).strip() desc = h3_item.find('p') if desc is not None: description = self.tag_to_string(desc,use_alt=False) else: description = '' - if not articles.has_key(section_name): - ans.append(section_name) - articles[section_name] = [] - articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) + if not self.articles.has_key(section_name): + self.ans.append(section_name) + self.articles[section_name] = [] + self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) - - ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - return self.filter_ans(ans) + self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] + return self.filter_ans(self.ans) def parse_index(self): if self.headlinesOnly: return self.parse_headline_index() + elif self.webEdition: + return self.parse_web_edition() else: return self.parse_todays_index() @@ -438,6 +561,21 @@ class NYTimes(BasicNewsRecipe): def preprocess_html(self, soup): + if self.webEdition & (self.oldest_article>0): + date_tag = soup.find(True,attrs={'class': ['dateline','date']}) + if date_tag: + date_str = self.tag_to_string(date_tag,use_alt=False) + date_str = date_str.replace('Published:','') + date_items = date_str.split(',') + try: + datestring = date_items[0]+' '+date_items[1] + article_date = self.decode_us_date(datestring) + except: + article_date = date.today() + if article_date < self.earliest_date: + self.log("Skipping article dated %s" % date_str) + return None + kicker_tag = soup.find(attrs={'class':'kicker'}) if kicker_tag: # remove Op_Ed author head shots tagline = self.tag_to_string(kicker_tag) @@ -462,7 +600,6 @@ class NYTimes(BasicNewsRecipe): for inlineImg in inlineImgs[1:]: inlineImg.extract() # Move firstImg before article body - #article_body = soup.find(True, {'id':'articleBody'}) cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')}) if cgFirst: # Strip all sibling NavigableStrings: noise @@ -548,4 +685,3 @@ class NYTimes(BasicNewsRecipe): divTag.replaceWith(tag) return soup - diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index ed1ba75f0f..ad98b466e1 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -7,14 +7,22 @@ nytimes.com ''' import re, string, time from calibre import entity_to_unicode, strftime +from datetime import timedelta, date from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup + class NYTimes(BasicNewsRecipe): - # set headlinesOnly to True for the headlines-only version + # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored. headlinesOnly = False + # set webEdition to True for the Web edition of the newspaper. Set oldest_article to the + # number of days old an article can be for inclusion. If oldest_article = 0 all articles + # will be included. Note: oldest_article is ignored if webEdition = False + webEdition = False + oldest_article = 7 + # includeSections: List of sections to include. If empty, all sections found will be included. # Otherwise, only the sections named will be included. For example, # @@ -39,20 +47,76 @@ class NYTimes(BasicNewsRecipe): # from an article (if one exists). If one_picture_per_article = True, the image # will be moved to a location between the headline and the byline. # If one_picture_per_article = False, all images from the article will be included - # and shown in their original location. - one_picture_per_article = True + one_picture_per_article = False # The maximum number of articles that will be downloaded max_articles_per_feed = 100 + # Whether to omit duplicates of articles (typically arsing when articles are indexed in + # more than one section). If True, only the first occurance will be downloaded. + filterDuplicates = True + + # Sections to collect for the Web edition. + # Delete any you don't want, or use includeSections or excludeSections + web_sections = [(u'World',u'world'), + (u'U.S.',u'national'), + (u'Politics',u'politics'), + (u'New York',u'nyregion'), + (u'Business','business'), + (u'Technology',u'technology'), + (u'Sports',u'sports'), + (u'Science',u'science'), + (u'Health',u'health'), + (u'Opinion',u'opinion'), + (u'Arts',u'arts'), + (u'Books',u'books'), + (u'Movies',u'movies'), + (u'Music',u'arts/music'), + (u'Television',u'arts/television'), + (u'Style',u'style'), + (u'Dining & Wine',u'dining'), + (u'Fashion & Style',u'fashion'), + (u'Home & Garden',u'garden'), + (u'Travel',u'travel'), + ('Education',u'education'), + ('Multimedia',u'multimedia'), + (u'Obituaries',u'obituaries'), + (u'Sunday Magazine',u'magazine'), + (u'Week in Review',u'weekinreview')] + if headlinesOnly: title='New York Times Headlines' description = 'Headlines from the New York Times' + needs_subscription = False + elif webEdition: + title='New York Times (Web)' + description = 'New York Times on the Web' + needs_subscription = True else: title='New York Times' description = 'Today\'s New York Times' + needs_subscription = True + + + month_list = ['january','february','march','april','may','june','july','august','september','october','november','december'] + + def decode_us_date(self,datestr): + udate = datestr.strip().lower().split() + try: + m = self.month_list.index(udate[0])+1 + except: + return date.today() + d = int(udate[1]) + y = int(udate[2]) + try: + d = date(y,m,d) + except: + d = date.today + return d + + earliest_date = date.today() - timedelta(days=oldest_article) __author__ = 'GRiker/Kovid Goyal/Nick Redding' language = 'en' @@ -60,7 +124,6 @@ class NYTimes(BasicNewsRecipe): timefmt = '' - needs_subscription = True masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' cover_margins = (18,18,'grey99') @@ -137,6 +200,12 @@ class NYTimes(BasicNewsRecipe): .image {text-align: center;} .source {text-align: left; }''' + + articles = {} + key = None + ans = [] + url_list = [] + def filter_ans(self, ans) : total_article_count = 0 idx = 0 @@ -165,6 +234,29 @@ class NYTimes(BasicNewsRecipe): self.log( "Queued %d articles" % total_article_count ) return ans + def exclude_url(self,url): + if not url.startswith("http"): + return True + if not url.endswith(".html"): + return True + if 'nytimes.com' not in url: + return True + if 'podcast' in url: + return True + if '/video/' in url: + return True + if '/slideshow/' in url: + return True + if '/magazine/index' in url: + return True + if '/interactive/' in url: + return True + if '/reference/' in url: + return True + if '/premium/' in url: + return True + return False + def fixChars(self,string): # Replace lsquo (\x91) fixed = re.sub("\x91","‘",string) @@ -250,7 +342,6 @@ class NYTimes(BasicNewsRecipe): return BeautifulSoup(_raw, markupMassage=massage) # Entry point - print "index_to_soup()" soup = get_the_soup( self.encoding, url_or_raw ) contentType = soup.find(True,attrs={'http-equiv':'Content-Type'}) docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')] @@ -274,83 +365,110 @@ class NYTimes(BasicNewsRecipe): else: return description - def parse_todays_index(self): + def feed_title(self,div): + return ''.join(div.findAll(text=True, recursive=True)).strip() - def feed_title(div): - return ''.join(div.findAll(text=True, recursive=True)).strip() - - articles = {} - key = None - ans = [] - url_list = [] - - def handle_article(div): - a = div.find('a', href=True) - if not a: + def handle_article(self,div): + thumbnail = div.find('div','thumbnail') + if thumbnail: + thumbnail.extract() + a = div.find('a', href=True) + if not a: + return + url = re.sub(r'\?.*', '', a['href']) + if self.exclude_url(url): + return + url += '?pagewanted=all' + if self.filterDuplicates: + if url in self.url_list: return - url = re.sub(r'\?.*', '', a['href']) - if not url.startswith("http"): - return - if not url.endswith(".html"): - return - if 'podcast' in url: - return - if '/video/' in url: - return - url += '?pagewanted=all' - if url in url_list: - return - url_list.append(url) - title = self.tag_to_string(a, use_alt=True).strip() - description = '' - pubdate = strftime('%a, %d %b') - summary = div.find(True, attrs={'class':'summary'}) - if summary: - description = self.tag_to_string(summary, use_alt=False) - author = '' + self.url_list.append(url) + title = self.tag_to_string(a, use_alt=True).strip() + description = '' + pubdate = strftime('%a, %d %b') + summary = div.find(True, attrs={'class':'summary'}) + if summary: + description = self.tag_to_string(summary, use_alt=False) + author = '' + authorAttribution = div.find(True, attrs={'class':'byline'}) + if authorAttribution: + author = self.tag_to_string(authorAttribution, use_alt=False) + else: authorAttribution = div.find(True, attrs={'class':'byline'}) if authorAttribution: author = self.tag_to_string(authorAttribution, use_alt=False) - else: - authorAttribution = div.find(True, attrs={'class':'byline'}) - if authorAttribution: - author = self.tag_to_string(authorAttribution, use_alt=False) - feed = key if key is not None else 'Uncategorized' - if not articles.has_key(feed): - ans.append(feed) - articles[feed] = [] - articles[feed].append( - dict(title=title, url=url, date=pubdate, - description=description, author=author, - content='')) + feed = self.key if self.key is not None else 'Uncategorized' + if not self.articles.has_key(feed): + self.ans.append(feed) + self.articles[feed] = [] + self.articles[feed].append( + dict(title=title, url=url, date=pubdate, + description=description, author=author, + content='')) + def parse_web_edition(self): + + for (sec_title,index_url) in self.web_sections: + if self.includeSections != []: + if sec_title not in self.includeSections: + print "SECTION NOT INCLUDED: ",sec_title + continue + if sec_title in self.excludeSections: + print "SECTION EXCLUDED: ",sec_title + continue + print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html' + soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html') + self.key = sec_title + # Find each article + for div in soup.findAll(True, + attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): + if div['class'] in ['story', 'story headline'] : + self.handle_article(div) + elif div['class'] == 'headlinesOnly multiline flush': + for lidiv in div.findAll('li'): + self.handle_article(lidiv) + + self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] + return self.filter_ans(self.ans) + + + def parse_todays_index(self): + soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') - + skipping = False # Find each article for div in soup.findAll(True, attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): if div['class'] in ['section-headline','sectionHeader']: - key = string.capwords(feed_title(div)) - key = key.replace('Op-ed','Op-Ed') - key = key.replace('U.s.','U.S.') + self.key = string.capwords(self.feed_title(div)) + self.key = self.key.replace('Op-ed','Op-Ed') + self.key = self.key.replace('U.s.','U.S.') + self.key = self.key.replace('N.y.','N.Y.') + skipping = False + if self.includeSections != []: + if self.key not in self.includeSections: + print "SECTION NOT INCLUDED: ",self.key + skipping = True + if self.key in self.excludeSections: + print "SECTION EXCLUDED: ",self.key + skipping = True + elif div['class'] in ['story', 'story headline'] : - handle_article(div) + if not skipping: + self.handle_article(div) elif div['class'] == 'headlinesOnly multiline flush': for lidiv in div.findAll('li'): - handle_article(lidiv) + if not skipping: + self.handle_article(lidiv) - ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - return self.filter_ans(ans) + self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] + return self.filter_ans(self.ans) def parse_headline_index(self): - articles = {} - ans = [] - url_list = [] - soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') # Fetch the content table @@ -364,15 +482,24 @@ class NYTimes(BasicNewsRecipe): for td_col in content_table.findAll('td', {'id' : re.compile('Column')}): for div_sec in td_col.findAll('div',recursive=False): for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}): + section_name = self.tag_to_string(h6_sec_name,use_alt=False) section_name = re.sub(r'^ *$','',section_name) + if section_name == '': continue + if self.includeSections != []: + if section_name not in self.includeSections: + print "SECTION NOT INCLUDED: ",section_name + continue + if section_name in self.excludeSections: + print "SECTION EXCLUDED: ",section_name + continue + section_name=string.capwords(section_name) - if section_name == 'U.s.': - section_name = 'U.S.' - elif section_name == 'Op-ed': - section_name = 'Op-Ed' + section_name = section_name.replace('Op-ed','Op-Ed') + section_name = section_name.replace('U.s.','U.S.') + section_name = section_name.replace('N.y.','N.Y.') pubdate = strftime('%a, %d %b') search_div = div_sec @@ -393,37 +520,32 @@ class NYTimes(BasicNewsRecipe): if not a: continue url = re.sub(r'\?.*', '', a['href']) - if not url.startswith("http"): - continue - if not url.endswith(".html"): - continue - if 'podcast' in url: - continue - if 'video' in url: + if self.exclude_url(url): continue url += '?pagewanted=all' - if url in url_list: - continue - url_list.append(url) - self.log("URL %s" % url) + if self.filterDuplicates: + if url in self.url_list: + continue + self.url_list.append(url) title = self.tag_to_string(a, use_alt=True).strip() desc = h3_item.find('p') if desc is not None: description = self.tag_to_string(desc,use_alt=False) else: description = '' - if not articles.has_key(section_name): - ans.append(section_name) - articles[section_name] = [] - articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) + if not self.articles.has_key(section_name): + self.ans.append(section_name) + self.articles[section_name] = [] + self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) - - ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - return self.filter_ans(ans) + self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] + return self.filter_ans(self.ans) def parse_index(self): if self.headlinesOnly: return self.parse_headline_index() + elif self.webEdition: + return self.parse_web_edition() else: return self.parse_todays_index() @@ -439,6 +561,21 @@ class NYTimes(BasicNewsRecipe): def preprocess_html(self, soup): + if self.webEdition & (self.oldest_article>0): + date_tag = soup.find(True,attrs={'class': ['dateline','date']}) + if date_tag: + date_str = self.tag_to_string(date_tag,use_alt=False) + date_str = date_str.replace('Published:','') + date_items = date_str.split(',') + try: + datestring = date_items[0]+' '+date_items[1] + article_date = self.decode_us_date(datestring) + except: + article_date = date.today() + if article_date < self.earliest_date: + self.log("Skipping article dated %s" % date_str) + return None + kicker_tag = soup.find(attrs={'class':'kicker'}) if kicker_tag: # remove Op_Ed author head shots tagline = self.tag_to_string(kicker_tag) @@ -463,7 +600,6 @@ class NYTimes(BasicNewsRecipe): for inlineImg in inlineImgs[1:]: inlineImg.extract() # Move firstImg before article body - #article_body = soup.find(True, {'id':'articleBody'}) cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')}) if cgFirst: # Strip all sibling NavigableStrings: noise diff --git a/resources/recipes/spiegelde.recipe b/resources/recipes/spiegelde.recipe index 705ffd0f7a..4fed3818b0 100644 --- a/resources/recipes/spiegelde.recipe +++ b/resources/recipes/spiegelde.recipe @@ -6,6 +6,7 @@ __copyright__ = '2009, Darko Miletic ' spiegel.de ''' +from time import strftime from calibre.web.feeds.news import BasicNewsRecipe class Spiegel_ger(BasicNewsRecipe): @@ -44,3 +45,6 @@ class Spiegel_ger(BasicNewsRecipe): rmain, rsep, rrest = main.rpartition(',') purl = rmain + ',druck-' + rrest + ',' + rest return purl + + def get_cover_url(self): + return 'http://wissen.spiegel.de/wissen/titel/SP/' + strftime("%Y/%W/%j/titel.jpg") diff --git a/resources/recipes/tsn.recipe b/resources/recipes/tsn.recipe new file mode 100644 index 0000000000..e822ebc633 --- /dev/null +++ b/resources/recipes/tsn.recipe @@ -0,0 +1,34 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1289990851(BasicNewsRecipe): + title = u'TSN' + oldest_article = 7 + max_articles_per_feed = 50 + language = 'en_CA' + __author__ = 'Nexus' + no_stylesheets = True + INDEX = 'http://tsn.ca/nhl/story/?id=nhl' + keep_only_tags = [dict(name='div', attrs={'id':['tsnColWrap']}), + dict(name='div', attrs={'id':['tsnStory']})] + remove_tags = [dict(name='div', attrs={'id':'tsnRelated'}), + dict(name='div', attrs={'class':'textSize'})] + + def parse_index(self): + feeds = [] + soup = self.index_to_soup(self.INDEX) + feed_parts = soup.findAll('div', attrs={'class': 'feature'}) + for feed_part in feed_parts: + articles = [] + if not feed_part.h2: + continue + feed_title = feed_part.h2.string + article_parts = feed_part.findAll('a') + for article_part in article_parts: + article_title = article_part.string + article_date = '' + article_url = 'http://tsn.ca/' + article_part['href'] + articles.append({'title': article_title, 'url': article_url, 'description':'', 'date':article_date}) + if articles: + feeds.append((feed_title, articles)) + return feeds + diff --git a/resources/recipes/zeitde_sub.recipe b/resources/recipes/zeitde_sub.recipe new file mode 100644 index 0000000000..5014837c5b --- /dev/null +++ b/resources/recipes/zeitde_sub.recipe @@ -0,0 +1,63 @@ +#!/usr/bin/env python +# -*- coding: utf-8 mode: python -*- + +__license__ = 'GPL v3' +__copyright__ = '2010, Steffen Siebert ' +__docformat__ = 'restructuredtext de' +__version__ = '1.1' + +""" +Die Zeit EPUB +""" + +import os, urllib2, zipfile, re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ptempfile import PersistentTemporaryFile + +class ZeitEPUBAbo(BasicNewsRecipe): + + title = u'Zeit Online Premium' + description = u'Das EPUB Abo der Zeit (needs subscription)' + language = 'de' + lang = 'de-DE' + + __author__ = 'Steffen Siebert' + needs_subscription = True + + conversion_options = { + 'no_default_epub_cover' : True + } + + def build_index(self): + domain = "http://premium.zeit.de" + url = domain + "/abovorteile/cgi-bin/_er_member/p4z.fpl?ER_Do=getUserData&ER_NextTemplate=login_ok" + + browser = self.get_browser() + browser.add_password("http://premium.zeit.de", self.username, self.password) + + try: + browser.open(url) + except urllib2.HTTPError: + self.report_progress(0,_("Can't login to download issue")) + raise ValueError('Failed to login, check your username and password') + + response = browser.follow_link(text="DIE ZEIT als E-Paper") + response = browser.follow_link(url_regex=re.compile('^http://contentserver.hgv-online.de/nodrm/fulfillment\\?distributor=zeit-online&orderid=zeit_online.*')) + + tmp = PersistentTemporaryFile(suffix='.epub') + self.report_progress(0,_('downloading epub')) + tmp.write(response.read()) + tmp.close() + + zfile = zipfile.ZipFile(tmp.name, 'r') + self.report_progress(0,_('extracting epub')) + + zfile.extractall(self.output_dir) + + tmp.close() + index = os.path.join(self.output_dir, 'content.opf') + + self.report_progress(1,_('epub downloaded and extracted')) + + return index + diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 681d953c9b..87946706cf 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -483,6 +483,7 @@ from calibre.devices.kobo.driver import KOBO from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ LibraryThing from calibre.ebooks.metadata.douban import DoubanBooks +from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ LibraryThingCovers, DoubanCovers from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX @@ -490,8 +491,9 @@ from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, - LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, - Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers] + LibraryThing, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, + Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers, + NiceBooksCovers] plugins += [ ComicInput, EPUBInput, diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index 36e2b9bdd2..2318c6724e 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -678,6 +678,15 @@ class NookOutput(OutputProfile): fbase = 16 fsizes = [12, 12, 14, 16, 18, 20, 22, 24] +class NookColorOutput(NookOutput): + name = 'Nook Color' + short_name = 'nook_color' + description = _('This profile is intended for the B&N Nook Color.') + + screen_size = (600, 980) + comic_screen_size = (584, 980) + dpi = 169 + class BambookOutput(OutputProfile): author = 'Li Fanxi' @@ -698,6 +707,6 @@ output_profiles = [OutputProfile, SonyReaderOutput, SonyReader300Output, iPadOutput, KoboReaderOutput, TabletOutput, SonyReaderLandscapeOutput, KindleDXOutput, IlliadOutput, IRexDR1000Output, IRexDR800Output, JetBook5Output, NookOutput, - BambookOutput, ] + BambookOutput, NookColorOutput] output_profiles.sort(cmp=lambda x,y:cmp(x.name.lower(), y.name.lower())) diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 844269e453..c360122842 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -120,7 +120,7 @@ def enable_plugin(plugin_or_name): config['enabled_plugins'] = ep default_disabled_plugins = set([ - 'Douban Books', 'Douban.com covers', + 'Douban Books', 'Douban.com covers', 'Nicebooks', 'Nicebooks covers' ]) def is_disabled(plugin): diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py index b2cc99fdb8..934dc0879e 100644 --- a/src/calibre/devices/kobo/driver.py +++ b/src/calibre/devices/kobo/driver.py @@ -93,7 +93,7 @@ class KOBO(USBMS): lpath = path.partition(self.normalize_path(prefix))[2] if lpath.startswith(os.sep): lpath = lpath[len(os.sep):] - lpath = lpath.replace('\\', '/') + lpath = lpath.replace('\\', '/') # debug_print("LPATH: ", lpath, " - Title: " , title) playlist_map = {} @@ -354,7 +354,7 @@ class KOBO(USBMS): ContentID = ContentID.replace(self._main_prefix, '') else: ContentID = path - ContentID = ContentID.replace(self._main_prefix + '.kobo/kepub/', '') + ContentID = ContentID.replace(self._main_prefix + self.normalize_path('.kobo/kepub/'), '') if self._card_a_prefix is not None: ContentID = ContentID.replace(self._card_a_prefix, '') @@ -507,7 +507,10 @@ class KOBO(USBMS): t = (ContentID,) cursor.execute('select DateLastRead from Content where BookID is Null and ContentID = ?', t) result = cursor.fetchone() - datelastread = result[0] if result[0] is not None else '1970-01-01T00:00:00' + if result is None: + datelastread = '1970-01-01T00:00:00' + else: + datelastread = result[0] if result[0] is not None else '1970-01-01T00:00:00' t = (datelastread,ContentID,) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index cca3679d14..11979b933c 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -107,7 +107,7 @@ class PreProcessor(object): # Arrange line feeds and

tags so the line_length and no_markup functions work correctly html = re.sub(r"\s*

", "

\n", html) html = re.sub(r"\s*[^>]*)>\s*", "\n"+">", html) - + ###### Check Markup ###### # # some lit files don't have any

tags or equivalent (generally just plain text between @@ -192,12 +192,12 @@ class PreProcessor(object): n_lookahead_close = ")" default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)" - + min_chapters = 10 heading = re.compile(']*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") - + chapter_types = [ [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"], [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"], # Numeric Chapters @@ -219,9 +219,9 @@ class PreProcessor(object): else: chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) - + html = chapdetect.sub(self.chapter_head, html) - + ###### Unwrap lines ###### # @@ -259,7 +259,7 @@ class PreProcessor(object): html = dehyphenator(html,'html', length) self.log("Done dehyphenating") # Unwrap lines using punctation and line length - unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE) + #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE) unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) html = unwrap.sub(' ', html) #check any remaining hyphens, but only unwrap if there is a match diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py new file mode 100644 index 0000000000..4d19e9611b --- /dev/null +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -0,0 +1,424 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, sengian ' +__docformat__ = 'restructuredtext en' + +import sys, textwrap, re, traceback, socket +from urllib import urlencode +from math import ceil +from copy import deepcopy + +from lxml.html import soupparser + +from calibre.utils.date import parse_date, utcnow +from calibre import browser, preferred_encoding +from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.metadata import MetaInformation, check_isbn, \ + authors_to_sort_string +from calibre.ebooks.metadata.fetch import MetadataSource +from calibre.ebooks.metadata.covers import CoverDownload +from calibre.utils.config import OptionParser + +class NiceBooks(MetadataSource): + + name = 'Nicebooks' + description = _('Downloads metadata from french Nicebooks') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + version = (1, 0, 0) + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + +class NiceBooksCovers(CoverDownload): + + name = 'Nicebooks covers' + description = _('Downloads covers from french Nicebooks') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + type = _('Cover download') + version = (1, 0, 0) + + def has_cover(self, mi, ans, timeout=5.): + if not mi.isbn: + return False + br = browser() + try: + entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0] + if Covers(mi.isbn)(entry).check_cover(): + self.debug('cover for', mi.isbn, 'found') + ans.set() + except Exception, e: + self.debug(e) + + def get_covers(self, mi, result_queue, abort, timeout=5.): + if not mi.isbn: + return + br = browser() + try: + entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0] + cover_data, ext = Covers(mi.isbn)(entry).get_cover(br, timeout) + if not ext: + ext = 'jpg' + result_queue.put((True, cover_data, ext, self.name)) + except Exception, e: + result_queue.put((False, self.exception_to_string(e), + traceback.format_exc(), self.name)) + + +def report(verbose): + if verbose: + import traceback + traceback.print_exc() + +def replace_monthsfr(datefr): + # Replace french months by english equivalent for parse_date + frtoen = { + u'[jJ]anvier': u'jan', + u'[fF].vrier': u'feb', + u'[mM]ars': u'mar', + u'[aA]vril': u'apr', + u'[mM]ai': u'may', + u'[jJ]uin': u'jun', + u'[jJ]uillet': u'jul', + u'[aA]o.t': u'aug', + u'[sS]eptembre': u'sep', + u'[Oo]ctobre': u'oct', + u'[nN]ovembre': u'nov', + u'[dD].cembre': u'dec' } + for k in frtoen.iterkeys(): + tmp = re.sub(k, frtoen[k], datefr) + if tmp <> datefr: break + return tmp + +class Query(object): + + BASE_URL = 'http://fr.nicebooks.com/' + + def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, max_results=20): + assert not(title is None and author is None and publisher is None \ + and isbn is None and keywords is None) + assert (max_results < 21) + + self.max_results = int(max_results) + + if isbn is not None: + q = isbn + else: + q = ' '.join([i for i in (title, author, publisher, keywords) \ + if i is not None]) + + if isinstance(q, unicode): + q = q.encode('utf-8') + self.urldata = 'search?' + urlencode({'q':q,'s':'Rechercher'}) + + def __call__(self, browser, verbose, timeout = 5.): + if verbose: + print 'Query:', self.BASE_URL+self.urldata + + try: + raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '404 - ' in raw: + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + #nb of page to call + try: + nbresults = int(feed.xpath("//div[@id='topbar']/b")[0].text) + except: + #direct hit + return [feed] + + nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/10)) + pages =[feed] + if nbpagetoquery > 1: + for i in xrange(2, nbpagetoquery + 1): + try: + urldata = self.urldata + '&p=' + str(i) + raw = browser.open_novisit(self.BASE_URL+urldata, timeout=timeout).read() + except Exception, e: + continue + if '<title>404 - ' in raw: + continue + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + continue + pages.append(feed) + + results = [] + for x in pages: + results.extend([i.find_class('title')[0].get('href') \ + for i in x.xpath("//ul[@id='results']/li")]) + return results[:self.max_results] + +class ResultList(list): + + BASE_URL = 'http://fr.nicebooks.com' + + def __init__(self): + self.repub = re.compile(u'\s*.diteur\s*', re.I) + self.reauteur = re.compile(u'\s*auteur.*', re.I) + self.reautclean = re.compile(u'\s*\(.*\)\s*') + + def get_title(self, entry): + # title = deepcopy(entry.find("div[@id='book-info']")) + title = deepcopy(entry) + title.remove(title.find("dl[@title='Informations sur le livre']")) + title = ' '.join([i.text_content() for i in title.iterchildren()]) + return unicode(title.replace('\n', '')) + + def get_authors(self, entry): + # author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + author = entry.find("dl[@title='Informations sur le livre']") + authortext = [] + for x in author.getiterator('dt'): + if self.reauteur.match(x.text): + elt = x.getnext() + while elt.tag == 'dd': + authortext.append(unicode(elt.text_content())) + elt = elt.getnext() + break + if len(authortext) == 1: + authortext = [self.reautclean.sub('', authortext[0])] + return authortext + + def get_description(self, entry, verbose): + try: + return u'RESUME:\n' + unicode(entry.getparent().xpath("//p[@id='book-description']")[0].text) + except: + report(verbose) + return None + + def get_book_info(self, entry, mi, verbose): + entry = entry.find("dl[@title='Informations sur le livre']") + for x in entry.getiterator('dt'): + if x.text == 'ISBN': + isbntext = x.getnext().text_content().replace('-', '') + if check_isbn(isbntext): + mi.isbn = unicode(isbntext) + elif self.repub.match(x.text): + mi.publisher = unicode(x.getnext().text_content()) + elif x.text == 'Langue': + mi.language = unicode(x.getnext().text_content()) + elif x.text == 'Date de parution': + d = x.getnext().text_content() + try: + default = utcnow().replace(day=15) + d = replace_monthsfr(d) + d = parse_date(d, assume_utc=True, default=default) + mi.pubdate = d + except: + report(verbose) + return mi + + def fill_MI(self, entry, title, authors, verbose): + mi = MetaInformation(title, authors) + mi.author_sort = authors_to_sort_string(authors) + mi.comments = self.get_description(entry, verbose) + # entry = entry.find("dl[@title='Informations sur le livre']") + # mi.publisher = self.get_publisher(entry) + # mi.pubdate = self.get_date(entry, verbose) + # mi.isbn = self.get_ISBN(entry) + # mi.language = self.get_language(entry) + return self.get_book_info(entry, mi, verbose) + + def get_individual_metadata(self, browser, linkdata, verbose): + try: + raw = browser.open_novisit(self.BASE_URL + linkdata).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '<title>404 - ' in raw: + report(verbose) + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + # get results + return feed.xpath("//div[@id='container']")[0] + + def populate(self, entries, browser, verbose=False): + #single entry + if len(entries) == 1 and not isinstance(entries[0], str): + try: + entry = entries[0].xpath("//div[@id='container']")[0] + entry = entry.find("div[@id='book-info']") + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + return + self.append(self.fill_MI(entry, title, authors, verbose)) + else: + #multiple entries + for x in entries: + try: + entry = self.get_individual_metadata(browser, x, verbose) + entry = entry.find("div[@id='book-info']") + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + continue + self.append(self.fill_MI(entry, title, authors, verbose)) + + +class NiceBooksError(Exception): + pass + +class ISBNNotFound(NiceBooksError): + pass + +class Covers(object): + + def __init__(self, isbn = None): + assert isbn is not None + self.urlimg = '' + self.isbn = isbn + self.isbnf = False + + def __call__(self, entry = None): + try: + self.urlimg = entry.xpath("//div[@id='book-picture']/a")[0].get('href') + except: + return self + isbno = entry.get_element_by_id('book-info').find("dl[@title='Informations sur le livre']") + for x in isbno.getiterator('dt'): + if x.text == 'ISBN' and check_isbn(x.getnext().text_content()): + self.isbnf = True + break + return self + + def check_cover(self): + return True if self.urlimg else False + + def get_cover(self, browser, timeout = 5.): + try: + cover, ext = browser.open_novisit(self.urlimg, timeout=timeout).read(), \ + self.urlimg.rpartition('.')[-1] + return cover, ext if ext else 'jpg' + except Exception, err: + if isinstance(getattr(err, 'args', [None])[0], socket.timeout): + err = NiceBooksError(_('Nicebooks timed out. Try again later.')) + raise err + if not len(self.urlimg): + if not self.isbnf: + raise ISBNNotFound('ISBN: '+self.isbn+_(' not found.')) + raise NiceBooksError(_('An errror occured with Nicebooks cover fetcher')) + + +def search(title=None, author=None, publisher=None, isbn=None, + max_results=5, verbose=False, keywords=None): + br = browser() + entries = Query(title=title, author=author, isbn=isbn, publisher=publisher, + keywords=keywords, max_results=max_results)(br, verbose) + + if entries is None or len(entries) == 0: + return + + #List of entry + ans = ResultList() + ans.populate(entries, br, verbose) + return ans + +def check_for_cover(isbn): + br = browser() + entry = Query(isbn=isbn, max_results=1)(br, False)[0] + return Covers(isbn)(entry).check_cover() + +def cover_from_isbn(isbn, timeout = 5.): + br = browser() + entry = Query(isbn=isbn, max_results=1)(br, False, timeout)[0] + return Covers(isbn)(entry).get_cover(br, timeout) + + +def option_parser(): + parser = OptionParser(textwrap.dedent(\ + '''\ + %prog [options] + + Fetch book metadata from Nicebooks. You must specify one of title, author, + ISBN, publisher or keywords. Will fetch a maximum of 20 matches, + so you should make your query as specific as possible. + It can also get covers if the option is activated. + ''' + )) + parser.add_option('-t', '--title', help='Book title') + parser.add_option('-a', '--author', help='Book author(s)') + parser.add_option('-p', '--publisher', help='Book publisher') + parser.add_option('-i', '--isbn', help='Book ISBN') + parser.add_option('-k', '--keywords', help='Keywords') + parser.add_option('-c', '--covers', default=0, + help='Covers: 1-Check/ 2-Download') + parser.add_option('-p', '--coverspath', default='', + help='Covers files path') + parser.add_option('-m', '--max-results', default=20, + help='Maximum number of results to fetch') + parser.add_option('-v', '--verbose', default=0, action='count', + help='Be more verbose about errors') + return parser + +def main(args=sys.argv): + import os + parser = option_parser() + opts, args = parser.parse_args(args) + try: + results = search(opts.title, opts.author, isbn=opts.isbn, publisher=opts.publisher, + keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results) + except AssertionError: + report(True) + parser.print_help() + return 1 + if results is None or len(results) == 0: + print 'No result found for this search!' + return 0 + for result in results: + print unicode(result).encode(preferred_encoding, 'replace') + covact = int(opts.covers) + if covact == 1: + textcover = 'No cover found!' + if check_for_cover(result.isbn): + textcover = 'A cover was found for this book' + print textcover + elif covact == 2: + cover_data, ext = cover_from_isbn(result.isbn) + cpath = result.isbn + if len(opts.coverspath): + cpath = os.path.normpath(opts.coverspath + '/' + result.isbn) + oname = os.path.abspath(cpath+'.'+ext) + open(oname, 'wb').write(cover_data) + print 'Cover saved to file ', oname + print + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index bcd5e14a58..98e7b6023c 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -470,4 +470,4 @@ class MobiMLizer(object): bstate.vpadding += vpadding if bstate.nested and bstate.nested[-1].tag == elem.tag: bstate.nested.pop() - istates.pop() \ No newline at end of file + istates.pop() diff --git a/src/calibre/gui2/actions/preferences.py b/src/calibre/gui2/actions/preferences.py index d9957bd70d..be536ca4e4 100644 --- a/src/calibre/gui2/actions/preferences.py +++ b/src/calibre/gui2/actions/preferences.py @@ -5,7 +5,7 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -from PyQt4.Qt import QIcon, QMenu +from PyQt4.Qt import QIcon, QMenu, Qt from calibre.gui2.actions import InterfaceAction from calibre.gui2.preferences.main import Preferences @@ -41,5 +41,7 @@ class PreferencesAction(InterfaceAction): return d = Preferences(self.gui, initial_plugin=initial_plugin) d.show() + d.run_wizard_requested.connect(self.gui.run_wizard, + type=Qt.QueuedConnection) diff --git a/src/calibre/gui2/book_details.py b/src/calibre/gui2/book_details.py index 8cc2965171..e193fe10b2 100644 --- a/src/calibre/gui2/book_details.py +++ b/src/calibre/gui2/book_details.py @@ -221,8 +221,6 @@ class BookInfo(QWebView): <style type="text/css"> body, td {background-color: %s; font-size: %dpx; color: %s } a { text-decoration: none; color: blue } - p { margin-top: .2em } - h3 { margin-bottom: .2em } </style> </head> <body> diff --git a/src/calibre/gui2/preferences/main.py b/src/calibre/gui2/preferences/main.py index c82ddcc022..fc01a33cf6 100644 --- a/src/calibre/gui2/preferences/main.py +++ b/src/calibre/gui2/preferences/main.py @@ -155,6 +155,8 @@ class Browser(QScrollArea): # {{{ class Preferences(QMainWindow): + run_wizard_requested = pyqtSignal() + def __init__(self, gui, initial_plugin=None): QMainWindow.__init__(self, gui) self.gui = gui @@ -195,6 +197,11 @@ class Preferences(QMainWindow): self.cw.setLayout(QVBoxLayout()) self.cw.layout().addWidget(self.stack) self.bb = QDialogButtonBox(QDialogButtonBox.Close) + self.wizard_button = self.bb.addButton(_('Run welcome wizard'), + self.bb.DestructiveRole) + self.wizard_button.setIcon(QIcon(I('wizard.png'))) + self.wizard_button.clicked.connect(self.run_wizard, + type=Qt.QueuedConnection) self.cw.layout().addWidget(self.bb) self.bb.rejected.connect(self.close, type=Qt.QueuedConnection) self.setCentralWidget(self.cw) @@ -240,6 +247,9 @@ class Preferences(QMainWindow): if plugin is not None: self.show_plugin(plugin) + def run_wizard(self): + self.close() + self.run_wizard_requested.emit() def show_plugin(self, plugin): self.showing_widget = plugin.create_widget(self.scroll_area) diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py index 2ac0908ea9..4f418d34d5 100644 --- a/src/calibre/gui2/wizard/__init__.py +++ b/src/calibre/gui2/wizard/__init__.py @@ -38,6 +38,7 @@ class Device(object): name = 'Default' manufacturer = 'Default' id = 'default' + supports_color = False @classmethod def set_output_profile(cls): @@ -56,6 +57,12 @@ class Device(object): def commit(cls): cls.set_output_profile() cls.set_output_format() + if cls.supports_color: + from calibre.ebooks.conversion.config import load_defaults, save_defaults + recs = load_defaults('comic_input') + recs['dont_grayscale'] = True + save_defaults('comic_input', recs) + class Kindle(Device): @@ -138,6 +145,12 @@ class Nook(Sony505): manufacturer = 'Barnes & Noble' output_profile = 'nook' +class NookColor(Nook): + id = 'nook_color' + name = 'Nook Color' + output_profile = 'nook_color' + supports_color = True + class CybookG3(Device): name = 'Cybook Gen 3' @@ -178,6 +191,7 @@ class iPhone(Device): output_format = 'EPUB' manufacturer = 'Apple' id = 'iphone' + supports_color = True class Android(Device): @@ -185,6 +199,7 @@ class Android(Device): output_format = 'EPUB' manufacturer = 'Android' id = 'android' + supports_color = True class HanlinV3(Device): @@ -354,11 +369,6 @@ class StanzaPage(QWizardPage, StanzaUI): return FinishPage.ID def commit(self): - from calibre.ebooks.conversion.config import load_defaults, save_defaults - recs = load_defaults('comic_input') - recs['dont_grayscale'] = True - save_defaults('comic_input', recs) - p = self.set_port() if p is not None: from calibre.library.server import server_config @@ -605,10 +615,14 @@ class LibraryPage(QWizardPage, LibraryUI): self.emit(SIGNAL('retranslate()')) self.init_languages() try: - if prefs['language'].lower().startswith('zh'): - from calibre.customize.ui import enable_plugin - for name in ('Douban Books', 'Douban.com covers'): - enable_plugin(name) + lang = prefs['language'].lower()[:2] + metadata_plugins = { + 'zh' : ('Douban Books', 'Douban.com covers'), + 'fr' : ('Nicebooks', 'Nicebooks covers'), + }.get(lang, []) + from calibre.customize.ui import enable_plugin + for name in metadata_plugins: + enable_plugin(name) except: pass diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 6d18a2d663..8e7002097a 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -771,7 +771,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): except: # Can happen if path has not yet been set return False - return os.access(path, os.R_OK) + return os.path.exists(path) def remove_cover(self, id, notify=True): path = os.path.join(self.library_path, self.path(id, index_is_id=True), 'cover.jpg') diff --git a/src/calibre/trac/bzr_commit_plugin.py b/src/calibre/trac/bzr_commit_plugin.py index f2a40e6266..df6bf699d1 100644 --- a/src/calibre/trac/bzr_commit_plugin.py +++ b/src/calibre/trac/bzr_commit_plugin.py @@ -56,6 +56,7 @@ class cmd_commit(_cmd_commit): summary = self.get_trac_summary(bug, url) if summary: msg = msg.replace('#%s'%bug, '#%s (%s)'%(bug, summary)) + msg = msg.replace('Fixesed', 'Fixed') return msg, bug, url, action diff --git a/src/calibre/translations/nl.po b/src/calibre/translations/nl.po index f6dd336ba0..b4c0d2bba9 100644 --- a/src/calibre/translations/nl.po +++ b/src/calibre/translations/nl.po @@ -4345,7 +4345,7 @@ msgid "" "changed.<br><br>Please confirm you want to proceed." msgstr "" "Boekformaten en metagegevens van de selectie zullen toegevoegd worden aan " -"het <b>eerst geselecteerde boek</b>. ISBN zal <i>niet</i> samengevoegd " +"het <b>eerst geselecteerde boek</b> (%s). ISBN zal <i>niet</i> samengevoegd " "worden.<br><br>De geselecteerde boeken zullen niet verwijderd of aangepast " "worden.<br><br>Bevestig als je wilt doorgaan." @@ -4360,7 +4360,7 @@ msgid "" "you <b>sure</b> you want to proceed?" msgstr "" "Boekformaten en metagegevens van de selectie zullen toegevoegd worden aan " -"het <b>eerst geselecteerde boek</b>. ISBN zal <i>niet</i> samengevoegd " +"het <b>eerst geselecteerde boek</b> (%s). ISBN zal <i>niet</i> samengevoegd " "worden.<br><br>Na samenvoeging zullen de geselecteerde boeken van je " "computer <b>verwijderd</b> worden.<br><br>Weet je zeker dat je door wilt " "gaan?" diff --git a/src/calibre/utils/smtp.py b/src/calibre/utils/smtp.py index 8af31b5d38..4b7ec3f0a3 100644 --- a/src/calibre/utils/smtp.py +++ b/src/calibre/utils/smtp.py @@ -105,7 +105,10 @@ def sendmail(msg, from_, to, localhost=None, verbose=0, timeout=30, try: s.sendmail(from_, to, msg) finally: - ret = s.quit() + try: + ret = s.quit() + except: + pass # Ignore so as to not hide original error return ret def option_parser(): diff --git a/src/calibre/web/feeds/recipes/collection.py b/src/calibre/web/feeds/recipes/collection.py index 012e24a799..5b34ddab0b 100644 --- a/src/calibre/web/feeds/recipes/collection.py +++ b/src/calibre/web/feeds/recipes/collection.py @@ -61,8 +61,10 @@ def serialize_recipe(urn, recipe_class): def serialize_collection(mapping_of_recipe_classes): collection = E.recipe_collection() - for urn, recipe_class in mapping_of_recipe_classes.items(): - recipe = serialize_recipe(urn, recipe_class) + for urn in sorted(mapping_of_recipe_classes.keys(), + key=lambda key: getattr(mapping_of_recipe_classes[key], 'title', + 'zzz')): + recipe = serialize_recipe(urn, mapping_of_recipe_classes[urn]) collection.append(recipe) collection.set('count', str(len(collection))) return etree.tostring(collection, encoding='utf-8', xml_declaration=True,