From 2f28c8b31dc501027d9aa6acf496c5941736312b Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Wed, 29 May 2013 12:15:07 -0700 Subject: [PATCH 01/13] net/9p: add privport option to 9p tcp transport If the privport option is specified, the tcp transport binds local address to a reserved port before connecting to the 9p server. In some cases when 9P AUTH cannot be implemented, this is better than nothing. Signed-off-by: Jim Garlick Signed-off-by: Eric Van Hensbergen --- include/net/9p/transport.h | 3 +++ net/9p/trans_fd.c | 40 +++++++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h index adcbb20f6511..9a36d9297114 100644 --- a/include/net/9p/transport.h +++ b/include/net/9p/transport.h @@ -26,6 +26,9 @@ #ifndef NET_9P_TRANSPORT_H #define NET_9P_TRANSPORT_H +#define P9_DEF_MIN_RESVPORT (665U) +#define P9_DEF_MAX_RESVPORT (1023U) + /** * struct p9_trans_module - transport module interface * @list: used to maintain a list of currently available transports diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 02efb25c2957..3ffda1b3799b 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -63,6 +63,7 @@ struct p9_fd_opts { int rfd; int wfd; u16 port; + int privport; }; /** @@ -87,12 +88,15 @@ struct p9_trans_fd { enum { /* Options that take integer arguments */ Opt_port, Opt_rfdno, Opt_wfdno, Opt_err, + /* Options that take no arguments */ + Opt_privport, }; static const match_table_t tokens = { {Opt_port, "port=%u"}, {Opt_rfdno, "rfdno=%u"}, {Opt_wfdno, "wfdno=%u"}, + {Opt_privport, "privport"}, {Opt_err, NULL}, }; @@ -161,6 +165,9 @@ static DEFINE_SPINLOCK(p9_poll_lock); static LIST_HEAD(p9_poll_pending_list); static DECLARE_WORK(p9_poll_work, p9_poll_workfn); +static unsigned int p9_ipport_resv_min = P9_DEF_MIN_RESVPORT; +static unsigned int p9_ipport_resv_max = P9_DEF_MAX_RESVPORT; + static void p9_mux_poll_stop(struct p9_conn *m) { unsigned long flags; @@ -741,7 +748,7 @@ static int parse_opts(char *params, struct p9_fd_opts *opts) if (!*p) continue; token = match_token(p, tokens, args); - if (token != Opt_err) { + if ((token != Opt_err) && (token != Opt_privport)) { r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, @@ -759,6 +766,9 @@ static int parse_opts(char *params, struct p9_fd_opts *opts) case Opt_wfdno: opts->wfd = option; break; + case Opt_privport: + opts->privport = 1; + break; default: continue; } @@ -898,6 +908,24 @@ static inline int valid_ipaddr4(const char *buf) return 0; } +static int p9_bind_privport(struct socket *sock) +{ + struct sockaddr_in cl; + int port, err = -EINVAL; + + memset(&cl, 0, sizeof(cl)); + cl.sin_family = AF_INET; + cl.sin_addr.s_addr = INADDR_ANY; + for (port = p9_ipport_resv_max; port >= p9_ipport_resv_min; port--) { + cl.sin_port = htons((ushort)port); + err = kernel_bind(sock, (struct sockaddr *)&cl, sizeof(cl)); + if (err != -EADDRINUSE) + break; + } + return err; +} + + static int p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) { @@ -926,6 +954,16 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) return err; } + if (opts.privport) { + err = p9_bind_privport(csocket); + if (err < 0) { + pr_err("%s (%d): problem binding to privport\n", + __func__, task_pid_nr(current)); + sock_release(csocket); + return err; + } + } + err = csocket->ops->connect(csocket, (struct sockaddr *)&sin_server, sizeof(struct sockaddr_in), 0); From d9a738597faf7cd2edeec82ce8fd81969fed8390 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Wed, 29 May 2013 12:09:39 -0700 Subject: [PATCH 02/13] fs/9p: xattr: add trusted and security namespaces Allow requests for security.* and trusted.* xattr name spaces to pass through to server. The new files are 99% cut and paste from fs/9p/xattr_user.c with the namespaces changed. It has the intended effect in superficial testing. I do not know much detail about how these namespaces are used, but passing them through to the server, which can decide whether to handle them or not, seems reasonable. I want to support a use case where an ext4 file system is mounted via 9P, then re-exported via samba to windows clients in a cluster. Windows wants to store xattrs such as security.NTACL. This works when ext4 directly backs samba, but not when 9P is inserted. This use case is documented here: http://code.google.com/p/diod/issues/detail?id=95 Signed-off-by: Jim Garlick Signed-off-by: Eric Van Hensbergen --- fs/9p/Kconfig | 13 +++++++ fs/9p/Makefile | 4 ++- fs/9p/xattr.c | 4 +++ fs/9p/xattr.h | 2 ++ fs/9p/xattr_security.c | 80 ++++++++++++++++++++++++++++++++++++++++++ fs/9p/xattr_trusted.c | 80 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 fs/9p/xattr_security.c create mode 100644 fs/9p/xattr_trusted.c diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig index 55abfd62654a..6489e1fc1afd 100644 --- a/fs/9p/Kconfig +++ b/fs/9p/Kconfig @@ -31,3 +31,16 @@ config 9P_FS_POSIX_ACL If you don't know what Access Control Lists are, say N endif + + +config 9P_FS_SECURITY + bool "9P Security Labels" + depends on 9P_FS + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the 9P filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/fs/9p/Makefile b/fs/9p/Makefile index ab8c12780634..ff7be98f84f2 100644 --- a/fs/9p/Makefile +++ b/fs/9p/Makefile @@ -11,7 +11,9 @@ obj-$(CONFIG_9P_FS) := 9p.o v9fs.o \ fid.o \ xattr.o \ - xattr_user.o + xattr_user.o \ + xattr_trusted.o 9p-$(CONFIG_9P_FSCACHE) += cache.o 9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o +9p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index c45e016b190f..3c28cdfb8c47 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -167,9 +167,13 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) const struct xattr_handler *v9fs_xattr_handlers[] = { &v9fs_xattr_user_handler, + &v9fs_xattr_trusted_handler, #ifdef CONFIG_9P_FS_POSIX_ACL &v9fs_xattr_acl_access_handler, &v9fs_xattr_acl_default_handler, +#endif +#ifdef CONFIG_9P_FS_SECURITY + &v9fs_xattr_security_handler, #endif NULL }; diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h index eec348a3df71..d3e2ea3840be 100644 --- a/fs/9p/xattr.h +++ b/fs/9p/xattr.h @@ -20,6 +20,8 @@ extern const struct xattr_handler *v9fs_xattr_handlers[]; extern struct xattr_handler v9fs_xattr_user_handler; +extern struct xattr_handler v9fs_xattr_trusted_handler; +extern struct xattr_handler v9fs_xattr_security_handler; extern const struct xattr_handler v9fs_xattr_acl_access_handler; extern const struct xattr_handler v9fs_xattr_acl_default_handler; diff --git a/fs/9p/xattr_security.c b/fs/9p/xattr_security.c new file mode 100644 index 000000000000..cb247a142a6e --- /dev/null +++ b/fs/9p/xattr_security.c @@ -0,0 +1,80 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + + +#include +#include +#include +#include +#include "xattr.h" + +static int v9fs_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(full_name+prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_get(dentry, full_name, buffer, size); + kfree(full_name); + return retval; +} + +static int v9fs_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(full_name + prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_set(dentry, full_name, value, size, flags); + kfree(full_name); + return retval; +} + +struct xattr_handler v9fs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = v9fs_xattr_security_get, + .set = v9fs_xattr_security_set, +}; diff --git a/fs/9p/xattr_trusted.c b/fs/9p/xattr_trusted.c new file mode 100644 index 000000000000..e30d33b8a3fb --- /dev/null +++ b/fs/9p/xattr_trusted.c @@ -0,0 +1,80 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + + +#include +#include +#include +#include +#include "xattr.h" + +static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(full_name+prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_get(dentry, full_name, buffer, size); + kfree(full_name); + return retval; +} + +static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(full_name + prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_set(dentry, full_name, value, size, flags); + kfree(full_name); + return retval; +} + +struct xattr_handler v9fs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = v9fs_xattr_trusted_get, + .set = v9fs_xattr_trusted_set, +}; From ea071aa1365eaf8a79b33bd8699cb0811dcddf34 Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Fri, 21 Jun 2013 15:32:34 +0200 Subject: [PATCH 03/13] 9P: Fix fcall allocation for rdma The current code assumes that when a request in the request array does have a tc, it also has a rc. This is normally true, but not always : when using RDMA, req->rc will temporarily be set to NULL after the request has been sent. That is usually OK though, as when the reply arrives, req->rc will be reassigned to a sane value before the request is recycled. But there is a catch : if the request is flushed, the reply will never arrive, and req->rc will be NULL, but not req->tc. This patch fixes p9_tag_alloc to take this into account. Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/client.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/net/9p/client.c b/net/9p/client.c index 01f1779eba80..5828769d1f3d 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -258,27 +258,25 @@ p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size) req = &c->reqs[row][col]; if (!req->tc) { req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_NOFS); - if (!req->wq) { - pr_err("Couldn't grow tag array\n"); - return ERR_PTR(-ENOMEM); - } + if (!req->wq) + goto grow_failed; + init_waitqueue_head(req->wq); req->tc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, GFP_NOFS); + if (!req->tc) + goto grow_failed; + + req->tc->capacity = alloc_msize; + req->tc->sdata = (char *) req->tc + sizeof(struct p9_fcall); + } + if (!req->rc) { req->rc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, GFP_NOFS); - if ((!req->tc) || (!req->rc)) { - pr_err("Couldn't grow tag array\n"); - kfree(req->tc); - kfree(req->rc); - kfree(req->wq); - req->tc = req->rc = NULL; - req->wq = NULL; - return ERR_PTR(-ENOMEM); - } - req->tc->capacity = alloc_msize; + if (!req->rc) + goto grow_failed; + req->rc->capacity = alloc_msize; - req->tc->sdata = (char *) req->tc + sizeof(struct p9_fcall); req->rc->sdata = (char *) req->rc + sizeof(struct p9_fcall); } @@ -288,7 +286,16 @@ p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size) req->tc->tag = tag-1; req->status = REQ_STATUS_ALLOC; - return &c->reqs[row][col]; + return req; + +grow_failed: + pr_err("Couldn't grow tag array\n"); + kfree(req->tc); + kfree(req->rc); + kfree(req->wq); + req->tc = req->rc = NULL; + req->wq = NULL; + return ERR_PTR(-ENOMEM); } /** From 17b6fd9d6dfa0faed3a25a6045f7456821ea140a Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Fri, 21 Jun 2013 15:32:35 +0200 Subject: [PATCH 04/13] 9P/RDMA: rdma_request() needs not allocate req->rc p9_tag_alloc() takes care of that. Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/trans_rdma.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 2c69ddd691a1..b1dfdf2078ff 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -427,26 +427,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) err = -ENOMEM; goto err_close; } - - /* - * If the request has a buffer, steal it, otherwise - * allocate a new one. Typically, requests should already - * have receive buffers allocated and just swap them around - */ - if (!req->rc) { - req->rc = kmalloc(sizeof(struct p9_fcall)+client->msize, - GFP_NOFS); - if (req->rc) { - req->rc->sdata = (char *) req->rc + - sizeof(struct p9_fcall); - req->rc->capacity = client->msize; - } - } rpl_context->rc = req->rc; - if (!rpl_context->rc) { - err = -ENOMEM; - goto err_free2; - } /* * Post a receive buffer for this request. We need to ensure From 5387320d4814aa1e40b50529d960a8f2b3340535 Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Fri, 21 Jun 2013 15:32:36 +0200 Subject: [PATCH 05/13] 9pnet: refactor struct p9_fcall alloc code Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/client.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/net/9p/client.c b/net/9p/client.c index 5828769d1f3d..db5bf2480a33 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -204,6 +204,17 @@ free_and_return: return ret; } +struct p9_fcall *p9_fcall_alloc(int alloc_msize) +{ + struct p9_fcall *fc; + fc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, GFP_NOFS); + if (!fc) + return NULL; + fc->capacity = alloc_msize; + fc->sdata = (char *) fc + sizeof(struct p9_fcall); + return fc; +} + /** * p9_tag_alloc - lookup/allocate a request by tag * @c: client session to lookup tag within @@ -256,29 +267,19 @@ p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size) col = tag % P9_ROW_MAXTAG; req = &c->reqs[row][col]; - if (!req->tc) { + if (!req->wq) { req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_NOFS); if (!req->wq) goto grow_failed; - init_waitqueue_head(req->wq); - req->tc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, - GFP_NOFS); - if (!req->tc) - goto grow_failed; - - req->tc->capacity = alloc_msize; - req->tc->sdata = (char *) req->tc + sizeof(struct p9_fcall); } - if (!req->rc) { - req->rc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, - GFP_NOFS); - if (!req->rc) - goto grow_failed; - req->rc->capacity = alloc_msize; - req->rc->sdata = (char *) req->rc + sizeof(struct p9_fcall); - } + if (!req->tc) + req->tc = p9_fcall_alloc(alloc_msize); + if (!req->rc) + req->rc = p9_fcall_alloc(alloc_msize); + if (!req->tc || !req->rc) + goto grow_failed; p9pdu_reset(req->tc); p9pdu_reset(req->rc); From 3fcc62f4e8620fd5f85f957a5e708e69a20adb51 Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Fri, 21 Jun 2013 15:32:37 +0200 Subject: [PATCH 06/13] 9P/RDMA: increase P9_RDMA_MAXSIZE to 1MB The current value is too low to get good performance. Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/trans_rdma.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index b1dfdf2078ff..b8b66d38f5b0 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -57,9 +57,7 @@ #define P9_RDMA_IRD 0 #define P9_RDMA_ORD 0 #define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ -#define P9_RDMA_MAXSIZE (4*4096) /* Min SGE is 4, so we can - * safely advertise a maxsize - * of 64k */ +#define P9_RDMA_MAXSIZE (1024*1024) /* 1MB */ /** * struct p9_trans_rdma - RDMA transport instance From 47229ff85e5a0b0613df2288d212938aeb9687da Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Fri, 21 Jun 2013 15:32:38 +0200 Subject: [PATCH 07/13] 9P/RDMA: Protect against duplicate replies A well-behaved server would not send twice the reply to a request. But if it ever happens... This additional check prevents the kernel from leaking memory and possibly more nasty consequences in that unlikely event. Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/trans_rdma.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index b8b66d38f5b0..274a9c1d3c3d 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -294,6 +294,13 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, if (!req) goto err_out; + /* Check that we have not yet received a reply for this request. + */ + if (unlikely(req->rc)) { + pr_err("Duplicate reply for request %d", tag); + goto err_out; + } + req->rc = c->rc; req->status = REQ_STATUS_RCVD; p9_client_cb(client, req); From fd453d0ed6c1dacef8eff466df473d62d63db1e9 Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Fri, 21 Jun 2013 15:32:39 +0200 Subject: [PATCH 08/13] 9P/RDMA: Use a semaphore to protect the RQ The current code keeps track of the number of buffers posted in the RQ, and will prevent it from overflowing. But it does so by simply dropping post requests (And leaking memory in the process). When this happens there will actually be too few buffers posted, and soon the 9P server will complain about 'RNR retry counter exceeded' errors. Instead, use a semaphore, and block until the RQ is ready for another buffer to be posted. Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/trans_rdma.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 274a9c1d3c3d..ad8dc331574b 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -73,7 +73,7 @@ * @sq_depth: The depth of the Send Queue * @sq_sem: Semaphore for the SQ * @rq_depth: The depth of the Receive Queue. - * @rq_count: Count of requests in the Receive Queue. + * @rq_sem: Semaphore for the RQ * @addr: The remote peer's address * @req_lock: Protects the active request list * @cm_done: Completion event for connection management tracking @@ -98,7 +98,7 @@ struct p9_trans_rdma { int sq_depth; struct semaphore sq_sem; int rq_depth; - atomic_t rq_count; + struct semaphore rq_sem; struct sockaddr_in addr; spinlock_t req_lock; @@ -341,8 +341,8 @@ static void cq_comp_handler(struct ib_cq *cq, void *cq_context) switch (c->wc_op) { case IB_WC_RECV: - atomic_dec(&rdma->rq_count); handle_recv(client, rdma, c, wc.status, wc.byte_len); + up(&rdma->rq_sem); break; case IB_WC_SEND: @@ -441,12 +441,14 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) * outstanding request, so we must keep a count to avoid * overflowing the RQ. */ - if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) { - err = post_recv(client, rpl_context); - if (err) - goto err_free1; - } else - atomic_dec(&rdma->rq_count); + if (down_interruptible(&rdma->rq_sem)) + goto error; /* FIXME : -EINTR instead */ + + err = post_recv(client, rpl_context); + if (err) { + p9_debug(P9_DEBUG_FCALL, "POST RECV failed\n"); + goto err_free1; + } /* remove posted receive buffer from request structure */ req->rc = NULL; @@ -537,7 +539,7 @@ static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) spin_lock_init(&rdma->req_lock); init_completion(&rdma->cm_done); sema_init(&rdma->sq_sem, rdma->sq_depth); - atomic_set(&rdma->rq_count, 0); + sema_init(&rdma->rq_sem, rdma->rq_depth); return rdma; } From b530e252e291c27fdcb1b73c72ad17f75c8bdba6 Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Fri, 21 Jun 2013 15:32:40 +0200 Subject: [PATCH 09/13] 9P/RDMA: Do not free req->rc in error handling in rdma_request() rdma_request() should never be in charge of freeing rc. When an error occurs: * Either the rc buffer has been recv_post()'ed. then kfree()'ing it certainly is a bad idea. * Or is has not, and in that case req->rc still points to it, hence it needs not be freed. Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/trans_rdma.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index ad8dc331574b..1bd4c7150114 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -447,7 +447,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) err = post_recv(client, rpl_context); if (err) { p9_debug(P9_DEBUG_FCALL, "POST RECV failed\n"); - goto err_free1; + goto err_free; } /* remove posted receive buffer from request structure */ @@ -457,7 +457,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) c = kmalloc(sizeof *c, GFP_NOFS); if (!c) { err = -ENOMEM; - goto err_free1; + goto err_free; } c->req = req; @@ -486,13 +486,10 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) error: kfree(c); - kfree(rpl_context->rc); kfree(rpl_context); p9_debug(P9_DEBUG_ERROR, "EIO\n"); return -EIO; - err_free1: - kfree(rpl_context->rc); - err_free2: + err_free: kfree(rpl_context); err_close: spin_lock_irqsave(&rdma->req_lock, flags); From 2f52d07cb75d96fcbb5b9ab72938590fa9ffb19d Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Fri, 21 Jun 2013 15:32:41 +0200 Subject: [PATCH 10/13] 9P/RDMA: Improve error handling in rdma_request Most importantly: - do not free the recv context (rpl_context) after a successful post_recv() - but do free the send context (c) after a failed send. Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/trans_rdma.c | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 1bd4c7150114..926e72d00e57 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -430,7 +430,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS); if (!rpl_context) { err = -ENOMEM; - goto err_close; + goto recv_error; } rpl_context->rc = req->rc; @@ -441,13 +441,15 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) * outstanding request, so we must keep a count to avoid * overflowing the RQ. */ - if (down_interruptible(&rdma->rq_sem)) - goto error; /* FIXME : -EINTR instead */ + if (down_interruptible(&rdma->rq_sem)) { + err = -EINTR; + goto recv_error; + } err = post_recv(client, rpl_context); if (err) { p9_debug(P9_DEBUG_FCALL, "POST RECV failed\n"); - goto err_free; + goto recv_error; } /* remove posted receive buffer from request structure */ @@ -457,15 +459,17 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) c = kmalloc(sizeof *c, GFP_NOFS); if (!c) { err = -ENOMEM; - goto err_free; + goto send_error; } c->req = req; c->busa = ib_dma_map_single(rdma->cm_id->device, c->req->tc->sdata, c->req->tc->size, DMA_TO_DEVICE); - if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) - goto error; + if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) { + err = -EIO; + goto send_error; + } sge.addr = c->busa; sge.length = c->req->tc->size; @@ -479,19 +483,27 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) wr.sg_list = &sge; wr.num_sge = 1; - if (down_interruptible(&rdma->sq_sem)) - goto error; + if (down_interruptible(&rdma->sq_sem)) { + err = -EINTR; + goto send_error; + } - return ib_post_send(rdma->qp, &wr, &bad_wr); + err = ib_post_send(rdma->qp, &wr, &bad_wr); + if (err) + goto send_error; - error: + /* Success */ + return 0; + + /* Handle errors that happened during or while preparing the send: */ + send_error: kfree(c); + p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err); + return err; + + /* Handle errors that happened during or while preparing post_recv(): */ + recv_error: kfree(rpl_context); - p9_debug(P9_DEBUG_ERROR, "EIO\n"); - return -EIO; - err_free: - kfree(rpl_context); - err_close: spin_lock_irqsave(&rdma->req_lock, flags); if (rdma->state < P9_RDMA_CLOSING) { rdma->state = P9_RDMA_CLOSING; From 1cff33069a4a1ac9ed080756113ecd17ad408282 Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Fri, 21 Jun 2013 15:32:42 +0200 Subject: [PATCH 11/13] 9P/RDMA: count posted buffers without a pending request In rdma_request(): If an error occurs between posting the recv and the send, there will be a reply context posted without a pending request. Since there is no way to "un-post" it, we remember it and skip post_recv() for the next request. Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/client.c | 6 ++++-- net/9p/trans_rdma.c | 31 ++++++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/net/9p/client.c b/net/9p/client.c index db5bf2480a33..d18a0b22f62c 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -656,8 +656,10 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) return PTR_ERR(req); - /* if we haven't received a response for oldreq, - remove it from the list. */ + /* + * if we haven't received a response for oldreq, + * remove it from the list. + */ spin_lock(&c->lock); if (oldreq->status == REQ_STATUS_FLSH) list_del(&oldreq->req_list); diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 926e72d00e57..8f68df5d2973 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -74,6 +74,8 @@ * @sq_sem: Semaphore for the SQ * @rq_depth: The depth of the Receive Queue. * @rq_sem: Semaphore for the RQ + * @excess_rc : Amount of posted Receive Contexts without a pending request. + * See rdma_request() * @addr: The remote peer's address * @req_lock: Protects the active request list * @cm_done: Completion event for connection management tracking @@ -99,6 +101,7 @@ struct p9_trans_rdma { struct semaphore sq_sem; int rq_depth; struct semaphore rq_sem; + atomic_t excess_rc; struct sockaddr_in addr; spinlock_t req_lock; @@ -426,6 +429,26 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) struct p9_rdma_context *c = NULL; struct p9_rdma_context *rpl_context = NULL; + /* When an error occurs between posting the recv and the send, + * there will be a receive context posted without a pending request. + * Since there is no way to "un-post" it, we remember it and skip + * post_recv() for the next request. + * So here, + * see if we are this `next request' and need to absorb an excess rc. + * If yes, then drop and free our own, and do not recv_post(). + **/ + if (unlikely(atomic_read(&rdma->excess_rc) > 0)) { + if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) { + /* Got one ! */ + kfree(req->rc); + req->rc = NULL; + goto dont_need_post_recv; + } else { + /* We raced and lost. */ + atomic_inc(&rdma->excess_rc); + } + } + /* Allocate an fcall for the reply */ rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS); if (!rpl_context) { @@ -451,10 +474,10 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) p9_debug(P9_DEBUG_FCALL, "POST RECV failed\n"); goto recv_error; } - /* remove posted receive buffer from request structure */ req->rc = NULL; +dont_need_post_recv: /* Post the request */ c = kmalloc(sizeof *c, GFP_NOFS); if (!c) { @@ -499,6 +522,11 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) send_error: kfree(c); p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err); + + /* Ach. + * We did recv_post(), but not send. We have one recv_post in excess. + */ + atomic_inc(&rdma->excess_rc); return err; /* Handle errors that happened during or while preparing post_recv(): */ @@ -549,6 +577,7 @@ static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) init_completion(&rdma->cm_done); sema_init(&rdma->sq_sem, rdma->sq_depth); sema_init(&rdma->rq_sem, rdma->rq_depth); + atomic_set(&rdma->excess_rc, 0); return rdma; } From 80b45261a0b263536b043c5ccfc4ba4fc27c2acc Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Fri, 21 Jun 2013 15:32:43 +0200 Subject: [PATCH 12/13] 9P: Add cancelled() to the transport functions. RDMA needs to post a buffer for each incoming reply. Hence it needs to keep count of these and needs to be aware of whether a flushed request has received a reply or not. This patch adds the cancelled() callback to the transport modules. It is called when RFLUSH has been received and that the corresponding request will never receive a reply. Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- include/net/9p/transport.h | 3 +++ net/9p/client.c | 12 +++++++++--- net/9p/trans_rdma.c | 11 +++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h index 9a36d9297114..d9fa68f26c41 100644 --- a/include/net/9p/transport.h +++ b/include/net/9p/transport.h @@ -40,6 +40,8 @@ * @close: member function to discard a connection on this transport * @request: member function to issue a request to the transport * @cancel: member function to cancel a request (if it hasn't been sent) + * @cancelled: member function to notify that a cancelled request will not + * not receive a reply * * This is the basic API for a transport module which is registered by the * transport module with the 9P core network module and used by the client @@ -58,6 +60,7 @@ struct p9_trans_module { void (*close) (struct p9_client *); int (*request) (struct p9_client *, struct p9_req_t *req); int (*cancel) (struct p9_client *, struct p9_req_t *req); + int (*cancelled)(struct p9_client *, struct p9_req_t *req); int (*zc_request)(struct p9_client *, struct p9_req_t *, char *, char *, int , int, int, int); }; diff --git a/net/9p/client.c b/net/9p/client.c index d18a0b22f62c..8b93cae2d11d 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -658,12 +658,18 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) /* * if we haven't received a response for oldreq, - * remove it from the list. + * remove it from the list, and notify the transport + * layer that the reply will never arrive. */ spin_lock(&c->lock); - if (oldreq->status == REQ_STATUS_FLSH) + if (oldreq->status == REQ_STATUS_FLSH) { list_del(&oldreq->req_list); - spin_unlock(&c->lock); + spin_unlock(&c->lock); + if (c->trans_mod->cancelled) + c->trans_mod->cancelled(c, req); + } else { + spin_unlock(&c->lock); + } p9_free_req(c, req); return 0; diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 8f68df5d2973..928f2bb9bf8d 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -588,6 +588,17 @@ static int rdma_cancel(struct p9_client *client, struct p9_req_t *req) return 1; } +/* A request has been fully flushed without a reply. + * That means we have posted one buffer in excess. + */ +static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req) +{ + struct p9_trans_rdma *rdma = client->trans; + + atomic_inc(&rdma->excess_rc); + return 0; +} + /** * trans_create_rdma - Transport method for creating atransport instance * @client: client instance From f2692ea8d5b535277bc06b315eabd32ef4e7a11c Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Fri, 28 Jun 2013 12:44:13 -0500 Subject: [PATCH 13/13] fs/9p: Remove the unused variable "err" in v9fs_vfs_getattr() Delete the unused variable "err" in v9fs_vfs_getattr() Signed-off-by: Gu Zheng Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index d86edc8d3fd0..25b018efb8ab 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1054,13 +1054,11 @@ static int v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - int err; struct v9fs_session_info *v9ses; struct p9_fid *fid; struct p9_wstat *st; p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); - err = -EPERM; v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { generic_fillattr(dentry->d_inode, stat);