diff options
Diffstat (limited to 'drivers/staging/lustre')
190 files changed, 10666 insertions, 8358 deletions
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs.h b/drivers/staging/lustre/include/linux/libcfs/libcfs.h index 3f6447c65042..3b92d38d37e2 100644 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs.h +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs.h @@ -138,8 +138,8 @@ struct lnet_debugfs_symlink_def { void lustre_insert_debugfs(struct ctl_table *table, const struct lnet_debugfs_symlink_def *symlinks); int lprocfs_call_handler(void *data, int write, loff_t *ppos, - void __user *buffer, size_t *lenp, - int (*handler)(void *data, int write, - loff_t pos, void __user *buffer, int len)); + void __user *buffer, size_t *lenp, + int (*handler)(void *data, int write, loff_t pos, + void __user *buffer, int len)); #endif /* _LIBCFS_H */ diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h index 25adab19fd86..b7bd6e8ab33f 100644 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h @@ -247,19 +247,19 @@ do { \ #define LCONSOLE_EMERG(format, ...) CDEBUG(D_CONSOLE | D_EMERG, format, ## __VA_ARGS__) int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata, - const char *format1, ...) + const char *format1, ...) __printf(2, 3); int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata, - const char *format1, - va_list args, const char *format2, ...) + const char *format1, + va_list args, const char *format2, ...) __printf(4, 5); /* other external symbols that tracefile provides: */ int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob, - const char __user *usr_buffer, int usr_buffer_nob); + const char __user *usr_buffer, int usr_buffer_nob); int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob, - const char *knl_buffer, char *append); + const char *knl_buffer, char *append); #define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log" diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h index d3f9a6020ee3..bdbbe934584c 100644 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h @@ -143,6 +143,9 @@ static inline int cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set) #define CFS_FAIL_TIMEOUT_ORSET(id, value, secs) \ cfs_fail_timeout_set(id, value, secs * 1000, CFS_FAIL_LOC_ORSET) +#define CFS_FAIL_TIMEOUT_RESET(id, value, secs) \ + cfs_fail_timeout_set(id, value, secs * 1000, CFS_FAIL_LOC_RESET) + #define CFS_FAIL_TIMEOUT_MS_ORSET(id, value, ms) \ cfs_fail_timeout_set(id, value, ms, CFS_FAIL_LOC_ORSET) diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h index 4daa3823f60a..e0e1a5d0949d 100644 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h @@ -310,13 +310,13 @@ do { \ #define MKSTR(ptr) ((ptr)) ? (ptr) : "" -static inline int cfs_size_round4(int val) +static inline size_t cfs_size_round4(int val) { return (val + 3) & (~0x3); } #ifndef HAVE_CFS_SIZE_ROUND -static inline int cfs_size_round(int val) +static inline size_t cfs_size_round(int val) { return (val + 7) & (~0x7); } @@ -324,17 +324,17 @@ static inline int cfs_size_round(int val) #define HAVE_CFS_SIZE_ROUND #endif -static inline int cfs_size_round16(int val) +static inline size_t cfs_size_round16(int val) { return (val + 0xf) & (~0xf); } -static inline int cfs_size_round32(int val) +static inline size_t cfs_size_round32(int val) { return (val + 0x1f) & (~0x1f); } -static inline int cfs_size_round0(int val) +static inline size_t cfs_size_round0(int val) { if (!val) return 0; @@ -343,7 +343,7 @@ static inline int cfs_size_round0(int val) static inline size_t cfs_round_strlen(char *fset) { - return (size_t)cfs_size_round((int)strlen(fset) + 1); + return cfs_size_round((int)strlen(fset) + 1); } #define LOGL(var, len, ptr) \ @@ -360,13 +360,4 @@ do { \ ptr += cfs_size_round(len); \ } while (0) -#define LOGL0(var, len, ptr) \ -do { \ - if (!len) \ - break; \ - memcpy((char *)ptr, (const char *)var, len); \ - *((char *)(ptr) + len) = 0; \ - ptr += cfs_size_round(len + 1); \ -} while (0) - #endif diff --git a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h index 513a8225f888..a59c5e99cbd3 100644 --- a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h +++ b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h @@ -605,73 +605,20 @@ void lnet_counters_reset(void); unsigned int lnet_iov_nob(unsigned int niov, struct kvec *iov); int lnet_extract_iov(int dst_niov, struct kvec *dst, - int src_niov, struct kvec *src, + int src_niov, const struct kvec *src, unsigned int offset, unsigned int len); unsigned int lnet_kiov_nob(unsigned int niov, lnet_kiov_t *iov); int lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, - int src_niov, lnet_kiov_t *src, + int src_niov, const lnet_kiov_t *src, unsigned int offset, unsigned int len); -void lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, - unsigned int doffset, - unsigned int nsiov, struct kvec *siov, +void lnet_copy_iov2iter(struct iov_iter *to, + unsigned int nsiov, const struct kvec *siov, unsigned int soffset, unsigned int nob); -void lnet_copy_kiov2iov(unsigned int niov, struct kvec *iov, - unsigned int iovoffset, - unsigned int nkiov, lnet_kiov_t *kiov, +void lnet_copy_kiov2iter(struct iov_iter *to, + unsigned int nkiov, const lnet_kiov_t *kiov, unsigned int kiovoffset, unsigned int nob); -void lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, - unsigned int kiovoffset, - unsigned int niov, struct kvec *iov, - unsigned int iovoffset, unsigned int nob); -void lnet_copy_kiov2kiov(unsigned int ndkiov, lnet_kiov_t *dkiov, - unsigned int doffset, - unsigned int nskiov, lnet_kiov_t *skiov, - unsigned int soffset, unsigned int nob); - -static inline void -lnet_copy_iov2flat(int dlen, void *dest, unsigned int doffset, - unsigned int nsiov, struct kvec *siov, unsigned int soffset, - unsigned int nob) -{ - struct kvec diov = {/*.iov_base = */ dest, /*.iov_len = */ dlen}; - - lnet_copy_iov2iov(1, &diov, doffset, - nsiov, siov, soffset, nob); -} - -static inline void -lnet_copy_kiov2flat(int dlen, void *dest, unsigned int doffset, - unsigned int nsiov, lnet_kiov_t *skiov, - unsigned int soffset, unsigned int nob) -{ - struct kvec diov = {/* .iov_base = */ dest, /* .iov_len = */ dlen}; - - lnet_copy_kiov2iov(1, &diov, doffset, - nsiov, skiov, soffset, nob); -} - -static inline void -lnet_copy_flat2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, - int slen, void *src, unsigned int soffset, unsigned int nob) -{ - struct kvec siov = {/*.iov_base = */ src, /*.iov_len = */slen}; - - lnet_copy_iov2iov(ndiov, diov, doffset, - 1, &siov, soffset, nob); -} - -static inline void -lnet_copy_flat2kiov(unsigned int ndiov, lnet_kiov_t *dkiov, - unsigned int doffset, int slen, void *src, - unsigned int soffset, unsigned int nob) -{ - struct kvec siov = {/* .iov_base = */ src, /* .iov_len = */ slen}; - - lnet_copy_iov2kiov(ndiov, dkiov, doffset, - 1, &siov, soffset, nob); -} void lnet_me_unlink(lnet_me_t *me); diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h index 7967b013cbae..b84a5bb9186c 100644 --- a/drivers/staging/lustre/include/linux/lnet/lib-types.h +++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h @@ -220,10 +220,7 @@ typedef struct lnet_lnd { * credit if the LND does flow control. */ int (*lnd_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg, - int delayed, unsigned int niov, - struct kvec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, - unsigned int rlen); + int delayed, struct iov_iter *to, unsigned int rlen); /* * lnet_parse() has had to delay processing of this message @@ -278,6 +275,8 @@ typedef struct lnet_ni { struct lnet_ioctl_config_lnd_tunables *ni_lnd_tunables; /* equivalent interfaces to use */ char *ni_interfaces[LNET_MAX_INTERFACES]; + /* original net namespace */ + struct net *ni_net_ns; } lnet_ni_t; #define LNET_PROTO_PING_MATCHBITS 0x8000000000000000LL diff --git a/drivers/staging/lustre/include/linux/lnet/types.h b/drivers/staging/lustre/include/linux/lnet/types.h index e098b6c086e1..f8be0e2f7bf7 100644 --- a/drivers/staging/lustre/include/linux/lnet/types.h +++ b/drivers/staging/lustre/include/linux/lnet/types.h @@ -503,21 +503,7 @@ typedef struct { /* NB lustre portals uses struct iovec internally! */ typedef struct iovec lnet_md_iovec_t; -/** - * A page-based fragment of a MD. - */ -typedef struct { - /** Pointer to the page where the fragment resides */ - struct page *kiov_page; - /** Length in bytes of the fragment */ - unsigned int kiov_len; - /** - * Starting offset of the fragment within the page. Note that the - * end of the fragment must not pass the end of the page; i.e., - * kiov_len + kiov_offset <= PAGE_SIZE. - */ - unsigned int kiov_offset; -} lnet_kiov_t; +typedef struct bio_vec lnet_kiov_t; /** @} lnet_md */ /** \addtogroup lnet_eq diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c index 4f5978b3767b..c7a5d49e487f 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c @@ -128,6 +128,7 @@ static int kiblnd_msgtype2size(int type) static int kiblnd_unpack_rd(struct kib_msg *msg, int flip) { struct kib_rdma_desc *rd; + int msg_size; int nob; int n; int i; @@ -146,12 +147,6 @@ static int kiblnd_unpack_rd(struct kib_msg *msg, int flip) n = rd->rd_nfrags; - if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) { - CERROR("Bad nfrags: %d, should be 0 < n <= %d\n", - n, IBLND_MAX_RDMA_FRAGS); - return 1; - } - nob = offsetof(struct kib_msg, ibm_u) + kiblnd_rd_msg_size(rd, msg->ibm_type, n); @@ -161,6 +156,13 @@ static int kiblnd_unpack_rd(struct kib_msg *msg, int flip) return 1; } + msg_size = kiblnd_rd_size(rd); + if (msg_size <= 0 || msg_size > LNET_MAX_PAYLOAD) { + CERROR("Bad msg_size: %d, should be 0 < n <= %d\n", + msg_size, LNET_MAX_PAYLOAD); + return 1; + } + if (!flip) return 0; @@ -618,7 +620,7 @@ static int kiblnd_get_completion_vector(struct kib_conn *conn, int cpt) } struct kib_conn *kiblnd_create_conn(struct kib_peer *peer, struct rdma_cm_id *cmid, - int state, int version) + int state, int version) { /* * CAVEAT EMPTOR: diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h index 078a0c3e8845..14576977200f 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h @@ -113,8 +113,9 @@ extern struct kib_tunables kiblnd_tunables; #define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1) #define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0) -#define IBLND_MSG_SIZE (4 << 10) /* max size of queued messages (inc hdr) */ -#define IBLND_MAX_RDMA_FRAGS LNET_MAX_IOV /* max # of fragments supported */ +#define IBLND_FRAG_SHIFT (PAGE_SHIFT - 12) /* frag size on wire is in 4K units */ +#define IBLND_MSG_SIZE (4 << 10) /* max size of queued messages (inc hdr) */ +#define IBLND_MAX_RDMA_FRAGS (LNET_MAX_PAYLOAD >> 12)/* max # of fragments supported in 4K size */ /************************/ /* derived constants... */ @@ -133,8 +134,8 @@ extern struct kib_tunables kiblnd_tunables; /* WRs and CQEs (per connection) */ #define IBLND_RECV_WRS(c) IBLND_RX_MSGS(c) #define IBLND_SEND_WRS(c) \ - ((c->ibc_max_frags + 1) * kiblnd_concurrent_sends(c->ibc_version, \ - c->ibc_peer->ibp_ni)) + (((c->ibc_max_frags + 1) << IBLND_FRAG_SHIFT) * \ + kiblnd_concurrent_sends(c->ibc_version, c->ibc_peer->ibp_ni)) #define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c)) struct kib_hca_dev; @@ -582,6 +583,8 @@ struct kib_peer { unsigned short ibp_connecting; /* reconnect this peer later */ unsigned short ibp_reconnecting:1; + /* counter of how many times we triggered a conn race */ + unsigned char ibp_races; /* # consecutive reconnection attempts to this peer */ unsigned int ibp_reconnected; /* errno on closing this peer */ @@ -607,14 +610,14 @@ kiblnd_cfg_rdma_frags(struct lnet_ni *ni) tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; mod = tunables->lnd_map_on_demand; - return mod ? mod : IBLND_MAX_RDMA_FRAGS; + return mod ? mod : IBLND_MAX_RDMA_FRAGS >> IBLND_FRAG_SHIFT; } static inline int kiblnd_rdma_frags(int version, struct lnet_ni *ni) { return version == IBLND_MSG_VERSION_1 ? - IBLND_MAX_RDMA_FRAGS : + (IBLND_MAX_RDMA_FRAGS >> IBLND_FRAG_SHIFT) : kiblnd_cfg_rdma_frags(ni); } @@ -1034,5 +1037,4 @@ int kiblnd_post_rx(struct kib_rx *rx, int credit); int kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); int kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, - unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen); + struct iov_iter *to, unsigned int rlen); diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c index 596a697b9d39..b27de8888149 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -36,16 +36,19 @@ #include "o2iblnd.h" +#define MAX_CONN_RACES_BEFORE_ABORT 20 + static void kiblnd_peer_alive(struct kib_peer *peer); static void kiblnd_peer_connect_failed(struct kib_peer *peer, int active, int error); -static void kiblnd_check_sends(struct kib_conn *conn); static void kiblnd_init_tx_msg(lnet_ni_t *ni, struct kib_tx *tx, - int type, int body_nob); + int type, int body_nob); static int kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, - int resid, struct kib_rdma_desc *dstrd, __u64 dstcookie); + int resid, struct kib_rdma_desc *dstrd, + __u64 dstcookie); static void kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn); static void kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn); static void kiblnd_unmap_tx(lnet_ni_t *ni, struct kib_tx *tx); +static void kiblnd_check_sends_locked(struct kib_conn *conn); static void kiblnd_tx_done(lnet_ni_t *ni, struct kib_tx *tx) @@ -211,9 +214,9 @@ kiblnd_post_rx(struct kib_rx *rx, int credit) conn->ibc_outstanding_credits++; else conn->ibc_reserved_credits++; + kiblnd_check_sends_locked(conn); spin_unlock(&conn->ibc_lock); - kiblnd_check_sends(conn); out: kiblnd_conn_decref(conn); return rc; @@ -344,8 +347,8 @@ kiblnd_handle_rx(struct kib_rx *rx) !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */ conn->ibc_outstanding_credits++; + kiblnd_check_sends_locked(conn); spin_unlock(&conn->ibc_lock); - kiblnd_check_sends(conn); } switch (msg->ibm_type) { @@ -648,7 +651,7 @@ static int kiblnd_map_tx(lnet_ni_t *ni, struct kib_tx *tx, struct kib_rdma_desc static int kiblnd_setup_rd_iov(lnet_ni_t *ni, struct kib_tx *tx, struct kib_rdma_desc *rd, - unsigned int niov, struct kvec *iov, int offset, int nob) + unsigned int niov, const struct kvec *iov, int offset, int nob) { struct kib_net *net = ni->ni_data; struct page *page; @@ -705,7 +708,7 @@ kiblnd_setup_rd_iov(lnet_ni_t *ni, struct kib_tx *tx, struct kib_rdma_desc *rd, static int kiblnd_setup_rd_kiov(lnet_ni_t *ni, struct kib_tx *tx, struct kib_rdma_desc *rd, - int nkiov, lnet_kiov_t *kiov, int offset, int nob) + int nkiov, const lnet_kiov_t *kiov, int offset, int nob) { struct kib_net *net = ni->ni_data; struct scatterlist *sg; @@ -717,8 +720,8 @@ kiblnd_setup_rd_kiov(lnet_ni_t *ni, struct kib_tx *tx, struct kib_rdma_desc *rd, LASSERT(nkiov > 0); LASSERT(net); - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; + while (offset >= kiov->bv_len) { + offset -= kiov->bv_len; nkiov--; kiov++; LASSERT(nkiov > 0); @@ -728,10 +731,10 @@ kiblnd_setup_rd_kiov(lnet_ni_t *ni, struct kib_tx *tx, struct kib_rdma_desc *rd, do { LASSERT(nkiov > 0); - fragnob = min((int)(kiov->kiov_len - offset), nob); + fragnob = min((int)(kiov->bv_len - offset), nob); - sg_set_page(sg, kiov->kiov_page, fragnob, - kiov->kiov_offset + offset); + sg_set_page(sg, kiov->bv_page, fragnob, + kiov->bv_offset + offset); sg = sg_next(sg); if (!sg) { CERROR("lacking enough sg entries to map tx\n"); @@ -761,7 +764,6 @@ kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit) LASSERT(tx->tx_queued); /* We rely on this for QP sizing */ LASSERT(tx->tx_nwrq > 0); - LASSERT(tx->tx_nwrq <= 1 + conn->ibc_max_frags); LASSERT(!credit || credit == 1); LASSERT(conn->ibc_outstanding_credits >= 0); @@ -800,7 +802,7 @@ kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit) conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) { /* * OK to drop when posted enough NOOPs, since - * kiblnd_check_sends will queue NOOP again when + * kiblnd_check_sends_locked will queue NOOP again when * posted NOOPs complete */ spin_unlock(&conn->ibc_lock); @@ -905,7 +907,7 @@ kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit) } static void -kiblnd_check_sends(struct kib_conn *conn) +kiblnd_check_sends_locked(struct kib_conn *conn) { int ver = conn->ibc_version; lnet_ni_t *ni = conn->ibc_peer->ibp_ni; @@ -918,8 +920,6 @@ kiblnd_check_sends(struct kib_conn *conn) return; } - spin_lock(&conn->ibc_lock); - LASSERT(conn->ibc_nsends_posted <= kiblnd_concurrent_sends(ver, ni)); LASSERT(!IBLND_OOB_CAPABLE(ver) || conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver)); @@ -969,8 +969,6 @@ kiblnd_check_sends(struct kib_conn *conn) if (kiblnd_post_tx_locked(conn, tx, credit)) break; } - - spin_unlock(&conn->ibc_lock); } static void @@ -1016,16 +1014,11 @@ kiblnd_tx_complete(struct kib_tx *tx, int status) if (idle) list_del(&tx->tx_list); - kiblnd_conn_addref(conn); /* 1 ref for me.... */ - + kiblnd_check_sends_locked(conn); spin_unlock(&conn->ibc_lock); if (idle) kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx); - - kiblnd_check_sends(conn); - - kiblnd_conn_decref(conn); /* ...until here */ } static void @@ -1078,6 +1071,15 @@ kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, LASSERT(type == IBLND_MSG_GET_DONE || type == IBLND_MSG_PUT_DONE); + if (kiblnd_rd_size(srcrd) > conn->ibc_max_frags << PAGE_SHIFT) { + CERROR("RDMA is too large for peer %s (%d), src size: %d dst size: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + conn->ibc_max_frags << PAGE_SHIFT, + kiblnd_rd_size(srcrd), kiblnd_rd_size(dstrd)); + rc = -EMSGSIZE; + goto too_big; + } + while (resid > 0) { if (srcidx >= srcrd->rd_nfrags) { CERROR("Src buffer exhausted: %d frags\n", srcidx); @@ -1091,10 +1093,10 @@ kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, break; } - if (tx->tx_nwrq >= conn->ibc_max_frags) { + if (tx->tx_nwrq >= IBLND_MAX_RDMA_FRAGS) { CERROR("RDMA has too many fragments for peer %s (%d), src idx/frags: %d/%d dst idx/frags: %d/%d\n", libcfs_nid2str(conn->ibc_peer->ibp_nid), - conn->ibc_max_frags, + IBLND_MAX_RDMA_FRAGS, srcidx, srcrd->rd_nfrags, dstidx, dstrd->rd_nfrags); rc = -EMSGSIZE; @@ -1132,7 +1134,7 @@ kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, wrq++; sge++; } - +too_big: if (rc < 0) /* no RDMA if completing with failure */ tx->tx_nwrq = 0; @@ -1204,9 +1206,8 @@ kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn) { spin_lock(&conn->ibc_lock); kiblnd_queue_tx_locked(tx, conn); + kiblnd_check_sends_locked(conn); spin_unlock(&conn->ibc_lock); - - kiblnd_check_sends(conn); } static int kiblnd_resolve_addr(struct rdma_cm_id *cmid, @@ -1499,6 +1500,7 @@ kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; unsigned int payload_offset = lntmsg->msg_offset; unsigned int payload_nob = lntmsg->msg_len; + struct iov_iter from; struct kib_msg *ibmsg; struct kib_rdma_desc *rd; struct kib_tx *tx; @@ -1518,6 +1520,17 @@ kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) /* payload is either all vaddrs or all pages */ LASSERT(!(payload_kiov && payload_iov)); + if (payload_kiov) + iov_iter_bvec(&from, ITER_BVEC | WRITE, + payload_kiov, payload_niov, + payload_nob + payload_offset); + else + iov_iter_kvec(&from, ITER_KVEC | WRITE, + payload_iov, payload_niov, + payload_nob + payload_offset); + + iov_iter_advance(&from, payload_offset); + switch (type) { default: LBUG(); @@ -1637,17 +1650,8 @@ kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) ibmsg = tx->tx_msg; ibmsg->ibm_u.immediate.ibim_hdr = *hdr; - if (payload_kiov) - lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg, - offsetof(struct kib_msg, ibm_u.immediate.ibim_payload), - payload_niov, payload_kiov, - payload_offset, payload_nob); - else - lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg, - offsetof(struct kib_msg, ibm_u.immediate.ibim_payload), - payload_niov, payload_iov, - payload_offset, payload_nob); - + copy_from_iter(&ibmsg->ibm_u.immediate.ibim_payload, IBLND_MSG_SIZE, + &from); nob = offsetof(struct kib_immediate_msg, ibim_payload[payload_nob]); kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob); @@ -1719,8 +1723,7 @@ kiblnd_reply(lnet_ni_t *ni, struct kib_rx *rx, lnet_msg_t *lntmsg) int kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, - unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen) + struct iov_iter *to, unsigned int rlen) { struct kib_rx *rx = private; struct kib_msg *rxmsg = rx->rx_msg; @@ -1730,10 +1733,9 @@ kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, int post_credit = IBLND_POSTRX_PEER_CREDIT; int rc = 0; - LASSERT(mlen <= rlen); + LASSERT(iov_iter_count(to) <= rlen); LASSERT(!in_interrupt()); /* Either all pages or all vaddrs */ - LASSERT(!(kiov && iov)); switch (rxmsg->ibm_type) { default: @@ -1749,16 +1751,8 @@ kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, break; } - if (kiov) - lnet_copy_flat2kiov(niov, kiov, offset, - IBLND_MSG_SIZE, rxmsg, - offsetof(struct kib_msg, ibm_u.immediate.ibim_payload), - mlen); - else - lnet_copy_flat2iov(niov, iov, offset, - IBLND_MSG_SIZE, rxmsg, - offsetof(struct kib_msg, ibm_u.immediate.ibim_payload), - mlen); + copy_to_iter(&rxmsg->ibm_u.immediate.ibim_payload, + IBLND_MSG_SIZE, to); lnet_finalize(ni, lntmsg, 0); break; @@ -1766,7 +1760,7 @@ kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, struct kib_msg *txmsg; struct kib_rdma_desc *rd; - if (!mlen) { + if (!iov_iter_count(to)) { lnet_finalize(ni, lntmsg, 0); kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0, rxmsg->ibm_u.putreq.ibprm_cookie); @@ -1784,12 +1778,16 @@ kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, txmsg = tx->tx_msg; rd = &txmsg->ibm_u.putack.ibpam_rd; - if (!kiov) + if (!(to->type & ITER_BVEC)) rc = kiblnd_setup_rd_iov(ni, tx, rd, - niov, iov, offset, mlen); + to->nr_segs, to->kvec, + to->iov_offset, + iov_iter_count(to)); else rc = kiblnd_setup_rd_kiov(ni, tx, rd, - niov, kiov, offset, mlen); + to->nr_segs, to->bvec, + to->iov_offset, + iov_iter_count(to)); if (rc) { CERROR("Can't setup PUT sink for %s: %d\n", libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); @@ -2183,14 +2181,11 @@ kiblnd_connreq_done(struct kib_conn *conn, int status) return; } - /** - * refcount taken by cmid is not reliable after I released the glock - * because this connection is visible to other threads now, another - * thread can find and close this connection right after I released - * the glock, if kiblnd_cm_callback for RDMA_CM_EVENT_DISCONNECTED is - * called, it can release the connection refcount taken by cmid. - * It means the connection could be destroyed before I finish my - * operations on it. + /* + * +1 ref for myself, this connection is visible to other threads + * now, refcount of peer:ibp_conns can be released by connection + * close from either a different thread, or the calling of + * kiblnd_check_sends_locked() below. See bz21911 for details. */ kiblnd_conn_addref(conn); write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); @@ -2202,10 +2197,9 @@ kiblnd_connreq_done(struct kib_conn *conn, int status) kiblnd_queue_tx_locked(tx, conn); } + kiblnd_check_sends_locked(conn); spin_unlock(&conn->ibc_lock); - kiblnd_check_sends(conn); - /* schedule blocked rxs */ kiblnd_handle_early_rxs(conn); @@ -2240,6 +2234,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) struct kib_rej rej; int version = IBLND_MSG_VERSION; unsigned long flags; + int max_frags; int rc; struct sockaddr_in *peer_addr; @@ -2346,22 +2341,20 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) goto failed; } - if (reqmsg->ibm_u.connparams.ibcp_max_frags > - kiblnd_rdma_frags(version, ni)) { - CWARN("Can't accept conn from %s (version %x): max_frags %d too large (%d wanted)\n", - libcfs_nid2str(nid), version, - reqmsg->ibm_u.connparams.ibcp_max_frags, + max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT; + if (max_frags > kiblnd_rdma_frags(version, ni)) { + CWARN("Can't accept conn from %s (version %x): max message size %d is too large (%d wanted)\n", + libcfs_nid2str(nid), version, max_frags, kiblnd_rdma_frags(version, ni)); if (version >= IBLND_MSG_VERSION) rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; goto failed; - } else if (reqmsg->ibm_u.connparams.ibcp_max_frags < - kiblnd_rdma_frags(version, ni) && !net->ibn_fmr_ps) { - CWARN("Can't accept conn from %s (version %x): max_frags %d incompatible without FMR pool (%d wanted)\n", - libcfs_nid2str(nid), version, - reqmsg->ibm_u.connparams.ibcp_max_frags, + } else if (max_frags < kiblnd_rdma_frags(version, ni) && + !net->ibn_fmr_ps) { + CWARN("Can't accept conn from %s (version %x): max message size %d incompatible without FMR pool (%d wanted)\n", + libcfs_nid2str(nid), version, max_frags, kiblnd_rdma_frags(version, ni)); if (version == IBLND_MSG_VERSION) @@ -2387,7 +2380,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) } /* We have validated the peer's parameters so use those */ - peer->ibp_max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags; + peer->ibp_max_frags = max_frags; peer->ibp_queue_depth = reqmsg->ibm_u.connparams.ibcp_queue_depth; write_lock_irqsave(g_lock, flags); @@ -2419,23 +2412,37 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) goto failed; } - /* tie-break connection race in favour of the higher NID */ + /* + * Tie-break connection race in favour of the higher NID. + * If we keep running into a race condition multiple times, + * we have to assume that the connection attempt with the + * higher NID is stuck in a connecting state and will never + * recover. As such, we pass through this if-block and let + * the lower NID connection win so we can move forward. + */ if (peer2->ibp_connecting && - nid < ni->ni_nid) { + nid < ni->ni_nid && peer2->ibp_races < + MAX_CONN_RACES_BEFORE_ABORT) { + peer2->ibp_races++; write_unlock_irqrestore(g_lock, flags); - CWARN("Conn race %s\n", libcfs_nid2str(peer2->ibp_nid)); + CDEBUG(D_NET, "Conn race %s\n", + libcfs_nid2str(peer2->ibp_nid)); kiblnd_peer_decref(peer); rej.ibr_why = IBLND_REJECT_CONN_RACE; goto failed; } - + if (peer2->ibp_races >= MAX_CONN_RACES_BEFORE_ABORT) + CNETERR("Conn race %s: unresolved after %d attempts, letting lower NID win\n", + libcfs_nid2str(peer2->ibp_nid), + MAX_CONN_RACES_BEFORE_ABORT); /** * passive connection is allowed even this peer is waiting for * reconnection. */ peer2->ibp_reconnecting = 0; + peer2->ibp_races = 0; peer2->ibp_accepting++; kiblnd_peer_addref(peer2); @@ -2494,7 +2501,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK, sizeof(ackmsg->ibm_u.connparams)); ackmsg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth; - ackmsg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags; + ackmsg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT; ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp); @@ -2526,9 +2533,9 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) failed: if (ni) { - lnet_ni_decref(ni); rej.ibr_cp.ibcp_queue_depth = kiblnd_msg_queue_size(version, ni); rej.ibr_cp.ibcp_max_frags = kiblnd_rdma_frags(version, ni); + lnet_ni_decref(ni); } rej.ibr_version = version; @@ -2556,7 +2563,7 @@ kiblnd_check_reconnect(struct kib_conn *conn, int version, if (cp) { msg_size = cp->ibcp_max_msg_size; - frag_num = cp->ibcp_max_frags; + frag_num = cp->ibcp_max_frags << IBLND_FRAG_SHIFT; queue_dep = cp->ibcp_queue_depth; } @@ -2821,11 +2828,11 @@ kiblnd_check_connreply(struct kib_conn *conn, void *priv, int priv_nob) goto failed; } - if (msg->ibm_u.connparams.ibcp_max_frags > + if ((msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT) > conn->ibc_max_frags) { CERROR("%s has incompatible max_frags %d (<=%d wanted)\n", libcfs_nid2str(peer->ibp_nid), - msg->ibm_u.connparams.ibcp_max_frags, + msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT, conn->ibc_max_frags); rc = -EPROTO; goto failed; @@ -2859,7 +2866,7 @@ kiblnd_check_connreply(struct kib_conn *conn, void *priv, int priv_nob) conn->ibc_credits = msg->ibm_u.connparams.ibcp_queue_depth; conn->ibc_reserved_credits = msg->ibm_u.connparams.ibcp_queue_depth; conn->ibc_queue_depth = msg->ibm_u.connparams.ibcp_queue_depth; - conn->ibc_max_frags = msg->ibm_u.connparams.ibcp_max_frags; + conn->ibc_max_frags = msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT; LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(ver) <= IBLND_RX_MSGS(conn)); @@ -2916,7 +2923,7 @@ kiblnd_active_connect(struct rdma_cm_id *cmid) memset(msg, 0, sizeof(*msg)); kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams)); msg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth; - msg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags; + msg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT; msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; kiblnd_pack_msg(peer->ibp_ni, msg, version, @@ -3233,7 +3240,11 @@ kiblnd_check_conns(int idx) */ list_for_each_entry_safe(conn, temp, &checksends, ibc_connd_list) { list_del(&conn->ibc_connd_list); - kiblnd_check_sends(conn); + + spin_lock(&conn->ibc_lock); + kiblnd_check_sends_locked(conn); + spin_unlock(&conn->ibc_lock); + kiblnd_conn_decref(conn); } } @@ -3419,6 +3430,12 @@ kiblnd_qp_event(struct ib_event *event, void *arg) case IB_EVENT_COMM_EST: CDEBUG(D_NET, "%s established\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); + /* + * We received a packet but connection isn't established + * probably handshake packet was lost, so free to + * force make connection established + */ + rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST); return; default: diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c index 07ec540946cd..cbc9a9c5385f 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c @@ -1468,11 +1468,6 @@ ksocknal_close_conn_locked(struct ksock_conn *conn, int error) conn->ksnc_route = NULL; -#if 0 /* irrelevant with only eager routes */ - /* make route least favourite */ - list_del(&route->ksnr_list); - list_add_tail(&route->ksnr_list, &peer->ksnp_routes); -#endif ksocknal_route_decref(route); /* drop conn's ref on route */ } diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h index a56632b4ee37..e6ca0cf52691 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h @@ -86,8 +86,6 @@ struct ksock_sched { /* per scheduler state */ int kss_nconns; /* # connections assigned to * this scheduler */ struct ksock_sched_info *kss_info; /* owner of it */ - struct page *kss_rx_scratch_pgs[LNET_MAX_IOV]; - struct kvec kss_scratch_iov[LNET_MAX_IOV]; }; struct ksock_sched_info { @@ -616,9 +614,7 @@ void ksocknal_shutdown(lnet_ni_t *ni); int ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); int ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, - int delayed, unsigned int niov, - struct kvec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen); + int delayed, struct iov_iter *to, unsigned int rlen); int ksocknal_accept(lnet_ni_t *ni, struct socket *sock); int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port); @@ -635,7 +631,7 @@ int ksocknal_close_peer_conns_locked(struct ksock_peer *peer, int ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why); int ksocknal_close_matching_conns(lnet_process_id_t id, __u32 ipaddr); struct ksock_conn *ksocknal_find_conn_locked(struct ksock_peer *peer, - struct ksock_tx *tx, int nonblk); + struct ksock_tx *tx, int nonblk); int ksocknal_launch_packet(lnet_ni_t *ni, struct ksock_tx *tx, lnet_process_id_t id); diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c index 303576d815c6..c1c6f604e6ad 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c @@ -35,8 +35,8 @@ ksocknal_alloc_tx(int type, int size) spin_lock(&ksocknal_data.ksnd_tx_lock); if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { - tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \ - next, struct ksock_tx, tx_list); + tx = list_entry(ksocknal_data.ksnd_idle_noop_txs.next, + struct ksock_tx, tx_list); LASSERT(tx->tx_desc_size == size); list_del(&tx->tx_list); } @@ -164,13 +164,13 @@ ksocknal_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx) do { LASSERT(tx->tx_nkiov > 0); - if (nob < (int)kiov->kiov_len) { - kiov->kiov_offset += nob; - kiov->kiov_len -= nob; + if (nob < (int)kiov->bv_len) { + kiov->bv_offset += nob; + kiov->bv_len -= nob; return rc; } - nob -= (int)kiov->kiov_len; + nob -= (int)kiov->bv_len; tx->tx_kiov = ++kiov; tx->tx_nkiov--; } while (nob); @@ -326,13 +326,13 @@ ksocknal_recv_kiov(struct ksock_conn *conn) do { LASSERT(conn->ksnc_rx_nkiov > 0); - if (nob < (int)kiov->kiov_len) { - kiov->kiov_offset += nob; - kiov->kiov_len -= nob; + if (nob < (int)kiov->bv_len) { + kiov->bv_offset += nob; + kiov->bv_len -= nob; return -EAGAIN; } - nob -= kiov->kiov_len; + nob -= kiov->bv_len; conn->ksnc_rx_kiov = ++kiov; conn->ksnc_rx_nkiov--; } while (nob); @@ -1325,39 +1325,36 @@ ksocknal_process_receive(struct ksock_conn *conn) int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, - unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen) + struct iov_iter *to, unsigned int rlen) { struct ksock_conn *conn = private; struct ksock_sched *sched = conn->ksnc_scheduler; - LASSERT(mlen <= rlen); - LASSERT(niov <= LNET_MAX_IOV); + LASSERT(iov_iter_count(to) <= rlen); + LASSERT(to->nr_segs <= LNET_MAX_IOV); conn->ksnc_cookie = msg; - conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_wanted = iov_iter_count(to); conn->ksnc_rx_nob_left = rlen; - if (!mlen || iov) { + if (to->type & ITER_KVEC) { conn->ksnc_rx_nkiov = 0; conn->ksnc_rx_kiov = NULL; conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; conn->ksnc_rx_niov = lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov, - niov, iov, offset, mlen); + to->nr_segs, to->kvec, + to->iov_offset, iov_iter_count(to)); } else { conn->ksnc_rx_niov = 0; conn->ksnc_rx_iov = NULL; conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; conn->ksnc_rx_nkiov = lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov, - niov, kiov, offset, mlen); + to->nr_segs, to->bvec, + to->iov_offset, iov_iter_count(to)); } - LASSERT(mlen == - lnet_iov_nob(conn->ksnc_rx_niov, conn->ksnc_rx_iov) + - lnet_kiov_nob(conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); - LASSERT(conn->ksnc_rx_scheduled); spin_lock_bh(&sched->kss_lock); @@ -2008,13 +2005,6 @@ ksocknal_connect(struct ksock_route *route) list_splice_init(&peer->ksnp_tx_queue, &zombies); } -#if 0 /* irrelevant with only eager routes */ - if (!route->ksnr_deleted) { - /* make this route least-favourite for re-selection */ - list_del(&route->ksnr_list); - list_add_tail(&route->ksnr_list, &peer->ksnp_routes); - } -#endif write_unlock_bh(&ksocknal_data.ksnd_global_lock); ksocknal_peer_failed(peer); diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c index 6a17757fce1e..6c95e989ca12 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c @@ -73,9 +73,9 @@ ksocknal_lib_zc_capable(struct ksock_conn *conn) int ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx) { + struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; struct socket *sock = conn->ksnc_sock; - int nob; - int rc; + int nob, i; if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */ conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */ @@ -83,34 +83,16 @@ ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx) !tx->tx_msg.ksm_csum) /* not checksummed */ ksocknal_lib_csum_tx(tx); - /* - * NB we can't trust socket ops to either consume our iovs - * or leave them alone. - */ - { -#if SOCKNAL_SINGLE_FRAG_TX - struct kvec scratch; - struct kvec *scratchiov = &scratch; - unsigned int niov = 1; -#else - struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; - unsigned int niov = tx->tx_niov; -#endif - struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; - int i; + for (nob = i = 0; i < tx->tx_niov; i++) + nob += tx->tx_iov[i].iov_len; - for (nob = i = 0; i < niov; i++) { - scratchiov[i] = tx->tx_iov[i]; - nob += scratchiov[i].iov_len; - } + if (!list_empty(&conn->ksnc_tx_queue) || + nob < tx->tx_resid) + msg.msg_flags |= MSG_MORE; - if (!list_empty(&conn->ksnc_tx_queue) || - nob < tx->tx_resid) - msg.msg_flags |= MSG_MORE; - - rc = kernel_sendmsg(sock, &msg, scratchiov, niov, nob); - } - return rc; + iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, + tx->tx_iov, tx->tx_niov, nob); + return sock_sendmsg(sock, &msg); } int @@ -124,20 +106,16 @@ ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx) /* Not NOOP message */ LASSERT(tx->tx_lnetmsg); - /* - * NB we can't trust socket ops to either consume our iovs - * or leave them alone. - */ if (tx->tx_msg.ksm_zc_cookies[0]) { /* Zero copy is enabled */ struct sock *sk = sock->sk; - struct page *page = kiov->kiov_page; - int offset = kiov->kiov_offset; - int fragsize = kiov->kiov_len; + struct page *page = kiov->bv_page; + int offset = kiov->bv_offset; + int fragsize = kiov->bv_len; int msgflg = MSG_DONTWAIT; CDEBUG(D_NET, "page %p + offset %x for %d\n", - page, offset, kiov->kiov_len); + page, offset, kiov->bv_len); if (!list_empty(&conn->ksnc_tx_queue) || fragsize < tx->tx_resid) @@ -150,34 +128,19 @@ ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx) rc = tcp_sendpage(sk, page, offset, fragsize, msgflg); } } else { -#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK - struct kvec scratch; - struct kvec *scratchiov = &scratch; - unsigned int niov = 1; -#else -#ifdef CONFIG_HIGHMEM -#warning "XXX risk of kmap deadlock on multiple frags..." -#endif - struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; - unsigned int niov = tx->tx_nkiov; -#endif struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; int i; - for (nob = i = 0; i < niov; i++) { - scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + - kiov[i].kiov_offset; - nob += scratchiov[i].iov_len = kiov[i].kiov_len; - } + for (nob = i = 0; i < tx->tx_nkiov; i++) + nob += kiov[i].bv_len; if (!list_empty(&conn->ksnc_tx_queue) || nob < tx->tx_resid) msg.msg_flags |= MSG_MORE; - rc = kernel_sendmsg(sock, &msg, (struct kvec *)scratchiov, niov, nob); - - for (i = 0; i < niov; i++) - kunmap(kiov[i].kiov_page); + iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC, + kiov, tx->tx_nkiov, nob); + rc = sock_sendmsg(sock, &msg); } return rc; } @@ -201,14 +164,7 @@ ksocknal_lib_eager_ack(struct ksock_conn *conn) int ksocknal_lib_recv_iov(struct ksock_conn *conn) { -#if SOCKNAL_SINGLE_FRAG_RX - struct kvec scratch; - struct kvec *scratchiov = &scratch; - unsigned int niov = 1; -#else - struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = conn->ksnc_rx_niov; -#endif struct kvec *iov = conn->ksnc_rx_iov; struct msghdr msg = { .msg_flags = 0 @@ -220,20 +176,15 @@ ksocknal_lib_recv_iov(struct ksock_conn *conn) int sum; __u32 saved_csum; - /* - * NB we can't trust socket ops to either consume our iovs - * or leave them alone. - */ LASSERT(niov > 0); - for (nob = i = 0; i < niov; i++) { - scratchiov[i] = iov[i]; - nob += scratchiov[i].iov_len; - } + for (nob = i = 0; i < niov; i++) + nob += iov[i].iov_len; + LASSERT(nob <= conn->ksnc_rx_nob_wanted); - rc = kernel_recvmsg(conn->ksnc_sock, &msg, scratchiov, niov, nob, - MSG_DONTWAIT); + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, niov, nob); + rc = sock_recvmsg(conn->ksnc_sock, &msg, MSG_DONTWAIT); saved_csum = 0; if (conn->ksnc_proto == &ksocknal_protocol_v2x) { @@ -259,67 +210,10 @@ ksocknal_lib_recv_iov(struct ksock_conn *conn) return rc; } -static void -ksocknal_lib_kiov_vunmap(void *addr) -{ - if (!addr) - return; - - vunmap(addr); -} - -static void * -ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov, - struct kvec *iov, struct page **pages) -{ - void *addr; - int nob; - int i; - - if (!*ksocknal_tunables.ksnd_zc_recv || !pages) - return NULL; - - LASSERT(niov <= LNET_MAX_IOV); - - if (niov < 2 || - niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags) - return NULL; - - for (nob = i = 0; i < niov; i++) { - if ((kiov[i].kiov_offset && i > 0) || - (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_SIZE && i < niov - 1)) - return NULL; - - pages[i] = kiov[i].kiov_page; - nob += kiov[i].kiov_len; - } - - addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL); - if (!addr) - return NULL; - - iov->iov_base = addr + kiov[0].kiov_offset; - iov->iov_len = nob; - - return addr; -} - int ksocknal_lib_recv_kiov(struct ksock_conn *conn) { -#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK - struct kvec scratch; - struct kvec *scratchiov = &scratch; - struct page **pages = NULL; - unsigned int niov = 1; -#else -#ifdef CONFIG_HIGHMEM -#warning "XXX risk of kmap deadlock on multiple frags..." -#endif - struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; - struct page **pages = conn->ksnc_scheduler->kss_rx_scratch_pgs; unsigned int niov = conn->ksnc_rx_nkiov; -#endif lnet_kiov_t *kiov = conn->ksnc_rx_kiov; struct msghdr msg = { .msg_flags = 0 @@ -328,63 +222,32 @@ ksocknal_lib_recv_kiov(struct ksock_conn *conn) int i; int rc; void *base; - void *addr; int sum; int fragnob; - int n; - - /* - * NB we can't trust socket ops to either consume our iovs - * or leave them alone. - */ - addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages); - if (addr) { - nob = scratchiov[0].iov_len; - n = 1; - } else { - for (nob = i = 0; i < niov; i++) { - nob += scratchiov[i].iov_len = kiov[i].kiov_len; - scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + - kiov[i].kiov_offset; - } - n = niov; - } + for (nob = i = 0; i < niov; i++) + nob += kiov[i].bv_len; LASSERT(nob <= conn->ksnc_rx_nob_wanted); - rc = kernel_recvmsg(conn->ksnc_sock, &msg, (struct kvec *)scratchiov, - n, nob, MSG_DONTWAIT); + iov_iter_bvec(&msg.msg_iter, READ | ITER_BVEC, kiov, niov, nob); + rc = sock_recvmsg(conn->ksnc_sock, &msg, MSG_DONTWAIT); if (conn->ksnc_msg.ksm_csum) { for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { LASSERT(i < niov); - /* - * Dang! have to kmap again because I have nowhere to - * stash the mapped address. But by doing it while the - * page is still mapped, the kernel just bumps the map - * count and returns me the address it stashed. - */ - base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; - fragnob = kiov[i].kiov_len; + base = kmap(kiov[i].bv_page) + kiov[i].bv_offset; + fragnob = kiov[i].bv_len; if (fragnob > sum) fragnob = sum; conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, base, fragnob); - kunmap(kiov[i].kiov_page); + kunmap(kiov[i].bv_page); } } - - if (addr) { - ksocknal_lib_kiov_vunmap(addr); - } else { - for (i = 0; i < niov; i++) - kunmap(kiov[i].kiov_page); - } - return rc; } @@ -406,12 +269,12 @@ ksocknal_lib_csum_tx(struct ksock_tx *tx) if (tx->tx_kiov) { for (i = 0; i < tx->tx_nkiov; i++) { - base = kmap(tx->tx_kiov[i].kiov_page) + - tx->tx_kiov[i].kiov_offset; + base = kmap(tx->tx_kiov[i].bv_page) + + tx->tx_kiov[i].bv_offset; - csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len); + csum = ksocknal_csum(csum, base, tx->tx_kiov[i].bv_len); - kunmap(tx->tx_kiov[i].kiov_page); + kunmap(tx->tx_kiov[i].bv_page); } } else { for (i = 1; i < tx->tx_niov; i++) diff --git a/drivers/staging/lustre/lnet/libcfs/debug.c b/drivers/staging/lustre/lnet/libcfs/debug.c index 42b15a769183..23b36b890964 100644 --- a/drivers/staging/lustre/lnet/libcfs/debug.c +++ b/drivers/staging/lustre/lnet/libcfs/debug.c @@ -328,15 +328,20 @@ libcfs_debug_str2mask(int *mask, const char *str, int is_subsys) */ void libcfs_debug_dumplog_internal(void *arg) { + static time64_t last_dump_time; + time64_t current_time; void *journal_info; journal_info = current->journal_info; current->journal_info = NULL; + current_time = ktime_get_real_seconds(); - if (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0) { + if (strncmp(libcfs_debug_file_path_arr, "NONE", 4) && + current_time > last_dump_time) { + last_dump_time = current_time; snprintf(debug_file_name, sizeof(debug_file_name) - 1, "%s.%lld.%ld", libcfs_debug_file_path_arr, - (s64)ktime_get_real_seconds(), (long_ptr_t)arg); + (s64)current_time, (long_ptr_t)arg); pr_alert("LustreError: dumping log to %s\n", debug_file_name); cfs_tracefile_dump_all_pages(debug_file_name); libcfs_run_debug_log_upcall(debug_file_name); diff --git a/drivers/staging/lustre/lnet/libcfs/fail.c b/drivers/staging/lustre/lnet/libcfs/fail.c index 9288ee08d1f7..e4b1a0a86eae 100644 --- a/drivers/staging/lustre/lnet/libcfs/fail.c +++ b/drivers/staging/lustre/lnet/libcfs/fail.c @@ -90,8 +90,10 @@ int __cfs_fail_check_set(__u32 id, __u32 value, int set) } } - if ((set == CFS_FAIL_LOC_ORSET || set == CFS_FAIL_LOC_RESET) && - (value & CFS_FAIL_ONCE)) + /* Take into account the current call for FAIL_ONCE for ORSET only, + * as RESET is a new fail_loc, it does not change the current call + */ + if ((set == CFS_FAIL_LOC_ORSET) && (value & CFS_FAIL_ONCE)) set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc); /* Lost race to set CFS_FAILED_BIT. */ if (test_and_set_bit(CFS_FAILED_BIT, &cfs_fail_loc)) { diff --git a/drivers/staging/lustre/lnet/libcfs/libcfs_string.c b/drivers/staging/lustre/lnet/libcfs/libcfs_string.c index fc697cdfcdaf..56a614d7713b 100644 --- a/drivers/staging/lustre/lnet/libcfs/libcfs_string.c +++ b/drivers/staging/lustre/lnet/libcfs/libcfs_string.c @@ -229,8 +229,6 @@ cfs_str2num_check(char *str, int nob, unsigned *num, char *endp, cache; int rc; - str = cfs_trimwhite(str); - /** * kstrouint can only handle strings composed * of only numbers. We need to scan the string diff --git a/drivers/staging/lustre/lnet/libcfs/linux/linux-cpu.c b/drivers/staging/lustre/lnet/libcfs/linux/linux-cpu.c index b52518c54efe..e8b1a61420de 100644 --- a/drivers/staging/lustre/lnet/libcfs/linux/linux-cpu.c +++ b/drivers/staging/lustre/lnet/libcfs/linux/linux-cpu.c @@ -74,6 +74,17 @@ struct cfs_cpt_data { static struct cfs_cpt_data cpt_data; +static void +cfs_node_to_cpumask(int node, cpumask_t *mask) +{ + const cpumask_t *tmp = cpumask_of_node(node); + + if (tmp) + cpumask_copy(mask, tmp); + else + cpumask_clear(mask); +} + void cfs_cpt_table_free(struct cfs_cpt_table *cptab) { @@ -403,7 +414,7 @@ cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node) mutex_lock(&cpt_data.cpt_mutex); mask = cpt_data.cpt_cpumask; - cpumask_copy(mask, cpumask_of_node(node)); + cfs_node_to_cpumask(node, mask); rc = cfs_cpt_set_cpumask(cptab, cpt, mask); @@ -427,7 +438,7 @@ cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node) mutex_lock(&cpt_data.cpt_mutex); mask = cpt_data.cpt_cpumask; - cpumask_copy(mask, cpumask_of_node(node)); + cfs_node_to_cpumask(node, mask); cfs_cpt_unset_cpumask(cptab, cpt, mask); @@ -749,7 +760,7 @@ cfs_cpt_table_create(int ncpt) } for_each_online_node(i) { - cpumask_copy(mask, cpumask_of_node(i)); + cfs_node_to_cpumask(i, mask); while (!cpumask_empty(mask)) { struct cfs_cpu_partition *part; diff --git a/drivers/staging/lustre/lnet/libcfs/linux/linux-crypto.c b/drivers/staging/lustre/lnet/libcfs/linux/linux-crypto.c index 5c0116ade909..7f56d2c9dd00 100644 --- a/drivers/staging/lustre/lnet/libcfs/linux/linux-crypto.c +++ b/drivers/staging/lustre/lnet/libcfs/linux/linux-crypto.c @@ -95,8 +95,8 @@ static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg, err = crypto_ahash_setkey(tfm, key, key_len); else if ((*type)->cht_key != 0) err = crypto_ahash_setkey(tfm, - (unsigned char *)&((*type)->cht_key), - (*type)->cht_size); + (unsigned char *)&((*type)->cht_key), + (*type)->cht_size); if (err != 0) { ahash_request_free(*req); diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c index 346db892f275..4daf828198c3 100644 --- a/drivers/staging/lustre/lnet/lnet/api-ni.c +++ b/drivers/staging/lustre/lnet/lnet/api-ni.c @@ -1286,6 +1286,25 @@ lnet_startup_lndni(struct lnet_ni *ni, struct lnet_ioctl_config_data *conf) sizeof(*ni->ni_lnd_tunables)); } + /* + * If given some LND tunable parameters, parse those now to + * override the values in the NI structure. + */ + if (conf) { + if (conf->cfg_config_u.cfg_net.net_peer_rtr_credits >= 0) + ni->ni_peerrtrcredits = + conf->cfg_config_u.cfg_net.net_peer_rtr_credits; + if (conf->cfg_config_u.cfg_net.net_peer_timeout >= 0) + ni->ni_peertimeout = + conf->cfg_config_u.cfg_net.net_peer_timeout; + if (conf->cfg_config_u.cfg_net.net_peer_tx_credits != -1) + ni->ni_peertxcredits = + conf->cfg_config_u.cfg_net.net_peer_tx_credits; + if (conf->cfg_config_u.cfg_net.net_max_tx_credits >= 0) + ni->ni_maxtxcredits = + conf->cfg_config_u.cfg_net.net_max_tx_credits; + } + rc = lnd->lnd_startup(ni); mutex_unlock(&the_lnet.ln_lnd_mutex); @@ -1299,33 +1318,6 @@ lnet_startup_lndni(struct lnet_ni *ni, struct lnet_ioctl_config_data *conf) goto failed0; } - /* - * If given some LND tunable parameters, parse those now to - * override the values in the NI structure. - */ - if (conf && conf->cfg_config_u.cfg_net.net_peer_rtr_credits >= 0) { - ni->ni_peerrtrcredits = - conf->cfg_config_u.cfg_net.net_peer_rtr_credits; - } - if (conf && conf->cfg_config_u.cfg_net.net_peer_timeout >= 0) { - ni->ni_peertimeout = - conf->cfg_config_u.cfg_net.net_peer_timeout; - } - /* - * TODO - * Note: For now, don't allow the user to change - * peertxcredits as this number is used in the - * IB LND to control queue depth. - * - * if (conf && conf->cfg_config_u.cfg_net.net_peer_tx_credits != -1) - * ni->ni_peertxcredits = - * conf->cfg_config_u.cfg_net.net_peer_tx_credits; - */ - if (conf && conf->cfg_config_u.cfg_net.net_max_tx_credits >= 0) { - ni->ni_maxtxcredits = - conf->cfg_config_u.cfg_net.net_max_tx_credits; - } - LASSERT(ni->ni_peertimeout <= 0 || lnd->lnd_query); lnet_net_lock(LNET_LOCK_EX); diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c index a72afdf68bb2..9e2183ff847e 100644 --- a/drivers/staging/lustre/lnet/lnet/config.c +++ b/drivers/staging/lustre/lnet/lnet/config.c @@ -31,6 +31,8 @@ */ #define DEBUG_SUBSYSTEM S_LNET +#include <linux/nsproxy.h> +#include <net/net_namespace.h> #include "../../include/linux/lnet/lib-lnet.h" struct lnet_text_buf { /* tmp struct for parsing routes */ @@ -110,6 +112,11 @@ lnet_ni_free(struct lnet_ni *ni) LIBCFS_FREE(ni->ni_interfaces[i], strlen(ni->ni_interfaces[i]) + 1); } + + /* release reference to net namespace */ + if (ni->ni_net_ns) + put_net(ni->ni_net_ns); + LIBCFS_FREE(ni, sizeof(*ni)); } @@ -171,6 +178,13 @@ lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist) /* LND will fill in the address part of the NID */ ni->ni_nid = LNET_MKNID(net, 0); + + /* Store net namespace in which current ni is being created */ + if (current->nsproxy->net_ns) + ni->ni_net_ns = get_net(current->nsproxy->net_ns); + else + ni->ni_net_ns = NULL; + ni->ni_last_alive = ktime_get_real_seconds(); list_add_tail(&ni->ni_list, nilist); return ni; diff --git a/drivers/staging/lustre/lnet/lnet/lib-md.c b/drivers/staging/lustre/lnet/lnet/lib-md.c index 1834bf7a27ef..eab53cd57296 100644 --- a/drivers/staging/lustre/lnet/lnet/lib-md.c +++ b/drivers/staging/lustre/lnet/lnet/lib-md.c @@ -134,11 +134,11 @@ lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink) for (i = 0; i < (int)niov; i++) { /* We take the page pointer on trust */ - if (lmd->md_iov.kiov[i].kiov_offset + - lmd->md_iov.kiov[i].kiov_len > PAGE_SIZE) + if (lmd->md_iov.kiov[i].bv_offset + + lmd->md_iov.kiov[i].bv_len > PAGE_SIZE) return -EINVAL; /* invalid length */ - total_length += lmd->md_iov.kiov[i].kiov_len; + total_length += lmd->md_iov.kiov[i].bv_len; } lmd->md_length = total_length; @@ -292,11 +292,12 @@ LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd, return -ENOMEM; rc = lnet_md_build(md, &umd, unlink); + if (rc) + goto out_free; + cpt = lnet_cpt_of_cookie(meh.cookie); lnet_res_lock(cpt); - if (rc) - goto failed; me = lnet_handle2me(&meh); if (!me) @@ -307,7 +308,7 @@ LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd, rc = lnet_md_link(md, umd.eq_handle, cpt); if (rc) - goto failed; + goto out_unlock; /* * attach this MD to portal of ME and check if it matches any @@ -324,10 +325,10 @@ LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd, return 0; - failed: - lnet_md_free(md); - +out_unlock: lnet_res_unlock(cpt); +out_free: + lnet_md_free(md); return rc; } EXPORT_SYMBOL(LNetMDAttach); @@ -370,24 +371,25 @@ LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle) return -ENOMEM; rc = lnet_md_build(md, &umd, unlink); + if (rc) + goto out_free; cpt = lnet_res_lock_current(); - if (rc) - goto failed; rc = lnet_md_link(md, umd.eq_handle, cpt); if (rc) - goto failed; + goto out_unlock; lnet_md2handle(handle, md); lnet_res_unlock(cpt); return 0; - failed: +out_unlock: + lnet_res_unlock(cpt); +out_free: lnet_md_free(md); - lnet_res_unlock(cpt); return rc; } EXPORT_SYMBOL(LNetMDBind); diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c index e6d3b801d87d..48e6f8f2392f 100644 --- a/drivers/staging/lustre/lnet/lnet/lib-move.c +++ b/drivers/staging/lustre/lnet/lnet/lib-move.c @@ -37,6 +37,8 @@ #define DEBUG_SUBSYSTEM S_LNET #include "../../include/linux/lnet/lib-lnet.h" +#include <linux/nsproxy.h> +#include <net/net_namespace.h> static int local_nid_dist_zero = 1; module_param(local_nid_dist_zero, int, 0444); @@ -166,25 +168,17 @@ lnet_iov_nob(unsigned int niov, struct kvec *iov) EXPORT_SYMBOL(lnet_iov_nob); void -lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, - unsigned int nsiov, struct kvec *siov, unsigned int soffset, - unsigned int nob) +lnet_copy_iov2iter(struct iov_iter *to, + unsigned int nsiov, const struct kvec *siov, + unsigned int soffset, unsigned int nob) { /* NB diov, siov are READ-ONLY */ - unsigned int this_nob; + const char *s; + size_t left; if (!nob) return; - /* skip complete frags before 'doffset' */ - LASSERT(ndiov > 0); - while (doffset >= diov->iov_len) { - doffset -= diov->iov_len; - diov++; - ndiov--; - LASSERT(ndiov > 0); - } - /* skip complete frags before 'soffset' */ LASSERT(nsiov > 0); while (soffset >= siov->iov_len) { @@ -194,39 +188,68 @@ lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, LASSERT(nsiov > 0); } + s = (char *)siov->iov_base + soffset; + left = siov->iov_len - soffset; do { - LASSERT(ndiov > 0); + size_t n, copy = left; LASSERT(nsiov > 0); - this_nob = min(diov->iov_len - doffset, - siov->iov_len - soffset); - this_nob = min(this_nob, nob); - memcpy((char *)diov->iov_base + doffset, - (char *)siov->iov_base + soffset, this_nob); - nob -= this_nob; + if (copy > nob) + copy = nob; + n = copy_to_iter(s, copy, to); + if (n != copy) + return; + nob -= n; - if (diov->iov_len > doffset + this_nob) { - doffset += this_nob; - } else { - diov++; - ndiov--; - doffset = 0; - } + siov++; + s = (char *)siov->iov_base; + left = siov->iov_len; + nsiov--; + } while (nob > 0); +} +EXPORT_SYMBOL(lnet_copy_iov2iter); - if (siov->iov_len > soffset + this_nob) { - soffset += this_nob; - } else { - siov++; - nsiov--; - soffset = 0; - } +void +lnet_copy_kiov2iter(struct iov_iter *to, + unsigned int nsiov, const lnet_kiov_t *siov, + unsigned int soffset, unsigned int nob) +{ + if (!nob) + return; + + LASSERT(!in_interrupt()); + + LASSERT(nsiov > 0); + while (soffset >= siov->bv_len) { + soffset -= siov->bv_len; + siov++; + nsiov--; + LASSERT(nsiov > 0); + } + + do { + size_t copy = siov->bv_len - soffset, n; + + LASSERT(nsiov > 0); + + if (copy > nob) + copy = nob; + n = copy_page_to_iter(siov->bv_page, + siov->bv_offset + soffset, + copy, to); + if (n != copy) + return; + nob -= n; + siov++; + nsiov--; + soffset = 0; } while (nob > 0); } -EXPORT_SYMBOL(lnet_copy_iov2iov); +EXPORT_SYMBOL(lnet_copy_kiov2iter); int lnet_extract_iov(int dst_niov, struct kvec *dst, - int src_niov, struct kvec *src, + int src_niov, const struct kvec *src, unsigned int offset, unsigned int len) { /* @@ -280,238 +303,15 @@ lnet_kiov_nob(unsigned int niov, lnet_kiov_t *kiov) LASSERT(!niov || kiov); while (niov-- > 0) - nob += (kiov++)->kiov_len; + nob += (kiov++)->bv_len; return nob; } EXPORT_SYMBOL(lnet_kiov_nob); -void -lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, - unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset, - unsigned int nob) -{ - /* NB diov, siov are READ-ONLY */ - unsigned int this_nob; - char *daddr = NULL; - char *saddr = NULL; - - if (!nob) - return; - - LASSERT(!in_interrupt()); - - LASSERT(ndiov > 0); - while (doffset >= diov->kiov_len) { - doffset -= diov->kiov_len; - diov++; - ndiov--; - LASSERT(ndiov > 0); - } - - LASSERT(nsiov > 0); - while (soffset >= siov->kiov_len) { - soffset -= siov->kiov_len; - siov++; - nsiov--; - LASSERT(nsiov > 0); - } - - do { - LASSERT(ndiov > 0); - LASSERT(nsiov > 0); - this_nob = min(diov->kiov_len - doffset, - siov->kiov_len - soffset); - this_nob = min(this_nob, nob); - - if (!daddr) - daddr = ((char *)kmap(diov->kiov_page)) + - diov->kiov_offset + doffset; - if (!saddr) - saddr = ((char *)kmap(siov->kiov_page)) + - siov->kiov_offset + soffset; - - /* - * Vanishing risk of kmap deadlock when mapping 2 pages. - * However in practice at least one of the kiovs will be mapped - * kernel pages and the map/unmap will be NOOPs - */ - memcpy(daddr, saddr, this_nob); - nob -= this_nob; - - if (diov->kiov_len > doffset + this_nob) { - daddr += this_nob; - doffset += this_nob; - } else { - kunmap(diov->kiov_page); - daddr = NULL; - diov++; - ndiov--; - doffset = 0; - } - - if (siov->kiov_len > soffset + this_nob) { - saddr += this_nob; - soffset += this_nob; - } else { - kunmap(siov->kiov_page); - saddr = NULL; - siov++; - nsiov--; - soffset = 0; - } - } while (nob > 0); - - if (daddr) - kunmap(diov->kiov_page); - if (saddr) - kunmap(siov->kiov_page); -} -EXPORT_SYMBOL(lnet_copy_kiov2kiov); - -void -lnet_copy_kiov2iov(unsigned int niov, struct kvec *iov, unsigned int iovoffset, - unsigned int nkiov, lnet_kiov_t *kiov, - unsigned int kiovoffset, unsigned int nob) -{ - /* NB iov, kiov are READ-ONLY */ - unsigned int this_nob; - char *addr = NULL; - - if (!nob) - return; - - LASSERT(!in_interrupt()); - - LASSERT(niov > 0); - while (iovoffset >= iov->iov_len) { - iovoffset -= iov->iov_len; - iov++; - niov--; - LASSERT(niov > 0); - } - - LASSERT(nkiov > 0); - while (kiovoffset >= kiov->kiov_len) { - kiovoffset -= kiov->kiov_len; - kiov++; - nkiov--; - LASSERT(nkiov > 0); - } - - do { - LASSERT(niov > 0); - LASSERT(nkiov > 0); - this_nob = min(iov->iov_len - iovoffset, - (__kernel_size_t)kiov->kiov_len - kiovoffset); - this_nob = min(this_nob, nob); - - if (!addr) - addr = ((char *)kmap(kiov->kiov_page)) + - kiov->kiov_offset + kiovoffset; - - memcpy((char *)iov->iov_base + iovoffset, addr, this_nob); - nob -= this_nob; - - if (iov->iov_len > iovoffset + this_nob) { - iovoffset += this_nob; - } else { - iov++; - niov--; - iovoffset = 0; - } - - if (kiov->kiov_len > kiovoffset + this_nob) { - addr += this_nob; - kiovoffset += this_nob; - } else { - kunmap(kiov->kiov_page); - addr = NULL; - kiov++; - nkiov--; - kiovoffset = 0; - } - - } while (nob > 0); - - if (addr) - kunmap(kiov->kiov_page); -} -EXPORT_SYMBOL(lnet_copy_kiov2iov); - -void -lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, - unsigned int kiovoffset, unsigned int niov, - struct kvec *iov, unsigned int iovoffset, - unsigned int nob) -{ - /* NB kiov, iov are READ-ONLY */ - unsigned int this_nob; - char *addr = NULL; - - if (!nob) - return; - - LASSERT(!in_interrupt()); - - LASSERT(nkiov > 0); - while (kiovoffset >= kiov->kiov_len) { - kiovoffset -= kiov->kiov_len; - kiov++; - nkiov--; - LASSERT(nkiov > 0); - } - - LASSERT(niov > 0); - while (iovoffset >= iov->iov_len) { - iovoffset -= iov->iov_len; - iov++; - niov--; - LASSERT(niov > 0); - } - - do { - LASSERT(nkiov > 0); - LASSERT(niov > 0); - this_nob = min((__kernel_size_t)kiov->kiov_len - kiovoffset, - iov->iov_len - iovoffset); - this_nob = min(this_nob, nob); - - if (!addr) - addr = ((char *)kmap(kiov->kiov_page)) + - kiov->kiov_offset + kiovoffset; - - memcpy(addr, (char *)iov->iov_base + iovoffset, this_nob); - nob -= this_nob; - - if (kiov->kiov_len > kiovoffset + this_nob) { - addr += this_nob; - kiovoffset += this_nob; - } else { - kunmap(kiov->kiov_page); - addr = NULL; - kiov++; - nkiov--; - kiovoffset = 0; - } - - if (iov->iov_len > iovoffset + this_nob) { - iovoffset += this_nob; - } else { - iov++; - niov--; - iovoffset = 0; - } - } while (nob > 0); - - if (addr) - kunmap(kiov->kiov_page); -} -EXPORT_SYMBOL(lnet_copy_iov2kiov); - int lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, - int src_niov, lnet_kiov_t *src, + int src_niov, const lnet_kiov_t *src, unsigned int offset, unsigned int len) { /* @@ -526,8 +326,8 @@ lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, return 0; /* no frags */ LASSERT(src_niov > 0); - while (offset >= src->kiov_len) { /* skip initial frags */ - offset -= src->kiov_len; + while (offset >= src->bv_len) { /* skip initial frags */ + offset -= src->bv_len; src_niov--; src++; LASSERT(src_niov > 0); @@ -538,19 +338,19 @@ lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, LASSERT(src_niov > 0); LASSERT((int)niov <= dst_niov); - frag_len = src->kiov_len - offset; - dst->kiov_page = src->kiov_page; - dst->kiov_offset = src->kiov_offset + offset; + frag_len = src->bv_len - offset; + dst->bv_page = src->bv_page; + dst->bv_offset = src->bv_offset + offset; if (len <= frag_len) { - dst->kiov_len = len; - LASSERT(dst->kiov_offset + dst->kiov_len + dst->bv_len = len; + LASSERT(dst->bv_offset + dst->bv_len <= PAGE_SIZE); return niov; } - dst->kiov_len = frag_len; - LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + dst->bv_len = frag_len; + LASSERT(dst->bv_offset + dst->bv_len <= PAGE_SIZE); len -= frag_len; dst++; @@ -569,6 +369,7 @@ lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, unsigned int niov = 0; struct kvec *iov = NULL; lnet_kiov_t *kiov = NULL; + struct iov_iter to; int rc; LASSERT(!in_interrupt()); @@ -594,8 +395,14 @@ lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, } } - rc = ni->ni_lnd->lnd_recv(ni, private, msg, delayed, - niov, iov, kiov, offset, mlen, rlen); + if (iov) { + iov_iter_kvec(&to, ITER_KVEC | READ, iov, niov, mlen + offset); + iov_iter_advance(&to, offset); + } else { + iov_iter_bvec(&to, ITER_BVEC | READ, kiov, niov, mlen + offset); + iov_iter_advance(&to, offset); + } + rc = ni->ni_lnd->lnd_recv(ni, private, msg, delayed, &to, rlen); if (rc < 0) lnet_finalize(ni, msg, rc); } @@ -2002,6 +1809,9 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), lnet_msgtyp2str(type), rc); lnet_msg_free(msg); + if (rc == -ESHUTDOWN) + /* We are shutting down. Don't do anything more */ + return 0; goto drop; } @@ -2512,6 +2322,15 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) } if (LNET_NIDNET(ni->ni_nid) == dstnet) { + /* + * Check if ni was originally created in + * current net namespace. + * If not, assign order above 0xffff0000, + * to make this ni not a priority. + */ + if (!net_eq(ni->ni_net_ns, current->nsproxy->net_ns)) + order += 0xffff0000; + if (srcnidp) *srcnidp = ni->ni_nid; if (orderp) diff --git a/drivers/staging/lustre/lnet/lnet/lib-msg.c b/drivers/staging/lustre/lnet/lnet/lib-msg.c index 910e106e221d..0897e588bd54 100644 --- a/drivers/staging/lustre/lnet/lnet/lib-msg.c +++ b/drivers/staging/lustre/lnet/lnet/lib-msg.c @@ -449,23 +449,7 @@ lnet_finalize(lnet_ni_t *ni, lnet_msg_t *msg, int status) if (!msg) return; -#if 0 - CDEBUG(D_WARNING, "%s msg->%s Flags:%s%s%s%s%s%s%s%s%s%s%s txp %s rxp %s\n", - lnet_msgtyp2str(msg->msg_type), libcfs_id2str(msg->msg_target), - msg->msg_target_is_router ? "t" : "", - msg->msg_routing ? "X" : "", - msg->msg_ack ? "A" : "", - msg->msg_sending ? "S" : "", - msg->msg_receiving ? "R" : "", - msg->msg_delayed ? "d" : "", - msg->msg_txcredit ? "C" : "", - msg->msg_peertxcredit ? "c" : "", - msg->msg_rtrcredit ? "F" : "", - msg->msg_peerrtrcredit ? "f" : "", - msg->msg_onactivelist ? "!" : "", - !msg->msg_txpeer ? "<none>" : libcfs_nid2str(msg->msg_txpeer->lp_nid), - !msg->msg_rxpeer ? "<none>" : libcfs_nid2str(msg->msg_rxpeer->lp_nid)); -#endif + msg->msg_ev.status = status; if (msg->msg_md) { diff --git a/drivers/staging/lustre/lnet/lnet/lib-socket.c b/drivers/staging/lustre/lnet/lnet/lib-socket.c index 891fd59401d7..4e6dd5149b4f 100644 --- a/drivers/staging/lustre/lnet/lnet/lib-socket.c +++ b/drivers/staging/lustre/lnet/lnet/lib-socket.c @@ -265,21 +265,17 @@ lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout) long jiffies_left = timeout * msecs_to_jiffies(MSEC_PER_SEC); unsigned long then; struct timeval tv; + struct kvec iov = { .iov_base = buffer, .iov_len = nob }; + struct msghdr msg = {NULL,}; LASSERT(nob > 0); /* * Caller may pass a zero timeout if she thinks the socket buffer is * empty enough to take the whole message immediately */ + iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iov, 1, nob); for (;;) { - struct kvec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_flags = !timeout ? MSG_DONTWAIT : 0 - }; - + msg.msg_flags = !timeout ? MSG_DONTWAIT : 0; if (timeout) { /* Set send timeout to remaining time */ jiffies_to_timeval(jiffies_left, &tv); @@ -296,9 +292,6 @@ lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout) rc = kernel_sendmsg(sock, &msg, &iov, 1, nob); jiffies_left -= jiffies - then; - if (rc == nob) - return 0; - if (rc < 0) return rc; @@ -307,11 +300,11 @@ lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout) return -ECONNABORTED; } + if (!msg_data_left(&msg)) + break; + if (jiffies_left <= 0) return -EAGAIN; - - buffer = ((char *)buffer) + rc; - nob -= rc; } return 0; } diff --git a/drivers/staging/lustre/lnet/lnet/lo.c b/drivers/staging/lustre/lnet/lnet/lo.c index 08402712a452..cb213b8f51cf 100644 --- a/drivers/staging/lustre/lnet/lnet/lo.c +++ b/drivers/staging/lustre/lnet/lnet/lo.c @@ -42,36 +42,23 @@ lolnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) static int lolnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, - int delayed, unsigned int niov, - struct kvec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen) + int delayed, struct iov_iter *to, unsigned int rlen) { lnet_msg_t *sendmsg = private; if (lntmsg) { /* not discarding */ - if (sendmsg->msg_iov) { - if (iov) - lnet_copy_iov2iov(niov, iov, offset, - sendmsg->msg_niov, - sendmsg->msg_iov, - sendmsg->msg_offset, mlen); - else - lnet_copy_iov2kiov(niov, kiov, offset, - sendmsg->msg_niov, - sendmsg->msg_iov, - sendmsg->msg_offset, mlen); - } else { - if (iov) - lnet_copy_kiov2iov(niov, iov, offset, - sendmsg->msg_niov, - sendmsg->msg_kiov, - sendmsg->msg_offset, mlen); - else - lnet_copy_kiov2kiov(niov, kiov, offset, - sendmsg->msg_niov, - sendmsg->msg_kiov, - sendmsg->msg_offset, mlen); - } + if (sendmsg->msg_iov) + lnet_copy_iov2iter(to, + sendmsg->msg_niov, + sendmsg->msg_iov, + sendmsg->msg_offset, + iov_iter_count(to)); + else + lnet_copy_kiov2iter(to, + sendmsg->msg_niov, + sendmsg->msg_kiov, + sendmsg->msg_offset, + iov_iter_count(to)); lnet_finalize(ni, lntmsg, 0); } diff --git a/drivers/staging/lustre/lnet/lnet/router.c b/drivers/staging/lustre/lnet/lnet/router.c index 063543233035..063ad55ec950 100644 --- a/drivers/staging/lustre/lnet/lnet/router.c +++ b/drivers/staging/lustre/lnet/lnet/router.c @@ -1307,7 +1307,7 @@ lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages) int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]); while (--npages >= 0) - __free_page(rb->rb_kiov[npages].kiov_page); + __free_page(rb->rb_kiov[npages].bv_page); LIBCFS_FREE(rb, sz); } @@ -1333,15 +1333,15 @@ lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt) GFP_KERNEL | __GFP_ZERO, 0); if (!page) { while (--i >= 0) - __free_page(rb->rb_kiov[i].kiov_page); + __free_page(rb->rb_kiov[i].bv_page); LIBCFS_FREE(rb, sz); return NULL; } - rb->rb_kiov[i].kiov_len = PAGE_SIZE; - rb->rb_kiov[i].kiov_offset = 0; - rb->rb_kiov[i].kiov_page = page; + rb->rb_kiov[i].bv_len = PAGE_SIZE; + rb->rb_kiov[i].bv_offset = 0; + rb->rb_kiov[i].bv_page = page; } return rb; @@ -1693,7 +1693,7 @@ lnet_rtrpools_adjust(int tiny, int small, int large) int lnet_rtrpools_enable(void) { - int rc; + int rc = 0; if (the_lnet.ln_routing) return 0; @@ -1706,9 +1706,9 @@ lnet_rtrpools_enable(void) * if we are just configuring this for the first * time. */ - return lnet_rtrpools_alloc(1); - - rc = lnet_rtrpools_adjust_helper(0, 0, 0); + rc = lnet_rtrpools_alloc(1); + else + rc = lnet_rtrpools_adjust_helper(0, 0, 0); if (rc) return rc; @@ -1718,7 +1718,7 @@ lnet_rtrpools_enable(void) the_lnet.ln_ping_info->pi_features &= ~LNET_PING_FEAT_RTE_DISABLED; lnet_net_unlock(LNET_LOCK_EX); - return 0; + return rc; } void diff --git a/drivers/staging/lustre/lnet/selftest/brw_test.c b/drivers/staging/lustre/lnet/selftest/brw_test.c index 13d0454e7fcb..b20c5d394e3b 100644 --- a/drivers/staging/lustre/lnet/selftest/brw_test.c +++ b/drivers/staging/lustre/lnet/selftest/brw_test.c @@ -226,7 +226,7 @@ brw_fill_bulk(struct srpc_bulk *bk, int pattern, __u64 magic) struct page *pg; for (i = 0; i < bk->bk_niov; i++) { - pg = bk->bk_iovs[i].kiov_page; + pg = bk->bk_iovs[i].bv_page; brw_fill_page(pg, pattern, magic); } } @@ -238,7 +238,7 @@ brw_check_bulk(struct srpc_bulk *bk, int pattern, __u64 magic) struct page *pg; for (i = 0; i < bk->bk_niov; i++) { - pg = bk->bk_iovs[i].kiov_page; + pg = bk->bk_iovs[i].bv_page; if (brw_check_page(pg, pattern, magic)) { CERROR("Bulk page %p (%d/%d) is corrupted!\n", pg, i, bk->bk_niov); diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.c b/drivers/staging/lustre/lnet/selftest/conrpc.c index 1be3cad727ae..55afb53b0743 100644 --- a/drivers/staging/lustre/lnet/selftest/conrpc.c +++ b/drivers/staging/lustre/lnet/selftest/conrpc.c @@ -152,10 +152,10 @@ lstcon_rpc_put(struct lstcon_rpc *crpc) LASSERT(list_empty(&crpc->crp_link)); for (i = 0; i < bulk->bk_niov; i++) { - if (!bulk->bk_iovs[i].kiov_page) + if (!bulk->bk_iovs[i].bv_page) continue; - __free_page(bulk->bk_iovs[i].kiov_page); + __free_page(bulk->bk_iovs[i].bv_page); } srpc_client_rpc_decref(crpc->crp_rpc); @@ -705,7 +705,7 @@ lstcon_next_id(int idx, int nkiov, lnet_kiov_t *kiov) LASSERT(i < nkiov); - pid = (lnet_process_id_packed_t *)page_address(kiov[i].kiov_page); + pid = (lnet_process_id_packed_t *)page_address(kiov[i].bv_page); return &pid[idx % SFW_ID_PER_PAGE]; } @@ -849,12 +849,11 @@ lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned feats, min_t(int, nob, PAGE_SIZE); nob -= len; - bulk->bk_iovs[i].kiov_offset = 0; - bulk->bk_iovs[i].kiov_len = len; - bulk->bk_iovs[i].kiov_page = - alloc_page(GFP_KERNEL); + bulk->bk_iovs[i].bv_offset = 0; + bulk->bk_iovs[i].bv_len = len; + bulk->bk_iovs[i].bv_page = alloc_page(GFP_KERNEL); - if (!bulk->bk_iovs[i].kiov_page) { + if (!bulk->bk_iovs[i].bv_page) { lstcon_rpc_put(*crpc); return -ENOMEM; } diff --git a/drivers/staging/lustre/lnet/selftest/console.c b/drivers/staging/lustre/lnet/selftest/console.c index 4c33621f06da..a0fcbf3bcc95 100644 --- a/drivers/staging/lustre/lnet/selftest/console.c +++ b/drivers/staging/lustre/lnet/selftest/console.c @@ -1993,8 +1993,6 @@ static void lstcon_init_acceptor_service(void) lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX; } -extern int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr); - static DECLARE_IOCTL_HANDLER(lstcon_ioctl_handler, lstcon_ioctl_entry); /* initialize console */ diff --git a/drivers/staging/lustre/lnet/selftest/console.h b/drivers/staging/lustre/lnet/selftest/console.h index 78b147732615..78388a611c22 100644 --- a/drivers/staging/lustre/lnet/selftest/console.h +++ b/drivers/staging/lustre/lnet/selftest/console.h @@ -184,6 +184,7 @@ lstcon_id2hash(lnet_process_id_t id, struct list_head *hash) return &hash[idx]; } +int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr); int lstcon_console_init(void); int lstcon_console_fini(void); int lstcon_session_match(lst_sid_t sid); diff --git a/drivers/staging/lustre/lnet/selftest/framework.c b/drivers/staging/lustre/lnet/selftest/framework.c index c2f121f44d33..abbd6287b4bd 100644 --- a/drivers/staging/lustre/lnet/selftest/framework.c +++ b/drivers/staging/lustre/lnet/selftest/framework.c @@ -784,8 +784,8 @@ sfw_add_test_instance(struct sfw_batch *tsb, struct srpc_server_rpc *rpc) lnet_process_id_packed_t id; int j; - dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].kiov_page); - LASSERT(dests); /* my pages are within KVM always */ + dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].bv_page); + LASSERT(dests); /* my pages are within KVM always */ id = dests[i % SFW_ID_PER_PAGE]; if (msg->msg_magic != SRPC_MSG_MAGIC) sfw_unpack_id(id); diff --git a/drivers/staging/lustre/lnet/selftest/rpc.c b/drivers/staging/lustre/lnet/selftest/rpc.c index 3b26d6eb4240..f5619d8744ef 100644 --- a/drivers/staging/lustre/lnet/selftest/rpc.c +++ b/drivers/staging/lustre/lnet/selftest/rpc.c @@ -91,9 +91,9 @@ srpc_add_bulk_page(struct srpc_bulk *bk, struct page *pg, int i, int nob) LASSERT(nob > 0); LASSERT(i >= 0 && i < bk->bk_niov); - bk->bk_iovs[i].kiov_offset = 0; - bk->bk_iovs[i].kiov_page = pg; - bk->bk_iovs[i].kiov_len = nob; + bk->bk_iovs[i].bv_offset = 0; + bk->bk_iovs[i].bv_page = pg; + bk->bk_iovs[i].bv_len = nob; return nob; } @@ -106,7 +106,7 @@ srpc_free_bulk(struct srpc_bulk *bk) LASSERT(bk); for (i = 0; i < bk->bk_niov; i++) { - pg = bk->bk_iovs[i].kiov_page; + pg = bk->bk_iovs[i].bv_page; if (!pg) break; diff --git a/drivers/staging/lustre/lustre/fid/fid_lib.c b/drivers/staging/lustre/lustre/fid/fid_lib.c index 99ae7eb6720e..4e49cb356d64 100644 --- a/drivers/staging/lustre/lustre/fid/fid_lib.c +++ b/drivers/staging/lustre/lustre/fid/fid_lib.c @@ -63,14 +63,12 @@ const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = { FID_SEQ_NORMAL, (__u64)~0ULL }; -EXPORT_SYMBOL(LUSTRE_SEQ_SPACE_RANGE); /* Zero range, used for init and other purposes. */ const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE = { 0, 0 }; -EXPORT_SYMBOL(LUSTRE_SEQ_ZERO_RANGE); /* Lustre Big Fs Lock fid. */ const struct lu_fid LUSTRE_BFL_FID = { .f_seq = FID_SEQ_SPECIAL, diff --git a/drivers/staging/lustre/lustre/fid/fid_request.c b/drivers/staging/lustre/lustre/fid/fid_request.c index 454744d25956..edd72b926f81 100644 --- a/drivers/staging/lustre/lustre/fid/fid_request.c +++ b/drivers/staging/lustre/lustre/fid/fid_request.c @@ -125,19 +125,19 @@ static int seq_client_rpc(struct lu_client_seq *seq, if (!range_is_sane(output)) { CERROR("%s: Invalid range received from server: " - DRANGE"\n", seq->lcs_name, PRANGE(output)); + DRANGE "\n", seq->lcs_name, PRANGE(output)); rc = -EINVAL; goto out_req; } if (range_is_exhausted(output)) { CERROR("%s: Range received from server is exhausted: " - DRANGE"]\n", seq->lcs_name, PRANGE(output)); + DRANGE "]\n", seq->lcs_name, PRANGE(output)); rc = -EINVAL; goto out_req; } - CDEBUG_LIMIT(debug_mask, "%s: Allocated %s-sequence "DRANGE"]\n", + CDEBUG_LIMIT(debug_mask, "%s: Allocated %s-sequence " DRANGE "]\n", seq->lcs_name, opcname, PRANGE(output)); out_req: @@ -179,7 +179,7 @@ static int seq_client_alloc_seq(const struct lu_env *env, seq->lcs_name, rc); return rc; } - CDEBUG(D_INFO, "%s: New range - "DRANGE"\n", + CDEBUG(D_INFO, "%s: New range - " DRANGE "\n", seq->lcs_name, PRANGE(&seq->lcs_space)); } else { rc = 0; diff --git a/drivers/staging/lustre/lustre/fid/lproc_fid.c b/drivers/staging/lustre/lustre/fid/lproc_fid.c index 81b7ca9ea2fd..3ed32d77f38b 100644 --- a/drivers/staging/lustre/lustre/fid/lproc_fid.c +++ b/drivers/staging/lustre/lustre/fid/lproc_fid.c @@ -105,7 +105,7 @@ ldebugfs_fid_space_seq_write(struct file *file, rc = ldebugfs_fid_write_common(buffer, count, &seq->lcs_space); if (rc == 0) { - CDEBUG(D_INFO, "%s: Space: "DRANGE"\n", + CDEBUG(D_INFO, "%s: Space: " DRANGE "\n", seq->lcs_name, PRANGE(&seq->lcs_space)); } diff --git a/drivers/staging/lustre/lustre/fld/fld_internal.h b/drivers/staging/lustre/lustre/fld/fld_internal.h index f0efe5b9fbec..08eaec735d6f 100644 --- a/drivers/staging/lustre/lustre/fld/fld_internal.h +++ b/drivers/staging/lustre/lustre/fld/fld_internal.h @@ -31,6 +31,25 @@ * * lustre/fld/fld_internal.h * + * Subsystem Description: + * FLD is FID Location Database, which stores where (IE, on which MDT) + * FIDs are located. + * The database is basically a record file, each record consists of a FID + * sequence range, MDT/OST index, and flags. The FLD for the whole FS + * is only stored on the sequence controller(MDT0) right now, but each target + * also has its local FLD, which only stores the local sequence. + * + * The FLD subsystem usually has two tasks: + * 1. maintain the database, i.e. when the sequence controller allocates + * new sequence ranges to some nodes, it will call the FLD API to insert the + * location information <sequence_range, node_index> in FLDB. + * + * 2. Handle requests from other nodes, i.e. if client needs to know where + * the FID is located, if it can not find the information in the local cache, + * it will send a FLD lookup RPC to the FLD service, and the FLD service will + * look up the FLDB entry and return the location information to client. + * + * * Author: Yury Umanets <umka@clusterfs.com> * Author: Tom WangDi <wangdi@clusterfs.com> */ diff --git a/drivers/staging/lustre/lustre/fld/fld_request.c b/drivers/staging/lustre/lustre/fld/fld_request.c index e59d626a1548..0de72b717ce5 100644 --- a/drivers/staging/lustre/lustre/fld/fld_request.c +++ b/drivers/staging/lustre/lustre/fld/fld_request.c @@ -53,57 +53,6 @@ #include "../include/lustre_mdc.h" #include "fld_internal.h" -/* TODO: these 3 functions are copies of flow-control code from mdc_lib.c - * It should be common thing. The same about mdc RPC lock - */ -static int fld_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw) -{ - int rc; - - spin_lock(&cli->cl_loi_list_lock); - rc = list_empty(&mcw->mcw_entry); - spin_unlock(&cli->cl_loi_list_lock); - return rc; -}; - -static void fld_enter_request(struct client_obd *cli) -{ - struct mdc_cache_waiter mcw; - struct l_wait_info lwi = { 0 }; - - spin_lock(&cli->cl_loi_list_lock); - if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) { - list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters); - init_waitqueue_head(&mcw.mcw_waitq); - spin_unlock(&cli->cl_loi_list_lock); - l_wait_event(mcw.mcw_waitq, fld_req_avail(cli, &mcw), &lwi); - } else { - cli->cl_r_in_flight++; - spin_unlock(&cli->cl_loi_list_lock); - } -} - -static void fld_exit_request(struct client_obd *cli) -{ - struct list_head *l, *tmp; - struct mdc_cache_waiter *mcw; - - spin_lock(&cli->cl_loi_list_lock); - cli->cl_r_in_flight--; - list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { - if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) { - /* No free request slots anymore */ - break; - } - - mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry); - list_del_init(&mcw->mcw_entry); - cli->cl_r_in_flight++; - wake_up(&mcw->mcw_waitq); - } - spin_unlock(&cli->cl_loi_list_lock); -} - static int fld_rrb_hash(struct lu_client_fld *fld, u64 seq) { LASSERT(fld->lcf_count > 0); @@ -270,7 +219,6 @@ int fld_client_del_target(struct lu_client_fld *fld, __u64 idx) spin_unlock(&fld->lcf_lock); return -ENOENT; } -EXPORT_SYMBOL(fld_client_del_target); static struct dentry *fld_debugfs_dir; @@ -439,9 +387,9 @@ int fld_client_rpc(struct obd_export *exp, req->rq_reply_portal = MDC_REPLY_PORTAL; ptlrpc_at_set_req_timeout(req); - fld_enter_request(&exp->exp_obd->u.cli); + obd_get_request_slot(&exp->exp_obd->u.cli); rc = ptlrpc_queue_wait(req); - fld_exit_request(&exp->exp_obd->u.cli); + obd_put_request_slot(&exp->exp_obd->u.cli); if (rc) goto out_req; @@ -505,7 +453,6 @@ void fld_client_flush(struct lu_client_fld *fld) { fld_cache_flush(fld->lcf_cache); } -EXPORT_SYMBOL(fld_client_flush); static int __init fld_init(void) { diff --git a/drivers/staging/lustre/lustre/include/cl_object.h b/drivers/staging/lustre/lustre/include/cl_object.h index 3cd4a2577d90..89292c93dcd5 100644 --- a/drivers/staging/lustre/lustre/include/cl_object.h +++ b/drivers/staging/lustre/lustre/include/cl_object.h @@ -93,8 +93,8 @@ * super-class definitions. */ #include "lu_object.h" +#include "lustre_compat.h" #include <linux/atomic.h> -#include "linux/lustre_compat25.h" #include <linux/mutex.h> #include <linux/radix-tree.h> #include <linux/spinlock.h> @@ -191,6 +191,9 @@ struct cl_attr { * Group identifier for quota purposes. */ gid_t cat_gid; + + /* nlink of the directory */ + __u64 cat_nlink; }; /** @@ -320,7 +323,7 @@ struct cl_object_operations { * to be used instead of newly created. */ int (*coo_page_init)(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index); + struct cl_page *page, pgoff_t index); /** * Initialize lock slice for this layer. Called top-to-bottom through * every object layer when a new cl_lock is instantiated. Layer @@ -366,8 +369,8 @@ struct cl_object_operations { * \return the same convention as for * cl_object_operations::coo_attr_get() is used. */ - int (*coo_attr_set)(const struct lu_env *env, struct cl_object *obj, - const struct cl_attr *attr, unsigned valid); + int (*coo_attr_update)(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned int valid); /** * Update object configuration. Called top-to-bottom to modify object * configuration. @@ -392,6 +395,11 @@ struct cl_object_operations { * mainly pages and locks. */ int (*coo_prune)(const struct lu_env *env, struct cl_object *obj); + /** + * Object getstripe method. + */ + int (*coo_getstripe)(const struct lu_env *env, struct cl_object *obj, + struct lov_user_md __user *lum); }; /** @@ -687,17 +695,6 @@ enum cl_page_type { }; /** - * Flags maintained for every cl_page. - */ -enum cl_page_flags { - /** - * Set when pagein completes. Used for debugging (read completes at - * most once for a page). - */ - CPF_READ_COMPLETED = 1 << 0 -}; - -/** * Fields are protected by the lock on struct page, except for atomics and * immutables. * @@ -711,24 +708,19 @@ struct cl_page { atomic_t cp_ref; /** An object this page is a part of. Immutable after creation. */ struct cl_object *cp_obj; - /** List of slices. Immutable after creation. */ - struct list_head cp_layers; /** vmpage */ struct page *cp_vmpage; + /** Linkage of pages within group. Pages must be owned */ + struct list_head cp_batch; + /** List of slices. Immutable after creation. */ + struct list_head cp_layers; + /** Linkage of pages within cl_req. */ + struct list_head cp_flight; /** * Page state. This field is const to avoid accidental update, it is * modified only internally within cl_page.c. Protected by a VM lock. */ const enum cl_page_state cp_state; - /** Linkage of pages within group. Protected by cl_page::cp_mutex. */ - struct list_head cp_batch; - /** Mutex serializing membership of a page in a batch. */ - struct mutex cp_mutex; - /** Linkage of pages within cl_req. */ - struct list_head cp_flight; - /** Transfer error. */ - int cp_error; - /** * Page type. Only CPT_TRANSIENT is used so far. Immutable after * creation. @@ -741,10 +733,6 @@ struct cl_page { */ struct cl_io *cp_owner; /** - * Debug information, the task is owning the page. - */ - struct task_struct *cp_task; - /** * Owning IO request in cl_page_state::CPS_PAGEOUT and * cl_page_state::CPS_PAGEIN states. This field is maintained only in * the top-level pages. Protected by a VM lock. @@ -756,8 +744,6 @@ struct cl_page { struct lu_ref_link cp_obj_ref; /** Link to a queue, for debugging. */ struct lu_ref_link cp_queue_ref; - /** Per-page flags from enum cl_page_flags. Protected by a VM lock. */ - unsigned cp_flags; /** Assigned if doing a sync_io */ struct cl_sync_io *cp_sync_io; }; @@ -1056,23 +1042,32 @@ do { \ } \ } while (0) -static inline int __page_in_use(const struct cl_page *page, int refc) -{ - if (page->cp_type == CPT_CACHEABLE) - ++refc; - LASSERT(atomic_read(&page->cp_ref) > 0); - return (atomic_read(&page->cp_ref) > refc); -} - -#define cl_page_in_use(pg) __page_in_use(pg, 1) -#define cl_page_in_use_noref(pg) __page_in_use(pg, 0) - static inline struct page *cl_page_vmpage(struct cl_page *page) { LASSERT(page->cp_vmpage); return page->cp_vmpage; } +/** + * Check if a cl_page is in use. + * + * Client cache holds a refcount, this refcount will be dropped when + * the page is taken out of cache, see vvp_page_delete(). + */ +static inline bool __page_in_use(const struct cl_page *page, int refc) +{ + return (atomic_read(&page->cp_ref) > refc + 1); +} + +/** + * Caller itself holds a refcount of cl_page. + */ +#define cl_page_in_use(pg) __page_in_use(pg, 1) +/** + * Caller doesn't hold a refcount. + */ +#define cl_page_in_use_noref(pg) __page_in_use(pg, 0) + /** @} cl_page */ /** \addtogroup cl_lock cl_lock @@ -1771,12 +1766,14 @@ struct cl_io { struct cl_setattr_io { struct ost_lvb sa_attr; unsigned int sa_valid; + int sa_stripe_index; + struct lu_fid *sa_parent_fid; } ci_setattr; struct cl_fault_io { /** page index within file. */ pgoff_t ft_index; /** bytes valid byte on a faulted page. */ - int ft_nob; + size_t ft_nob; /** writable page? for nopage() only */ int ft_writable; /** page of an executable? */ @@ -1909,7 +1906,7 @@ struct cl_req_attr { /** Generic attributes for the server consumption. */ struct obdo *cra_oa; /** Jobid */ - char cra_jobid[JOBSTATS_JOBID_SIZE]; + char cra_jobid[LUSTRE_JOBID_SIZE]; }; /** @@ -2176,14 +2173,16 @@ void cl_object_attr_lock(struct cl_object *o); void cl_object_attr_unlock(struct cl_object *o); int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj, struct cl_attr *attr); -int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj, - const struct cl_attr *attr, unsigned valid); +int cl_object_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned int valid); int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj, struct ost_lvb *lvb); int cl_conf_set(const struct lu_env *env, struct cl_object *obj, const struct cl_object_conf *conf); int cl_object_prune(const struct lu_env *env, struct cl_object *obj); void cl_object_kill(const struct lu_env *env, struct cl_object *obj); +int cl_object_getstripe(const struct lu_env *env, struct cl_object *obj, + struct lov_user_md __user *lum); /** * Returns true, iff \a o0 and \a o1 are slices of the same object. @@ -2197,6 +2196,7 @@ static inline void cl_object_page_init(struct cl_object *clob, int size) { clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize; cl_object_header(clob)->coh_page_bufsize += cfs_size_round(size); + WARN_ON(cl_object_header(clob)->coh_page_bufsize > 512); } static inline void *cl_object_page_slice(struct cl_object *clob, @@ -2263,6 +2263,8 @@ void cl_page_unassume(const struct lu_env *env, struct cl_io *io, struct cl_page *pg); void cl_page_disown(const struct lu_env *env, struct cl_io *io, struct cl_page *page); +void cl_page_disown0(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg); int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io); /** @} ownership */ @@ -2304,7 +2306,7 @@ int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io, struct cl_page *page, pgoff_t *max_index); loff_t cl_offset(const struct cl_object *obj, pgoff_t idx); pgoff_t cl_index(const struct cl_object *obj, loff_t offset); -int cl_page_size(const struct cl_object *obj); +size_t cl_page_size(const struct cl_object *obj); int cl_pages_prune(const struct lu_env *env, struct cl_object *obj); void cl_lock_print(const struct lu_env *env, void *cookie, @@ -2333,7 +2335,7 @@ struct cl_client_cache { /** * # of LRU entries available */ - atomic_t ccc_lru_left; + atomic_long_t ccc_lru_left; /** * List of entities(OSCs) for this LRU cache */ @@ -2347,14 +2349,18 @@ struct cl_client_cache { */ spinlock_t ccc_lru_lock; /** + * Set if unstable check is enabled + */ + unsigned int ccc_unstable_check:1; + /** * # of unstable pages for this mount point */ - atomic_t ccc_unstable_nr; + atomic_long_t ccc_unstable_nr; /** * Waitq for awaiting unstable pages to reach zero. * Used at umounting time and signaled on BRW commit */ - wait_queue_head_t ccc_unstable_waitq; + wait_queue_head_t ccc_unstable_waitq; }; diff --git a/drivers/staging/lustre/lustre/include/interval_tree.h b/drivers/staging/lustre/lustre/include/interval_tree.h index 4a15228b5570..5d387d372547 100644 --- a/drivers/staging/lustre/lustre/include/interval_tree.h +++ b/drivers/staging/lustre/lustre/include/interval_tree.h @@ -63,6 +63,11 @@ static inline int interval_is_intree(struct interval_node *node) return node->in_intree == 1; } +static inline __u64 interval_low(struct interval_node *node) +{ + return node->in_extent.start; +} + static inline __u64 interval_high(struct interval_node *node) { return node->in_extent.end; @@ -77,8 +82,29 @@ static inline void interval_set(struct interval_node *node, node->in_max_high = end; } +/* + * Rules to write an interval callback. + * - the callback returns INTERVAL_ITER_STOP when it thinks the iteration + * should be stopped. It will then cause the iteration function to return + * immediately with return value INTERVAL_ITER_STOP. + * - callbacks for interval_iterate and interval_iterate_reverse: Every + * nodes in the tree will be set to @node before the callback being called + * - callback for interval_search: Only overlapped node will be set to @node + * before the callback being called. + */ +typedef enum interval_iter (*interval_callback_t)(struct interval_node *node, + void *args); + struct interval_node *interval_insert(struct interval_node *node, struct interval_node **root); void interval_erase(struct interval_node *node, struct interval_node **root); +/* + * Search the extents in the tree and call @func for each overlapped + * extents. + */ +enum interval_iter interval_search(struct interval_node *root, + struct interval_node_extent *ex, + interval_callback_t func, void *data); + #endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_lite.h b/drivers/staging/lustre/lustre/include/linux/lustre_lite.h deleted file mode 100644 index d18e8a76bb25..000000000000 --- a/drivers/staging/lustre/lustre/include/linux/lustre_lite.h +++ /dev/null @@ -1,91 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef _LINUX_LL_H -#define _LINUX_LL_H - -#ifndef _LL_H -#error Do not #include this file directly. #include <lustre_lite.h> instead -#endif - -#include <linux/statfs.h> - -#include <linux/fs.h> -#include <linux/dcache.h> - -#include "../obd_class.h" -#include "../lustre_net.h" -#include "../lustre_ha.h" - -#include <linux/rbtree.h> -#include "../../include/linux/lustre_compat25.h" -#include <linux/pagemap.h> - -/* lprocfs.c */ -enum { - LPROC_LL_DIRTY_HITS = 0, - LPROC_LL_DIRTY_MISSES, - LPROC_LL_READ_BYTES, - LPROC_LL_WRITE_BYTES, - LPROC_LL_BRW_READ, - LPROC_LL_BRW_WRITE, - LPROC_LL_OSC_READ, - LPROC_LL_OSC_WRITE, - LPROC_LL_IOCTL, - LPROC_LL_OPEN, - LPROC_LL_RELEASE, - LPROC_LL_MAP, - LPROC_LL_LLSEEK, - LPROC_LL_FSYNC, - LPROC_LL_READDIR, - LPROC_LL_SETATTR, - LPROC_LL_TRUNC, - LPROC_LL_FLOCK, - LPROC_LL_GETATTR, - LPROC_LL_CREATE, - LPROC_LL_LINK, - LPROC_LL_UNLINK, - LPROC_LL_SYMLINK, - LPROC_LL_MKDIR, - LPROC_LL_RMDIR, - LPROC_LL_MKNOD, - LPROC_LL_RENAME, - LPROC_LL_STAFS, - LPROC_LL_ALLOC_INODE, - LPROC_LL_SETXATTR, - LPROC_LL_GETXATTR, - LPROC_LL_GETXATTR_HITS, - LPROC_LL_LISTXATTR, - LPROC_LL_REMOVEXATTR, - LPROC_LL_INODE_PERM, - LPROC_LL_FILE_OPCODES -}; - -#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_user.h b/drivers/staging/lustre/lustre/include/linux/lustre_user.h deleted file mode 100644 index e967950e8536..000000000000 --- a/drivers/staging/lustre/lustre/include/linux/lustre_user.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/include/linux/lustre_user.h - * - * Lustre public user-space interface definitions. - */ - -#ifndef _LINUX_LUSTRE_USER_H -#define _LINUX_LUSTRE_USER_H - -# include <linux/quota.h> - -/* - * asm-x86_64/processor.h on some SLES 9 distros seems to use - * kernel-only typedefs. fortunately skipping it altogether is ok - * (for now). - */ -#define __ASM_X86_64_PROCESSOR_H - -#include <linux/string.h> - -/* - * We need to always use 64bit version because the structure - * is shared across entire cluster where 32bit and 64bit machines - * are co-existing. - */ -#if __BITS_PER_LONG != 64 || defined(__ARCH_WANT_STAT64) -typedef struct stat64 lstat_t; -#define lstat_f lstat64 -#else -typedef struct stat lstat_t; -#define lstat_f lstat -#endif - -#define HAVE_LOV_USER_MDS_DATA - -#endif /* _LUSTRE_USER_H */ diff --git a/drivers/staging/lustre/lustre/include/lprocfs_status.h b/drivers/staging/lustre/lustre/include/lprocfs_status.h index d68e60e7fef7..cc0713ef8ae5 100644 --- a/drivers/staging/lustre/lustre/include/lprocfs_status.h +++ b/drivers/staging/lustre/lustre/include/lprocfs_status.h @@ -165,8 +165,10 @@ struct lprocfs_percpu { struct lprocfs_counter lp_cntr[0]; }; -#define LPROCFS_GET_NUM_CPU 0x0001 -#define LPROCFS_GET_SMP_ID 0x0002 +enum lprocfs_stats_lock_ops { + LPROCFS_GET_NUM_CPU = 0x0001, /* number allocated per-CPU stats */ + LPROCFS_GET_SMP_ID = 0x0002, /* current stat to be updated */ +}; enum lprocfs_stats_flags { LPROCFS_STATS_FLAG_NONE = 0x0000, /* per cpu counter */ @@ -363,82 +365,99 @@ static inline void s2dhms(struct dhms *ts, time64_t secs64) #define JOBSTATS_PROCNAME_UID "procname_uid" #define JOBSTATS_NODELOCAL "nodelocal" +/* obd_config.c */ +void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg)); + int lprocfs_write_frac_helper(const char __user *buffer, unsigned long count, int *val, int mult); int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val, int mult); int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid); -/* - * \return value - * < 0 : on error (only possible for opc as LPROCFS_GET_SMP_ID) + +/** + * Lock statistics structure for access, possibly only on this CPU. + * + * The statistics struct may be allocated with per-CPU structures for + * efficient concurrent update (usually only on server-wide stats), or + * as a single global struct (e.g. for per-client or per-job statistics), + * so the required locking depends on the type of structure allocated. + * + * For per-CPU statistics, pin the thread to the current cpuid so that + * will only access the statistics for that CPU. If the stats structure + * for the current CPU has not been allocated (or previously freed), + * allocate it now. The per-CPU statistics do not need locking since + * the thread is pinned to the CPU during update. + * + * For global statistics, lock the stats structure to prevent concurrent update. + * + * \param[in] stats statistics structure to lock + * \param[in] opc type of operation: + * LPROCFS_GET_SMP_ID: "lock" and return current CPU index + * for incrementing statistics for that CPU + * LPROCFS_GET_NUM_CPU: "lock" and return number of used + * CPU indices to iterate over all indices + * \param[out] flags CPU interrupt saved state for IRQ-safe locking + * + * \retval cpuid of current thread or number of allocated structs + * \retval negative on error (only for opc LPROCFS_GET_SMP_ID + per-CPU stats) */ -static inline int lprocfs_stats_lock(struct lprocfs_stats *stats, int opc, +static inline int lprocfs_stats_lock(struct lprocfs_stats *stats, + enum lprocfs_stats_lock_ops opc, unsigned long *flags) { - int rc = 0; + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_lock_irqsave(&stats->ls_lock, *flags); + else + spin_lock(&stats->ls_lock); + return opc == LPROCFS_GET_NUM_CPU ? 1 : 0; + } switch (opc) { - default: - LBUG(); + case LPROCFS_GET_SMP_ID: { + unsigned int cpuid = get_cpu(); - case LPROCFS_GET_SMP_ID: - if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { - if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) - spin_lock_irqsave(&stats->ls_lock, *flags); - else - spin_lock(&stats->ls_lock); - return 0; - } else { - unsigned int cpuid = get_cpu(); - - if (unlikely(!stats->ls_percpu[cpuid])) { - rc = lprocfs_stats_alloc_one(stats, cpuid); - if (rc < 0) { - put_cpu(); - return rc; - } + if (unlikely(!stats->ls_percpu[cpuid])) { + int rc = lprocfs_stats_alloc_one(stats, cpuid); + + if (rc < 0) { + put_cpu(); + return rc; } - return cpuid; } - + return cpuid; + } case LPROCFS_GET_NUM_CPU: - if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { - if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) - spin_lock_irqsave(&stats->ls_lock, *flags); - else - spin_lock(&stats->ls_lock); - return 1; - } return stats->ls_biggest_alloc_num; + default: + LBUG(); } } -static inline void lprocfs_stats_unlock(struct lprocfs_stats *stats, int opc, +/** + * Unlock statistics structure after access. + * + * Unlock the lock acquired via lprocfs_stats_lock() for global statistics, + * or unpin this thread from the current cpuid for per-CPU statistics. + * + * This function must be called using the same arguments as used when calling + * lprocfs_stats_lock() so that the correct operation can be performed. + * + * \param[in] stats statistics structure to unlock + * \param[in] opc type of operation (current cpuid or number of structs) + * \param[in] flags CPU interrupt saved state for IRQ-safe locking + */ +static inline void lprocfs_stats_unlock(struct lprocfs_stats *stats, + enum lprocfs_stats_lock_ops opc, unsigned long *flags) { - switch (opc) { - default: - LBUG(); - - case LPROCFS_GET_SMP_ID: - if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { - if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) - spin_unlock_irqrestore(&stats->ls_lock, *flags); - else - spin_unlock(&stats->ls_lock); - } else { - put_cpu(); - } - return; - - case LPROCFS_GET_NUM_CPU: - if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { - if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) - spin_unlock_irqrestore(&stats->ls_lock, *flags); - else - spin_unlock(&stats->ls_lock); - } - return; + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_unlock_irqrestore(&stats->ls_lock, *flags); + else + spin_unlock(&stats->ls_lock); + } else if (opc == LPROCFS_GET_SMP_ID) { + put_cpu(); } } @@ -496,7 +515,7 @@ static inline __u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx, enum lprocfs_fields_flags field) { - int i; + unsigned int i; unsigned int num_cpu; unsigned long flags = 0; __u64 ret = 0; @@ -681,6 +700,12 @@ static struct lustre_attr lustre_attr_##name = __ATTR(name, mode, show, store) extern const struct sysfs_ops lustre_sysfs_ops; +struct root_squash_info; +int lprocfs_wr_root_squash(const char *buffer, unsigned long count, + struct root_squash_info *squash, char *name); +int lprocfs_wr_nosquash_nids(const char *buffer, unsigned long count, + struct root_squash_info *squash, char *name); + /* all quota proc functions */ int lprocfs_quota_rd_bunit(char *page, char **start, loff_t off, int count, diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h index 6e25c1bb6aa3..260643ee0d48 100644 --- a/drivers/staging/lustre/lustre/include/lu_object.h +++ b/drivers/staging/lustre/lustre/include/lu_object.h @@ -327,7 +327,7 @@ struct lu_device_type { /** * Number of existing device type instances. */ - unsigned ldt_device_nr; + atomic_t ldt_device_nr; /** * Linkage into a global list of all device types. * @@ -602,7 +602,7 @@ struct lu_site { /** * index of bucket on hash table while purging */ - int ls_purge_start; + unsigned int ls_purge_start; /** * Top-level device for this stack. */ @@ -623,6 +623,11 @@ struct lu_site { spinlock_t ls_ld_lock; /** + * Lock to serialize site purge. + */ + struct mutex ls_purge_mutex; + + /** * lu_site stats */ struct lprocfs_stats *ls_stats; @@ -673,7 +678,6 @@ void lu_object_add(struct lu_object *before, struct lu_object *o); int lu_device_type_init(struct lu_device_type *ldt); void lu_device_type_fini(struct lu_device_type *ldt); -void lu_types_stop(void); /** @} ctors */ @@ -1025,7 +1029,8 @@ enum lu_context_tag { /** * Contexts usable in cache shrinker thread. */ - LCT_SHRINKER = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF + LCT_SHRINKER = LCT_MD_THREAD | LCT_DT_THREAD | LCT_CL_THREAD | + LCT_NOREF }; /** @@ -1264,12 +1269,28 @@ struct lu_name { }; /** + * Validate names (path components) + * + * To be valid \a name must be non-empty, '\0' terminated of length \a + * name_len, and not contain '/'. The maximum length of a name (before + * say -ENAMETOOLONG will be returned) is really controlled by llite + * and the server. We only check for something insane coming from bad + * integer handling here. + */ +static inline bool lu_name_is_valid_2(const char *name, size_t name_len) +{ + return name && name_len > 0 && name_len < INT_MAX && + name[name_len] == '\0' && strlen(name) == name_len && + !memchr(name, '/', name_len); +} + +/** * Common buffer structure to be passed around for various xattr_{s,g}et() * methods. */ struct lu_buf { void *lb_buf; - ssize_t lb_len; + size_t lb_len; }; #define DLUBUF "(%p %zu)" @@ -1298,5 +1319,12 @@ struct lu_kmem_descr { int lu_kmem_init(struct lu_kmem_descr *caches); void lu_kmem_fini(struct lu_kmem_descr *caches); +void lu_buf_free(struct lu_buf *buf); +void lu_buf_alloc(struct lu_buf *buf, size_t size); +void lu_buf_realloc(struct lu_buf *buf, size_t size); + +int lu_buf_check_and_grow(struct lu_buf *buf, size_t len); +struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, size_t len); + /** @} lu */ #endif /* __LUSTRE_LU_OBJECT_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h index 051864c23b5b..72eaee95c6b8 100644 --- a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h +++ b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h @@ -93,6 +93,7 @@ /* Defn's shared with user-space. */ #include "lustre_user.h" #include "lustre_errno.h" +#include "../lustre_ver.h" /* * GENERAL STUFF @@ -196,12 +197,12 @@ static inline unsigned fld_range_type(const struct lu_seq_range *range) return range->lsr_flags & LU_SEQ_RANGE_MASK; } -static inline int fld_range_is_ost(const struct lu_seq_range *range) +static inline bool fld_range_is_ost(const struct lu_seq_range *range) { return fld_range_type(range) == LU_SEQ_RANGE_OST; } -static inline int fld_range_is_mdt(const struct lu_seq_range *range) +static inline bool fld_range_is_mdt(const struct lu_seq_range *range) { return fld_range_type(range) == LU_SEQ_RANGE_MDT; } @@ -260,23 +261,23 @@ static inline void range_init(struct lu_seq_range *range) * check if given seq id \a s is within given range \a r */ -static inline int range_within(const struct lu_seq_range *range, - __u64 s) +static inline bool range_within(const struct lu_seq_range *range, + __u64 s) { return s >= range->lsr_start && s < range->lsr_end; } -static inline int range_is_sane(const struct lu_seq_range *range) +static inline bool range_is_sane(const struct lu_seq_range *range) { return (range->lsr_end >= range->lsr_start); } -static inline int range_is_zero(const struct lu_seq_range *range) +static inline bool range_is_zero(const struct lu_seq_range *range) { return (range->lsr_start == 0 && range->lsr_end == 0); } -static inline int range_is_exhausted(const struct lu_seq_range *range) +static inline bool range_is_exhausted(const struct lu_seq_range *range) { return range_space(range) == 0; @@ -437,69 +438,69 @@ enum dot_lustre_oid { FID_OID_DOT_LUSTRE_OBF = 2UL, }; -static inline int fid_seq_is_mdt0(__u64 seq) +static inline bool fid_seq_is_mdt0(__u64 seq) { return (seq == FID_SEQ_OST_MDT0); } -static inline int fid_seq_is_mdt(const __u64 seq) +static inline bool fid_seq_is_mdt(__u64 seq) { return seq == FID_SEQ_OST_MDT0 || seq >= FID_SEQ_NORMAL; }; -static inline int fid_seq_is_echo(__u64 seq) +static inline bool fid_seq_is_echo(__u64 seq) { return (seq == FID_SEQ_ECHO); } -static inline int fid_is_echo(const struct lu_fid *fid) +static inline bool fid_is_echo(const struct lu_fid *fid) { return fid_seq_is_echo(fid_seq(fid)); } -static inline int fid_seq_is_llog(__u64 seq) +static inline bool fid_seq_is_llog(__u64 seq) { return (seq == FID_SEQ_LLOG); } -static inline int fid_is_llog(const struct lu_fid *fid) +static inline bool fid_is_llog(const struct lu_fid *fid) { /* file with OID == 0 is not llog but contains last oid */ return fid_seq_is_llog(fid_seq(fid)) && fid_oid(fid) > 0; } -static inline int fid_seq_is_rsvd(const __u64 seq) +static inline bool fid_seq_is_rsvd(__u64 seq) { return (seq > FID_SEQ_OST_MDT0 && seq <= FID_SEQ_RSVD); }; -static inline int fid_seq_is_special(const __u64 seq) +static inline bool fid_seq_is_special(__u64 seq) { return seq == FID_SEQ_SPECIAL; }; -static inline int fid_seq_is_local_file(const __u64 seq) +static inline bool fid_seq_is_local_file(__u64 seq) { return seq == FID_SEQ_LOCAL_FILE || seq == FID_SEQ_LOCAL_NAME; }; -static inline int fid_seq_is_root(const __u64 seq) +static inline bool fid_seq_is_root(__u64 seq) { return seq == FID_SEQ_ROOT; } -static inline int fid_seq_is_dot(const __u64 seq) +static inline bool fid_seq_is_dot(__u64 seq) { return seq == FID_SEQ_DOT_LUSTRE; } -static inline int fid_seq_is_default(const __u64 seq) +static inline bool fid_seq_is_default(__u64 seq) { return seq == FID_SEQ_LOV_DEFAULT; } -static inline int fid_is_mdt0(const struct lu_fid *fid) +static inline bool fid_is_mdt0(const struct lu_fid *fid) { return fid_seq_is_mdt0(fid_seq(fid)); } @@ -516,12 +517,12 @@ static inline void lu_root_fid(struct lu_fid *fid) * \param fid the fid to be tested. * \return true if the fid is a igif; otherwise false. */ -static inline int fid_seq_is_igif(const __u64 seq) +static inline bool fid_seq_is_igif(__u64 seq) { return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX; } -static inline int fid_is_igif(const struct lu_fid *fid) +static inline bool fid_is_igif(const struct lu_fid *fid) { return fid_seq_is_igif(fid_seq(fid)); } @@ -531,27 +532,27 @@ static inline int fid_is_igif(const struct lu_fid *fid) * \param fid the fid to be tested. * \return true if the fid is a idif; otherwise false. */ -static inline int fid_seq_is_idif(const __u64 seq) +static inline bool fid_seq_is_idif(__u64 seq) { return seq >= FID_SEQ_IDIF && seq <= FID_SEQ_IDIF_MAX; } -static inline int fid_is_idif(const struct lu_fid *fid) +static inline bool fid_is_idif(const struct lu_fid *fid) { return fid_seq_is_idif(fid_seq(fid)); } -static inline int fid_is_local_file(const struct lu_fid *fid) +static inline bool fid_is_local_file(const struct lu_fid *fid) { return fid_seq_is_local_file(fid_seq(fid)); } -static inline int fid_seq_is_norm(const __u64 seq) +static inline bool fid_seq_is_norm(__u64 seq) { return (seq >= FID_SEQ_NORMAL); } -static inline int fid_is_norm(const struct lu_fid *fid) +static inline bool fid_is_norm(const struct lu_fid *fid) { return fid_seq_is_norm(fid_seq(fid)); } @@ -658,7 +659,7 @@ static inline void ostid_set_id(struct ost_id *oi, __u64 oid) oi->oi_fid.f_oid = oid; oi->oi_fid.f_ver = oid >> 48; } else { - if (oid > OBIF_MAX_OID) { + if (oid >= OBIF_MAX_OID) { CERROR("Bad %llu to set " DOSTID "\n", oid, POSTID(oi)); return; } @@ -683,7 +684,7 @@ static inline int fid_set_id(struct lu_fid *fid, __u64 oid) fid->f_oid = oid; fid->f_ver = oid >> 48; } else { - if (oid > OBIF_MAX_OID) { + if (oid >= OBIF_MAX_OID) { CERROR("Too large OID %#llx to set REG "DFID"\n", (unsigned long long)oid, PFID(fid)); return -EBADF; @@ -769,7 +770,7 @@ static inline int fid_to_ostid(const struct lu_fid *fid, struct ost_id *ostid) } /* Check whether the fid is for LAST_ID */ -static inline int fid_is_last_id(const struct lu_fid *fid) +static inline bool fid_is_last_id(const struct lu_fid *fid) { return (fid_oid(fid) == 0); } @@ -838,7 +839,7 @@ static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src) dst->f_ver = be32_to_cpu(fid_ver(src)); } -static inline int fid_is_sane(const struct lu_fid *fid) +static inline bool fid_is_sane(const struct lu_fid *fid) { return fid && ((fid_seq(fid) >= FID_SEQ_START && fid_ver(fid) == 0) || @@ -846,15 +847,10 @@ static inline int fid_is_sane(const struct lu_fid *fid) fid_seq_is_rsvd(fid_seq(fid))); } -static inline int fid_is_zero(const struct lu_fid *fid) -{ - return fid_seq(fid) == 0 && fid_oid(fid) == 0; -} - void lustre_swab_lu_fid(struct lu_fid *fid); void lustre_swab_lu_seq_range(struct lu_seq_range *range); -static inline int lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1) +static inline bool lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1) { return memcmp(f0, f1, sizeof(*f0)) == 0; } @@ -1017,12 +1013,12 @@ static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent) return next; } -static inline int lu_dirent_calc_size(int namelen, __u16 attr) +static inline size_t lu_dirent_calc_size(size_t namelen, __u16 attr) { - int size; + size_t size; if (attr & LUDA_TYPE) { - const unsigned align = sizeof(struct luda_type) - 1; + const size_t align = sizeof(struct luda_type) - 1; size = (sizeof(struct lu_dirent) + namelen + align) & ~align; size += sizeof(struct luda_type); @@ -1033,15 +1029,6 @@ static inline int lu_dirent_calc_size(int namelen, __u16 attr) return (size + 7) & ~7; } -static inline int lu_dirent_size(struct lu_dirent *ent) -{ - if (le16_to_cpu(ent->lde_reclen) == 0) { - return lu_dirent_calc_size(le16_to_cpu(ent->lde_namelen), - le32_to_cpu(ent->lde_attrs)); - } - return le16_to_cpu(ent->lde_reclen); -} - #define MDS_DIR_END_OFF 0xfffffffffffffffeULL /** @@ -1067,19 +1054,19 @@ struct lustre_handle { #define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL -static inline int lustre_handle_is_used(struct lustre_handle *lh) +static inline bool lustre_handle_is_used(const struct lustre_handle *lh) { return lh->cookie != 0ull; } -static inline int lustre_handle_equal(const struct lustre_handle *lh1, - const struct lustre_handle *lh2) +static inline bool lustre_handle_equal(const struct lustre_handle *lh1, + const struct lustre_handle *lh2) { return lh1->cookie == lh2->cookie; } static inline void lustre_handle_copy(struct lustre_handle *tgt, - struct lustre_handle *src) + const struct lustre_handle *src) { tgt->cookie = src->cookie; } @@ -1105,7 +1092,7 @@ struct lustre_msg_v2 { /* without gss, ptlrpc_body is put at the first buffer. */ #define PTLRPC_NUM_VERSIONS 4 -#define JOBSTATS_JOBID_SIZE 32 /* 32 bytes string */ + struct ptlrpc_body_v3 { struct lustre_handle pb_handle; __u32 pb_type; @@ -1127,7 +1114,7 @@ struct ptlrpc_body_v3 { __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS]; /* padding for future needs */ __u64 pb_padding[4]; - char pb_jobid[JOBSTATS_JOBID_SIZE]; + char pb_jobid[LUSTRE_JOBID_SIZE]; }; #define ptlrpc_body ptlrpc_body_v3 @@ -1293,6 +1280,9 @@ void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); #define OBD_CONNECT_OPEN_BY_FID 0x20000000000000ULL /* open by fid won't pack * name in request */ +#define OBD_CONNECT_LFSCK 0x40000000000000ULL/* support online LFSCK */ +#define OBD_CONNECT_UNLINK_CLOSE 0x100000000000000ULL/* close file in unlink */ +#define OBD_CONNECT_DIR_STRIPE 0x400000000000000ULL/* striped DNE dir */ /* XXX README XXX: * Please DO NOT add flag values here before first ensuring that this same @@ -1318,14 +1308,6 @@ void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); #define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \ OBD_CONNECT_FULL20) -#define OBD_OCD_VERSION(major, minor, patch, fix) (((major)<<24) + \ - ((minor)<<16) + \ - ((patch)<<8) + (fix)) -#define OBD_OCD_VERSION_MAJOR(version) ((int)((version)>>24)&255) -#define OBD_OCD_VERSION_MINOR(version) ((int)((version)>>16)&255) -#define OBD_OCD_VERSION_PATCH(version) ((int)((version)>>8)&255) -#define OBD_OCD_VERSION_FIX(version) ((int)(version)&255) - /* This structure is used for both request and reply. * * If we eventually have separate connect data for different types, which we @@ -1478,10 +1460,23 @@ enum obdo_flags { OBD_FL_LOCAL_MASK = 0xF0000000, }; -#define LOV_MAGIC_V1 0x0BD10BD0 -#define LOV_MAGIC LOV_MAGIC_V1 -#define LOV_MAGIC_JOIN_V1 0x0BD20BD0 -#define LOV_MAGIC_V3 0x0BD30BD0 +/* + * All LOV EA magics should have the same postfix, if some new version + * Lustre instroduces new LOV EA magic, then when down-grade to an old + * Lustre, even though the old version system does not recognizes such + * new magic, it still can distinguish the corrupted cases by checking + * the magic's postfix. + */ +#define LOV_MAGIC_MAGIC 0x0BD0 +#define LOV_MAGIC_MASK 0xFFFF + +#define LOV_MAGIC_V1 (0x0BD10000 | LOV_MAGIC_MAGIC) +#define LOV_MAGIC_JOIN_V1 (0x0BD20000 | LOV_MAGIC_MAGIC) +#define LOV_MAGIC_V3 (0x0BD30000 | LOV_MAGIC_MAGIC) +#define LOV_MAGIC_MIGRATE (0x0BD40000 | LOV_MAGIC_MAGIC) +/* reserved for specifying OSTs */ +#define LOV_MAGIC_SPECIFIC (0x0BD50000 | LOV_MAGIC_MAGIC) +#define LOV_MAGIC LOV_MAGIC_V1 /* * magic for fully defined striping @@ -1498,14 +1493,6 @@ enum obdo_flags { #define LOV_MAGIC_V1_DEF 0x0CD10BD0 #define LOV_MAGIC_V3_DEF 0x0CD30BD0 -#define LOV_PATTERN_RAID0 0x001 /* stripes are used round-robin */ -#define LOV_PATTERN_RAID1 0x002 /* stripes are mirrors of each other */ -#define LOV_PATTERN_FIRST 0x100 /* first stripe is not in round-robin */ -#define LOV_PATTERN_CMOBD 0x200 - -#define LOV_PATTERN_F_MASK 0xffff0000 -#define LOV_PATTERN_F_RELEASED 0x80000000 /* HSM released file */ - #define lov_pattern(pattern) (pattern & ~LOV_PATTERN_F_MASK) #define lov_pattern_flags(pattern) (pattern & LOV_PATTERN_F_MASK) @@ -1569,25 +1556,25 @@ static inline void lmm_oi_set_id(struct ost_id *oi, __u64 oid) oi->oi.oi_id = oid; } -static inline __u64 lmm_oi_id(struct ost_id *oi) +static inline __u64 lmm_oi_id(const struct ost_id *oi) { return oi->oi.oi_id; } -static inline __u64 lmm_oi_seq(struct ost_id *oi) +static inline __u64 lmm_oi_seq(const struct ost_id *oi) { return oi->oi.oi_seq; } static inline void lmm_oi_le_to_cpu(struct ost_id *dst_oi, - struct ost_id *src_oi) + const struct ost_id *src_oi) { dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id); dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq); } static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi, - struct ost_id *src_oi) + const struct ost_id *src_oi) { dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id); dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq); @@ -1610,6 +1597,7 @@ static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi, #define XATTR_NAME_LOV "trusted.lov" #define XATTR_NAME_LMA "trusted.lma" #define XATTR_NAME_LMV "trusted.lmv" +#define XATTR_NAME_DEFAULT_LMV "trusted.dmv" #define XATTR_NAME_LINK "trusted.link" #define XATTR_NAME_FID "trusted.fid" #define XATTR_NAME_VERSION "trusted.version" @@ -1625,7 +1613,7 @@ struct lov_mds_md_v3 { /* LOV EA mds/wire data (little-endian) */ /* lmm_stripe_count used to be __u32 */ __u16 lmm_stripe_count; /* num stripes in use for this object */ __u16 lmm_layout_gen; /* layout generation number */ - char lmm_pool_name[LOV_MAXPOOLNAME]; /* must be 32bit aligned */ + char lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* must be 32bit aligned */ struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */ }; @@ -1727,6 +1715,8 @@ lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic) #define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */ #define OBD_MD_FLRELEASED (0x0020000000000000ULL) /* file released */ +#define OBD_MD_DEFAULT_MEA (0x0040000000000000ULL) /* default MEA */ + #define OBD_MD_FLGETATTR (OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME | \ OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLKSZ | \ OBD_MD_FLMODE | OBD_MD_FLTYPE | OBD_MD_FLUID | \ @@ -1782,7 +1772,7 @@ void lustre_swab_obd_statfs(struct obd_statfs *os); * it to sync quickly */ -#define OBD_OBJECT_EOF 0xffffffffffffffffULL +#define OBD_OBJECT_EOF LUSTRE_EOF #define OST_MIN_PRECREATE 32 #define OST_MAX_PRECREATE 20000 @@ -1806,9 +1796,9 @@ void lustre_swab_obd_ioobj(struct obd_ioobj *ioo); /* multiple of 8 bytes => can array */ struct niobuf_remote { - __u64 offset; - __u32 len; - __u32 flags; + __u64 rnb_offset; + __u32 rnb_len; + __u32 rnb_flags; }; void lustre_swab_niobuf_remote(struct niobuf_remote *nbr); @@ -1878,12 +1868,6 @@ struct obd_quotactl { void lustre_swab_obd_quotactl(struct obd_quotactl *q); -#define Q_QUOTACHECK 0x800100 /* deprecated as of 2.4 */ -#define Q_INITQUOTA 0x800101 /* deprecated as of 2.4 */ -#define Q_GETOINFO 0x800102 /* get obd quota info */ -#define Q_GETOQUOTA 0x800103 /* get obd quotas */ -#define Q_FINVALIDATE 0x800104 /* deprecated as of 2.4 */ - #define Q_COPY(out, in, member) (out)->member = (in)->member #define QCTL_COPY(out, in) \ @@ -1946,8 +1930,8 @@ enum mds_cmd { MDS_DISCONNECT = 39, MDS_GETSTATUS = 40, MDS_STATFS = 41, - MDS_PIN = 42, - MDS_UNPIN = 43, + MDS_PIN = 42, /* obsolete, never used in a release */ + MDS_UNPIN = 43, /* obsolete, never used in a release */ MDS_SYNC = 44, MDS_DONE_WRITING = 45, MDS_SET_INFO = 46, @@ -1956,7 +1940,7 @@ enum mds_cmd { MDS_GETXATTR = 49, MDS_SETXATTR = 50, /* obsolete, now it's MDS_REINT op */ MDS_WRITEPAGE = 51, - MDS_IS_SUBDIR = 52, + MDS_IS_SUBDIR = 52, /* obsolete, never used in a release */ MDS_GET_INFO = 53, MDS_HSM_STATE_GET = 54, MDS_HSM_STATE_SET = 55, @@ -1984,7 +1968,7 @@ enum mdt_reint_cmd { REINT_OPEN = 6, REINT_SETXATTR = 7, REINT_RMENTRY = 8, -/* REINT_WRITE = 9, */ + REINT_MIGRATE = 9, REINT_MAX }; @@ -2003,6 +1987,7 @@ void lustre_swab_generic_32s(__u32 *val); #define DISP_OPEN_LOCK 0x02000000 #define DISP_OPEN_LEASE 0x04000000 #define DISP_OPEN_STRIPE 0x08000000 +#define DISP_OPEN_DENY 0x10000000 /* INODE LOCK PARTS */ #define MDS_INODELOCK_LOOKUP 0x000001 /* For namespace, dentry etc, and also @@ -2028,7 +2013,7 @@ void lustre_swab_generic_32s(__u32 *val); #define MDS_INODELOCK_MAXSHIFT 5 /* This FULL lock is useful to take on unlink sort of operations */ -#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1) +#define MDS_INODELOCK_FULL ((1 << (MDS_INODELOCK_MAXSHIFT + 1)) - 1) /* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2], * but was moved into name[1] along with the OID to avoid consuming the @@ -2108,43 +2093,43 @@ enum md_transient_state { }; struct mdt_body { - struct lu_fid fid1; - struct lu_fid fid2; - struct lustre_handle handle; - __u64 valid; - __u64 size; /* Offset, in the case of MDS_READPAGE */ - __s64 mtime; - __s64 atime; - __s64 ctime; - __u64 blocks; /* XID, in the case of MDS_READPAGE */ - __u64 ioepoch; - __u64 t_state; /* transient file state defined in - * enum md_transient_state - * was "ino" until 2.4.0 - */ - __u32 fsuid; - __u32 fsgid; - __u32 capability; - __u32 mode; - __u32 uid; - __u32 gid; - __u32 flags; /* from vfs for pin/unpin, LUSTRE_BFLAG close */ - __u32 rdev; - __u32 nlink; /* #bytes to read in the case of MDS_READPAGE */ - __u32 unused2; /* was "generation" until 2.4.0 */ - __u32 suppgid; - __u32 eadatasize; - __u32 aclsize; - __u32 max_mdsize; - __u32 max_cookiesize; - __u32 uid_h; /* high 32-bits of uid, for FUID */ - __u32 gid_h; /* high 32-bits of gid, for FUID */ - __u32 padding_5; /* also fix lustre_swab_mdt_body */ - __u64 padding_6; - __u64 padding_7; - __u64 padding_8; - __u64 padding_9; - __u64 padding_10; + struct lu_fid mbo_fid1; + struct lu_fid mbo_fid2; + struct lustre_handle mbo_handle; + __u64 mbo_valid; + __u64 mbo_size; /* Offset, in the case of MDS_READPAGE */ + __s64 mbo_mtime; + __s64 mbo_atime; + __s64 mbo_ctime; + __u64 mbo_blocks; /* XID, in the case of MDS_READPAGE */ + __u64 mbo_ioepoch; + __u64 mbo_t_state; /* transient file state defined in + * enum md_transient_state + * was "ino" until 2.4.0 + */ + __u32 mbo_fsuid; + __u32 mbo_fsgid; + __u32 mbo_capability; + __u32 mbo_mode; + __u32 mbo_uid; + __u32 mbo_gid; + __u32 mbo_flags; + __u32 mbo_rdev; + __u32 mbo_nlink; /* #bytes to read in the case of MDS_READPAGE */ + __u32 mbo_unused2; /* was "generation" until 2.4.0 */ + __u32 mbo_suppgid; + __u32 mbo_eadatasize; + __u32 mbo_aclsize; + __u32 mbo_max_mdsize; + __u32 mbo_max_cookiesize; + __u32 mbo_uid_h; /* high 32-bits of uid, for FUID */ + __u32 mbo_gid_h; /* high 32-bits of gid, for FUID */ + __u32 mbo_padding_5; /* also fix lustre_swab_mdt_body */ + __u64 mbo_padding_6; + __u64 mbo_padding_7; + __u64 mbo_padding_8; + __u64 mbo_padding_9; + __u64 mbo_padding_10; }; /* 216 */ void lustre_swab_mdt_body(struct mdt_body *b); @@ -2263,6 +2248,11 @@ void lustre_swab_mdt_rec_setattr(struct mdt_rec_setattr *sa); */ #define MDS_OPEN_RELEASE 02000000000000ULL /* Open the file for HSM release */ +#define MDS_OPEN_FL_INTERNAL (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS | \ + MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK | \ + MDS_OPEN_BY_FID | MDS_OPEN_LEASE | \ + MDS_OPEN_RELEASE) + enum mds_op_bias { MDS_CHECK_SPLIT = 1 << 0, MDS_CROSS_REF = 1 << 1, @@ -2277,6 +2267,7 @@ enum mds_op_bias { MDS_CREATE_VOLATILE = 1 << 10, MDS_OWNEROVERRIDE = 1 << 11, MDS_HSM_RELEASE = 1 << 12, + MDS_RENAME_MIGRATE = BIT(13), }; /* instance of mdt_reint_rec */ @@ -2472,7 +2463,7 @@ struct lmv_desc { __u32 ld_tgt_count; /* how many MDS's */ __u32 ld_active_tgt_count; /* how many active */ __u32 ld_default_stripe_count; /* how many objects are used */ - __u32 ld_pattern; /* default MEA_MAGIC_* */ + __u32 ld_pattern; /* default hash pattern */ __u64 ld_default_hash_size; __u64 ld_padding_1; /* also fix lustre_swab_lmv_desc */ __u32 ld_padding_2; /* also fix lustre_swab_lmv_desc */ @@ -2482,23 +2473,129 @@ struct lmv_desc { struct obd_uuid ld_uuid; }; -/* TODO: lmv_stripe_md should contain mds capabilities for all slave fids */ -struct lmv_stripe_md { - __u32 mea_magic; - __u32 mea_count; - __u32 mea_master; - __u32 mea_padding; - char mea_pool_name[LOV_MAXPOOLNAME]; - struct lu_fid mea_ids[0]; +/* LMV layout EA, and it will be stored both in master and slave object */ +struct lmv_mds_md_v1 { + __u32 lmv_magic; + __u32 lmv_stripe_count; + __u32 lmv_master_mdt_index; /* On master object, it is master + * MDT index, on slave object, it + * is stripe index of the slave obj + */ + __u32 lmv_hash_type; /* dir stripe policy, i.e. indicate + * which hash function to be used, + * Note: only lower 16 bits is being + * used for now. Higher 16 bits will + * be used to mark the object status, + * for example migrating or dead. + */ + __u32 lmv_layout_version; /* Used for directory restriping */ + __u32 lmv_padding1; + __u64 lmv_padding2; + __u64 lmv_padding3; + char lmv_pool_name[LOV_MAXPOOLNAME + 1];/* pool name */ + struct lu_fid lmv_stripe_fids[0]; /* FIDs for each stripe */ }; -#define MEA_MAGIC_LAST_CHAR 0xb2221ca1 -#define MEA_MAGIC_ALL_CHARS 0xb222a11c -#define MEA_MAGIC_HASH_SEGMENT 0xb222a11b +#define LMV_MAGIC_V1 0x0CD20CD0 /* normal stripe lmv magic */ +#define LMV_MAGIC LMV_MAGIC_V1 + +/* #define LMV_USER_MAGIC 0x0CD30CD0 */ +#define LMV_MAGIC_STRIPE 0x0CD40CD0 /* magic for dir sub_stripe */ + +/* + *Right now only the lower part(0-16bits) of lmv_hash_type is being used, + * and the higher part will be the flag to indicate the status of object, + * for example the object is being migrated. And the hash function + * might be interpreted differently with different flags. + */ +#define LMV_HASH_TYPE_MASK 0x0000ffff + +#define LMV_HASH_FLAG_MIGRATION 0x80000000 +#define LMV_HASH_FLAG_DEAD 0x40000000 -#define MAX_HASH_SIZE_32 0x7fffffffUL -#define MAX_HASH_SIZE 0x7fffffffffffffffULL -#define MAX_HASH_HIGHEST_BIT 0x1000000000000000ULL +/** + * The FNV-1a hash algorithm is as follows: + * hash = FNV_offset_basis + * for each octet_of_data to be hashed + * hash = hash XOR octet_of_data + * hash = hash × FNV_prime + * return hash + * http://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function#FNV-1a_hash + * + * http://www.isthe.com/chongo/tech/comp/fnv/index.html#FNV-reference-source + * FNV_prime is 2^40 + 2^8 + 0xb3 = 0x100000001b3ULL + **/ +#define LUSTRE_FNV_1A_64_PRIME 0x100000001b3ULL +#define LUSTRE_FNV_1A_64_OFFSET_BIAS 0xcbf29ce484222325ULL +static inline __u64 lustre_hash_fnv_1a_64(const void *buf, size_t size) +{ + __u64 hash = LUSTRE_FNV_1A_64_OFFSET_BIAS; + const unsigned char *p = buf; + size_t i; + + for (i = 0; i < size; i++) { + hash ^= p[i]; + hash *= LUSTRE_FNV_1A_64_PRIME; + } + + return hash; +} + +union lmv_mds_md { + __u32 lmv_magic; + struct lmv_mds_md_v1 lmv_md_v1; + struct lmv_user_md lmv_user_md; +}; + +void lustre_swab_lmv_mds_md(union lmv_mds_md *lmm); + +static inline ssize_t lmv_mds_md_size(int stripe_count, unsigned int lmm_magic) +{ + ssize_t len = -EINVAL; + + switch (lmm_magic) { + case LMV_MAGIC_V1: { + struct lmv_mds_md_v1 *lmm1; + + len = sizeof(*lmm1); + len += stripe_count * sizeof(lmm1->lmv_stripe_fids[0]); + break; } + default: + break; + } + return len; +} + +static inline int lmv_mds_md_stripe_count_get(const union lmv_mds_md *lmm) +{ + switch (le32_to_cpu(lmm->lmv_magic)) { + case LMV_MAGIC_V1: + return le32_to_cpu(lmm->lmv_md_v1.lmv_stripe_count); + case LMV_USER_MAGIC: + return le32_to_cpu(lmm->lmv_user_md.lum_stripe_count); + default: + return -EINVAL; + } +} + +static inline int lmv_mds_md_stripe_count_set(union lmv_mds_md *lmm, + unsigned int stripe_count) +{ + int rc = 0; + + switch (le32_to_cpu(lmm->lmv_magic)) { + case LMV_MAGIC_V1: + lmm->lmv_md_v1.lmv_stripe_count = cpu_to_le32(stripe_count); + break; + case LMV_USER_MAGIC: + lmm->lmv_user_md.lum_stripe_count = cpu_to_le32(stripe_count); + break; + default: + rc = -EINVAL; + break; + } + return rc; +} enum fld_rpc_opc { FLD_QUERY = 900, @@ -2582,8 +2679,8 @@ struct ldlm_res_id { #define PLDLMRES(res) (res)->lr_name.name[0], (res)->lr_name.name[1], \ (res)->lr_name.name[2], (res)->lr_name.name[3] -static inline int ldlm_res_eq(const struct ldlm_res_id *res0, - const struct ldlm_res_id *res1) +static inline bool ldlm_res_eq(const struct ldlm_res_id *res0, + const struct ldlm_res_id *res1) { return !memcmp(res0, res1, sizeof(*res0)); } @@ -2620,17 +2717,15 @@ struct ldlm_extent { __u64 gid; }; -#define LDLM_GID_ANY ((__u64)-1) - -static inline int ldlm_extent_overlap(struct ldlm_extent *ex1, - struct ldlm_extent *ex2) +static inline int ldlm_extent_overlap(const struct ldlm_extent *ex1, + const struct ldlm_extent *ex2) { return (ex1->start <= ex2->end) && (ex2->start <= ex1->end); } /* check if @ex1 contains @ex2 */ -static inline int ldlm_extent_contain(struct ldlm_extent *ex1, - struct ldlm_extent *ex2) +static inline int ldlm_extent_contain(const struct ldlm_extent *ex1, + const struct ldlm_extent *ex2) { return (ex1->start <= ex2->start) && (ex1->end >= ex2->end); } @@ -2833,7 +2928,29 @@ enum obd_cmd { }; #define OBD_FIRST_OPC OBD_PING -/* catalog of log objects */ +/** + * llog contexts indices. + * + * There is compatibility problem with indexes below, they are not + * continuous and must keep their numbers for compatibility needs. + * See LU-5218 for details. + */ +enum llog_ctxt_id { + LLOG_CONFIG_ORIG_CTXT = 0, + LLOG_CONFIG_REPL_CTXT = 1, + LLOG_MDS_OST_ORIG_CTXT = 2, + LLOG_MDS_OST_REPL_CTXT = 3, /* kept just to avoid re-assignment */ + LLOG_SIZE_ORIG_CTXT = 4, + LLOG_SIZE_REPL_CTXT = 5, + LLOG_TEST_ORIG_CTXT = 8, + LLOG_TEST_REPL_CTXT = 9, /* kept just to avoid re-assignment */ + LLOG_CHANGELOG_ORIG_CTXT = 12, /**< changelog generation on mdd */ + LLOG_CHANGELOG_REPL_CTXT = 13, /**< changelog access on clients */ + /* for multiple changelog consumers */ + LLOG_CHANGELOG_USER_ORIG_CTXT = 14, + LLOG_AGENT_ORIG_CTXT = 15, /**< agent requests generation on cdt */ + LLOG_MAX_CTXTS +}; /** Identifier for a single log object */ struct llog_logid { @@ -2939,7 +3056,7 @@ struct llog_setattr64_rec { __u32 lsr_uid_h; __u32 lsr_gid; __u32 lsr_gid_h; - __u64 lsr_padding; + __u64 lsr_valid; struct llog_rec_tail lsr_tail; } __packed; @@ -2963,15 +3080,9 @@ struct changelog_setinfo { /** changelog record */ struct llog_changelog_rec { - struct llog_rec_hdr cr_hdr; - struct changelog_rec cr; - struct llog_rec_tail cr_tail; /**< for_sizezof_only */ -} __packed; - -struct llog_changelog_ext_rec { - struct llog_rec_hdr cr_hdr; - struct changelog_ext_rec cr; - struct llog_rec_tail cr_tail; /**< for_sizezof_only */ + struct llog_rec_hdr cr_hdr; + struct changelog_rec cr; /**< Variable length field */ + struct llog_rec_tail cr_do_not_use; /**< for_sizezof_only */ } __packed; struct llog_changelog_user_rec { @@ -2990,7 +3101,7 @@ enum agent_req_status { ARS_SUCCEED, }; -static inline char *agent_req_status2name(enum agent_req_status ars) +static inline const char *agent_req_status2name(const enum agent_req_status ars) { switch (ars) { case ARS_WAITING: @@ -3056,6 +3167,9 @@ enum llog_flag { LLOG_F_ZAP_WHEN_EMPTY = 0x1, LLOG_F_IS_CAT = 0x2, LLOG_F_IS_PLAIN = 0x4, + LLOG_F_EXT_JOBID = BIT(3), + + LLOG_F_EXT_MASK = LLOG_F_EXT_JOBID, }; struct llog_log_hdr { @@ -3068,8 +3182,8 @@ struct llog_log_hdr { __u32 llh_cat_idx; /* for a catalog the first plain slot is next to it */ struct obd_uuid llh_tgtuuid; - __u32 llh_reserved[LLOG_HEADER_SIZE/sizeof(__u32) - 23]; - __u32 llh_bitmap[LLOG_BITMAP_BYTES/sizeof(__u32)]; + __u32 llh_reserved[LLOG_HEADER_SIZE / sizeof(__u32) - 23]; + __u32 llh_bitmap[LLOG_BITMAP_BYTES / sizeof(__u32)]; struct llog_rec_tail llh_tail; } __packed; @@ -3166,7 +3280,7 @@ struct obdo { #define o_cksum o_nlink #define o_grant_used o_data_version -static inline void lustre_set_wire_obdo(struct obd_connect_data *ocd, +static inline void lustre_set_wire_obdo(const struct obd_connect_data *ocd, struct obdo *wobdo, const struct obdo *lobdo) { @@ -3185,7 +3299,7 @@ static inline void lustre_set_wire_obdo(struct obd_connect_data *ocd, } } -static inline void lustre_get_wire_obdo(struct obd_connect_data *ocd, +static inline void lustre_get_wire_obdo(const struct obd_connect_data *ocd, struct obdo *lobdo, const struct obdo *wobdo) { @@ -3284,17 +3398,17 @@ void lustre_swab_lustre_capa(struct lustre_capa *c); /** lustre_capa::lc_opc */ enum { - CAPA_OPC_BODY_WRITE = 1<<0, /**< write object data */ - CAPA_OPC_BODY_READ = 1<<1, /**< read object data */ - CAPA_OPC_INDEX_LOOKUP = 1<<2, /**< lookup object fid */ - CAPA_OPC_INDEX_INSERT = 1<<3, /**< insert object fid */ - CAPA_OPC_INDEX_DELETE = 1<<4, /**< delete object fid */ - CAPA_OPC_OSS_WRITE = 1<<5, /**< write oss object data */ - CAPA_OPC_OSS_READ = 1<<6, /**< read oss object data */ - CAPA_OPC_OSS_TRUNC = 1<<7, /**< truncate oss object */ - CAPA_OPC_OSS_DESTROY = 1<<8, /**< destroy oss object */ - CAPA_OPC_META_WRITE = 1<<9, /**< write object meta data */ - CAPA_OPC_META_READ = 1<<10, /**< read object meta data */ + CAPA_OPC_BODY_WRITE = 1 << 0, /**< write object data */ + CAPA_OPC_BODY_READ = 1 << 1, /**< read object data */ + CAPA_OPC_INDEX_LOOKUP = 1 << 2, /**< lookup object fid */ + CAPA_OPC_INDEX_INSERT = 1 << 3, /**< insert object fid */ + CAPA_OPC_INDEX_DELETE = 1 << 4, /**< delete object fid */ + CAPA_OPC_OSS_WRITE = 1 << 5, /**< write oss object data */ + CAPA_OPC_OSS_READ = 1 << 6, /**< read oss object data */ + CAPA_OPC_OSS_TRUNC = 1 << 7, /**< truncate oss object */ + CAPA_OPC_OSS_DESTROY = 1 << 8, /**< destroy oss object */ + CAPA_OPC_META_WRITE = 1 << 9, /**< write object meta data */ + CAPA_OPC_META_READ = 1 << 10, /**< read object meta data */ }; #define CAPA_OPC_OSS_RW (CAPA_OPC_OSS_READ | CAPA_OPC_OSS_WRITE) @@ -3346,6 +3460,14 @@ struct getinfo_fid2path { void lustre_swab_fid2path(struct getinfo_fid2path *gf); +/** path2parent request/reply structures */ +struct getparent { + struct lu_fid gp_fid; /**< parent FID */ + __u32 gp_linkno; /**< hardlink number */ + __u32 gp_name_size; /**< size of the name field */ + char gp_name[0]; /**< zero-terminated link name */ +} __packed; + enum { LAYOUT_INTENT_ACCESS = 0, LAYOUT_INTENT_READ = 1, diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_ioctl.h b/drivers/staging/lustre/lustre/include/lustre/lustre_ioctl.h new file mode 100644 index 000000000000..f3d7c94c3b50 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/lustre_ioctl.h @@ -0,0 +1,412 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2015, Intel Corporation. + */ +#ifndef LUSTRE_IOCTL_H_ +#define LUSTRE_IOCTL_H_ + +#include <linux/types.h> +#include "../../../include/linux/libcfs/libcfs.h" +#include "lustre_idl.h" + +#ifdef __KERNEL__ +# include <linux/ioctl.h> +# include <linux/string.h> +# include "../obd_support.h" +#else /* __KERNEL__ */ +# include <malloc.h> +# include <string.h> +#include <libcfs/util/ioctl.h> +#endif /* !__KERNEL__ */ + +#if !defined(__KERNEL__) && !defined(LUSTRE_UTILS) +# error This file is for Lustre internal use only. +#endif + +enum md_echo_cmd { + ECHO_MD_CREATE = 1, /* Open/Create file on MDT */ + ECHO_MD_MKDIR = 2, /* Mkdir on MDT */ + ECHO_MD_DESTROY = 3, /* Unlink file on MDT */ + ECHO_MD_RMDIR = 4, /* Rmdir on MDT */ + ECHO_MD_LOOKUP = 5, /* Lookup on MDT */ + ECHO_MD_GETATTR = 6, /* Getattr on MDT */ + ECHO_MD_SETATTR = 7, /* Setattr on MDT */ + ECHO_MD_ALLOC_FID = 8, /* Get FIDs from MDT */ +}; + +#define OBD_DEV_ID 1 +#define OBD_DEV_NAME "obd" +#define OBD_DEV_PATH "/dev/" OBD_DEV_NAME +#define OBD_DEV_MAJOR 10 +#define OBD_DEV_MINOR 241 + +#define OBD_IOCTL_VERSION 0x00010004 +#define OBD_DEV_BY_DEVNAME 0xffffd0de +#define OBD_MAX_IOCTL_BUFFER CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER + +struct obd_ioctl_data { + __u32 ioc_len; + __u32 ioc_version; + + union { + __u64 ioc_cookie; + __u64 ioc_u64_1; + }; + union { + __u32 ioc_conn1; + __u32 ioc_u32_1; + }; + union { + __u32 ioc_conn2; + __u32 ioc_u32_2; + }; + + struct obdo ioc_obdo1; + struct obdo ioc_obdo2; + + __u64 ioc_count; + __u64 ioc_offset; + __u32 ioc_dev; + __u32 ioc_command; + + __u64 ioc_nid; + __u32 ioc_nal; + __u32 ioc_type; + + /* buffers the kernel will treat as user pointers */ + __u32 ioc_plen1; + char __user *ioc_pbuf1; + __u32 ioc_plen2; + char __user *ioc_pbuf2; + + /* inline buffers for various arguments */ + __u32 ioc_inllen1; + char *ioc_inlbuf1; + __u32 ioc_inllen2; + char *ioc_inlbuf2; + __u32 ioc_inllen3; + char *ioc_inlbuf3; + __u32 ioc_inllen4; + char *ioc_inlbuf4; + + char ioc_bulk[0]; +}; + +struct obd_ioctl_hdr { + __u32 ioc_len; + __u32 ioc_version; +}; + +static inline __u32 obd_ioctl_packlen(struct obd_ioctl_data *data) +{ + __u32 len = cfs_size_round(sizeof(*data)); + + len += cfs_size_round(data->ioc_inllen1); + len += cfs_size_round(data->ioc_inllen2); + len += cfs_size_round(data->ioc_inllen3); + len += cfs_size_round(data->ioc_inllen4); + + return len; +} + +static inline int obd_ioctl_is_invalid(struct obd_ioctl_data *data) +{ + if (data->ioc_len > (1 << 30)) { + CERROR("OBD ioctl: ioc_len larger than 1<<30\n"); + return 1; + } + + if (data->ioc_inllen1 > (1 << 30)) { + CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n"); + return 1; + } + + if (data->ioc_inllen2 > (1 << 30)) { + CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n"); + return 1; + } + + if (data->ioc_inllen3 > (1 << 30)) { + CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n"); + return 1; + } + + if (data->ioc_inllen4 > (1 << 30)) { + CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n"); + return 1; + } + + if (data->ioc_inlbuf1 && !data->ioc_inllen1) { + CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n"); + return 1; + } + + if (data->ioc_inlbuf2 && !data->ioc_inllen2) { + CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n"); + return 1; + } + + if (data->ioc_inlbuf3 && !data->ioc_inllen3) { + CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n"); + return 1; + } + + if (data->ioc_inlbuf4 && !data->ioc_inllen4) { + CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n"); + return 1; + } + + if (data->ioc_pbuf1 && !data->ioc_plen1) { + CERROR("OBD ioctl: pbuf1 pointer but 0 length\n"); + return 1; + } + + if (data->ioc_pbuf2 && !data->ioc_plen2) { + CERROR("OBD ioctl: pbuf2 pointer but 0 length\n"); + return 1; + } + + if (!data->ioc_pbuf1 && data->ioc_plen1) { + CERROR("OBD ioctl: plen1 set but NULL pointer\n"); + return 1; + } + + if (!data->ioc_pbuf2 && data->ioc_plen2) { + CERROR("OBD ioctl: plen2 set but NULL pointer\n"); + return 1; + } + + if (obd_ioctl_packlen(data) > data->ioc_len) { + CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n", + obd_ioctl_packlen(data), data->ioc_len); + return 1; + } + + return 0; +} + +#ifdef __KERNEL__ + +int obd_ioctl_getdata(char **buf, int *len, void __user *arg); +int obd_ioctl_popdata(void __user *arg, void *data, int len); + +static inline void obd_ioctl_freedata(char *buf, size_t len) +{ + kvfree(buf); +} + +#else /* __KERNEL__ */ + +static inline int obd_ioctl_pack(struct obd_ioctl_data *data, char **pbuf, + int max_len) +{ + char *ptr; + struct obd_ioctl_data *overlay; + + data->ioc_len = obd_ioctl_packlen(data); + data->ioc_version = OBD_IOCTL_VERSION; + + if (*pbuf && data->ioc_len > max_len) { + fprintf(stderr, "pbuf = %p, ioc_len = %u, max_len = %d\n", + *pbuf, data->ioc_len, max_len); + return -EINVAL; + } + + if (!*pbuf) + *pbuf = malloc(data->ioc_len); + + if (!*pbuf) + return -ENOMEM; + + overlay = (struct obd_ioctl_data *)*pbuf; + memcpy(*pbuf, data, sizeof(*data)); + + ptr = overlay->ioc_bulk; + if (data->ioc_inlbuf1) + LOGL(data->ioc_inlbuf1, data->ioc_inllen1, ptr); + + if (data->ioc_inlbuf2) + LOGL(data->ioc_inlbuf2, data->ioc_inllen2, ptr); + + if (data->ioc_inlbuf3) + LOGL(data->ioc_inlbuf3, data->ioc_inllen3, ptr); + + if (data->ioc_inlbuf4) + LOGL(data->ioc_inlbuf4, data->ioc_inllen4, ptr); + + if (obd_ioctl_is_invalid(overlay)) { + fprintf(stderr, "invalid ioctl data: ioc_len = %u, max_len = %d\n", + data->ioc_len, max_len); + return -EINVAL; + } + + return 0; +} + +static inline int +obd_ioctl_unpack(struct obd_ioctl_data *data, char *pbuf, int max_len) +{ + char *ptr; + struct obd_ioctl_data *overlay; + + if (!pbuf) + return 1; + + overlay = (struct obd_ioctl_data *)pbuf; + + /* Preserve the caller's buffer pointers */ + overlay->ioc_inlbuf1 = data->ioc_inlbuf1; + overlay->ioc_inlbuf2 = data->ioc_inlbuf2; + overlay->ioc_inlbuf3 = data->ioc_inlbuf3; + overlay->ioc_inlbuf4 = data->ioc_inlbuf4; + + memcpy(data, pbuf, sizeof(*data)); + + ptr = overlay->ioc_bulk; + if (data->ioc_inlbuf1) + LOGU(data->ioc_inlbuf1, data->ioc_inllen1, ptr); + + if (data->ioc_inlbuf2) + LOGU(data->ioc_inlbuf2, data->ioc_inllen2, ptr); + + if (data->ioc_inlbuf3) + LOGU(data->ioc_inlbuf3, data->ioc_inllen3, ptr); + + if (data->ioc_inlbuf4) + LOGU(data->ioc_inlbuf4, data->ioc_inllen4, ptr); + + return 0; +} + +#endif /* !__KERNEL__ */ + +/* + * OBD_IOC_DATA_TYPE is only for compatibility reasons with older + * Linux Lustre user tools. New ioctls should NOT use this macro as + * the ioctl "size". Instead the ioctl should get a "size" argument + * which is the actual data type used by the ioctl, to ensure the + * ioctl interface is versioned correctly. + */ +#define OBD_IOC_DATA_TYPE long + +/* IOC_LDLM_TEST _IOWR('f', 40, long) */ +/* IOC_LDLM_DUMP _IOWR('f', 41, long) */ +/* IOC_LDLM_REGRESS_START _IOWR('f', 42, long) */ +/* IOC_LDLM_REGRESS_STOP _IOWR('f', 43, long) */ + +#define OBD_IOC_CREATE _IOWR('f', 101, OBD_IOC_DATA_TYPE) +#define OBD_IOC_DESTROY _IOW('f', 104, OBD_IOC_DATA_TYPE) +/* OBD_IOC_PREALLOCATE _IOWR('f', 105, OBD_IOC_DATA_TYPE) */ + +#define OBD_IOC_SETATTR _IOW('f', 107, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETATTR _IOWR('f', 108, OBD_IOC_DATA_TYPE) +#define OBD_IOC_READ _IOWR('f', 109, OBD_IOC_DATA_TYPE) +#define OBD_IOC_WRITE _IOWR('f', 110, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_STATFS _IOWR('f', 113, OBD_IOC_DATA_TYPE) +#define OBD_IOC_SYNC _IOW('f', 114, OBD_IOC_DATA_TYPE) +/* OBD_IOC_READ2 _IOWR('f', 115, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_FORMAT _IOWR('f', 116, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_PARTITION _IOWR('f', 117, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_COPY _IOWR('f', 120, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_MIGR _IOWR('f', 121, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_PUNCH _IOWR('f', 122, OBD_IOC_DATA_TYPE) */ + +/* OBD_IOC_MODULE_DEBUG _IOWR('f', 124, OBD_IOC_DATA_TYPE) */ +#define OBD_IOC_BRW_READ _IOWR('f', 125, OBD_IOC_DATA_TYPE) +#define OBD_IOC_BRW_WRITE _IOWR('f', 126, OBD_IOC_DATA_TYPE) +#define OBD_IOC_NAME2DEV _IOWR('f', 127, OBD_IOC_DATA_TYPE) +#define OBD_IOC_UUID2DEV _IOWR('f', 130, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETNAME _IOWR('f', 131, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETMDNAME _IOR('f', 131, char[MAX_OBD_NAME]) +#define OBD_IOC_GETDTNAME OBD_IOC_GETNAME +#define OBD_IOC_LOV_GET_CONFIG _IOWR('f', 132, OBD_IOC_DATA_TYPE) +#define OBD_IOC_CLIENT_RECOVER _IOW('f', 133, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PING_TARGET _IOW('f', 136, OBD_IOC_DATA_TYPE) + +/* OBD_IOC_DEC_FS_USE_COUNT _IO('f', 139) */ +#define OBD_IOC_NO_TRANSNO _IOW('f', 140, OBD_IOC_DATA_TYPE) +#define OBD_IOC_SET_READONLY _IOW('f', 141, OBD_IOC_DATA_TYPE) +#define OBD_IOC_ABORT_RECOVERY _IOR('f', 142, OBD_IOC_DATA_TYPE) +/* OBD_IOC_ROOT_SQUASH _IOWR('f', 143, OBD_IOC_DATA_TYPE) */ +#define OBD_GET_VERSION _IOWR('f', 144, OBD_IOC_DATA_TYPE) +/* OBD_IOC_GSS_SUPPORT _IOWR('f', 145, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_CLOSE_UUID _IOWR('f', 147, OBD_IOC_DATA_TYPE) */ +#define OBD_IOC_CHANGELOG_SEND _IOW('f', 148, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETDEVICE _IOWR('f', 149, OBD_IOC_DATA_TYPE) +#define OBD_IOC_FID2PATH _IOWR('f', 150, OBD_IOC_DATA_TYPE) +/* lustre/lustre_user.h 151-153 */ +/* OBD_IOC_LOV_SETSTRIPE 154 LL_IOC_LOV_SETSTRIPE */ +/* OBD_IOC_LOV_GETSTRIPE 155 LL_IOC_LOV_GETSTRIPE */ +/* OBD_IOC_LOV_SETEA 156 LL_IOC_LOV_SETEA */ +/* lustre/lustre_user.h 157-159 */ +#define OBD_IOC_QUOTACHECK _IOW('f', 160, int) +#define OBD_IOC_POLL_QUOTACHECK _IOR('f', 161, struct if_quotacheck *) +#define OBD_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl) +/* lustre/lustre_user.h 163-176 */ +#define OBD_IOC_CHANGELOG_REG _IOW('f', 177, struct obd_ioctl_data) +#define OBD_IOC_CHANGELOG_DEREG _IOW('f', 178, struct obd_ioctl_data) +#define OBD_IOC_CHANGELOG_CLEAR _IOW('f', 179, struct obd_ioctl_data) +/* OBD_IOC_RECORD _IOWR('f', 180, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_ENDRECORD _IOWR('f', 181, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_PARSE _IOWR('f', 182, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_DORECORD _IOWR('f', 183, OBD_IOC_DATA_TYPE) */ +#define OBD_IOC_PROCESS_CFG _IOWR('f', 184, OBD_IOC_DATA_TYPE) +/* OBD_IOC_DUMP_LOG _IOWR('f', 185, OBD_IOC_DATA_TYPE) */ +/* OBD_IOC_CLEAR_LOG _IOWR('f', 186, OBD_IOC_DATA_TYPE) */ +#define OBD_IOC_PARAM _IOW('f', 187, OBD_IOC_DATA_TYPE) +#define OBD_IOC_POOL _IOWR('f', 188, OBD_IOC_DATA_TYPE) +#define OBD_IOC_REPLACE_NIDS _IOWR('f', 189, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_CATLOGLIST _IOWR('f', 190, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_INFO _IOWR('f', 191, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_PRINT _IOWR('f', 192, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_CANCEL _IOWR('f', 193, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_REMOVE _IOWR('f', 194, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_CHECK _IOWR('f', 195, OBD_IOC_DATA_TYPE) +/* OBD_IOC_LLOG_CATINFO _IOWR('f', 196, OBD_IOC_DATA_TYPE) */ +#define OBD_IOC_NODEMAP _IOWR('f', 197, OBD_IOC_DATA_TYPE) + +/* ECHO_IOC_GET_STRIPE _IOWR('f', 200, OBD_IOC_DATA_TYPE) */ +/* ECHO_IOC_SET_STRIPE _IOWR('f', 201, OBD_IOC_DATA_TYPE) */ +/* ECHO_IOC_ENQUEUE _IOWR('f', 202, OBD_IOC_DATA_TYPE) */ +/* ECHO_IOC_CANCEL _IOWR('f', 203, OBD_IOC_DATA_TYPE) */ + +#define OBD_IOC_GET_OBJ_VERSION _IOR('f', 210, OBD_IOC_DATA_TYPE) + +/* lustre/lustre_user.h 212-217 */ +#define OBD_IOC_GET_MNTOPT _IOW('f', 220, mntopt_t) +#define OBD_IOC_ECHO_MD _IOR('f', 221, struct obd_ioctl_data) +#define OBD_IOC_ECHO_ALLOC_SEQ _IOWR('f', 222, struct obd_ioctl_data) +#define OBD_IOC_START_LFSCK _IOWR('f', 230, OBD_IOC_DATA_TYPE) +#define OBD_IOC_STOP_LFSCK _IOW('f', 231, OBD_IOC_DATA_TYPE) +#define OBD_IOC_QUERY_LFSCK _IOR('f', 232, struct obd_ioctl_data) +/* lustre/lustre_user.h 240-249 */ +/* LIBCFS_IOC_DEBUG_MASK 250 */ + +#define IOC_OSC_SET_ACTIVE _IOWR('h', 21, void *) + +#endif /* LUSTRE_IOCTL_H_ */ diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h index ef6f38ff359e..6fc985571cba 100644 --- a/drivers/staging/lustre/lustre/include/lustre/lustre_user.h +++ b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h @@ -42,8 +42,35 @@ * @{ */ +#ifdef __KERNEL__ +# include <linux/quota.h> +# include <linux/string.h> /* snprintf() */ +# include <linux/version.h> +#else /* !__KERNEL__ */ +# define NEED_QUOTA_DEFS +# include <stdio.h> /* snprintf() */ +# include <string.h> +# include <sys/quota.h> +# include <sys/stat.h> +#endif /* __KERNEL__ */ #include "ll_fiemap.h" -#include "../linux/lustre_user.h" + +/* + * We need to always use 64bit version because the structure + * is shared across entire cluster where 32bit and 64bit machines + * are co-existing. + */ +#if __BITS_PER_LONG != 64 || defined(__ARCH_WANT_STAT64) +typedef struct stat64 lstat_t; +#define lstat_f lstat64 +#else +typedef struct stat lstat_t; +#define lstat_f lstat +#endif + +#define HAVE_LOV_USER_MDS_DATA + +#define LUSTRE_EOF 0xffffffffffffffffULL /* for statfs() */ #define LL_SUPER_MAGIC 0x0BD00BD0 @@ -117,6 +144,11 @@ struct lu_fid { __u32 f_ver; }; +static inline bool fid_is_zero(const struct lu_fid *fid) +{ + return !fid->f_seq && !fid->f_oid; +} + struct filter_fid { struct lu_fid ff_parent; /* ff_parent.f_ver == file stripe number */ }; @@ -167,7 +199,7 @@ struct lustre_mdt_attrs { */ struct ost_id { union { - struct ostid { + struct { __u64 oi_id; __u64 oi_seq; } oi; @@ -188,26 +220,20 @@ struct ost_id { * *STRIPE* - set/get lov_user_md * *INFO - set/get lov_user_mds_data */ -/* see <lustre_lib.h> for ioctl numberss 101-150 */ +/* lustre_ioctl.h 101-150 */ #define LL_IOC_GETFLAGS _IOR('f', 151, long) #define LL_IOC_SETFLAGS _IOW('f', 152, long) #define LL_IOC_CLRFLAGS _IOW('f', 153, long) -/* LL_IOC_LOV_SETSTRIPE: See also OBD_IOC_LOV_SETSTRIPE */ #define LL_IOC_LOV_SETSTRIPE _IOW('f', 154, long) -/* LL_IOC_LOV_GETSTRIPE: See also OBD_IOC_LOV_GETSTRIPE */ #define LL_IOC_LOV_GETSTRIPE _IOW('f', 155, long) -/* LL_IOC_LOV_SETEA: See also OBD_IOC_LOV_SETEA */ #define LL_IOC_LOV_SETEA _IOW('f', 156, long) -#define LL_IOC_RECREATE_OBJ _IOW('f', 157, long) -#define LL_IOC_RECREATE_FID _IOW('f', 157, struct lu_fid) +/* LL_IOC_RECREATE_OBJ 157 obsolete */ +/* LL_IOC_RECREATE_FID 158 obsolete */ #define LL_IOC_GROUP_LOCK _IOW('f', 158, long) #define LL_IOC_GROUP_UNLOCK _IOW('f', 159, long) -/* LL_IOC_QUOTACHECK: See also OBD_IOC_QUOTACHECK */ -#define LL_IOC_QUOTACHECK _IOW('f', 160, int) -/* LL_IOC_POLL_QUOTACHECK: See also OBD_IOC_POLL_QUOTACHECK */ -#define LL_IOC_POLL_QUOTACHECK _IOR('f', 161, struct if_quotacheck *) -/* LL_IOC_QUOTACTL: See also OBD_IOC_QUOTACTL */ -#define LL_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl) +/* #define LL_IOC_QUOTACHECK 160 OBD_IOC_QUOTACHECK */ +/* #define LL_IOC_POLL_QUOTACHECK 161 OBD_IOC_POLL_QUOTACHECK */ +/* #define LL_IOC_QUOTACTL 162 OBD_IOC_QUOTACTL */ #define IOC_OBD_STATFS _IOWR('f', 164, struct obd_statfs *) #define IOC_LOV_GETINFO _IOWR('f', 165, struct lov_user_mds_data *) #define LL_IOC_FLUSHCTX _IOW('f', 166, long) @@ -221,8 +247,7 @@ struct ost_id { #define LL_IOC_GET_CONNECT_FLAGS _IOWR('f', 174, __u64 *) #define LL_IOC_GET_MDTIDX _IOR('f', 175, int) -/* see <lustre_lib.h> for ioctl numbers 177-210 */ - +/* lustre_ioctl.h 177-210 */ #define LL_IOC_HSM_STATE_GET _IOR('f', 211, struct hsm_user_state) #define LL_IOC_HSM_STATE_SET _IOW('f', 212, struct hsm_state_set) #define LL_IOC_HSM_CT_START _IOW('f', 213, struct lustre_kernelcomm) @@ -242,6 +267,17 @@ struct ost_id { #define LL_IOC_SET_LEASE _IOWR('f', 243, long) #define LL_IOC_GET_LEASE _IO('f', 244) #define LL_IOC_HSM_IMPORT _IOWR('f', 245, struct hsm_user_import) +#define LL_IOC_LMV_SET_DEFAULT_STRIPE _IOWR('f', 246, struct lmv_user_md) +#define LL_IOC_MIGRATE _IOR('f', 247, int) +#define LL_IOC_FID2MDTIDX _IOWR('f', 248, struct lu_fid) +#define LL_IOC_GETPARENT _IOWR('f', 249, struct getparent) + +/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */ +enum ll_lease_type { + LL_LEASE_RDLCK = 0x1, + LL_LEASE_WRLCK = 0x2, + LL_LEASE_UNLCK = 0x4, +}; #define LL_STATFS_LMV 1 #define LL_STATFS_LOV 2 @@ -253,10 +289,6 @@ struct ost_id { #define IOC_MDC_GETFILEINFO _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *) #define LL_IOC_MDC_GETINFO _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *) -/* Keep these for backward compartability. */ -#define LL_IOC_OBD_STATFS IOC_OBD_STATFS -#define IOC_MDC_GETSTRIPE IOC_MDC_GETFILESTRIPE - #define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */ /* Define O_LOV_DELAY_CREATE to be a mask that is not useful for regular @@ -273,20 +305,26 @@ struct ost_id { #define LL_FILE_LOCKLESS_IO 0x00000010 /* server-side locks with cio */ #define LL_FILE_RMTACL 0x00000020 -#define LOV_USER_MAGIC_V1 0x0BD10BD0 -#define LOV_USER_MAGIC LOV_USER_MAGIC_V1 -#define LOV_USER_MAGIC_JOIN_V1 0x0BD20BD0 -#define LOV_USER_MAGIC_V3 0x0BD30BD0 +#define LOV_USER_MAGIC_V1 0x0BD10BD0 +#define LOV_USER_MAGIC LOV_USER_MAGIC_V1 +#define LOV_USER_MAGIC_JOIN_V1 0x0BD20BD0 +#define LOV_USER_MAGIC_V3 0x0BD30BD0 +/* 0x0BD40BD0 is occupied by LOV_MAGIC_MIGRATE */ +#define LOV_USER_MAGIC_SPECIFIC 0x0BD50BD0 /* for specific OSTs */ + +#define LMV_USER_MAGIC 0x0CD30CD0 /*default lmv magic*/ -#define LMV_MAGIC_V1 0x0CD10CD0 /*normal stripe lmv magic */ -#define LMV_USER_MAGIC 0x0CD20CD0 /*default lmv magic*/ +#define LOV_PATTERN_RAID0 0x001 +#define LOV_PATTERN_RAID1 0x002 +#define LOV_PATTERN_FIRST 0x100 +#define LOV_PATTERN_CMOBD 0x200 -#define LOV_PATTERN_RAID0 0x001 -#define LOV_PATTERN_RAID1 0x002 -#define LOV_PATTERN_FIRST 0x100 +#define LOV_PATTERN_F_MASK 0xffff0000 +#define LOV_PATTERN_F_HOLE 0x40000000 /* there is hole in LOV EA */ +#define LOV_PATTERN_F_RELEASED 0x80000000 /* HSM released file */ -#define LOV_MAXPOOLNAME 16 -#define LOV_POOLNAMEF "%.16s" +#define LOV_MAXPOOLNAME 15 +#define LOV_POOLNAMEF "%.15s" #define LOV_MIN_STRIPE_BITS 16 /* maximum PAGE_SIZE (ia64), power of 2 */ #define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS) @@ -344,18 +382,17 @@ struct lov_user_md_v3 { /* LOV EA user data (host-endian) */ * used when reading */ }; - char lmm_pool_name[LOV_MAXPOOLNAME]; /* pool name */ + char lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* pool name */ struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ } __packed; static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic) { - if (lmm_magic == LOV_USER_MAGIC_V3) - return sizeof(struct lov_user_md_v3) + - stripes * sizeof(struct lov_user_ost_data_v1); - else + if (lmm_magic == LOV_USER_MAGIC_V1) return sizeof(struct lov_user_md_v1) + stripes * sizeof(struct lov_user_ost_data_v1); + return sizeof(struct lov_user_md_v3) + + stripes * sizeof(struct lov_user_ost_data_v1); } /* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to @@ -374,19 +411,26 @@ struct lov_user_mds_data_v3 { } __packed; #endif -/* keep this to be the same size as lov_user_ost_data_v1 */ struct lmv_user_mds_data { struct lu_fid lum_fid; __u32 lum_padding; __u32 lum_mds; }; -/* lum_type */ -enum { - LMV_STRIPE_TYPE = 0, - LMV_DEFAULT_TYPE = 1, +enum lmv_hash_type { + LMV_HASH_TYPE_UNKNOWN = 0, /* 0 is reserved for testing purpose */ + LMV_HASH_TYPE_ALL_CHARS = 1, + LMV_HASH_TYPE_FNV_1A_64 = 2, }; +#define LMV_HASH_NAME_ALL_CHARS "all_char" +#define LMV_HASH_NAME_FNV_1A_64 "fnv_1a_64" + +/* + * Got this according to how get LOV_MAX_STRIPE_COUNT, see above, + * (max buffer size - lmv+rpc header) / sizeof(struct lmv_user_mds_data) + */ +#define LMV_MAX_STRIPE_COUNT 2000 /* ((12 * 4096 - 256) / 24) */ #define lmv_user_md lmv_user_md_v1 struct lmv_user_md_v1 { __u32 lum_magic; /* must be the first field */ @@ -397,9 +441,9 @@ struct lmv_user_md_v1 { __u32 lum_padding1; __u32 lum_padding2; __u32 lum_padding3; - char lum_pool_name[LOV_MAXPOOLNAME]; + char lum_pool_name[LOV_MAXPOOLNAME + 1]; struct lmv_user_mds_data lum_objects[0]; -}; +} __packed; static inline int lmv_user_md_size(int stripes, int lmm_magic) { @@ -407,6 +451,8 @@ static inline int lmv_user_md_size(int stripes, int lmm_magic) stripes * sizeof(struct lmv_user_mds_data); } +void lustre_swab_lmv_user_md(struct lmv_user_md *lum); + struct ll_recreate_obj { __u64 lrc_id; __u32 lrc_ost_idx; @@ -498,6 +544,12 @@ static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen) /********* Quotas **********/ +#define Q_QUOTACHECK 0x800100 /* deprecated as of 2.4 */ +#define Q_INITQUOTA 0x800101 /* deprecated as of 2.4 */ +#define Q_GETOINFO 0x800102 /* get obd quota info */ +#define Q_GETOQUOTA 0x800103 /* get obd quotas */ +#define Q_FINVALIDATE 0x800104 /* deprecated as of 2.4 */ + /* these must be explicitly translated into linux Q_* in ll_dir_ioctl */ #define LUSTRE_Q_QUOTAON 0x800002 /* turn quotas on */ #define LUSTRE_Q_QUOTAOFF 0x800003 /* turn quotas off */ @@ -648,11 +700,16 @@ static inline const char *changelog_type2str(int type) } /* per-record flags */ -#define CLF_VERSION 0x1000 -#define CLF_EXT_VERSION 0x2000 #define CLF_FLAGSHIFT 12 #define CLF_FLAGMASK ((1U << CLF_FLAGSHIFT) - 1) #define CLF_VERMASK (~CLF_FLAGMASK) +enum changelog_rec_flags { + CLF_VERSION = 0x1000, + CLF_RENAME = 0x2000, + CLF_JOBID = 0x4000, + CLF_SUPPORTED = CLF_VERSION | CLF_RENAME | CLF_JOBID +}; + /* Anything under the flagmask may be per-type (if desired) */ /* Flags for unlink */ #define CLF_UNLINK_LAST 0x0001 /* Unlink of last hardlink */ @@ -736,12 +793,35 @@ static inline void hsm_set_cl_error(int *flags, int error) *flags |= (error << CLF_HSM_ERR_L); } -#define CR_MAXSIZE cfs_size_round(2*NAME_MAX + 1 + \ - sizeof(struct changelog_ext_rec)) +enum changelog_send_flag { + /* Not yet implemented */ + CHANGELOG_FLAG_FOLLOW = BIT(0), + /* + * Blocking IO makes sense in case of slow user parsing of the records, + * but it also prevents us from cleaning up if the records are not + * consumed. + */ + CHANGELOG_FLAG_BLOCK = BIT(1), + /* Pack jobid into the changelog records if available. */ + CHANGELOG_FLAG_JOBID = BIT(2), +}; + +#define CR_MAXSIZE cfs_size_round(2 * NAME_MAX + 2 + \ + changelog_rec_offset(CLF_SUPPORTED)) + +/* 31 usable bytes string + null terminator. */ +#define LUSTRE_JOBID_SIZE 32 +/* + * This is the minimal changelog record. It can contain extensions + * such as rename fields or process jobid. Its exact content is described + * by the cr_flags. + * + * Extensions are packed in the same order as their corresponding flags. + */ struct changelog_rec { __u16 cr_namelen; - __u16 cr_flags; /**< (flags&CLF_FLAGMASK)|CLF_VERSION */ + __u16 cr_flags; /**< \a changelog_rec_flags */ __u32 cr_type; /**< \a changelog_rec_type */ __u64 cr_index; /**< changelog record number */ __u64 cr_prev; /**< last index for this target fid */ @@ -751,55 +831,138 @@ struct changelog_rec { __u32 cr_markerflags; /**< CL_MARK flags */ }; struct lu_fid cr_pfid; /**< parent fid */ - char cr_name[0]; /**< last element */ } __packed; -/* changelog_ext_rec is 2*sizeof(lu_fid) bigger than changelog_rec, to save - * space, only rename uses changelog_ext_rec, while others use changelog_rec to - * store records. - */ -struct changelog_ext_rec { - __u16 cr_namelen; - __u16 cr_flags; /**< (flags & CLF_FLAGMASK) | - * CLF_EXT_VERSION - */ - __u32 cr_type; /**< \a changelog_rec_type */ - __u64 cr_index; /**< changelog record number */ - __u64 cr_prev; /**< last index for this target fid */ - __u64 cr_time; - union { - struct lu_fid cr_tfid; /**< target fid */ - __u32 cr_markerflags; /**< CL_MARK flags */ - }; - struct lu_fid cr_pfid; /**< target parent fid */ - struct lu_fid cr_sfid; /**< source fid, or zero */ - struct lu_fid cr_spfid; /**< source parent fid, or zero */ - char cr_name[0]; /**< last element */ -} __packed; +/* Changelog extension for RENAME. */ +struct changelog_ext_rename { + struct lu_fid cr_sfid; /**< source fid, or zero */ + struct lu_fid cr_spfid; /**< source parent fid, or zero */ +}; -#define CHANGELOG_REC_EXTENDED(rec) \ - (((rec)->cr_flags & CLF_VERMASK) == CLF_EXT_VERSION) +/* Changelog extension to include JOBID. */ +struct changelog_ext_jobid { + char cr_jobid[LUSTRE_JOBID_SIZE]; /**< zero-terminated string. */ +}; + +static inline size_t changelog_rec_offset(enum changelog_rec_flags crf) +{ + size_t size = sizeof(struct changelog_rec); + + if (crf & CLF_RENAME) + size += sizeof(struct changelog_ext_rename); + + if (crf & CLF_JOBID) + size += sizeof(struct changelog_ext_jobid); -static inline int changelog_rec_size(struct changelog_rec *rec) + return size; +} + +static inline size_t changelog_rec_size(struct changelog_rec *rec) { - return CHANGELOG_REC_EXTENDED(rec) ? sizeof(struct changelog_ext_rec) : - sizeof(*rec); + return changelog_rec_offset(rec->cr_flags); +} + +static inline size_t changelog_rec_varsize(struct changelog_rec *rec) +{ + return changelog_rec_size(rec) - sizeof(*rec) + rec->cr_namelen; +} + +static inline +struct changelog_ext_rename *changelog_rec_rename(struct changelog_rec *rec) +{ + enum changelog_rec_flags crf = rec->cr_flags & CLF_VERSION; + + return (struct changelog_ext_rename *)((char *)rec + + changelog_rec_offset(crf)); +} + +/* The jobid follows the rename extension, if present */ +static inline +struct changelog_ext_jobid *changelog_rec_jobid(struct changelog_rec *rec) +{ + enum changelog_rec_flags crf = rec->cr_flags & + (CLF_VERSION | CLF_RENAME); + + return (struct changelog_ext_jobid *)((char *)rec + + changelog_rec_offset(crf)); } +/* The name follows the rename and jobid extensions, if present */ static inline char *changelog_rec_name(struct changelog_rec *rec) { - return CHANGELOG_REC_EXTENDED(rec) ? - ((struct changelog_ext_rec *)rec)->cr_name : rec->cr_name; + return (char *)rec + changelog_rec_offset(rec->cr_flags & + CLF_SUPPORTED); } -static inline int changelog_rec_snamelen(struct changelog_ext_rec *rec) +static inline size_t changelog_rec_snamelen(struct changelog_rec *rec) { - return rec->cr_namelen - strlen(rec->cr_name) - 1; + return rec->cr_namelen - strlen(changelog_rec_name(rec)) - 1; } -static inline char *changelog_rec_sname(struct changelog_ext_rec *rec) +static inline char *changelog_rec_sname(struct changelog_rec *rec) { - return rec->cr_name + strlen(rec->cr_name) + 1; + char *cr_name = changelog_rec_name(rec); + + return cr_name + strlen(cr_name) + 1; +} + +/** + * Remap a record to the desired format as specified by the crf flags. + * The record must be big enough to contain the final remapped version. + * Superfluous extension fields are removed and missing ones are added + * and zeroed. The flags of the record are updated accordingly. + * + * The jobid and rename extensions can be added to a record, to match the + * format an application expects, typically. In this case, the newly added + * fields will be zeroed. + * The Jobid field can be removed, to guarantee compatibility with older + * clients that don't expect this field in the records they process. + * + * The following assumptions are being made: + * - CLF_RENAME will not be removed + * - CLF_JOBID will not be added without CLF_RENAME being added too + * + * @param[in,out] rec The record to remap. + * @param[in] crf_wanted Flags describing the desired extensions. + */ +static inline void changelog_remap_rec(struct changelog_rec *rec, + enum changelog_rec_flags crf_wanted) +{ + char *jid_mov, *rnm_mov; + + crf_wanted &= CLF_SUPPORTED; + + if ((rec->cr_flags & CLF_SUPPORTED) == crf_wanted) + return; + + /* First move the variable-length name field */ + memmove((char *)rec + changelog_rec_offset(crf_wanted), + changelog_rec_name(rec), rec->cr_namelen); + + /* Locations of jobid and rename extensions in the remapped record */ + jid_mov = (char *)rec + + changelog_rec_offset(crf_wanted & ~CLF_JOBID); + rnm_mov = (char *)rec + + changelog_rec_offset(crf_wanted & ~(CLF_JOBID | CLF_RENAME)); + + /* Move the extension fields to the desired positions */ + if ((crf_wanted & CLF_JOBID) && (rec->cr_flags & CLF_JOBID)) + memmove(jid_mov, changelog_rec_jobid(rec), + sizeof(struct changelog_ext_jobid)); + + if ((crf_wanted & CLF_RENAME) && (rec->cr_flags & CLF_RENAME)) + memmove(rnm_mov, changelog_rec_rename(rec), + sizeof(struct changelog_ext_rename)); + + /* Clear newly added fields */ + if ((crf_wanted & CLF_JOBID) && !(rec->cr_flags & CLF_JOBID)) + memset(jid_mov, 0, sizeof(struct changelog_ext_jobid)); + + if ((crf_wanted & CLF_RENAME) && !(rec->cr_flags & CLF_RENAME)) + memset(rnm_mov, 0, sizeof(struct changelog_ext_rename)); + + /* Update the record's flags accordingly */ + rec->cr_flags = (rec->cr_flags & CLF_FLAGMASK) | crf_wanted; } struct ioc_changelog { @@ -978,7 +1141,7 @@ struct hsm_user_request { /** Return pointer to data field in a hsm user request */ static inline void *hur_data(struct hsm_user_request *hur) { - return &(hur->hur_user_item[hur->hur_request.hr_itemcount]); + return &hur->hur_user_item[hur->hur_request.hr_itemcount]; } /** diff --git a/drivers/staging/lustre/lustre/include/lustre_cfg.h b/drivers/staging/lustre/lustre/include/lustre_cfg.h index 95a0be13c0fb..8eb394e64b25 100644 --- a/drivers/staging/lustre/lustre/include/lustre_cfg.h +++ b/drivers/staging/lustre/lustre/include/lustre_cfg.h @@ -151,13 +151,11 @@ static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs, char *nam lustre_cfg_bufs_set_string(bufs, 0, name); } -static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, int index) +static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, __u32 index) { - int i; - int offset; - int bufcount; - - LASSERT(index >= 0); + __u32 i; + size_t offset; + __u32 bufcount; bufcount = lcfg->lcfg_bufcount; if (index >= bufcount) @@ -172,7 +170,7 @@ static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, int index) static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs, struct lustre_cfg *lcfg) { - int i; + __u32 i; bufs->lcfg_bufcount = lcfg->lcfg_bufcount; for (i = 0; i < bufs->lcfg_bufcount; i++) { @@ -181,7 +179,7 @@ static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs, } } -static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index) +static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, __u32 index) { char *s; @@ -197,8 +195,8 @@ static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index) * of data. Try to use the padding first though. */ if (s[lcfg->lcfg_buflens[index] - 1] != '\0') { - int last = min((int)lcfg->lcfg_buflens[index], - cfs_size_round(lcfg->lcfg_buflens[index]) - 1); + size_t last = min((size_t)lcfg->lcfg_buflens[index], + cfs_size_round(lcfg->lcfg_buflens[index]) - 1); char lost = s[last]; s[last] = '\0'; @@ -210,10 +208,10 @@ static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index) return s; } -static inline int lustre_cfg_len(__u32 bufcount, __u32 *buflens) +static inline __u32 lustre_cfg_len(__u32 bufcount, __u32 *buflens) { - int i; - int len; + __u32 i; + __u32 len; len = LCFG_HDR_SIZE(bufcount); for (i = 0; i < bufcount; i++) @@ -254,7 +252,7 @@ static inline void lustre_cfg_free(struct lustre_cfg *lcfg) return; } -static inline int lustre_cfg_sanity_check(void *buf, int len) +static inline int lustre_cfg_sanity_check(void *buf, size_t len) { struct lustre_cfg *lcfg = (struct lustre_cfg *)buf; diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h b/drivers/staging/lustre/lustre/include/lustre_compat.h index 1eb64ec4bed4..567c438e93cb 100644 --- a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h +++ b/drivers/staging/lustre/lustre/include/lustre_compat.h @@ -30,8 +30,8 @@ * Lustre is a trademark of Sun Microsystems, Inc. */ -#ifndef _LINUX_COMPAT25_H -#define _LINUX_COMPAT25_H +#ifndef _LUSTRE_COMPAT_H +#define _LUSTRE_COMPAT_H #include <linux/fs_struct.h> #include <linux/namei.h> @@ -74,4 +74,4 @@ # define ext2_find_next_zero_bit find_next_zero_bit_le #endif -#endif /* _COMPAT25_H */ +#endif /* _LUSTRE_COMPAT_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm.h b/drivers/staging/lustre/lustre/include/lustre_dlm.h index 60051a5cfe20..d03534432624 100644 --- a/drivers/staging/lustre/lustre/include/lustre_dlm.h +++ b/drivers/staging/lustre/lustre/include/lustre_dlm.h @@ -573,6 +573,11 @@ enum lvb_type { }; /** + * LDLM_GID_ANY is used to match any group id in ldlm_lock_match(). + */ +#define LDLM_GID_ANY ((__u64)-1) + +/** * LDLM lock structure * * Represents a single LDLM lock and its state in memory. Each lock is @@ -968,6 +973,7 @@ struct ldlm_enqueue_info { void *ei_cb_cp; /** lock completion callback */ void *ei_cb_gl; /** lock glimpse callback */ void *ei_cbdata; /** Data to be passed into callbacks. */ + unsigned int ei_enq_slave:1; /* whether enqueue slave stripes */ }; extern struct obd_ops ldlm_obd_ops; @@ -1281,16 +1287,6 @@ int ldlm_cli_cancel_list(struct list_head *head, int count, int intent_disposition(struct ldlm_reply *rep, int flag); void intent_set_disposition(struct ldlm_reply *rep, int flag); -/* ioctls for trying requests */ -#define IOC_LDLM_TYPE 'f' -#define IOC_LDLM_MIN_NR 40 - -#define IOC_LDLM_TEST _IOWR('f', 40, long) -#define IOC_LDLM_DUMP _IOWR('f', 41, long) -#define IOC_LDLM_REGRESS_START _IOWR('f', 42, long) -#define IOC_LDLM_REGRESS_STOP _IOWR('f', 43, long) -#define IOC_LDLM_MAX_NR 43 - /** * "Modes" of acquiring lock_res, necessary to tell lockdep that taking more * than one lock_res is dead-lock safe. diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h index e7e0c21a9b40..a0f064d237c9 100644 --- a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h +++ b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h @@ -28,21 +28,6 @@ /** l_flags bits marked as "all_flags" bits */ #define LDLM_FL_ALL_FLAGS_MASK 0x00FFFFFFC08F932FULL -/** l_flags bits marked as "ast" bits */ -#define LDLM_FL_AST_MASK 0x0000000080008000ULL - -/** l_flags bits marked as "blocked" bits */ -#define LDLM_FL_BLOCKED_MASK 0x000000000000000EULL - -/** l_flags bits marked as "gone" bits */ -#define LDLM_FL_GONE_MASK 0x0006004000000000ULL - -/** l_flags bits marked as "inherit" bits */ -#define LDLM_FL_INHERIT_MASK 0x0000000000800000ULL - -/** l_flags bits marked as "off_wire" bits */ -#define LDLM_FL_OFF_WIRE_MASK 0x00FFFFFF00000000ULL - /** extent, mode, or resource changed */ #define LDLM_FL_LOCK_CHANGED 0x0000000000000001ULL /* bit 0 */ #define ldlm_is_lock_changed(_l) LDLM_TEST_FLAG((_l), 1ULL << 0) @@ -372,6 +357,27 @@ #define ldlm_set_excl(_l) LDLM_SET_FLAG((_l), 1ULL << 55) #define ldlm_clear_excl(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 55) +/** l_flags bits marked as "ast" bits */ +#define LDLM_FL_AST_MASK (LDLM_FL_FLOCK_DEADLOCK |\ + LDLM_FL_AST_DISCARD_DATA) + +/** l_flags bits marked as "blocked" bits */ +#define LDLM_FL_BLOCKED_MASK (LDLM_FL_BLOCK_GRANTED |\ + LDLM_FL_BLOCK_CONV |\ + LDLM_FL_BLOCK_WAIT) + +/** l_flags bits marked as "gone" bits */ +#define LDLM_FL_GONE_MASK (LDLM_FL_DESTROYED |\ + LDLM_FL_FAILED) + +/** l_flags bits marked as "inherit" bits */ +/* Flags inherited from wire on enqueue/reply between client/server. */ +/* NO_TIMEOUT flag to force ldlm_lock_match() to wait with no timeout. */ +/* TEST_LOCK flag to not let TEST lock to be granted. */ +#define LDLM_FL_INHERIT_MASK (LDLM_FL_CANCEL_ON_BLOCK |\ + LDLM_FL_NO_TIMEOUT |\ + LDLM_FL_TEST_LOCK) + /** test for ldlm_lock flag bit set */ #define LDLM_TEST_FLAG(_l, _b) (((_l)->l_flags & (_b)) != 0) diff --git a/drivers/staging/lustre/lustre/include/lustre_eacl.h b/drivers/staging/lustre/lustre/include/lustre_eacl.h index d1039e1ff70d..1e71a8638186 100644 --- a/drivers/staging/lustre/lustre/include/lustre_eacl.h +++ b/drivers/staging/lustre/lustre/include/lustre_eacl.h @@ -46,6 +46,7 @@ #ifdef CONFIG_FS_POSIX_ACL +#include <linux/fs.h> #include <linux/posix_acl_xattr.h> typedef struct { diff --git a/drivers/staging/lustre/lustre/include/lustre_fid.h b/drivers/staging/lustre/lustre/include/lustre_fid.h index 743671a547ef..316780693193 100644 --- a/drivers/staging/lustre/lustre/include/lustre_fid.h +++ b/drivers/staging/lustre/lustre/include/lustre_fid.h @@ -229,6 +229,7 @@ enum local_oid { MDD_LOV_OBJ_OSEQ = 4121UL, LFSCK_NAMESPACE_OID = 4122UL, REMOTE_PARENT_DIR_OID = 4123UL, + SLAVE_LLOG_CATALOGS_OID = 4124UL, }; static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid) @@ -392,21 +393,19 @@ struct ldlm_namespace; * but was moved into name[1] along with the OID to avoid consuming the * renaming name[2,3] fields that need to be used for the quota identifier. */ -static inline struct ldlm_res_id * +static inline void fid_build_reg_res_name(const struct lu_fid *fid, struct ldlm_res_id *res) { memset(res, 0, sizeof(*res)); res->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(fid); res->name[LUSTRE_RES_ID_VER_OID_OFF] = fid_ver_oid(fid); - - return res; } /* * Return true if resource is for object identified by FID. */ -static inline int fid_res_name_eq(const struct lu_fid *fid, - const struct ldlm_res_id *res) +static inline bool fid_res_name_eq(const struct lu_fid *fid, + const struct ldlm_res_id *res) { return res->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(fid) && res->name[LUSTRE_RES_ID_VER_OID_OFF] == fid_ver_oid(fid); @@ -415,29 +414,25 @@ static inline int fid_res_name_eq(const struct lu_fid *fid, /* * Extract FID from LDLM resource. Reverse of fid_build_reg_res_name(). */ -static inline struct lu_fid * +static inline void fid_extract_from_res_name(struct lu_fid *fid, const struct ldlm_res_id *res) { fid->f_seq = res->name[LUSTRE_RES_ID_SEQ_OFF]; fid->f_oid = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF]); fid->f_ver = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32); LASSERT(fid_res_name_eq(fid, res)); - - return fid; } /* * Build (DLM) resource identifier from global quota FID and quota ID. */ -static inline struct ldlm_res_id * +static inline void fid_build_quota_res_name(const struct lu_fid *glb_fid, union lquota_id *qid, struct ldlm_res_id *res) { fid_build_reg_res_name(glb_fid, res); res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF] = fid_seq(&qid->qid_fid); res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] = fid_ver_oid(&qid->qid_fid); - - return res; } /* @@ -454,14 +449,12 @@ static inline void fid_extract_from_quota_res(struct lu_fid *glb_fid, (__u32)(res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] >> 32); } -static inline struct ldlm_res_id * +static inline void fid_build_pdo_res_name(const struct lu_fid *fid, unsigned int hash, struct ldlm_res_id *res) { fid_build_reg_res_name(fid, res); res->name[LUSTRE_RES_ID_HSH_OFF] = hash; - - return res; } /** @@ -482,7 +475,7 @@ fid_build_pdo_res_name(const struct lu_fid *fid, unsigned int hash, * res will be built from normal FID directly, i.e. res[0] = f_seq, * res[1] = f_oid + f_ver. */ -static inline void ostid_build_res_name(struct ost_id *oi, +static inline void ostid_build_res_name(const struct ost_id *oi, struct ldlm_res_id *name) { memset(name, 0, sizeof(*name)); @@ -497,8 +490,8 @@ static inline void ostid_build_res_name(struct ost_id *oi, /** * Return true if the resource is for the object identified by this id & group. */ -static inline int ostid_res_name_eq(struct ost_id *oi, - struct ldlm_res_id *name) +static inline int ostid_res_name_eq(const struct ost_id *oi, + const struct ldlm_res_id *name) { /* Note: it is just a trick here to save some effort, probably the * correct way would be turn them into the FID and compare @@ -603,13 +596,14 @@ static inline __u32 fid_flatten32(const struct lu_fid *fid) * (from OID), or up to 128M inodes without collisions for new files. */ ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) + - (seq >> (64 - (40-8)) & 0xffffff00) + + (seq >> (64 - (40 - 8)) & 0xffffff00) + (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8); return ino ? ino : fid_oid(fid); } -static inline int lu_fid_diff(struct lu_fid *fid1, struct lu_fid *fid2) +static inline int lu_fid_diff(const struct lu_fid *fid1, + const struct lu_fid *fid2) { LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n", PFID(fid1), PFID(fid2)); diff --git a/drivers/staging/lustre/lustre/include/lustre_handles.h b/drivers/staging/lustre/lustre/include/lustre_handles.h index 1a63a6b9e116..e071bac9df57 100644 --- a/drivers/staging/lustre/lustre/include/lustre_handles.h +++ b/drivers/staging/lustre/lustre/include/lustre_handles.h @@ -66,6 +66,7 @@ struct portals_handle_ops { struct portals_handle { struct list_head h_link; __u64 h_cookie; + const void *h_owner; struct portals_handle_ops *h_ops; /* newly added fields to handle the RCU issue. -jxiong */ @@ -75,15 +76,13 @@ struct portals_handle { unsigned int h_in:1; }; -#define RCU2HANDLE(rcu) container_of(rcu, struct portals_handle, h_rcu) - /* handles.c */ /* Add a handle to the hash table */ void class_handle_hash(struct portals_handle *, struct portals_handle_ops *ops); void class_handle_unhash(struct portals_handle *); -void *class_handle2object(__u64 cookie); +void *class_handle2object(__u64 cookie, const void *owner); void class_handle_free_cb(struct rcu_head *rcu); int class_handle_init(void); void class_handle_cleanup(void); diff --git a/drivers/staging/lustre/lustre/include/lustre_import.h b/drivers/staging/lustre/lustre/include/lustre_import.h index 4445be7a59dd..5461ba33d90c 100644 --- a/drivers/staging/lustre/lustre/include/lustre_import.h +++ b/drivers/staging/lustre/lustre/include/lustre_import.h @@ -285,8 +285,10 @@ struct obd_import { imp_resend_replay:1, /* disable normal recovery, for test only. */ imp_no_pinger_recover:1, +#if OBD_OCD_VERSION(3, 0, 53, 0) > LUSTRE_VERSION_CODE /* need IR MNE swab */ imp_need_mne_swab:1, +#endif /* import must be reconnected instead of * chosing new connection */ @@ -305,28 +307,6 @@ struct obd_import { time64_t imp_last_reply_time; /* for health check */ }; -typedef void (*obd_import_callback)(struct obd_import *imp, void *closure, - int event, void *event_arg, void *cb_data); - -/** - * Structure for import observer. - * It is possible to register "observer" on an import and every time - * something happens to an import (like connect/evict/disconnect) - * obderver will get its callback called with event type - */ -struct obd_import_observer { - struct list_head oio_chain; - obd_import_callback oio_cb; - void *oio_cb_data; -}; - -void class_observe_import(struct obd_import *imp, obd_import_callback cb, - void *cb_data); -void class_unobserve_import(struct obd_import *imp, obd_import_callback cb, - void *cb_data); -void class_notify_import_observers(struct obd_import *imp, int event, - void *event_arg); - /* import.c */ static inline unsigned int at_est2timeout(unsigned int val) { diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h index 06958f217fc8..6b231913ba2e 100644 --- a/drivers/staging/lustre/lustre/include/lustre_lib.h +++ b/drivers/staging/lustre/lustre/include/lustre_lib.h @@ -51,7 +51,6 @@ #include "lustre_cfg.h" /* target.c */ -struct kstatfs; struct ptlrpc_request; struct obd_export; struct lu_target; @@ -74,325 +73,8 @@ int do_set_info_async(struct obd_import *imp, u32 vallen, void *val, struct ptlrpc_request_set *set); -#define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */ -#define OBD_MAX_IOCTL_BUFFER CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER - void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id); -/* client.c */ - -int client_sanobd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg); -struct client_obd *client_conn2cli(struct lustre_handle *conn); - -struct md_open_data; -struct obd_client_handle { - struct lustre_handle och_fh; - struct lu_fid och_fid; - struct md_open_data *och_mod; - struct lustre_handle och_lease_handle; /* open lock for lease */ - __u32 och_magic; - fmode_t och_flags; -}; - -#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed - -/* statfs_pack.c */ -void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs); - -/* - * For md echo client - */ -enum md_echo_cmd { - ECHO_MD_CREATE = 1, /* Open/Create file on MDT */ - ECHO_MD_MKDIR = 2, /* Mkdir on MDT */ - ECHO_MD_DESTROY = 3, /* Unlink file on MDT */ - ECHO_MD_RMDIR = 4, /* Rmdir on MDT */ - ECHO_MD_LOOKUP = 5, /* Lookup on MDT */ - ECHO_MD_GETATTR = 6, /* Getattr on MDT */ - ECHO_MD_SETATTR = 7, /* Setattr on MDT */ - ECHO_MD_ALLOC_FID = 8, /* Get FIDs from MDT */ -}; - -/* - * OBD IOCTLS - */ -#define OBD_IOCTL_VERSION 0x00010004 - -struct obd_ioctl_data { - __u32 ioc_len; - __u32 ioc_version; - - union { - __u64 ioc_cookie; - __u64 ioc_u64_1; - }; - union { - __u32 ioc_conn1; - __u32 ioc_u32_1; - }; - union { - __u32 ioc_conn2; - __u32 ioc_u32_2; - }; - - struct obdo ioc_obdo1; - struct obdo ioc_obdo2; - - u64 ioc_count; - u64 ioc_offset; - __u32 ioc_dev; - __u32 ioc_command; - - __u64 ioc_nid; - __u32 ioc_nal; - __u32 ioc_type; - - /* buffers the kernel will treat as user pointers */ - __u32 ioc_plen1; - void __user *ioc_pbuf1; - __u32 ioc_plen2; - void __user *ioc_pbuf2; - - /* inline buffers for various arguments */ - __u32 ioc_inllen1; - char *ioc_inlbuf1; - __u32 ioc_inllen2; - char *ioc_inlbuf2; - __u32 ioc_inllen3; - char *ioc_inlbuf3; - __u32 ioc_inllen4; - char *ioc_inlbuf4; - - char ioc_bulk[0]; -}; - -struct obd_ioctl_hdr { - __u32 ioc_len; - __u32 ioc_version; -}; - -static inline int obd_ioctl_packlen(struct obd_ioctl_data *data) -{ - int len = cfs_size_round(sizeof(struct obd_ioctl_data)); - - len += cfs_size_round(data->ioc_inllen1); - len += cfs_size_round(data->ioc_inllen2); - len += cfs_size_round(data->ioc_inllen3); - len += cfs_size_round(data->ioc_inllen4); - return len; -} - -static inline int obd_ioctl_is_invalid(struct obd_ioctl_data *data) -{ - if (data->ioc_len > OBD_MAX_IOCTL_BUFFER) { - CERROR("OBD ioctl: ioc_len larger than %d\n", - OBD_MAX_IOCTL_BUFFER); - return 1; - } - if (data->ioc_inllen1 > OBD_MAX_IOCTL_BUFFER) { - CERROR("OBD ioctl: ioc_inllen1 larger than ioc_len\n"); - return 1; - } - if (data->ioc_inllen2 > OBD_MAX_IOCTL_BUFFER) { - CERROR("OBD ioctl: ioc_inllen2 larger than ioc_len\n"); - return 1; - } - if (data->ioc_inllen3 > OBD_MAX_IOCTL_BUFFER) { - CERROR("OBD ioctl: ioc_inllen3 larger than ioc_len\n"); - return 1; - } - if (data->ioc_inllen4 > OBD_MAX_IOCTL_BUFFER) { - CERROR("OBD ioctl: ioc_inllen4 larger than ioc_len\n"); - return 1; - } - if (data->ioc_inlbuf1 && !data->ioc_inllen1) { - CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n"); - return 1; - } - if (data->ioc_inlbuf2 && !data->ioc_inllen2) { - CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n"); - return 1; - } - if (data->ioc_inlbuf3 && !data->ioc_inllen3) { - CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n"); - return 1; - } - if (data->ioc_inlbuf4 && !data->ioc_inllen4) { - CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n"); - return 1; - } - if (data->ioc_pbuf1 && !data->ioc_plen1) { - CERROR("OBD ioctl: pbuf1 pointer but 0 length\n"); - return 1; - } - if (data->ioc_pbuf2 && !data->ioc_plen2) { - CERROR("OBD ioctl: pbuf2 pointer but 0 length\n"); - return 1; - } - if (data->ioc_plen1 && !data->ioc_pbuf1) { - CERROR("OBD ioctl: plen1 set but NULL pointer\n"); - return 1; - } - if (data->ioc_plen2 && !data->ioc_pbuf2) { - CERROR("OBD ioctl: plen2 set but NULL pointer\n"); - return 1; - } - if (obd_ioctl_packlen(data) > data->ioc_len) { - CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n", - obd_ioctl_packlen(data), data->ioc_len); - return 1; - } - return 0; -} - -#include "obd_support.h" - -/* function defined in lustre/obdclass/<platform>/<platform>-module.c */ -int obd_ioctl_getdata(char **buf, int *len, void __user *arg); -int obd_ioctl_popdata(void __user *arg, void *data, int len); - -static inline void obd_ioctl_freedata(char *buf, int len) -{ - kvfree(buf); - return; -} - -/* - * BSD ioctl description: - * #define IOC_V1 _IOR(g, n1, long) - * #define IOC_V2 _IOW(g, n2, long) - * - * ioctl(f, IOC_V1, arg); - * arg will be treated as a long value, - * - * ioctl(f, IOC_V2, arg) - * arg will be treated as a pointer, bsd will call - * copyin(buf, arg, sizeof(long)) - * - * To make BSD ioctl handles argument correctly and simplely, - * we change _IOR to _IOWR so BSD will copyin obd_ioctl_data - * for us. Does this change affect Linux? (XXX Liang) - */ -#define OBD_IOC_DATA_TYPE long - -#define OBD_IOC_CREATE _IOWR('f', 101, OBD_IOC_DATA_TYPE) -#define OBD_IOC_DESTROY _IOW('f', 104, OBD_IOC_DATA_TYPE) -#define OBD_IOC_PREALLOCATE _IOWR('f', 105, OBD_IOC_DATA_TYPE) - -#define OBD_IOC_SETATTR _IOW('f', 107, OBD_IOC_DATA_TYPE) -#define OBD_IOC_GETATTR _IOWR ('f', 108, OBD_IOC_DATA_TYPE) -#define OBD_IOC_READ _IOWR('f', 109, OBD_IOC_DATA_TYPE) -#define OBD_IOC_WRITE _IOWR('f', 110, OBD_IOC_DATA_TYPE) - -#define OBD_IOC_STATFS _IOWR('f', 113, OBD_IOC_DATA_TYPE) -#define OBD_IOC_SYNC _IOW('f', 114, OBD_IOC_DATA_TYPE) -#define OBD_IOC_READ2 _IOWR('f', 115, OBD_IOC_DATA_TYPE) -#define OBD_IOC_FORMAT _IOWR('f', 116, OBD_IOC_DATA_TYPE) -#define OBD_IOC_PARTITION _IOWR('f', 117, OBD_IOC_DATA_TYPE) -#define OBD_IOC_COPY _IOWR('f', 120, OBD_IOC_DATA_TYPE) -#define OBD_IOC_MIGR _IOWR('f', 121, OBD_IOC_DATA_TYPE) -#define OBD_IOC_PUNCH _IOWR('f', 122, OBD_IOC_DATA_TYPE) - -#define OBD_IOC_MODULE_DEBUG _IOWR('f', 124, OBD_IOC_DATA_TYPE) -#define OBD_IOC_BRW_READ _IOWR('f', 125, OBD_IOC_DATA_TYPE) -#define OBD_IOC_BRW_WRITE _IOWR('f', 126, OBD_IOC_DATA_TYPE) -#define OBD_IOC_NAME2DEV _IOWR('f', 127, OBD_IOC_DATA_TYPE) -#define OBD_IOC_UUID2DEV _IOWR('f', 130, OBD_IOC_DATA_TYPE) - -#define OBD_IOC_GETNAME _IOWR('f', 131, OBD_IOC_DATA_TYPE) -#define OBD_IOC_GETMDNAME _IOR('f', 131, char[MAX_OBD_NAME]) -#define OBD_IOC_GETDTNAME OBD_IOC_GETNAME - -#define OBD_IOC_LOV_GET_CONFIG _IOWR('f', 132, OBD_IOC_DATA_TYPE) -#define OBD_IOC_CLIENT_RECOVER _IOW('f', 133, OBD_IOC_DATA_TYPE) -#define OBD_IOC_PING_TARGET _IOW('f', 136, OBD_IOC_DATA_TYPE) - -#define OBD_IOC_DEC_FS_USE_COUNT _IO ('f', 139) -#define OBD_IOC_NO_TRANSNO _IOW('f', 140, OBD_IOC_DATA_TYPE) -#define OBD_IOC_SET_READONLY _IOW('f', 141, OBD_IOC_DATA_TYPE) -#define OBD_IOC_ABORT_RECOVERY _IOR('f', 142, OBD_IOC_DATA_TYPE) - -#define OBD_IOC_ROOT_SQUASH _IOWR('f', 143, OBD_IOC_DATA_TYPE) - -#define OBD_GET_VERSION _IOWR ('f', 144, OBD_IOC_DATA_TYPE) - -#define OBD_IOC_GSS_SUPPORT _IOWR('f', 145, OBD_IOC_DATA_TYPE) - -#define OBD_IOC_CLOSE_UUID _IOWR ('f', 147, OBD_IOC_DATA_TYPE) - -#define OBD_IOC_CHANGELOG_SEND _IOW('f', 148, OBD_IOC_DATA_TYPE) -#define OBD_IOC_GETDEVICE _IOWR ('f', 149, OBD_IOC_DATA_TYPE) -#define OBD_IOC_FID2PATH _IOWR ('f', 150, OBD_IOC_DATA_TYPE) -/* see also <lustre/lustre_user.h> for ioctls 151-153 */ -/* OBD_IOC_LOV_SETSTRIPE: See also LL_IOC_LOV_SETSTRIPE */ -#define OBD_IOC_LOV_SETSTRIPE _IOW('f', 154, OBD_IOC_DATA_TYPE) -/* OBD_IOC_LOV_GETSTRIPE: See also LL_IOC_LOV_GETSTRIPE */ -#define OBD_IOC_LOV_GETSTRIPE _IOW('f', 155, OBD_IOC_DATA_TYPE) -/* OBD_IOC_LOV_SETEA: See also LL_IOC_LOV_SETEA */ -#define OBD_IOC_LOV_SETEA _IOW('f', 156, OBD_IOC_DATA_TYPE) -/* see <lustre/lustre_user.h> for ioctls 157-159 */ -/* OBD_IOC_QUOTACHECK: See also LL_IOC_QUOTACHECK */ -#define OBD_IOC_QUOTACHECK _IOW('f', 160, int) -/* OBD_IOC_POLL_QUOTACHECK: See also LL_IOC_POLL_QUOTACHECK */ -#define OBD_IOC_POLL_QUOTACHECK _IOR('f', 161, struct if_quotacheck *) -/* OBD_IOC_QUOTACTL: See also LL_IOC_QUOTACTL */ -#define OBD_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl) -/* see also <lustre/lustre_user.h> for ioctls 163-176 */ -#define OBD_IOC_CHANGELOG_REG _IOW('f', 177, struct obd_ioctl_data) -#define OBD_IOC_CHANGELOG_DEREG _IOW('f', 178, struct obd_ioctl_data) -#define OBD_IOC_CHANGELOG_CLEAR _IOW('f', 179, struct obd_ioctl_data) -#define OBD_IOC_RECORD _IOWR('f', 180, OBD_IOC_DATA_TYPE) -#define OBD_IOC_ENDRECORD _IOWR('f', 181, OBD_IOC_DATA_TYPE) -#define OBD_IOC_PARSE _IOWR('f', 182, OBD_IOC_DATA_TYPE) -#define OBD_IOC_DORECORD _IOWR('f', 183, OBD_IOC_DATA_TYPE) -#define OBD_IOC_PROCESS_CFG _IOWR('f', 184, OBD_IOC_DATA_TYPE) -#define OBD_IOC_DUMP_LOG _IOWR('f', 185, OBD_IOC_DATA_TYPE) -#define OBD_IOC_CLEAR_LOG _IOWR('f', 186, OBD_IOC_DATA_TYPE) -#define OBD_IOC_PARAM _IOW('f', 187, OBD_IOC_DATA_TYPE) -#define OBD_IOC_POOL _IOWR('f', 188, OBD_IOC_DATA_TYPE) -#define OBD_IOC_REPLACE_NIDS _IOWR('f', 189, OBD_IOC_DATA_TYPE) - -#define OBD_IOC_CATLOGLIST _IOWR('f', 190, OBD_IOC_DATA_TYPE) -#define OBD_IOC_LLOG_INFO _IOWR('f', 191, OBD_IOC_DATA_TYPE) -#define OBD_IOC_LLOG_PRINT _IOWR('f', 192, OBD_IOC_DATA_TYPE) -#define OBD_IOC_LLOG_CANCEL _IOWR('f', 193, OBD_IOC_DATA_TYPE) -#define OBD_IOC_LLOG_REMOVE _IOWR('f', 194, OBD_IOC_DATA_TYPE) -#define OBD_IOC_LLOG_CHECK _IOWR('f', 195, OBD_IOC_DATA_TYPE) -/* OBD_IOC_LLOG_CATINFO is deprecated */ -#define OBD_IOC_LLOG_CATINFO _IOWR('f', 196, OBD_IOC_DATA_TYPE) - -/* #define ECHO_IOC_GET_STRIPE _IOWR('f', 200, OBD_IOC_DATA_TYPE) */ -/* #define ECHO_IOC_SET_STRIPE _IOWR('f', 201, OBD_IOC_DATA_TYPE) */ -/* #define ECHO_IOC_ENQUEUE _IOWR('f', 202, OBD_IOC_DATA_TYPE) */ -/* #define ECHO_IOC_CANCEL _IOWR('f', 203, OBD_IOC_DATA_TYPE) */ - -#define OBD_IOC_GET_OBJ_VERSION _IOR('f', 210, OBD_IOC_DATA_TYPE) - -/* <lustre/lustre_user.h> defines ioctl number 218-219 */ -#define OBD_IOC_GET_MNTOPT _IOW('f', 220, mntopt_t) - -#define OBD_IOC_ECHO_MD _IOR('f', 221, struct obd_ioctl_data) -#define OBD_IOC_ECHO_ALLOC_SEQ _IOWR('f', 222, struct obd_ioctl_data) - -#define OBD_IOC_START_LFSCK _IOWR('f', 230, OBD_IOC_DATA_TYPE) -#define OBD_IOC_STOP_LFSCK _IOW('f', 231, OBD_IOC_DATA_TYPE) -#define OBD_IOC_PAUSE_LFSCK _IOW('f', 232, OBD_IOC_DATA_TYPE) - -/* XXX _IOWR('f', 250, long) has been defined in - * libcfs/include/libcfs/libcfs_private.h for debug, don't use it - */ - -/* Until such time as we get_info the per-stripe maximum from the OST, - * we define this to be 2T - 4k, which is the ext3 maxbytes. - */ -#define LUSTRE_STRIPE_MAXBYTES 0x1fffffff000ULL - -/* Special values for remove LOV EA from disk */ -#define LOVEA_DELETE_VALUES(size, count, offset) (size == 0 && count == 0 && \ - offset == (typeof(offset))(-1)) - -/* #define POISON_BULK 0 */ - /* * l_wait_event is a flexible sleeping function, permitting simple caller * configuration of interrupt and timeout sensitivity along with actions to diff --git a/drivers/staging/lustre/lustre/include/lustre_linkea.h b/drivers/staging/lustre/lustre/include/lustre_linkea.h new file mode 100644 index 000000000000..249e8bf4fa22 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_linkea.h @@ -0,0 +1,79 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2014, Intel Corporation. + * Use is subject to license terms. + * + * Author: di wang <di.wang@intel.com> + */ + +#define DEFAULT_LINKEA_SIZE 4096 + +struct linkea_data { + /** + * Buffer to keep link EA body. + */ + struct lu_buf *ld_buf; + /** + * The matched header, entry and its length in the EA + */ + struct link_ea_header *ld_leh; + struct link_ea_entry *ld_lee; + int ld_reclen; +}; + +int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf); +int linkea_init(struct linkea_data *ldata); +void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen, + struct lu_name *lname, struct lu_fid *pfid); +int linkea_entry_pack(struct link_ea_entry *lee, const struct lu_name *lname, + const struct lu_fid *pfid); +int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid); +void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname); +int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid); + +static inline void linkea_first_entry(struct linkea_data *ldata) +{ + LASSERT(ldata); + LASSERT(ldata->ld_leh); + + if (ldata->ld_leh->leh_reccount == 0) + ldata->ld_lee = NULL; + else + ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1); +} + +static inline void linkea_next_entry(struct linkea_data *ldata) +{ + LASSERT(ldata); + LASSERT(ldata->ld_leh); + + if (ldata->ld_lee) { + ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee + + ldata->ld_reclen); + if ((char *)ldata->ld_lee >= ((char *)ldata->ld_leh + + ldata->ld_leh->leh_len)) + ldata->ld_lee = NULL; + } +} diff --git a/drivers/staging/lustre/lustre/include/lustre_lite.h b/drivers/staging/lustre/lustre/include/lustre_lite.h deleted file mode 100644 index b16897702559..000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_lite.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef _LL_H -#define _LL_H - -/** \defgroup lite lite - * - * @{ - */ - -#include "linux/lustre_lite.h" - -#include "obd_class.h" -#include "lustre_net.h" -#include "lustre_mds.h" -#include "lustre_ha.h" - -/* 4UL * 1024 * 1024 */ -#define LL_MAX_BLKSIZE_BITS (22) -#define LL_MAX_BLKSIZE (1UL<<LL_MAX_BLKSIZE_BITS) - -/* - * This is embedded into llite super-blocks to keep track of - * connect flags (capabilities) supported by all imports given mount is - * connected to. - */ -struct lustre_client_ocd { - /* - * This is conjunction of connect_flags across all imports (LOVs) this - * mount is connected to. This field is updated by cl_ocd_update() - * under ->lco_lock. - */ - __u64 lco_flags; - struct mutex lco_lock; - struct obd_export *lco_md_exp; - struct obd_export *lco_dt_exp; -}; - -/* - * Chain of hash overflow pages. - */ -struct ll_dir_chain { - /* XXX something. Later */ -}; - -static inline void ll_dir_chain_init(struct ll_dir_chain *chain) -{ -} - -static inline void ll_dir_chain_fini(struct ll_dir_chain *chain) -{ -} - -static inline unsigned long hash_x_index(__u64 hash, int hash64) -{ - if (BITS_PER_LONG == 32 && hash64) - hash >>= 32; - /* save hash 0 as index 0 because otherwise we'll save it at - * page index end (~0UL) and it causes truncate_inode_pages_range() - * to loop forever. - */ - return ~0UL - (hash + !hash); -} - -/** @} lite */ - -#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_lmv.h b/drivers/staging/lustre/lustre/include/lustre_lmv.h new file mode 100644 index 000000000000..d7f7afa8dfa7 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_lmv.h @@ -0,0 +1,184 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, Intel Corporation. + */ +/* + * lustre/include/lustre_lmv.h + * + * Lustre LMV structures and functions. + * + * Author: Di Wang <di.wang@intel.com> + */ + +#ifndef _LUSTRE_LMV_H +#define _LUSTRE_LMV_H +#include "lustre/lustre_idl.h" + +struct lmv_oinfo { + struct lu_fid lmo_fid; + u32 lmo_mds; + struct inode *lmo_root; +}; + +struct lmv_stripe_md { + __u32 lsm_md_magic; + __u32 lsm_md_stripe_count; + __u32 lsm_md_master_mdt_index; + __u32 lsm_md_hash_type; + __u32 lsm_md_layout_version; + __u32 lsm_md_default_count; + __u32 lsm_md_default_index; + char lsm_md_pool_name[LOV_MAXPOOLNAME + 1]; + struct lmv_oinfo lsm_md_oinfo[0]; +}; + +static inline bool +lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2) +{ + __u32 idx; + + if (lsm1->lsm_md_magic != lsm2->lsm_md_magic || + lsm1->lsm_md_stripe_count != lsm2->lsm_md_stripe_count || + lsm1->lsm_md_master_mdt_index != lsm2->lsm_md_master_mdt_index || + lsm1->lsm_md_hash_type != lsm2->lsm_md_hash_type || + lsm1->lsm_md_layout_version != lsm2->lsm_md_layout_version || + !strcmp(lsm1->lsm_md_pool_name, lsm2->lsm_md_pool_name)) + return false; + + for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) { + if (!lu_fid_eq(&lsm1->lsm_md_oinfo[idx].lmo_fid, + &lsm2->lsm_md_oinfo[idx].lmo_fid)) + return false; + } + + return true; +} + +union lmv_mds_md; + +int lmv_unpack_md(struct obd_export *exp, struct lmv_stripe_md **lsmp, + const union lmv_mds_md *lmm, int stripe_count); + +static inline int lmv_alloc_memmd(struct lmv_stripe_md **lsmp, int stripe_count) +{ + return lmv_unpack_md(NULL, lsmp, NULL, stripe_count); +} + +static inline void lmv_free_memmd(struct lmv_stripe_md *lsm) +{ + lmv_unpack_md(NULL, &lsm, NULL, 0); +} + +static inline void lmv1_le_to_cpu(struct lmv_mds_md_v1 *lmv_dst, + const struct lmv_mds_md_v1 *lmv_src) +{ + __u32 i; + + lmv_dst->lmv_magic = le32_to_cpu(lmv_src->lmv_magic); + lmv_dst->lmv_stripe_count = le32_to_cpu(lmv_src->lmv_stripe_count); + lmv_dst->lmv_master_mdt_index = + le32_to_cpu(lmv_src->lmv_master_mdt_index); + lmv_dst->lmv_hash_type = le32_to_cpu(lmv_src->lmv_hash_type); + lmv_dst->lmv_layout_version = le32_to_cpu(lmv_src->lmv_layout_version); + + for (i = 0; i < lmv_src->lmv_stripe_count; i++) + fid_le_to_cpu(&lmv_dst->lmv_stripe_fids[i], + &lmv_src->lmv_stripe_fids[i]); +} + +static inline void lmv_le_to_cpu(union lmv_mds_md *lmv_dst, + const union lmv_mds_md *lmv_src) +{ + switch (le32_to_cpu(lmv_src->lmv_magic)) { + case LMV_MAGIC_V1: + lmv1_le_to_cpu(&lmv_dst->lmv_md_v1, &lmv_src->lmv_md_v1); + break; + default: + break; + } +} + +/* This hash is only for testing purpose */ +static inline unsigned int +lmv_hash_all_chars(unsigned int count, const char *name, int namelen) +{ + const unsigned char *p = (const unsigned char *)name; + unsigned int c = 0; + + while (--namelen >= 0) + c += p[namelen]; + + c = c % count; + + return c; +} + +static inline unsigned int +lmv_hash_fnv1a(unsigned int count, const char *name, int namelen) +{ + __u64 hash; + + hash = lustre_hash_fnv_1a_64(name, namelen); + + return do_div(hash, count); +} + +static inline int lmv_name_to_stripe_index(__u32 lmv_hash_type, + unsigned int stripe_count, + const char *name, int namelen) +{ + __u32 hash_type = lmv_hash_type & LMV_HASH_TYPE_MASK; + int idx; + + LASSERT(namelen > 0); + if (stripe_count <= 1) + return 0; + + /* for migrating object, always start from 0 stripe */ + if (lmv_hash_type & LMV_HASH_FLAG_MIGRATION) + return 0; + + switch (hash_type) { + case LMV_HASH_TYPE_ALL_CHARS: + idx = lmv_hash_all_chars(stripe_count, name, namelen); + break; + case LMV_HASH_TYPE_FNV_1A_64: + idx = lmv_hash_fnv1a(stripe_count, name, namelen); + break; + default: + idx = -EBADFD; + break; + } + CDEBUG(D_INFO, "name %.*s hash_type %d idx %d\n", namelen, name, + hash_type, idx); + + return idx; +} + +static inline bool lmv_is_known_hash_type(__u32 type) +{ + return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 || + (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS; +} + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_log.h b/drivers/staging/lustre/lustre/include/lustre_log.h index b96e02317bfc..995b266932e3 100644 --- a/drivers/staging/lustre/lustre/include/lustre_log.h +++ b/drivers/staging/lustre/lustre/include/lustre_log.h @@ -277,12 +277,11 @@ static inline void llog_ctxt_put(struct llog_ctxt *ctxt) __llog_ctxt_put(NULL, ctxt); } -static inline void llog_group_init(struct obd_llog_group *olg, int group) +static inline void llog_group_init(struct obd_llog_group *olg) { init_waitqueue_head(&olg->olg_waitq); spin_lock_init(&olg->olg_lock); mutex_init(&olg->olg_cat_processing); - olg->olg_seq = group; } static inline int llog_group_set_ctxt(struct obd_llog_group *olg, diff --git a/drivers/staging/lustre/lustre/include/lustre_mdc.h b/drivers/staging/lustre/lustre/include/lustre_mdc.h index fa62b95d351f..8fc2d3f2dfd6 100644 --- a/drivers/staging/lustre/lustre/include/lustre_mdc.h +++ b/drivers/staging/lustre/lustre/include/lustre_mdc.h @@ -96,7 +96,7 @@ static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck, struct lookup_intent *it) { if (it && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP || - it->it_op == IT_LAYOUT)) + it->it_op == IT_LAYOUT || it->it_op == IT_READDIR)) return; /* This would normally block until the existing request finishes. @@ -136,7 +136,7 @@ static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck, struct lookup_intent *it) { if (it && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP || - it->it_op == IT_LAYOUT)) + it->it_op == IT_LAYOUT || it->it_op == IT_READDIR)) return; if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */ @@ -156,34 +156,44 @@ static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck, mutex_unlock(&lck->rpcl_mutex); } -/* Update the maximum observed easize and cookiesize. The default easize - * and cookiesize is initialized to the minimum value but allowed to grow - * up to a single page in size if required to handle the common case. +/** + * Update the maximum possible easize and cookiesize. + * + * The values are learned from ptlrpc replies sent by the MDT. The + * default easize and cookiesize is initialized to the minimum value but + * allowed to grow up to a single page in size if required to handle the + * common case. + * + * \see client_obd::cl_default_mds_easize and + * client_obd::cl_default_mds_cookiesize + * + * \param[in] exp export for MDC device + * \param[in] body body of ptlrpc reply from MDT + * */ static inline void mdc_update_max_ea_from_body(struct obd_export *exp, struct mdt_body *body) { - if (body->valid & OBD_MD_FLMODEASIZE) { + if (body->mbo_valid & OBD_MD_FLMODEASIZE) { struct client_obd *cli = &exp->exp_obd->u.cli; + u32 def_cookiesize, def_easize; - if (cli->cl_max_mds_easize < body->max_mdsize) { - cli->cl_max_mds_easize = body->max_mdsize; - cli->cl_default_mds_easize = - min_t(__u32, body->max_mdsize, PAGE_SIZE); - } - if (cli->cl_max_mds_cookiesize < body->max_cookiesize) { - cli->cl_max_mds_cookiesize = body->max_cookiesize; - cli->cl_default_mds_cookiesize = - min_t(__u32, body->max_cookiesize, PAGE_SIZE); - } + if (cli->cl_max_mds_easize < body->mbo_max_mdsize) + cli->cl_max_mds_easize = body->mbo_max_mdsize; + + def_easize = min_t(__u32, body->mbo_max_mdsize, + OBD_MAX_DEFAULT_EA_SIZE); + cli->cl_default_mds_easize = def_easize; + + if (cli->cl_max_mds_cookiesize < body->mbo_max_cookiesize) + cli->cl_max_mds_cookiesize = body->mbo_max_cookiesize; + + def_cookiesize = min_t(__u32, body->mbo_max_cookiesize, + OBD_MAX_DEFAULT_COOKIE_SIZE); + cli->cl_default_mds_cookiesize = def_cookiesize; } } -struct mdc_cache_waiter { - struct list_head mcw_entry; - wait_queue_head_t mcw_waitq; -}; - /* mdc/mdc_locks.c */ int it_open_error(int phase, struct lookup_intent *it); diff --git a/drivers/staging/lustre/lustre/include/lustre_mds.h b/drivers/staging/lustre/lustre/include/lustre_mds.h index 4104bd9bd5c4..23a7e4f78e9a 100644 --- a/drivers/staging/lustre/lustre/include/lustre_mds.h +++ b/drivers/staging/lustre/lustre/include/lustre_mds.h @@ -58,9 +58,6 @@ struct mds_group_info { #define MDD_OBD_NAME "mdd_obd" #define MDD_OBD_UUID "mdd_obd_uuid" -/* these are local flags, used only on the client, private */ -#define M_CHECK_STALE 0200000000 - /** @} mds */ #endif diff --git a/drivers/staging/lustre/lustre/include/lustre_net.h b/drivers/staging/lustre/lustre/include/lustre_net.h index d5debd615fdf..e9aba99ee52a 100644 --- a/drivers/staging/lustre/lustre/include/lustre_net.h +++ b/drivers/staging/lustre/lustre/include/lustre_net.h @@ -261,7 +261,10 @@ #define MDS_MAXREQSIZE (5 * 1024) /* >= 4736 */ -#define OST_MAXREQSIZE (5 * 1024) +/** + * FIEMAP request can be 4K+ for now + */ +#define OST_MAXREQSIZE (16 * 1024) /* Macro to hide a typecast. */ #define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args) @@ -570,13 +573,13 @@ struct ptlrpc_nrs_pol_ops { * * \param[in,out] policy The policy being initialized */ - int (*op_policy_init) (struct ptlrpc_nrs_policy *policy); + int (*op_policy_init)(struct ptlrpc_nrs_policy *policy); /** * Called during policy unregistration; this operation is optional. * * \param[in,out] policy The policy being unregistered/finalized */ - void (*op_policy_fini) (struct ptlrpc_nrs_policy *policy); + void (*op_policy_fini)(struct ptlrpc_nrs_policy *policy); /** * Called when activating a policy via lprocfs; policies allocate and * initialize their resources here; this operation is optional. @@ -585,7 +588,7 @@ struct ptlrpc_nrs_pol_ops { * * \see nrs_policy_start_locked() */ - int (*op_policy_start) (struct ptlrpc_nrs_policy *policy); + int (*op_policy_start)(struct ptlrpc_nrs_policy *policy); /** * Called when deactivating a policy via lprocfs; policies deallocate * their resources here; this operation is optional @@ -594,7 +597,7 @@ struct ptlrpc_nrs_pol_ops { * * \see nrs_policy_stop0() */ - void (*op_policy_stop) (struct ptlrpc_nrs_policy *policy); + void (*op_policy_stop)(struct ptlrpc_nrs_policy *policy); /** * Used for policy-specific operations; i.e. not generic ones like * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous @@ -610,8 +613,8 @@ struct ptlrpc_nrs_pol_ops { * * \see ptlrpc_nrs_policy_control() */ - int (*op_policy_ctl) (struct ptlrpc_nrs_policy *policy, - enum ptlrpc_nrs_ctl opc, void *arg); + int (*op_policy_ctl)(struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, void *arg); /** * Called when obtaining references to the resources of the resource @@ -648,11 +651,11 @@ struct ptlrpc_nrs_pol_ops { * \see ptlrpc_nrs_req_initialize() * \see ptlrpc_nrs_hpreq_add_nolock() */ - int (*op_res_get) (struct ptlrpc_nrs_policy *policy, - struct ptlrpc_nrs_request *nrq, - const struct ptlrpc_nrs_resource *parent, - struct ptlrpc_nrs_resource **resp, - bool moving_req); + int (*op_res_get)(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, + bool moving_req); /** * Called when releasing references taken for resources in the resource * hierarchy for the request; this operation is optional. @@ -663,8 +666,8 @@ struct ptlrpc_nrs_pol_ops { * \see ptlrpc_nrs_req_finalize() * \see ptlrpc_nrs_hpreq_add_nolock() */ - void (*op_res_put) (struct ptlrpc_nrs_policy *policy, - const struct ptlrpc_nrs_resource *res); + void (*op_res_put)(struct ptlrpc_nrs_policy *policy, + const struct ptlrpc_nrs_resource *res); /** * Obtains a request for handling from the policy, and optionally @@ -683,8 +686,8 @@ struct ptlrpc_nrs_pol_ops { * \see ptlrpc_nrs_req_get_nolock() */ struct ptlrpc_nrs_request * - (*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek, - bool force); + (*op_req_get)(struct ptlrpc_nrs_policy *policy, bool peek, + bool force); /** * Called when attempting to add a request to a policy for later * handling; this operation is mandatory. @@ -697,8 +700,8 @@ struct ptlrpc_nrs_pol_ops { * * \see ptlrpc_nrs_req_add_nolock() */ - int (*op_req_enqueue) (struct ptlrpc_nrs_policy *policy, - struct ptlrpc_nrs_request *nrq); + int (*op_req_enqueue)(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); /** * Removes a request from the policy's set of pending requests. Normally * called after a request has been polled successfully from the policy @@ -707,8 +710,8 @@ struct ptlrpc_nrs_pol_ops { * \param[in,out] policy The policy the request \a nrq belongs to * \param[in,out] nrq The request to dequeue */ - void (*op_req_dequeue) (struct ptlrpc_nrs_policy *policy, - struct ptlrpc_nrs_request *nrq); + void (*op_req_dequeue)(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); /** * Called after the request being carried out. Could be used for * job/resource control; this operation is optional. @@ -721,8 +724,8 @@ struct ptlrpc_nrs_pol_ops { * * \see ptlrpc_nrs_req_stop_nolock() */ - void (*op_req_stop) (struct ptlrpc_nrs_policy *policy, - struct ptlrpc_nrs_request *nrq); + void (*op_req_stop)(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); /** * Registers the policy's lprocfs interface with a PTLRPC service. * @@ -731,7 +734,7 @@ struct ptlrpc_nrs_pol_ops { * \retval 0 success * \retval != 0 error */ - int (*op_lprocfs_init) (struct ptlrpc_service *svc); + int (*op_lprocfs_init)(struct ptlrpc_service *svc); /** * Unegisters the policy's lprocfs interface with a PTLRPC service. * @@ -743,7 +746,7 @@ struct ptlrpc_nrs_pol_ops { * * \param[in] svc The service */ - void (*op_lprocfs_fini) (struct ptlrpc_service *svc); + void (*op_lprocfs_fini)(struct ptlrpc_service *svc); }; /** @@ -1628,7 +1631,7 @@ static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req) /** * Returns 1 if request buffer at offset \a index was already swabbed */ -static inline int lustre_req_swabbed(struct ptlrpc_request *req, int index) +static inline int lustre_req_swabbed(struct ptlrpc_request *req, size_t index) { LASSERT(index < sizeof(req->rq_req_swab_mask) * 8); return req->rq_req_swab_mask & (1 << index); @@ -1637,7 +1640,7 @@ static inline int lustre_req_swabbed(struct ptlrpc_request *req, int index) /** * Returns 1 if request reply buffer at offset \a index was already swabbed */ -static inline int lustre_rep_swabbed(struct ptlrpc_request *req, int index) +static inline int lustre_rep_swabbed(struct ptlrpc_request *req, size_t index) { LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8); return req->rq_rep_swab_mask & (1 << index); @@ -1662,7 +1665,8 @@ static inline int ptlrpc_rep_need_swab(struct ptlrpc_request *req) /** * Mark request buffer at offset \a index that it was already swabbed */ -static inline void lustre_set_req_swabbed(struct ptlrpc_request *req, int index) +static inline void lustre_set_req_swabbed(struct ptlrpc_request *req, + size_t index) { LASSERT(index < sizeof(req->rq_req_swab_mask) * 8); LASSERT((req->rq_req_swab_mask & (1 << index)) == 0); @@ -1672,7 +1676,8 @@ static inline void lustre_set_req_swabbed(struct ptlrpc_request *req, int index) /** * Mark request reply buffer at offset \a index that it was already swabbed */ -static inline void lustre_set_rep_swabbed(struct ptlrpc_request *req, int index) +static inline void lustre_set_rep_swabbed(struct ptlrpc_request *req, + size_t index) { LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8); LASSERT((req->rq_rep_swab_mask & (1 << index)) == 0); @@ -2403,7 +2408,6 @@ int ptlrpc_send_reply(struct ptlrpc_request *req, int flags); int ptlrpc_reply(struct ptlrpc_request *req); int ptlrpc_send_error(struct ptlrpc_request *req, int difficult); int ptlrpc_error(struct ptlrpc_request *req); -void ptlrpc_resend_req(struct ptlrpc_request *request); int ptlrpc_at_get_net_latency(struct ptlrpc_request *req); int ptl_send_rpc(struct ptlrpc_request *request, int noreply); int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd); @@ -2423,23 +2427,17 @@ struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid); int ptlrpc_queue_wait(struct ptlrpc_request *req); int ptlrpc_replay_req(struct ptlrpc_request *req); -int ptlrpc_unregister_reply(struct ptlrpc_request *req, int async); void ptlrpc_abort_inflight(struct obd_import *imp); void ptlrpc_abort_set(struct ptlrpc_request_set *set); struct ptlrpc_request_set *ptlrpc_prep_set(void); struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, void *arg); -int ptlrpc_set_next_timeout(struct ptlrpc_request_set *); int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set); int ptlrpc_set_wait(struct ptlrpc_request_set *); -int ptlrpc_expired_set(void *data); -void ptlrpc_interrupted_set(void *data); void ptlrpc_mark_interrupted(struct ptlrpc_request *req); void ptlrpc_set_destroy(struct ptlrpc_request_set *); void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *); -void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, - struct ptlrpc_request *req); void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool); int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq); @@ -2611,9 +2609,9 @@ int ptlrpc_reconnect_import(struct obd_import *imp); * @{ */ int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout, - int index); + u32 index); void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout, - int index); + u32 index); int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len); int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len); @@ -2632,27 +2630,27 @@ int lustre_shrink_msg(struct lustre_msg *msg, int segment, unsigned int newlen, int move_data); void lustre_free_reply_state(struct ptlrpc_reply_state *rs); int __lustre_unpack_msg(struct lustre_msg *m, int len); -int lustre_msg_hdr_size(__u32 magic, int count); -int lustre_msg_size(__u32 magic, int count, __u32 *lengths); -int lustre_msg_size_v2(int count, __u32 *lengths); -int lustre_packed_msg_size(struct lustre_msg *msg); -int lustre_msg_early_size(void); -void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size); -void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen); -int lustre_msg_buflen(struct lustre_msg *m, int n); -int lustre_msg_bufcount(struct lustre_msg *m); -char *lustre_msg_string(struct lustre_msg *m, int n, int max_len); +u32 lustre_msg_hdr_size(__u32 magic, u32 count); +u32 lustre_msg_size(__u32 magic, int count, __u32 *lengths); +u32 lustre_msg_size_v2(int count, __u32 *lengths); +u32 lustre_packed_msg_size(struct lustre_msg *msg); +u32 lustre_msg_early_size(void); +void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, u32 n, u32 min_size); +void *lustre_msg_buf(struct lustre_msg *m, u32 n, u32 minlen); +u32 lustre_msg_buflen(struct lustre_msg *m, u32 n); +u32 lustre_msg_bufcount(struct lustre_msg *m); +char *lustre_msg_string(struct lustre_msg *m, u32 n, u32 max_len); __u32 lustre_msghdr_get_flags(struct lustre_msg *msg); void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags); __u32 lustre_msg_get_flags(struct lustre_msg *msg); -void lustre_msg_add_flags(struct lustre_msg *msg, int flags); -void lustre_msg_set_flags(struct lustre_msg *msg, int flags); -void lustre_msg_clear_flags(struct lustre_msg *msg, int flags); +void lustre_msg_add_flags(struct lustre_msg *msg, u32 flags); +void lustre_msg_set_flags(struct lustre_msg *msg, u32 flags); +void lustre_msg_clear_flags(struct lustre_msg *msg, u32 flags); __u32 lustre_msg_get_op_flags(struct lustre_msg *msg); -void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags); +void lustre_msg_add_op_flags(struct lustre_msg *msg, u32 flags); struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg); __u32 lustre_msg_get_type(struct lustre_msg *msg); -void lustre_msg_add_version(struct lustre_msg *msg, int version); +void lustre_msg_add_version(struct lustre_msg *msg, u32 version); __u32 lustre_msg_get_opc(struct lustre_msg *msg); __u64 lustre_msg_get_last_committed(struct lustre_msg *msg); __u64 *lustre_msg_get_versions(struct lustre_msg *msg); diff --git a/drivers/staging/lustre/lustre/include/lustre_param.h b/drivers/staging/lustre/lustre/include/lustre_param.h index 82aadd32c2b8..8061a04ee806 100644 --- a/drivers/staging/lustre/lustre/include/lustre_param.h +++ b/drivers/staging/lustre/lustre/include/lustre_param.h @@ -39,6 +39,9 @@ #ifndef _LUSTRE_PARAM_H #define _LUSTRE_PARAM_H +#include "../../include/linux/libcfs/libcfs.h" +#include "../../include/linux/lnet/types.h" + /** \defgroup param param * * @{ diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/lustre_patchless_compat.h index 5842cb18b49e..5842cb18b49e 100644 --- a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h +++ b/drivers/staging/lustre/lustre/include/lustre_patchless_compat.h diff --git a/drivers/staging/lustre/lustre/include/lustre_req_layout.h b/drivers/staging/lustre/lustre/include/lustre_req_layout.h index 544a43c862b9..a13558e53274 100644 --- a/drivers/staging/lustre/lustre/include/lustre_req_layout.h +++ b/drivers/staging/lustre/lustre/include/lustre_req_layout.h @@ -76,7 +76,8 @@ void req_capsule_init(struct req_capsule *pill, struct ptlrpc_request *req, void req_capsule_fini(struct req_capsule *pill); void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt); -int req_capsule_filled_sizes(struct req_capsule *pill, enum req_location loc); +size_t req_capsule_filled_sizes(struct req_capsule *pill, + enum req_location loc); int req_capsule_server_pack(struct req_capsule *pill); void *req_capsule_client_get(struct req_capsule *pill, @@ -86,27 +87,27 @@ void *req_capsule_client_swab_get(struct req_capsule *pill, void *swabber); void *req_capsule_client_sized_get(struct req_capsule *pill, const struct req_msg_field *field, - int len); + u32 len); void *req_capsule_server_get(struct req_capsule *pill, const struct req_msg_field *field); void *req_capsule_server_sized_get(struct req_capsule *pill, const struct req_msg_field *field, - int len); + u32 len); void *req_capsule_server_swab_get(struct req_capsule *pill, const struct req_msg_field *field, void *swabber); void *req_capsule_server_sized_swab_get(struct req_capsule *pill, const struct req_msg_field *field, - int len, void *swabber); + u32 len, void *swabber); void req_capsule_set_size(struct req_capsule *pill, const struct req_msg_field *field, - enum req_location loc, int size); -int req_capsule_get_size(const struct req_capsule *pill, + enum req_location loc, u32 size); +u32 req_capsule_get_size(const struct req_capsule *pill, const struct req_msg_field *field, enum req_location loc); -int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc); -int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, +u32 req_capsule_msg_size(struct req_capsule *pill, enum req_location loc); +u32 req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, enum req_location loc); void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt); @@ -115,8 +116,7 @@ int req_capsule_has_field(const struct req_capsule *pill, enum req_location loc); void req_capsule_shrink(struct req_capsule *pill, const struct req_msg_field *field, - unsigned int newlen, - enum req_location loc); + u32 newlen, enum req_location loc); int req_layout_init(void); void req_layout_fini(void); @@ -149,14 +149,11 @@ extern struct req_format RQF_MDS_GETATTR; extern struct req_format RQF_MDS_GETATTR_NAME; extern struct req_format RQF_MDS_CLOSE; extern struct req_format RQF_MDS_RELEASE_CLOSE; -extern struct req_format RQF_MDS_PIN; -extern struct req_format RQF_MDS_UNPIN; extern struct req_format RQF_MDS_CONNECT; extern struct req_format RQF_MDS_DISCONNECT; extern struct req_format RQF_MDS_GET_INFO; extern struct req_format RQF_MDS_READPAGE; extern struct req_format RQF_MDS_WRITEPAGE; -extern struct req_format RQF_MDS_IS_SUBDIR; extern struct req_format RQF_MDS_DONE_WRITING; extern struct req_format RQF_MDS_REINT; extern struct req_format RQF_MDS_REINT_CREATE; diff --git a/drivers/staging/lustre/lustre/include/lustre_ver.h b/drivers/staging/lustre/lustre/include/lustre_ver.h index 64559a16f4de..19c9135e2273 100644 --- a/drivers/staging/lustre/lustre/include/lustre_ver.h +++ b/drivers/staging/lustre/lustre/include/lustre_ver.h @@ -2,14 +2,21 @@ #define _LUSTRE_VER_H_ #define LUSTRE_MAJOR 2 -#define LUSTRE_MINOR 4 -#define LUSTRE_PATCH 60 +#define LUSTRE_MINOR 6 +#define LUSTRE_PATCH 99 #define LUSTRE_FIX 0 -#define LUSTRE_VERSION_STRING "2.4.60" +#define LUSTRE_VERSION_STRING "2.6.99" -#define LUSTRE_VERSION_CODE OBD_OCD_VERSION(LUSTRE_MAJOR, \ - LUSTRE_MINOR, LUSTRE_PATCH, \ - LUSTRE_FIX) +#define OBD_OCD_VERSION(major, minor, patch, fix) \ + (((major) << 24) + ((minor) << 16) + ((patch) << 8) + (fix)) + +#define OBD_OCD_VERSION_MAJOR(version) ((int)((version) >> 24) & 255) +#define OBD_OCD_VERSION_MINOR(version) ((int)((version) >> 16) & 255) +#define OBD_OCD_VERSION_PATCH(version) ((int)((version) >> 8) & 255) +#define OBD_OCD_VERSION_FIX(version) ((int)((version) >> 0) & 255) + +#define LUSTRE_VERSION_CODE \ + OBD_OCD_VERSION(LUSTRE_MAJOR, LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX) /* * If lustre version of client and servers it connects to differs by more diff --git a/drivers/staging/lustre/lustre/include/obd.h b/drivers/staging/lustre/lustre/include/obd.h index a1bc2c478ff9..f6fc4dd05bd6 100644 --- a/drivers/staging/lustre/lustre/include/obd.h +++ b/drivers/staging/lustre/lustre/include/obd.h @@ -35,21 +35,13 @@ #include <linux/spinlock.h> -#define IOC_OSC_TYPE 'h' -#define IOC_OSC_MIN_NR 20 -#define IOC_OSC_SET_ACTIVE _IOWR(IOC_OSC_TYPE, 21, struct obd_device *) -#define IOC_OSC_MAX_NR 50 - -#define IOC_MDC_TYPE 'i' -#define IOC_MDC_MIN_NR 20 -#define IOC_MDC_MAX_NR 50 - #include "lustre/lustre_idl.h" #include "lustre_lib.h" #include "lu_ref.h" #include "lustre_export.h" #include "lustre_fid.h" #include "lustre_fld.h" +#include "lustre_handles.h" #include "lustre_intent.h" #define MAX_OBD_DEVICES 8192 @@ -81,6 +73,13 @@ static inline void loi_init(struct lov_oinfo *loi) { } +/* + * If we are unable to get the maximum object size from the OST in + * ocd_maxbytes using OBD_CONNECT_MAXBYTES, then we fall back to using + * the old maximum object size from ext3. + */ +#define LUSTRE_EXT3_STRIPE_MAXBYTES 0x1fffffff000ULL + struct lov_stripe_md { atomic_t lsm_refc; spinlock_t lsm_lock; @@ -89,31 +88,17 @@ struct lov_stripe_md { /* maximum possible file size, might change as OSTs status changes, * e.g. disconnected, deactivated */ - __u64 lsm_maxbytes; - struct { - /* Public members. */ - struct ost_id lw_object_oi; /* lov object id/seq */ - - /* LOV-private members start here -- only for use in lov/. */ - __u32 lw_magic; - __u32 lw_stripe_size; /* size of the stripe */ - __u32 lw_pattern; /* striping pattern (RAID0, RAID1) */ - __u16 lw_stripe_count; /* number of objects being striped over */ - __u16 lw_layout_gen; /* generation of the layout */ - char lw_pool_name[LOV_MAXPOOLNAME]; /* pool name */ - } lsm_wire; - + __u64 lsm_maxbytes; + struct ost_id lsm_oi; + __u32 lsm_magic; + __u32 lsm_stripe_size; + __u32 lsm_pattern; /* striping pattern (RAID0, RAID1) */ + __u16 lsm_stripe_count; + __u16 lsm_layout_gen; + char lsm_pool_name[LOV_MAXPOOLNAME + 1]; struct lov_oinfo *lsm_oinfo[0]; }; -#define lsm_oi lsm_wire.lw_object_oi -#define lsm_magic lsm_wire.lw_magic -#define lsm_layout_gen lsm_wire.lw_layout_gen -#define lsm_stripe_size lsm_wire.lw_stripe_size -#define lsm_pattern lsm_wire.lw_pattern -#define lsm_stripe_count lsm_wire.lw_stripe_count -#define lsm_pool_name lsm_wire.lw_pool_name - static inline bool lsm_is_released(struct lov_stripe_md *lsm) { return !!(lsm->lsm_pattern & LOV_PATTERN_F_RELEASED); @@ -177,31 +162,10 @@ struct obd_type { struct brw_page { u64 off; struct page *pg; - int count; + unsigned int count; u32 flag; }; -/* llog contexts */ -enum llog_ctxt_id { - LLOG_CONFIG_ORIG_CTXT = 0, - LLOG_CONFIG_REPL_CTXT, - LLOG_MDS_OST_ORIG_CTXT, - LLOG_MDS_OST_REPL_CTXT, - LLOG_SIZE_ORIG_CTXT, - LLOG_SIZE_REPL_CTXT, - LLOG_RD1_ORIG_CTXT, - LLOG_RD1_REPL_CTXT, - LLOG_TEST_ORIG_CTXT, - LLOG_TEST_REPL_CTXT, - LLOG_LOVEA_ORIG_CTXT, - LLOG_LOVEA_REPL_CTXT, - LLOG_CHANGELOG_ORIG_CTXT, /**< changelog generation on mdd */ - LLOG_CHANGELOG_REPL_CTXT, /**< changelog access on clients */ - LLOG_CHANGELOG_USER_ORIG_CTXT, /**< for multiple changelog consumers */ - LLOG_AGENT_ORIG_CTXT, /**< agent requests generation on cdt */ - LLOG_MAX_CTXTS -}; - struct timeout_item { enum timeout_event ti_event; unsigned long ti_timeout; @@ -211,11 +175,12 @@ struct timeout_item { struct list_head ti_chain; }; -#define OSC_MAX_RIF_DEFAULT 8 -#define OSC_MAX_RIF_MAX 256 -#define OSC_MAX_DIRTY_DEFAULT (OSC_MAX_RIF_DEFAULT * 4) -#define OSC_MAX_DIRTY_MB_MAX 2048 /* arbitrary, but < MAX_LONG bytes */ -#define OSC_DEFAULT_RESENDS 10 +#define OBD_MAX_RIF_DEFAULT 8 +#define OBD_MAX_RIF_MAX 512 +#define OSC_MAX_RIF_MAX 256 +#define OSC_MAX_DIRTY_DEFAULT (OBD_MAX_RIF_DEFAULT * 4) +#define OSC_MAX_DIRTY_MB_MAX 2048 /* arbitrary, but < MAX_LONG bytes */ +#define OSC_DEFAULT_RESENDS 10 /* possible values for fo_sync_lock_cancel */ enum { @@ -225,40 +190,74 @@ enum { NUM_SYNC_ON_CANCEL_STATES }; -#define MDC_MAX_RIF_DEFAULT 8 -#define MDC_MAX_RIF_MAX 512 - enum obd_cl_sem_lock_class { OBD_CLI_SEM_NORMAL, OBD_CLI_SEM_MGC, OBD_CLI_SEM_MDCOSC, }; +/* + * Limit reply buffer size for striping data to one x86_64 page. This + * value is chosen to fit the striping data for common use cases while + * staying well below the limit at which the buffer must be backed by + * vmalloc(). Excessive use of vmalloc() may cause spinlock contention + * on the MDS. + */ +#define OBD_MAX_DEFAULT_EA_SIZE 4096 +#define OBD_MAX_DEFAULT_COOKIE_SIZE 4096 + struct mdc_rpc_lock; struct obd_import; struct client_obd { struct rw_semaphore cl_sem; struct obd_uuid cl_target_uuid; struct obd_import *cl_import; /* ptlrpc connection state */ - int cl_conn_count; - /* max_mds_easize is purely a performance thing so we don't have to - * call obd_size_diskmd() all the time. + size_t cl_conn_count; + /* + * Cache maximum and default values for easize and cookiesize. This is + * strictly a performance optimization to minimize calls to + * obd_size_diskmd(). The default values are used to calculate the + * initial size of a request buffer. The ptlrpc layer will resize the + * buffer as needed to accommodate a larger reply from the + * server. The default values should be small enough to avoid wasted + * memory and excessive use of vmalloc(), yet large enough to avoid + * reallocating the buffer in the common use case. */ - int cl_default_mds_easize; - int cl_max_mds_easize; - int cl_default_mds_cookiesize; - int cl_max_mds_cookiesize; + /* + * Default EA size for striping attributes. It is initialized at + * mount-time based on the default stripe width of the filesystem, + * then it tracks the largest observed EA size advertised by + * the MDT, up to a maximum value of OBD_MAX_DEFAULT_EA_SIZE. + */ + u32 cl_default_mds_easize; + /* Maximum possible EA size computed at mount-time based on + * the number of OSTs in the filesystem. May be increased at + * run-time if a larger observed size is advertised by the MDT. + */ + u32 cl_max_mds_easize; + /* Default cookie size for llog cookies (see struct llog_cookie). It is + * initialized to zero at mount-time, then it tracks the largest + * observed cookie size advertised by the MDT, up to a maximum value of + * OBD_MAX_DEFAULT_COOKIE_SIZE. Note that llog_cookies are not + * used by clients communicating with MDS versions 2.4.0 and later. + */ + u32 cl_default_mds_cookiesize; + /* Maximum possible cookie size computed at mount-time based on + * the number of OSTs in the filesystem. May be increased at + * run-time if a larger observed size is advertised by the MDT. + */ + u32 cl_max_mds_cookiesize; enum lustre_sec_part cl_sp_me; enum lustre_sec_part cl_sp_to; struct sptlrpc_flavor cl_flvr_mgc; /* fixed flavor of mgc->mgs */ /* the grant values are protected by loi_list_lock below */ - long cl_dirty; /* all _dirty_ in bytes */ - long cl_dirty_max; /* allowed w/o rpc */ - long cl_dirty_transit; /* dirty synchronous */ - long cl_avail_grant; /* bytes of credit for ost */ - long cl_lost_grant; /* lost credits (trunc) */ + unsigned long cl_dirty_pages; /* all _dirty_ in pahges */ + unsigned long cl_dirty_max_pages; /* allowed w/o rpc */ + unsigned long cl_dirty_transit; /* dirty synchronous */ + unsigned long cl_avail_grant; /* bytes of credit for ost */ + unsigned long cl_lost_grant; /* lost credits (trunc) */ /* since we allocate grant by blocks, we don't know how many grant will * be used to add a page into cache. As a solution, we reserve maximum @@ -275,8 +274,7 @@ struct client_obd { * the extent size. A chunk is max(PAGE_SIZE, OST block size) */ int cl_chunkbits; - int cl_chunk; - int cl_extent_tax; /* extent overhead, by bytes */ + unsigned int cl_extent_tax; /* extent overhead, by bytes */ /* keep track of objects that have lois that contain pages which * have been queued for async brw. this lock also protects the @@ -301,13 +299,13 @@ struct client_obd { struct list_head cl_loi_hp_ready_list; struct list_head cl_loi_write_list; struct list_head cl_loi_read_list; - int cl_r_in_flight; - int cl_w_in_flight; + __u32 cl_r_in_flight; + __u32 cl_w_in_flight; /* just a sum of the loi/lop pending numbers to be exported by sysfs */ atomic_t cl_pending_w_pages; atomic_t cl_pending_r_pages; __u32 cl_max_pages_per_rpc; - int cl_max_rpcs_in_flight; + __u32 cl_max_rpcs_in_flight; struct obd_histogram cl_read_rpc_hist; struct obd_histogram cl_write_rpc_hist; struct obd_histogram cl_read_page_hist; @@ -318,13 +316,13 @@ struct client_obd { /* lru for osc caching pages */ struct cl_client_cache *cl_cache; struct list_head cl_lru_osc; /* member of cl_cache->ccc_lru */ - atomic_t *cl_lru_left; - atomic_t cl_lru_busy; + atomic_long_t *cl_lru_left; + atomic_long_t cl_lru_busy; + atomic_long_t cl_lru_in_list; atomic_t cl_lru_shrinkers; - atomic_t cl_lru_in_list; struct list_head cl_lru_list; /* lru page list */ spinlock_t cl_lru_list_lock; /* page list protector */ - atomic_t cl_unstable_count; + atomic_long_t cl_unstable_count; /* number of in flight destroy rpcs is limited to max_rpcs_in_flight */ atomic_t cl_destroy_in_flight; @@ -350,7 +348,7 @@ struct client_obd { /* used by quotacheck when the servers are older than 2.4 */ int cl_qchk_stat; /* quotacheck stat of the peer */ #define CL_NOT_QUOTACHECKED 1 /* client->cl_qchk_stat init value */ -#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 7, 50, 0) +#if OBD_OCD_VERSION(2, 7, 53, 0) < LUSTRE_VERSION_CODE #warning "please consider removing quotacheck compatibility code" #endif @@ -431,7 +429,7 @@ struct lov_obd { struct lmv_tgt_desc { struct obd_uuid ltd_uuid; struct obd_export *ltd_exp; - int ltd_idx; + u32 ltd_idx; struct mutex ltd_fid_mutex; unsigned long ltd_active:1; /* target up for requests */ }; @@ -458,9 +456,8 @@ struct lmv_obd { int max_def_easize; int max_cookiesize; int max_def_cookiesize; - int server_timeout; - int tgts_size; /* size of tgts array */ + u32 tgts_size; /* size of tgts array */ struct lmv_tgt_desc **tgts; struct obd_connect_data conn_data; @@ -470,12 +467,11 @@ struct lmv_obd { struct niobuf_local { __u64 lnb_file_offset; __u32 lnb_page_offset; - __u32 len; - __u32 flags; - struct page *page; - struct dentry *dentry; - int lnb_grant_used; - int rc; + __u32 lnb_len; + __u32 lnb_flags; + struct page *lnb_page; + void *lnb_data; + int lnb_rc; }; #define LUSTRE_FLD_NAME "fld" @@ -517,7 +513,6 @@ struct niobuf_local { #define N_LOCAL_TEMP_PAGE 0x10000000 struct obd_trans_info { - __u64 oti_transno; __u64 oti_xid; /* Only used on the server side for tracking acks. */ struct oti_req_ack_lock { @@ -527,50 +522,11 @@ struct obd_trans_info { void *oti_handle; struct llog_cookie oti_onecookie; struct llog_cookie *oti_logcookies; - int oti_numcookies; - /** synchronous write is needed */ - unsigned long oti_sync_write:1; - /* initial thread handling transaction */ - struct ptlrpc_thread *oti_thread; - __u32 oti_conn_cnt; /** VBR: versions */ __u64 oti_pre_version; - /** JobID */ - char *oti_jobid; - - struct obd_uuid *oti_ost_uuid; }; -static inline void oti_alloc_cookies(struct obd_trans_info *oti, - int num_cookies) -{ - if (!oti) - return; - - if (num_cookies == 1) - oti->oti_logcookies = &oti->oti_onecookie; - else - oti->oti_logcookies = libcfs_kvzalloc(num_cookies * sizeof(oti->oti_onecookie), - GFP_NOFS); - - oti->oti_numcookies = num_cookies; -} - -static inline void oti_free_cookies(struct obd_trans_info *oti) -{ - if (!oti || !oti->oti_logcookies) - return; - - if (oti->oti_logcookies == &oti->oti_onecookie) - LASSERT(oti->oti_numcookies == 1); - else - kvfree(oti->oti_logcookies); - - oti->oti_logcookies = NULL; - oti->oti_numcookies = 0; -} - /* * Events signalled through obd_notify() upcall-chain. */ @@ -616,7 +572,6 @@ struct target_recovery_data { }; struct obd_llog_group { - int olg_seq; struct llog_ctxt *olg_ctxts[LLOG_MAX_CTXTS]; wait_queue_head_t olg_waitq; spinlock_t olg_lock; @@ -625,7 +580,6 @@ struct obd_llog_group { /* corresponds to one of the obd's */ #define OBD_DEVICE_MAGIC 0XAB5CD6EF -#define OBD_DEV_BY_DEVNAME 0xffffd0de struct lvfs_run_ctxt { struct dt_device *dt; @@ -653,7 +607,6 @@ struct obd_device { obd_starting:1, /* started setup */ obd_force:1, /* cleanup with > 0 obd refcount */ obd_fail:1, /* cleanup with failover */ - obd_async_recov:1, /* allow asynchronous orphan cleanup */ obd_no_conn:1, /* deny new connections */ obd_inactive:1, /* device active/inactive * (for sysfs status only!!) @@ -728,9 +681,6 @@ struct obd_device { struct completion obd_kobj_unregister; }; -#define OBD_LLOG_FL_SENDNOW 0x0001 -#define OBD_LLOG_FL_EXIT 0x0002 - enum obd_cleanup_stage { /* Special case hack for MDS LOVs */ OBD_CLEANUP_EARLY, @@ -740,8 +690,6 @@ enum obd_cleanup_stage { /* get/set_info keys */ #define KEY_ASYNC "async" -#define KEY_BLOCKSIZE_BITS "blocksize_bits" -#define KEY_BLOCKSIZE "blocksize" #define KEY_CHANGELOG_CLEAR "changelog_clear" #define KEY_FID2PATH "fid2path" #define KEY_CHECKSUM "checksum" @@ -753,30 +701,22 @@ enum obd_cleanup_stage { #define KEY_GRANT_SHRINK "grant_shrink" #define KEY_HSM_COPYTOOL_SEND "hsm_send" #define KEY_INIT_RECOV_BACKUP "init_recov_bk" -#define KEY_INIT_RECOV "initial_recov" #define KEY_INTERMDS "inter_mds" #define KEY_LAST_ID "last_id" #define KEY_LAST_FID "last_fid" -#define KEY_LOCK_TO_STRIPE "lock_to_stripe" #define KEY_LOVDESC "lovdesc" -#define KEY_LOV_IDX "lov_idx" #define KEY_MAX_EASIZE "max_easize" #define KEY_DEFAULT_EASIZE "default_easize" -#define KEY_MDS_CONN "mds_conn" #define KEY_MGSSEC "mgssec" -#define KEY_NEXT_ID "next_id" #define KEY_READ_ONLY "read-only" #define KEY_REGISTER_TARGET "register_target" #define KEY_SET_FS "set_fs" #define KEY_TGT_COUNT "tgt_count" /* KEY_SET_INFO in lustre_idl.h */ #define KEY_SPTLRPC_CONF "sptlrpc_conf" -#define KEY_CONNECT_FLAG "connect_flags" -#define KEY_SYNC_LOCK_CANCEL "sync_lock_cancel" #define KEY_CACHE_SET "cache_set" #define KEY_CACHE_LRU_SHRINK "cache_lru_shrink" -#define KEY_CHANGELOG_INDEX "changelog_index" struct lu_context; @@ -801,9 +741,11 @@ static inline int it_to_lock_mode(struct lookup_intent *it) /* CREAT needs to be tested before open (both could be set) */ if (it->it_op & IT_CREAT) return LCK_CW; - else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP | + else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP | IT_LAYOUT)) return LCK_CR; + else if (it->it_op & IT_READDIR) + return LCK_PR; else if (it->it_op & IT_GETXATTR) return LCK_PR; else if (it->it_op & IT_SETXATTR) @@ -813,6 +755,14 @@ static inline int it_to_lock_mode(struct lookup_intent *it) return -EINVAL; } +enum md_cli_flags { + CLI_SET_MEA = BIT(0), + CLI_RM_ENTRY = BIT(1), + CLI_HASH64 = BIT(2), + CLI_API32 = BIT(3), + CLI_MIGRATE = BIT(4), +}; + struct md_op_data { struct lu_fid op_fid1; /* operation fid1 (usually parent) */ struct lu_fid op_fid2; /* operation fid2 (usually child) */ @@ -822,7 +772,7 @@ struct md_op_data { struct lustre_handle op_handle; s64 op_mod_time; const char *op_name; - int op_namelen; + size_t op_namelen; __u32 op_mode; struct lmv_stripe_md *op_mea1; struct lmv_stripe_md *op_mea2; @@ -831,6 +781,7 @@ struct md_op_data { __u32 op_fsgid; cfs_cap_t op_cap; void *op_data; + size_t op_data_size; /* iattr fields and blocks. */ struct iattr op_attr; @@ -845,28 +796,29 @@ struct md_op_data { /* Various operation flags. */ enum mds_op_bias op_bias; - /* Operation type */ - __u32 op_opc; - /* Used by readdir */ __u64 op_offset; /* Used by readdir */ - __u32 op_npages; + __u32 op_max_pages; /* used to transfer info between the stacks of MD client * see enum op_cli_flags */ - __u32 op_cli_flags; + enum md_cli_flags op_cli_flags; /* File object data version for HSM release, on client */ __u64 op_data_version; struct lustre_handle op_lease_handle; + + /* default stripe offset */ + __u32 op_default_stripe_offset; }; -enum op_cli_flags { - CLI_SET_MEA = 1 << 0, - CLI_RM_ENTRY = 1 << 1, +struct md_callback { + int (*md_blocking_ast)(struct ldlm_lock *lock, + struct ldlm_lock_desc *desc, + void *data, int flag); }; struct md_enqueue_info; @@ -879,8 +831,7 @@ struct md_enqueue_info { struct inode *mi_dir; int (*mi_cb)(struct ptlrpc_request *req, struct md_enqueue_info *minfo, int rc); - __u64 mi_cbdata; - unsigned int mi_generation; + void *mi_cbdata; }; struct obd_ops { @@ -894,8 +845,6 @@ struct obd_ops { __u32 keylen, void *key, __u32 vallen, void *val, struct ptlrpc_request_set *set); - int (*attach)(struct obd_device *dev, u32 len, void *data); - int (*detach)(struct obd_device *dev); int (*setup)(struct obd_device *dev, struct lustre_cfg *cfg); int (*precleanup)(struct obd_device *dev, enum obd_cleanup_stage cleanup_stage); @@ -927,8 +876,8 @@ struct obd_ops { int (*fid_fini)(struct obd_device *obd); /* Allocate new fid according to passed @hint. */ - int (*fid_alloc)(struct obd_export *exp, struct lu_fid *fid, - struct md_op_data *op_data); + int (*fid_alloc)(const struct lu_env *env, struct obd_export *exp, + struct lu_fid *fid, struct md_op_data *op_data); /* * Object with @fid is getting deleted, we may want to do something @@ -943,13 +892,10 @@ struct obd_ops { int (*unpackmd)(struct obd_export *exp, struct lov_stripe_md **mem_tgt, struct lov_mds_md *disk_src, int disk_len); - int (*preallocate)(struct lustre_handle *, u32 *req, u64 *ids); int (*create)(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa, struct lov_stripe_md **ea, - struct obd_trans_info *oti); + struct obdo *oa, struct obd_trans_info *oti); int (*destroy)(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa, struct lov_stripe_md *ea, - struct obd_trans_info *oti, struct obd_export *md_exp); + struct obdo *oa, struct obd_trans_info *oti); int (*setattr)(const struct lu_env *, struct obd_export *exp, struct obd_info *oinfo, struct obd_trans_info *oti); int (*setattr_async)(struct obd_export *exp, struct obd_info *oinfo, @@ -959,8 +905,6 @@ struct obd_ops { struct obd_info *oinfo); int (*getattr_async)(struct obd_export *exp, struct obd_info *oinfo, struct ptlrpc_request_set *set); - int (*adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm, - u64 size, int shrink); int (*preprw)(const struct lu_env *env, int cmd, struct obd_export *exp, struct obdo *oa, int objcount, struct obd_ioobj *obj, struct niobuf_remote *remote, @@ -972,8 +916,6 @@ struct obd_ops { struct niobuf_remote *remote, int pages, struct niobuf_local *local, struct obd_trans_info *oti, int rc); - int (*find_cbdata)(struct obd_export *, struct lov_stripe_md *, - ldlm_iterator_t it, void *data); int (*init_export)(struct obd_export *exp); int (*destroy_export)(struct obd_export *exp); @@ -1009,27 +951,11 @@ struct obd_ops { */ }; -enum { - LUSTRE_OPC_MKDIR = (1 << 0), - LUSTRE_OPC_SYMLINK = (1 << 1), - LUSTRE_OPC_MKNOD = (1 << 2), - LUSTRE_OPC_CREATE = (1 << 3), - LUSTRE_OPC_ANY = (1 << 4) -}; - /* lmv structures */ -#define MEA_MAGIC_LAST_CHAR 0xb2221ca1 -#define MEA_MAGIC_ALL_CHARS 0xb222a11c -#define MEA_MAGIC_HASH_SEGMENT 0xb222a11b - -#define MAX_HASH_SIZE_32 0x7fffffffUL -#define MAX_HASH_SIZE 0x7fffffffffffffffULL -#define MAX_HASH_HIGHEST_BIT 0x1000000000000000ULL - struct lustre_md { struct mdt_body *body; struct lov_stripe_md *lsm; - struct lmv_stripe_md *mea; + struct lmv_stripe_md *lmv; #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *posix_acl; #endif @@ -1044,48 +970,55 @@ struct md_open_data { bool mod_is_create; }; +struct obd_client_handle { + struct lustre_handle och_fh; + struct lu_fid och_fid; + struct md_open_data *och_mod; + struct lustre_handle och_lease_handle; /* open lock for lease */ + __u32 och_magic; + int och_flags; +}; + +#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed + struct lookup_intent; +struct cl_attr; struct md_ops { int (*getstatus)(struct obd_export *, struct lu_fid *); int (*null_inode)(struct obd_export *, const struct lu_fid *); - int (*find_cbdata)(struct obd_export *, const struct lu_fid *, - ldlm_iterator_t, void *); int (*close)(struct obd_export *, struct md_op_data *, struct md_open_data *, struct ptlrpc_request **); int (*create)(struct obd_export *, struct md_op_data *, - const void *, int, int, __u32, __u32, cfs_cap_t, - __u64, struct ptlrpc_request **); + const void *, size_t, umode_t, uid_t, gid_t, + cfs_cap_t, __u64, struct ptlrpc_request **); int (*done_writing)(struct obd_export *, struct md_op_data *, struct md_open_data *); int (*enqueue)(struct obd_export *, struct ldlm_enqueue_info *, + const ldlm_policy_data_t *, struct lookup_intent *, struct md_op_data *, - struct lustre_handle *, void *, int, - struct ptlrpc_request **, __u64); + struct lustre_handle *, __u64); int (*getattr)(struct obd_export *, struct md_op_data *, struct ptlrpc_request **); int (*getattr_name)(struct obd_export *, struct md_op_data *, struct ptlrpc_request **); int (*intent_lock)(struct obd_export *, struct md_op_data *, - void *, int, struct lookup_intent *, int, + struct lookup_intent *, struct ptlrpc_request **, ldlm_blocking_callback, __u64); int (*link)(struct obd_export *, struct md_op_data *, struct ptlrpc_request **); int (*rename)(struct obd_export *, struct md_op_data *, - const char *, int, const char *, int, + const char *, size_t, const char *, size_t, struct ptlrpc_request **); - int (*is_subdir)(struct obd_export *, const struct lu_fid *, - const struct lu_fid *, - struct ptlrpc_request **); int (*setattr)(struct obd_export *, struct md_op_data *, void *, - int, void *, int, struct ptlrpc_request **, + size_t, void *, size_t, struct ptlrpc_request **, struct md_open_data **mod); int (*sync)(struct obd_export *, const struct lu_fid *, struct ptlrpc_request **); - int (*readpage)(struct obd_export *, struct md_op_data *, - struct page **, struct ptlrpc_request **); - + int (*read_page)(struct obd_export *, struct md_op_data *, + struct md_callback *cb_op, __u64 hash_offset, + struct page **ppage); int (*unlink)(struct obd_export *, struct md_op_data *, struct ptlrpc_request **); @@ -1097,7 +1030,7 @@ struct md_ops { u64, const char *, const char *, int, int, int, struct ptlrpc_request **); - int (*init_ea_size)(struct obd_export *, int, int, int, int); + int (*init_ea_size)(struct obd_export *, u32, u32, u32, u32); int (*get_lustre_md)(struct obd_export *, struct ptlrpc_request *, struct obd_export *, struct obd_export *, @@ -1105,12 +1038,17 @@ struct md_ops { int (*free_lustre_md)(struct obd_export *, struct lustre_md *); + int (*merge_attr)(struct obd_export *, + const struct lmv_stripe_md *lsm, + struct cl_attr *attr, ldlm_blocking_callback); + int (*set_open_replay_data)(struct obd_export *, struct obd_client_handle *, struct lookup_intent *); int (*clear_open_replay_data)(struct obd_export *, struct obd_client_handle *); - int (*set_lock_data)(struct obd_export *, __u64 *, void *, __u64 *); + int (*set_lock_data)(struct obd_export *, const struct lustre_handle *, + void *, __u64 *); enum ldlm_mode (*lock_match)(struct obd_export *, __u64, const struct lu_fid *, enum ldlm_type, @@ -1121,6 +1059,11 @@ struct md_ops { ldlm_policy_data_t *, enum ldlm_mode, enum ldlm_cancel_flags flags, void *opaque); + int (*get_fid_from_lsm)(struct obd_export *, + const struct lmv_stripe_md *, + const char *name, int namelen, + struct lu_fid *fid); + int (*intent_getattr_async)(struct obd_export *, struct md_enqueue_info *, struct ldlm_enqueue_info *); @@ -1137,8 +1080,6 @@ struct md_ops { struct lsm_operations { void (*lsm_free)(struct lov_stripe_md *); - int (*lsm_destroy)(struct lov_stripe_md *, struct obdo *oa, - struct obd_export *md_exp); void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, u64 *, u64 *); void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, u64 *, @@ -1164,10 +1105,6 @@ static inline const struct lsm_operations *lsm_op_find(int magic) } } -/* Requests for obd_extent_calc() */ -#define OBD_CALC_STRIPE_START 1 -#define OBD_CALC_STRIPE_END 2 - static inline struct md_open_data *obd_mod_alloc(void) { struct md_open_data *mod; @@ -1211,7 +1148,8 @@ static inline const char *lu_dev_name(const struct lu_device *lu_dev) return lu_dev->ld_obd->obd_name; } -static inline bool filename_is_volatile(const char *name, int namelen, int *idx) +static inline bool filename_is_volatile(const char *name, size_t namelen, + int *idx) { const char *start; char *end; @@ -1259,4 +1197,28 @@ static inline int cli_brw_size(struct obd_device *obd) return obd->u.cli.cl_max_pages_per_rpc << PAGE_SHIFT; } +/* + * when RPC size or the max RPCs in flight is increased, the max dirty pages + * of the client should be increased accordingly to avoid sending fragmented + * RPCs over the network when the client runs out of the maximum dirty space + * when so many RPCs are being generated. + */ +static inline void client_adjust_max_dirty(struct client_obd *cli) +{ + /* initializing */ + if (cli->cl_dirty_max_pages <= 0) + cli->cl_dirty_max_pages = + (OSC_MAX_DIRTY_DEFAULT * 1024 * 1024) >> PAGE_SHIFT; + else { + unsigned long dirty_max = cli->cl_max_rpcs_in_flight * + cli->cl_max_pages_per_rpc; + + if (dirty_max > cli->cl_dirty_max_pages) + cli->cl_dirty_max_pages = dirty_max; + } + + if (cli->cl_dirty_max_pages > totalram_pages / 8) + cli->cl_dirty_max_pages = totalram_pages / 8; +} + #endif /* __OBD_H */ diff --git a/drivers/staging/lustre/lustre/include/obd_class.h b/drivers/staging/lustre/lustre/include/obd_class.h index 6482a937000b..16094dbec08b 100644 --- a/drivers/staging/lustre/lustre/include/obd_class.h +++ b/drivers/staging/lustre/lustre/include/obd_class.h @@ -56,7 +56,6 @@ #define OBD_STATFS_FOR_MDT0 0x0008 /* The statfs is only for retrieving * information from MDT0. */ -#define OBD_FL_PUNCH 0x00000001 /* To indicate it is punch operation */ /* OBD Device Declarations */ extern struct obd_device *obd_devs[MAX_OBD_DEVICES]; @@ -97,6 +96,11 @@ int obd_zombie_impexp_init(void); void obd_zombie_impexp_stop(void); void obd_zombie_barrier(void); +int obd_get_request_slot(struct client_obd *cli); +void obd_put_request_slot(struct client_obd *cli); +__u32 obd_get_max_rpcs_in_flight(struct client_obd *cli); +int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max); + struct llog_handle; struct llog_rec_hdr; typedef int (*llog_cb_t)(const struct lu_env *, struct llog_handle *, @@ -265,10 +269,10 @@ static inline int lprocfs_climp_check(struct obd_device *obd) struct inode; struct lu_attr; struct obdo; -void obdo_refresh_inode(struct inode *dst, struct obdo *src, u32 valid); +void obdo_refresh_inode(struct inode *dst, const struct obdo *src, u32 valid); -void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj); -void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, u32 valid); +void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj); +void md_from_obdo(struct md_op_data *op_data, const struct obdo *oa, u32 valid); #define OBT(dev) (dev)->obd_type #define OBP(dev, op) (dev)->obd_type->typ_dt_ops->op @@ -673,15 +677,6 @@ static inline int obd_unpackmd(struct obd_export *exp, return rc; } -/* helper functions */ -static inline int obd_alloc_memmd(struct obd_export *exp, - struct lov_stripe_md **mem_tgt) -{ - LASSERT(mem_tgt); - LASSERT(!*mem_tgt); - return obd_unpackmd(exp, mem_tgt, NULL, 0); -} - static inline int obd_free_memmd(struct obd_export *exp, struct lov_stripe_md **mem_tgt) { @@ -695,29 +690,26 @@ static inline int obd_free_memmd(struct obd_export *exp, } static inline int obd_create(const struct lu_env *env, struct obd_export *exp, - struct obdo *obdo, struct lov_stripe_md **ea, - struct obd_trans_info *oti) + struct obdo *obdo, struct obd_trans_info *oti) { int rc; EXP_CHECK_DT_OP(exp, create); EXP_COUNTER_INCREMENT(exp, create); - rc = OBP(exp->exp_obd, create)(env, exp, obdo, ea, oti); + rc = OBP(exp->exp_obd, create)(env, exp, obdo, oti); return rc; } static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp, - struct obdo *obdo, struct lov_stripe_md *ea, - struct obd_trans_info *oti, - struct obd_export *md_exp) + struct obdo *obdo, struct obd_trans_info *oti) { int rc; EXP_CHECK_DT_OP(exp, destroy); EXP_COUNTER_INCREMENT(exp, destroy); - rc = OBP(exp->exp_obd, destroy)(env, exp, obdo, ea, oti, md_exp); + rc = OBP(exp->exp_obd, destroy)(env, exp, obdo, oti); return rc; } @@ -925,7 +917,8 @@ static inline int obd_fid_fini(struct obd_device *obd) return rc; } -static inline int obd_fid_alloc(struct obd_export *exp, +static inline int obd_fid_alloc(const struct lu_env *env, + struct obd_export *exp, struct lu_fid *fid, struct md_op_data *op_data) { @@ -934,7 +927,7 @@ static inline int obd_fid_alloc(struct obd_export *exp, EXP_CHECK_DT_OP(exp, fid_alloc); EXP_COUNTER_INCREMENT(exp, fid_alloc); - rc = OBP(exp->exp_obd, fid_alloc)(exp, fid, op_data); + rc = OBP(exp->exp_obd, fid_alloc)(env, exp, fid, op_data); return rc; } @@ -1147,19 +1140,6 @@ static inline int obd_commitrw(const struct lu_env *env, int cmd, return rc; } -static inline int obd_adjust_kms(struct obd_export *exp, - struct lov_stripe_md *lsm, u64 size, - int shrink) -{ - int rc; - - EXP_CHECK_DT_OP(exp, adjust_kms); - EXP_COUNTER_INCREMENT(exp, adjust_kms); - - rc = OBP(exp->exp_obd, adjust_kms)(exp, lsm, size, shrink); - return rc; -} - static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp, int len, void *karg, void __user *uarg) { @@ -1172,19 +1152,6 @@ static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp, return rc; } -static inline int obd_find_cbdata(struct obd_export *exp, - struct lov_stripe_md *lsm, - ldlm_iterator_t it, void *data) -{ - int rc; - - EXP_CHECK_DT_OP(exp, find_cbdata); - EXP_COUNTER_INCREMENT(exp, find_cbdata); - - rc = OBP(exp->exp_obd, find_cbdata)(exp, lsm, it, data); - return rc; -} - static inline void obd_import_event(struct obd_device *obd, struct obd_import *imp, enum obd_import_event event) @@ -1210,12 +1177,7 @@ static inline int obd_notify(struct obd_device *obd, if (rc) return rc; - /* the check for async_recov is a complete hack - I'm hereby - * overloading the meaning to also mean "this was called from - * mds_postsetup". I know that my mds is able to handle notifies - * by this point, and it needs to get them to execute mds_postrecov. - */ - if (!obd->obd_set_up && !obd->obd_async_recov) { + if (!obd->obd_set_up) { CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name); return -EINVAL; } @@ -1358,18 +1320,6 @@ static inline int md_null_inode(struct obd_export *exp, return rc; } -static inline int md_find_cbdata(struct obd_export *exp, - const struct lu_fid *fid, - ldlm_iterator_t it, void *data) -{ - int rc; - - EXP_CHECK_MD_OP(exp, find_cbdata); - EXP_MD_COUNTER_INCREMENT(exp, find_cbdata); - rc = MDP(exp->exp_obd, find_cbdata)(exp, fid, it, data); - return rc; -} - static inline int md_close(struct obd_export *exp, struct md_op_data *op_data, struct md_open_data *mod, struct ptlrpc_request **request) @@ -1383,9 +1333,9 @@ static inline int md_close(struct obd_export *exp, struct md_op_data *op_data, } static inline int md_create(struct obd_export *exp, struct md_op_data *op_data, - const void *data, int datalen, int mode, __u32 uid, - __u32 gid, cfs_cap_t cap_effective, __u64 rdev, - struct ptlrpc_request **request) + const void *data, size_t datalen, umode_t mode, + uid_t uid, gid_t gid, cfs_cap_t cap_effective, + __u64 rdev, struct ptlrpc_request **request) { int rc; @@ -1410,19 +1360,18 @@ static inline int md_done_writing(struct obd_export *exp, static inline int md_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + const ldlm_policy_data_t *policy, struct lookup_intent *it, struct md_op_data *op_data, struct lustre_handle *lockh, - void *lmm, int lmmsize, - struct ptlrpc_request **req, __u64 extra_lock_flags) { int rc; EXP_CHECK_MD_OP(exp, enqueue); EXP_MD_COUNTER_INCREMENT(exp, enqueue); - rc = MDP(exp->exp_obd, enqueue)(exp, einfo, it, op_data, lockh, - lmm, lmmsize, req, extra_lock_flags); + rc = MDP(exp->exp_obd, enqueue)(exp, einfo, policy, it, op_data, lockh, + extra_lock_flags); return rc; } @@ -1439,9 +1388,9 @@ static inline int md_getattr_name(struct obd_export *exp, } static inline int md_intent_lock(struct obd_export *exp, - struct md_op_data *op_data, void *lmm, - int lmmsize, struct lookup_intent *it, - int lookup_flags, struct ptlrpc_request **reqp, + struct md_op_data *op_data, + struct lookup_intent *it, + struct ptlrpc_request **reqp, ldlm_blocking_callback cb_blocking, __u64 extra_lock_flags) { @@ -1449,9 +1398,8 @@ static inline int md_intent_lock(struct obd_export *exp, EXP_CHECK_MD_OP(exp, intent_lock); EXP_MD_COUNTER_INCREMENT(exp, intent_lock); - rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, lmm, lmmsize, - it, lookup_flags, reqp, cb_blocking, - extra_lock_flags); + rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, it, reqp, + cb_blocking, extra_lock_flags); return rc; } @@ -1467,8 +1415,8 @@ static inline int md_link(struct obd_export *exp, struct md_op_data *op_data, } static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data, - const char *old, int oldlen, const char *new, - int newlen, struct ptlrpc_request **request) + const char *old, size_t oldlen, const char *new, + size_t newlen, struct ptlrpc_request **request) { int rc; @@ -1479,21 +1427,8 @@ static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data, return rc; } -static inline int md_is_subdir(struct obd_export *exp, - const struct lu_fid *pfid, - const struct lu_fid *cfid, - struct ptlrpc_request **request) -{ - int rc; - - EXP_CHECK_MD_OP(exp, is_subdir); - EXP_MD_COUNTER_INCREMENT(exp, is_subdir); - rc = MDP(exp->exp_obd, is_subdir)(exp, pfid, cfid, request); - return rc; -} - static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data, - void *ea, int ealen, void *ea2, int ea2len, + void *ea, size_t ealen, void *ea2, size_t ea2len, struct ptlrpc_request **request, struct md_open_data **mod) { @@ -1517,15 +1452,18 @@ static inline int md_sync(struct obd_export *exp, const struct lu_fid *fid, return rc; } -static inline int md_readpage(struct obd_export *exp, struct md_op_data *opdata, - struct page **pages, - struct ptlrpc_request **request) +static inline int md_read_page(struct obd_export *exp, + struct md_op_data *op_data, + struct md_callback *cb_op, + __u64 hash_offset, + struct page **ppage) { int rc; - EXP_CHECK_MD_OP(exp, readpage); - EXP_MD_COUNTER_INCREMENT(exp, readpage); - rc = MDP(exp->exp_obd, readpage)(exp, opdata, pages, request); + EXP_CHECK_MD_OP(exp, read_page); + EXP_MD_COUNTER_INCREMENT(exp, read_page); + rc = MDP(exp->exp_obd, read_page)(exp, op_data, cb_op, hash_offset, + ppage); return rc; } @@ -1559,6 +1497,16 @@ static inline int md_free_lustre_md(struct obd_export *exp, return MDP(exp->exp_obd, free_lustre_md)(exp, md); } +static inline int md_merge_attr(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + struct cl_attr *attr, + ldlm_blocking_callback cb) +{ + EXP_CHECK_MD_OP(exp, merge_attr); + EXP_MD_COUNTER_INCREMENT(exp, merge_attr); + return MDP(exp->exp_obd, merge_attr)(exp, lsm, attr, cb); +} + static inline int md_setxattr(struct obd_export *exp, const struct lu_fid *fid, u64 valid, const char *name, const char *input, int input_size, @@ -1603,7 +1551,8 @@ static inline int md_clear_open_replay_data(struct obd_export *exp, } static inline int md_set_lock_data(struct obd_export *exp, - __u64 *lockh, void *data, __u64 *bits) + const struct lustre_handle *lockh, + void *data, __u64 *bits) { EXP_CHECK_MD_OP(exp, set_lock_data); EXP_MD_COUNTER_INCREMENT(exp, set_lock_data); @@ -1674,6 +1623,19 @@ static inline int md_revalidate_lock(struct obd_export *exp, return rc; } +static inline int md_get_fid_from_lsm(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + const char *name, int namelen, + struct lu_fid *fid) +{ + int rc; + + EXP_CHECK_MD_OP(exp, get_fid_from_lsm); + EXP_MD_COUNTER_INCREMENT(exp, get_fid_from_lsm); + rc = MDP(exp->exp_obd, get_fid_from_lsm)(exp, lsm, name, namelen, fid); + return rc; +} + /* OBD Metadata Support */ int obd_init_caches(void); @@ -1682,16 +1644,6 @@ void obd_cleanup_caches(void); /* support routines */ extern struct kmem_cache *obdo_cachep; -static inline void obdo2fid(struct obdo *oa, struct lu_fid *fid) -{ - /* something here */ -} - -static inline void fid2obdo(struct lu_fid *fid, struct obdo *oa) -{ - /* something here */ -} - typedef int (*register_lwp_cb)(void *data); struct lwp_register_item { @@ -1710,6 +1662,9 @@ struct lwp_register_item { extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); /* obd_mount.c */ +int lustre_unregister_fs(void); +int lustre_register_fs(void); +int lustre_check_exclusion(struct super_block *sb, char *svname); /* sysctl.c */ int obd_sysctl_init(void); @@ -1730,8 +1685,24 @@ void class_exit_uuidlist(void); extern char obd_jobid_node[]; extern struct miscdevice obd_psdev; extern spinlock_t obd_types_lock; +int class_procfs_init(void); +int class_procfs_clean(void); /* prng.c */ #define ll_generate_random_uuid(uuid_out) cfs_get_random_bytes(uuid_out, sizeof(class_uuid_t)) +/* statfs_pack.c */ +struct kstatfs; +void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs); +void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs); + +/* root squash info */ +struct rw_semaphore; +struct root_squash_info { + uid_t rsi_uid; + gid_t rsi_gid; + struct list_head rsi_nosquash_nids; + struct rw_semaphore rsi_sem; +}; + #endif /* __LINUX_OBD_CLASS_H */ diff --git a/drivers/staging/lustre/lustre/include/obd_support.h b/drivers/staging/lustre/lustre/include/obd_support.h index 845e64a56c21..b346a7f10aa4 100644 --- a/drivers/staging/lustre/lustre/include/obd_support.h +++ b/drivers/staging/lustre/lustre/include/obd_support.h @@ -35,7 +35,7 @@ #include <linux/slab.h> #include "../../include/linux/libcfs/libcfs.h" -#include "linux/lustre_compat25.h" +#include "lustre_compat.h" #include "lprocfs_status.h" /* global variables */ @@ -52,11 +52,9 @@ extern unsigned int at_max; extern unsigned int at_history; extern int at_early_margin; extern int at_extra; -extern unsigned int obd_sync_filter; -extern unsigned int obd_max_dirty_pages; -extern atomic_t obd_unstable_pages; -extern atomic_t obd_dirty_pages; -extern atomic_t obd_dirty_transit_pages; +extern unsigned long obd_max_dirty_pages; +extern atomic_long_t obd_dirty_pages; +extern atomic_long_t obd_dirty_transit_pages; extern char obd_jobid_var[]; /* Some hash init argument constants */ @@ -117,17 +115,17 @@ extern char obd_jobid_var[]; * running on a backup server. (If it's too low, import_select_connection * will increase the timeout anyhow.) */ -#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN, obd_timeout/20) +#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN, obd_timeout / 20) /* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */ #define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \ INITIAL_CONNECT_TIMEOUT) /* The min time a target should wait for clients to reconnect in recovery */ -#define OBD_RECOVERY_TIME_MIN (2*RECONNECT_DELAY_MAX) +#define OBD_RECOVERY_TIME_MIN (2 * RECONNECT_DELAY_MAX) #define OBD_IR_FACTOR_MIN 1 #define OBD_IR_FACTOR_MAX 10 -#define OBD_IR_FACTOR_DEFAULT (OBD_IR_FACTOR_MAX/2) +#define OBD_IR_FACTOR_DEFAULT (OBD_IR_FACTOR_MAX / 2) /* default timeout for the MGS to become IR_FULL */ -#define OBD_IR_MGS_TIMEOUT (4*obd_timeout) +#define OBD_IR_MGS_TIMEOUT (4 * obd_timeout) #define LONG_UNLINK 300 /* Unlink should happen before now */ /** @@ -318,6 +316,10 @@ extern char obd_jobid_var[]; #define OBD_FAIL_LDLM_AGL_NOLOCK 0x31b #define OBD_FAIL_LDLM_OST_LVB 0x31c #define OBD_FAIL_LDLM_ENQUEUE_HANG 0x31d +#define OBD_FAIL_LDLM_CP_CB_WAIT2 0x320 +#define OBD_FAIL_LDLM_CP_CB_WAIT3 0x321 +#define OBD_FAIL_LDLM_CP_CB_WAIT4 0x322 +#define OBD_FAIL_LDLM_CP_CB_WAIT5 0x323 /* LOCKLESS IO */ #define OBD_FAIL_LDLM_SET_CONTENTION 0x385 @@ -400,6 +402,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_MDC_GETATTR_ENQUEUE 0x803 #define OBD_FAIL_MDC_RPCS_SEM 0x804 #define OBD_FAIL_MDC_LIGHTWEIGHT 0x805 +#define OBD_FAIL_MDC_CLOSE 0x806 #define OBD_FAIL_MGS 0x900 #define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901 @@ -455,6 +458,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_LOV_INIT 0x1403 #define OBD_FAIL_GLIMPSE_DELAY 0x1404 #define OBD_FAIL_LLITE_XATTR_ENOMEM 0x1405 +#define OBD_FAIL_GETATTR_DELAY 0x1409 #define OBD_FAIL_FID_INDIR 0x1501 #define OBD_FAIL_FID_INLMA 0x1502 @@ -474,11 +478,16 @@ extern char obd_jobid_var[]; #define OBD_FAIL_LFSCK_CRASH 0x160a #define OBD_FAIL_LFSCK_NO_AUTO 0x160b #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c +#define OBD_FAIL_LFSCK_INVALID_PFID 0x1619 +#define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628 /* UPDATE */ #define OBD_FAIL_UPDATE_OBJ_NET 0x1700 #define OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701 +/* LMV */ +#define OBD_FAIL_UNKNOWN_LMV_STRIPE 0x1901 + /* Assign references to moved code to reduce code changes */ #define OBD_FAIL_PRECHECK(id) CFS_FAIL_PRECHECK(id) #define OBD_FAIL_CHECK(id) CFS_FAIL_CHECK(id) @@ -520,7 +529,8 @@ do { \ POISON_PTR(ptr); \ } while (0) -#define KEY_IS(str) \ - (keylen >= (sizeof(str)-1) && memcmp(key, str, (sizeof(str)-1)) == 0) +#define KEY_IS(str) \ + (keylen >= (sizeof(str) - 1) && \ + memcmp(key, str, (sizeof(str) - 1)) == 0) #endif diff --git a/drivers/staging/lustre/lustre/ldlm/interval_tree.c b/drivers/staging/lustre/lustre/ldlm/interval_tree.c index f4a70ebddeaf..e134ecd21bb2 100644 --- a/drivers/staging/lustre/lustre/ldlm/interval_tree.c +++ b/drivers/staging/lustre/lustre/ldlm/interval_tree.c @@ -90,6 +90,17 @@ static inline int extent_equal(struct interval_node_extent *e1, return (e1->start == e2->start) && (e1->end == e2->end); } +static inline int extent_overlapped(struct interval_node_extent *e1, + struct interval_node_extent *e2) +{ + return (e1->start <= e2->end) && (e2->start <= e1->end); +} + +static inline int node_equal(struct interval_node *n1, struct interval_node *n2) +{ + return extent_equal(&n1->in_extent, &n2->in_extent); +} + static inline __u64 max_u64(__u64 x, __u64 y) { return x > y ? x : y; @@ -262,7 +273,7 @@ struct interval_node *interval_insert(struct interval_node *node, p = root; while (*p) { parent = *p; - if (extent_equal(&parent->in_extent, &node->in_extent)) + if (node_equal(parent, node)) return parent; /* max_high field must be updated after each iteration */ @@ -463,3 +474,90 @@ color: interval_erase_color(child, parent, root); } EXPORT_SYMBOL(interval_erase); + +static inline int interval_may_overlap(struct interval_node *node, + struct interval_node_extent *ext) +{ + return (ext->start <= node->in_max_high && + ext->end >= interval_low(node)); +} + +/* + * This function finds all intervals that overlap interval ext, + * and calls func to handle resulted intervals one by one. + * in lustre, this function will find all conflicting locks in + * the granted queue and add these locks to the ast work list. + * + * { + * if (!node) + * return 0; + * if (ext->end < interval_low(node)) { + * interval_search(node->in_left, ext, func, data); + * } else if (interval_may_overlap(node, ext)) { + * if (extent_overlapped(ext, &node->in_extent)) + * func(node, data); + * interval_search(node->in_left, ext, func, data); + * interval_search(node->in_right, ext, func, data); + * } + * return 0; + * } + * + */ +enum interval_iter interval_search(struct interval_node *node, + struct interval_node_extent *ext, + interval_callback_t func, + void *data) +{ + enum interval_iter rc = INTERVAL_ITER_CONT; + struct interval_node *parent; + + LASSERT(ext); + LASSERT(func); + + while (node) { + if (ext->end < interval_low(node)) { + if (node->in_left) { + node = node->in_left; + continue; + } + } else if (interval_may_overlap(node, ext)) { + if (extent_overlapped(ext, &node->in_extent)) { + rc = func(node, data); + if (rc == INTERVAL_ITER_STOP) + break; + } + + if (node->in_left) { + node = node->in_left; + continue; + } + if (node->in_right) { + node = node->in_right; + continue; + } + } + + parent = node->in_parent; + while (parent) { + if (node_is_left_child(node) && + parent->in_right) { + /* + * If we ever got the left, it means that the + * parent met ext->end<interval_low(parent), or + * may_overlap(parent). If the former is true, + * we needn't go back. So stop early and check + * may_overlap(parent) after this loop. + */ + node = parent->in_right; + break; + } + node = parent; + parent = parent->in_parent; + } + if (!parent || !interval_may_overlap(parent, ext)) + break; + } + + return rc; +} +EXPORT_SYMBOL(interval_search); diff --git a/drivers/staging/lustre/lustre/ldlm/l_lock.c b/drivers/staging/lustre/lustre/ldlm/l_lock.c index ea8840cb9056..3845f386f1db 100644 --- a/drivers/staging/lustre/lustre/ldlm/l_lock.c +++ b/drivers/staging/lustre/lustre/ldlm/l_lock.c @@ -45,6 +45,8 @@ * being an atomic operation. */ struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock) + __acquires(&lock->l_lock) + __acquires(&lock->l_resource->lr_lock) { spin_lock(&lock->l_lock); @@ -59,6 +61,8 @@ EXPORT_SYMBOL(lock_res_and_lock); * Unlock a lock and its resource previously locked with lock_res_and_lock */ void unlock_res_and_lock(struct ldlm_lock *lock) + __releases(&lock->l_resource->lr_lock) + __releases(&lock->l_lock) { /* on server-side resource of lock doesn't change */ ldlm_clear_res_locked(lock); diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c index f5023d9b78f5..ecf472e4813d 100644 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c @@ -221,7 +221,7 @@ void ldlm_extent_unlink_lock(struct ldlm_lock *lock) } void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, - ldlm_policy_data_t *lpolicy) + ldlm_policy_data_t *lpolicy) { memset(lpolicy, 0, sizeof(*lpolicy)); lpolicy->l_extent.start = wpolicy->l_extent.start; @@ -230,7 +230,7 @@ void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, } void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, - ldlm_wire_policy_data_t *wpolicy) + ldlm_wire_policy_data_t *wpolicy) { memset(wpolicy, 0, sizeof(*wpolicy)); wpolicy->l_extent.start = lpolicy->l_extent.start; diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c index d6b61bc39135..861f36f039b5 100644 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c @@ -97,7 +97,7 @@ ldlm_flock_destroy(struct ldlm_lock *lock, enum ldlm_mode mode, __u64 flags) LASSERT(hlist_unhashed(&lock->l_exp_flock_hash)); list_del_init(&lock->l_res_link); - if (flags == LDLM_FL_WAIT_NOREPROC && !ldlm_is_failed(lock)) { + if (flags == LDLM_FL_WAIT_NOREPROC) { /* client side - set a flag to prevent sending a CANCEL */ lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING; @@ -166,7 +166,7 @@ reprocess: */ list_for_each(tmp, &res->lr_granted) { lock = list_entry(tmp, struct ldlm_lock, - l_res_link); + l_res_link); if (ldlm_same_flock_owner(lock, req)) { ownlocks = tmp; break; @@ -182,7 +182,7 @@ reprocess: */ list_for_each(tmp, &res->lr_granted) { lock = list_entry(tmp, struct ldlm_lock, - l_res_link); + l_res_link); if (ldlm_same_flock_owner(lock, req)) { if (!ownlocks) @@ -339,10 +339,10 @@ reprocess: lock->l_granted_mode, &null_cbs, NULL, 0, LVB_T_NONE); lock_res_and_lock(req); - if (!new2) { + if (IS_ERR(new2)) { ldlm_flock_destroy(req, lock->l_granted_mode, *flags); - *err = -ENOLCK; + *err = PTR_ERR(new2); return LDLM_ITER_STOP; } goto reprocess; @@ -455,29 +455,22 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) enum ldlm_error err; int rc = 0; + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT2, 4); + if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT3)) { + lock_res_and_lock(lock); + lock->l_flags |= LDLM_FL_FAIL_LOC; + unlock_res_and_lock(lock); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT3, 4); + } CDEBUG(D_DLMTRACE, "flags: 0x%llx data: %p getlk: %p\n", flags, data, getlk); - /* Import invalidation. We need to actually release the lock - * references being held, so that it can go away. No point in - * holding the lock even if app still believes it has it, since - * server already dropped it anyway. Only for granted locks too. - */ - if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) == - (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) { - if (lock->l_req_mode == lock->l_granted_mode && - lock->l_granted_mode != LCK_NL && !data) - ldlm_lock_decref_internal(lock, lock->l_req_mode); - - /* Need to wake up the waiter if we were evicted */ - wake_up(&lock->l_waitq); - return 0; - } - LASSERT(flags != LDLM_FL_WAIT_NOREPROC); - if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED | - LDLM_FL_BLOCK_CONV))) { + if (flags & LDLM_FL_FAILED) + goto granted; + + if (!(flags & LDLM_FL_BLOCKED_MASK)) { if (!data) /* mds granted the lock in the reply */ goto granted; @@ -514,12 +507,21 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) granted: OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT, 10); - if (ldlm_is_failed(lock)) { - LDLM_DEBUG(lock, "client-side enqueue waking up: failed"); - return -EIO; + if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT4)) { + lock_res_and_lock(lock); + /* DEADLOCK is always set with CBPENDING */ + lock->l_flags |= LDLM_FL_FLOCK_DEADLOCK | LDLM_FL_CBPENDING; + unlock_res_and_lock(lock); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT4, 4); + } + if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT5)) { + lock_res_and_lock(lock); + /* DEADLOCK is always set with CBPENDING */ + lock->l_flags |= LDLM_FL_FAIL_LOC | + LDLM_FL_FLOCK_DEADLOCK | LDLM_FL_CBPENDING; + unlock_res_and_lock(lock); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT5, 4); } - - LDLM_DEBUG(lock, "client-side enqueue granted"); lock_res_and_lock(lock); @@ -530,20 +532,59 @@ granted: if (ldlm_is_destroyed(lock)) { unlock_res_and_lock(lock); LDLM_DEBUG(lock, "client-side enqueue waking up: destroyed"); - return 0; + /* + * An error is still to be returned, to propagate it up to + * ldlm_cli_enqueue_fini() caller. + */ + return -EIO; } /* ldlm_lock_enqueue() has already placed lock on the granted list. */ - list_del_init(&lock->l_res_link); + ldlm_resource_unlink_lock(lock); + + /* + * Import invalidation. We need to actually release the lock + * references being held, so that it can go away. No point in + * holding the lock even if app still believes it has it, since + * server already dropped it anyway. Only for granted locks too. + */ + /* Do the same for DEADLOCK'ed locks. */ + if (ldlm_is_failed(lock) || ldlm_is_flock_deadlock(lock)) { + int mode; + + if (flags & LDLM_FL_TEST_LOCK) + LASSERT(ldlm_is_test_lock(lock)); + + if (ldlm_is_test_lock(lock) || ldlm_is_flock_deadlock(lock)) + mode = getlk->fl_type; + else + mode = lock->l_granted_mode; + + if (ldlm_is_flock_deadlock(lock)) { + LDLM_DEBUG(lock, "client-side enqueue deadlock received"); + rc = -EDEADLK; + } + ldlm_flock_destroy(lock, mode, LDLM_FL_WAIT_NOREPROC); + unlock_res_and_lock(lock); + + /* Need to wake up the waiter if we were evicted */ + wake_up(&lock->l_waitq); + + /* + * An error is still to be returned, to propagate it up to + * ldlm_cli_enqueue_fini() caller. + */ + return rc ? : -EIO; + } + + LDLM_DEBUG(lock, "client-side enqueue granted"); - if (ldlm_is_flock_deadlock(lock)) { - LDLM_DEBUG(lock, "client-side enqueue deadlock received"); - rc = -EDEADLK; - } else if (flags & LDLM_FL_TEST_LOCK) { + if (flags & LDLM_FL_TEST_LOCK) { /* fcntl(F_GETLK) request */ /* The old mode was saved in getlk->fl_type so that if the mode * in the lock changes we can decref the appropriate refcount. */ + LASSERT(ldlm_is_test_lock(lock)); ldlm_flock_destroy(lock, getlk->fl_type, LDLM_FL_WAIT_NOREPROC); switch (lock->l_granted_mode) { case LCK_PR: diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h b/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h index e4cf65d2d3b1..5e82cfc245b2 100644 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h @@ -100,9 +100,10 @@ enum { int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, enum ldlm_cancel_flags sync, int flags); int ldlm_cancel_lru_local(struct ldlm_namespace *ns, - struct list_head *cancels, int count, int max, - enum ldlm_cancel_flags cancel_flags, int flags); -extern int ldlm_enqueue_min; + struct list_head *cancels, int count, int max, + enum ldlm_cancel_flags cancel_flags, int flags); +extern unsigned int ldlm_enqueue_min; +extern unsigned int ldlm_cancel_unused_locks_before_replay; /* ldlm_resource.c */ int ldlm_resource_putref_locked(struct ldlm_resource *res); @@ -200,8 +201,7 @@ ldlm_interval_extent(struct ldlm_interval *node) LASSERT(!list_empty(&node->li_group)); - lock = list_entry(node->li_group.next, struct ldlm_lock, - l_sl_policy); + lock = list_entry(node->li_group.next, struct ldlm_lock, l_sl_policy); return &lock->l_policy_data.l_extent; } @@ -302,7 +302,7 @@ static inline int is_granted_or_cancelled(struct ldlm_lock *lock) lock_res_and_lock(lock); if ((lock->l_req_mode == lock->l_granted_mode) && - !ldlm_is_cp_reqd(lock)) + !ldlm_is_cp_reqd(lock)) ret = 1; else if (ldlm_is_failed(lock) || ldlm_is_cancel(lock)) ret = 1; @@ -326,13 +326,13 @@ void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, ldlm_wire_policy_data_t *wpolicy); void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, - ldlm_policy_data_t *lpolicy); + ldlm_policy_data_t *lpolicy); void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, - ldlm_wire_policy_data_t *wpolicy); + ldlm_wire_policy_data_t *wpolicy); void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy, - ldlm_policy_data_t *lpolicy); + ldlm_policy_data_t *lpolicy); void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy, - ldlm_policy_data_t *lpolicy); + ldlm_policy_data_t *lpolicy); void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, ldlm_wire_policy_data_t *wpolicy); diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c index 7c832aae7d5e..153e990c494e 100644 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c @@ -82,7 +82,7 @@ static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid, if (priority) { list_del(&item->oic_item); list_add(&item->oic_item, - &imp->imp_conn_list); + &imp->imp_conn_list); item->oic_last_attempt = 0; } CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n", @@ -102,7 +102,7 @@ static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid, list_add(&imp_conn->oic_item, &imp->imp_conn_list); else list_add_tail(&imp_conn->oic_item, - &imp->imp_conn_list); + &imp->imp_conn_list); CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n", imp, imp->imp_obd->obd_name, uuid->uuid, (priority ? "head" : "tail")); @@ -299,12 +299,14 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2), sizeof(server_uuid))); - cli->cl_dirty = 0; + cli->cl_dirty_pages = 0; cli->cl_avail_grant = 0; - /* FIXME: Should limit this for the sum of all cl_dirty_max. */ - cli->cl_dirty_max = OSC_MAX_DIRTY_DEFAULT * 1024 * 1024; - if (cli->cl_dirty_max >> PAGE_SHIFT > totalram_pages / 8) - cli->cl_dirty_max = totalram_pages << (PAGE_SHIFT - 3); + /* FIXME: Should limit this for the sum of all cl_dirty_max_pages. */ + /* + * cl_dirty_max_pages may be changed at connect time in + * ptlrpc_connect_interpret(). + */ + client_adjust_max_dirty(cli); INIT_LIST_HEAD(&cli->cl_cache_waiters); INIT_LIST_HEAD(&cli->cl_loi_ready_list); INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list); @@ -326,11 +328,11 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) /* lru for osc. */ INIT_LIST_HEAD(&cli->cl_lru_osc); atomic_set(&cli->cl_lru_shrinkers, 0); - atomic_set(&cli->cl_lru_busy, 0); - atomic_set(&cli->cl_lru_in_list, 0); + atomic_long_set(&cli->cl_lru_busy, 0); + atomic_long_set(&cli->cl_lru_in_list, 0); INIT_LIST_HEAD(&cli->cl_lru_list); spin_lock_init(&cli->cl_lru_list_lock); - atomic_set(&cli->cl_unstable_count, 0); + atomic_long_set(&cli->cl_unstable_count, 0); init_waitqueue_head(&cli->cl_destroy_waitq); atomic_set(&cli->cl_destroy_in_flight, 0); @@ -360,7 +362,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) cli->cl_chunkbits = PAGE_SHIFT; if (!strcmp(name, LUSTRE_MDC_NAME)) { - cli->cl_max_rpcs_in_flight = MDC_MAX_RIF_DEFAULT; + cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT; } else if (totalram_pages >> (20 - PAGE_SHIFT) <= 128 /* MB */) { cli->cl_max_rpcs_in_flight = 2; } else if (totalram_pages >> (20 - PAGE_SHIFT) <= 256 /* MB */) { @@ -368,7 +370,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) } else if (totalram_pages >> (20 - PAGE_SHIFT) <= 512 /* MB */) { cli->cl_max_rpcs_in_flight = 4; } else { - cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT; + cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT; } rc = ldlm_get_ref(); if (rc) { @@ -534,7 +536,7 @@ int client_disconnect_export(struct obd_export *exp) imp = cli->cl_import; down_write(&cli->cl_sem); - CDEBUG(D_INFO, "disconnect %s - %d\n", obd->obd_name, + CDEBUG(D_INFO, "disconnect %s - %zu\n", obd->obd_name, cli->cl_conn_count); if (!cli->cl_conn_count) { @@ -690,7 +692,7 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) if (rs->rs_transno > exp->exp_last_committed) { /* not committed already */ list_add_tail(&rs->rs_obd_list, - &exp->exp_uncommitted_replies); + &exp->exp_uncommitted_replies); } spin_unlock(&exp->exp_uncommitted_replies_lock); @@ -795,7 +797,7 @@ void ldlm_dump_export_locks(struct obd_export *exp) CERROR("dumping locks for export %p,ignore if the unmount doesn't hang\n", exp); list_for_each_entry(lock, &exp->exp_locks_list, - l_exp_refs_link) + l_exp_refs_link) LDLM_ERROR(lock, "lock:"); } spin_unlock(&exp->exp_locks_list_guard); diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c index a5993f745ebe..3c48b4fb96f1 100644 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c @@ -481,8 +481,8 @@ int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock, unlock_res_and_lock(lock); newres = ldlm_resource_get(ns, NULL, new_resid, type, 1); - if (!newres) - return -ENOMEM; + if (IS_ERR(newres)) + return PTR_ERR(newres); lu_ref_add(&newres->lr_reference, "lock", lock); /* @@ -542,7 +542,7 @@ struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *handle, LASSERT(handle); - lock = class_handle2object(handle->cookie); + lock = class_handle2object(handle->cookie, NULL); if (!lock) return NULL; @@ -937,7 +937,7 @@ static void search_granted_lock(struct list_head *queue, /* go to next policy group within mode group */ tmp = policy_end->l_res_link.next; lock = list_entry(tmp, struct ldlm_lock, - l_res_link); + l_res_link); } /* loop over policy groups within the mode group */ /* insert point is last lock of the mode group, @@ -1028,15 +1028,28 @@ void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list) check_res_locked(res); lock->l_granted_mode = lock->l_req_mode; + + if (work_list && lock->l_completion_ast) + ldlm_add_ast_work_item(lock, NULL, work_list); + if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) ldlm_grant_lock_with_skiplist(lock); else if (res->lr_type == LDLM_EXTENT) ldlm_extent_add_lock(res, lock); - else + else if (res->lr_type == LDLM_FLOCK) { + /* + * We should not add locks to granted list in the following cases: + * - this is an UNLOCK but not a real lock; + * - this is a TEST lock; + * - this is a F_CANCELLK lock (async flock has req_mode == 0) + * - this is a deadlock (flock cannot be granted) + */ + if (!lock->l_req_mode || lock->l_req_mode == LCK_NL || + ldlm_is_test_lock(lock) || ldlm_is_flock_deadlock(lock)) + return; ldlm_resource_add_lock(res, &res->lr_granted, lock); - - if (work_list && lock->l_completion_ast) - ldlm_add_ast_work_item(lock, NULL, work_list); + } else + LBUG(); ldlm_pool_add(&ldlm_res_to_ns(res)->ns_pool, lock); } @@ -1103,7 +1116,7 @@ static struct ldlm_lock *search_queue(struct list_head *queue, * of bits. */ if (lock->l_resource->lr_type == LDLM_IBITS && - ((lock->l_policy_data.l_inodebits.bits & + ((lock->l_policy_data.l_inodebits.bits & policy->l_inodebits.bits) != policy->l_inodebits.bits)) continue; @@ -1214,7 +1227,7 @@ enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags, } res = ldlm_resource_get(ns, NULL, res_id, type, 0); - if (!res) { + if (IS_ERR(res)) { LASSERT(!old_lock); return 0; } @@ -1363,12 +1376,12 @@ int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill, if (size == sizeof(struct ost_lvb)) { if (loc == RCL_CLIENT) lvb = req_capsule_client_swab_get(pill, - &RMF_DLM_LVB, - lustre_swab_ost_lvb); + &RMF_DLM_LVB, + lustre_swab_ost_lvb); else lvb = req_capsule_server_swab_get(pill, - &RMF_DLM_LVB, - lustre_swab_ost_lvb); + &RMF_DLM_LVB, + lustre_swab_ost_lvb); if (unlikely(!lvb)) { LDLM_ERROR(lock, "no LVB"); return -EPROTO; @@ -1380,8 +1393,8 @@ int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill, if (loc == RCL_CLIENT) lvb = req_capsule_client_swab_get(pill, - &RMF_DLM_LVB, - lustre_swab_ost_lvb_v1); + &RMF_DLM_LVB, + lustre_swab_ost_lvb_v1); else lvb = req_capsule_server_sized_swab_get(pill, &RMF_DLM_LVB, size, @@ -1405,12 +1418,12 @@ int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill, if (size == sizeof(struct lquota_lvb)) { if (loc == RCL_CLIENT) lvb = req_capsule_client_swab_get(pill, - &RMF_DLM_LVB, - lustre_swab_lquota_lvb); + &RMF_DLM_LVB, + lustre_swab_lquota_lvb); else lvb = req_capsule_server_swab_get(pill, - &RMF_DLM_LVB, - lustre_swab_lquota_lvb); + &RMF_DLM_LVB, + lustre_swab_lquota_lvb); if (unlikely(!lvb)) { LDLM_ERROR(lock, "no LVB"); return -EPROTO; @@ -1462,15 +1475,15 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns, { struct ldlm_lock *lock; struct ldlm_resource *res; + int rc; res = ldlm_resource_get(ns, NULL, res_id, type, 1); - if (!res) - return NULL; + if (IS_ERR(res)) + return ERR_CAST(res); lock = ldlm_lock_new(res); - if (!lock) - return NULL; + return ERR_PTR(-ENOMEM); lock->l_req_mode = mode; lock->l_ast_data = data; @@ -1484,27 +1497,33 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns, lock->l_tree_node = NULL; /* if this is the extent lock, allocate the interval tree node */ if (type == LDLM_EXTENT) { - if (!ldlm_interval_alloc(lock)) + if (!ldlm_interval_alloc(lock)) { + rc = -ENOMEM; goto out; + } } if (lvb_len) { lock->l_lvb_len = lvb_len; lock->l_lvb_data = kzalloc(lvb_len, GFP_NOFS); - if (!lock->l_lvb_data) + if (!lock->l_lvb_data) { + rc = -ENOMEM; goto out; + } } lock->l_lvb_type = lvb_type; - if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_NEW_LOCK)) + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_NEW_LOCK)) { + rc = -ENOENT; goto out; + } return lock; out: ldlm_lock_destroy(lock); LDLM_LOCK_RELEASE(lock); - return NULL; + return ERR_PTR(rc); } /** @@ -1522,16 +1541,13 @@ enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns, struct ldlm_lock *lock = *lockp; struct ldlm_resource *res = lock->l_resource; - lock->l_last_activity = ktime_get_real_seconds(); - lock_res_and_lock(lock); if (lock->l_req_mode == lock->l_granted_mode) { /* The server returned a blocked lock, but it was granted * before we got a chance to actually enqueue it. We don't * need to do anything else. */ - *flags &= ~(LDLM_FL_BLOCK_GRANTED | - LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_WAIT); + *flags &= ~LDLM_FL_BLOCKED_MASK; goto out; } @@ -1546,6 +1562,8 @@ enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns, */ if (*flags & LDLM_FL_AST_DISCARD_DATA) ldlm_set_ast_discard_data(lock); + if (*flags & LDLM_FL_TEST_LOCK) + ldlm_set_test_lock(lock); /* * This distinction between local lock trees is very important; a client @@ -1688,7 +1706,7 @@ static int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) return -ENOENT; gl_work = list_entry(arg->list->next, struct ldlm_glimpse_work, - gl_list); + gl_list); list_del_init(&gl_work->gl_list); lock = gl_work->gl_lock; diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c index 821939ff2e6b..fde697ebaadc 100644 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c @@ -559,8 +559,11 @@ static int ldlm_callback_handler(struct ptlrpc_request *req) switch (lustre_msg_get_opc(req->rq_reqmsg)) { case LDLM_BL_CALLBACK: - if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) { + if (cfs_fail_err) + ldlm_callback_reply(req, -(int)cfs_fail_err); return 0; + } break; case LDLM_CP_CALLBACK: if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CP_CALLBACK_NET)) @@ -706,12 +709,12 @@ static struct ldlm_bl_work_item *ldlm_bl_get_work(struct ldlm_bl_pool *blp) if (!list_empty(&blp->blp_list) && (list_empty(&blp->blp_prio_list) || num_bl == 0)) blwi = list_entry(blp->blp_list.next, - struct ldlm_bl_work_item, blwi_entry); + struct ldlm_bl_work_item, blwi_entry); else if (!list_empty(&blp->blp_prio_list)) blwi = list_entry(blp->blp_prio_list.next, - struct ldlm_bl_work_item, - blwi_entry); + struct ldlm_bl_work_item, + blwi_entry); if (blwi) { if (++num_bl >= atomic_read(&blp->blp_num_threads)) @@ -741,7 +744,7 @@ static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp) init_completion(&bltd.bltd_comp); bltd.bltd_num = atomic_read(&blp->blp_num_threads); snprintf(bltd.bltd_name, sizeof(bltd.bltd_name), - "ldlm_bl_%02d", bltd.bltd_num); + "ldlm_bl_%02d", bltd.bltd_num); task = kthread_run(ldlm_bl_thread_main, &bltd, "%s", bltd.bltd_name); if (IS_ERR(task)) { CERROR("cannot start LDLM thread ldlm_bl_%02d: rc %ld\n", @@ -786,8 +789,8 @@ static int ldlm_bl_thread_main(void *arg) if (!blwi) { atomic_dec(&blp->blp_busy_threads); l_wait_event_exclusive(blp->blp_waitq, - (blwi = ldlm_bl_get_work(blp)), - &lwi); + (blwi = ldlm_bl_get_work(blp)), + &lwi); busy = atomic_inc_return(&blp->blp_busy_threads); } else { busy = atomic_read(&blp->blp_busy_threads); @@ -874,8 +877,6 @@ void ldlm_put_ref(void) } EXPORT_SYMBOL(ldlm_put_ref); -extern unsigned int ldlm_cancel_unused_locks_before_replay; - static ssize_t cancel_unused_locks_before_replay_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -1094,16 +1095,17 @@ int ldlm_init(void) return -ENOMEM; ldlm_lock_slab = kmem_cache_create("ldlm_locks", - sizeof(struct ldlm_lock), 0, - SLAB_HWCACHE_ALIGN | SLAB_DESTROY_BY_RCU, NULL); + sizeof(struct ldlm_lock), 0, + SLAB_HWCACHE_ALIGN | + SLAB_DESTROY_BY_RCU, NULL); if (!ldlm_lock_slab) { kmem_cache_destroy(ldlm_resource_slab); return -ENOMEM; } ldlm_interval_slab = kmem_cache_create("interval_node", - sizeof(struct ldlm_interval), - 0, SLAB_HWCACHE_ALIGN, NULL); + sizeof(struct ldlm_interval), + 0, SLAB_HWCACHE_ALIGN, NULL); if (!ldlm_interval_slab) { kmem_cache_destroy(ldlm_resource_slab); kmem_cache_destroy(ldlm_lock_slab); diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c b/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c index 657ed4012776..9a1136e32dfc 100644 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c @@ -357,38 +357,40 @@ static int ldlm_pool_recalc(struct ldlm_pool *pl) int count; recalc_interval_sec = ktime_get_seconds() - pl->pl_recalc_time; - if (recalc_interval_sec <= 0) - goto recalc; - - spin_lock(&pl->pl_lock); if (recalc_interval_sec > 0) { - /* - * Update pool statistics every 1s. - */ - ldlm_pool_recalc_stats(pl); - - /* - * Zero out all rates and speed for the last period. - */ - atomic_set(&pl->pl_grant_rate, 0); - atomic_set(&pl->pl_cancel_rate, 0); + spin_lock(&pl->pl_lock); + recalc_interval_sec = ktime_get_seconds() - pl->pl_recalc_time; + + if (recalc_interval_sec > 0) { + /* + * Update pool statistics every 1s. + */ + ldlm_pool_recalc_stats(pl); + + /* + * Zero out all rates and speed for the last period. + */ + atomic_set(&pl->pl_grant_rate, 0); + atomic_set(&pl->pl_cancel_rate, 0); + } + spin_unlock(&pl->pl_lock); } - spin_unlock(&pl->pl_lock); - recalc: if (pl->pl_ops->po_recalc) { count = pl->pl_ops->po_recalc(pl); lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT, count); } + recalc_interval_sec = pl->pl_recalc_time - ktime_get_seconds() + pl->pl_recalc_period; if (recalc_interval_sec <= 0) { + /* DEBUG: should be re-removed after LU-4536 is fixed */ + CDEBUG(D_DLMTRACE, "%s: Negative interval(%ld), too short period(%ld)\n", + pl->pl_name, (long)recalc_interval_sec, + (long)pl->pl_recalc_period); + /* Prevent too frequent recalculation. */ - CDEBUG(D_DLMTRACE, - "Negative interval(%d), too short period(%lld)", - recalc_interval_sec, - (s64)pl->pl_recalc_period); recalc_interval_sec = 1; } @@ -792,7 +794,8 @@ static struct completion ldlm_pools_comp; */ static unsigned long ldlm_pools_count(ldlm_side_t client, gfp_t gfp_mask) { - int total = 0, nr_ns; + unsigned long total = 0; + int nr_ns; struct ldlm_namespace *ns; struct ldlm_namespace *ns_old = NULL; /* loop detection */ void *cookie; @@ -995,7 +998,7 @@ static int ldlm_pools_thread_main(void *arg) wake_up(&thread->t_ctl_waitq); CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n", - "ldlm_poold", current_pid()); + "ldlm_poold", current_pid()); while (1) { struct l_wait_info lwi; @@ -1025,7 +1028,7 @@ static int ldlm_pools_thread_main(void *arg) wake_up(&thread->t_ctl_waitq); CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n", - "ldlm_poold", current_pid()); + "ldlm_poold", current_pid()); complete_and_exit(&ldlm_pools_comp, 0); } diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_request.c b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c index af487f9937f4..35ba6f14d95f 100644 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_request.c +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c @@ -63,8 +63,8 @@ #include "ldlm_internal.h" -int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT; -module_param(ldlm_enqueue_min, int, 0644); +unsigned int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT; +module_param(ldlm_enqueue_min, uint, 0644); MODULE_PARM_DESC(ldlm_enqueue_min, "lock enqueue timeout minimum"); /* in client side, whether the cached locks will be canceled before replay */ @@ -123,44 +123,56 @@ static int ldlm_expired_completion_wait(void *data) return 0; } +/** + * Calculate the Completion timeout (covering enqueue, BL AST, data flush, + * lock cancel, and their replies). Used for lock completion timeout on the + * client side. + * + * \param[in] lock lock which is waiting the completion callback + * + * \retval timeout in seconds to wait for the server reply + */ /* We use the same basis for both server side and client side functions * from a single node. */ -static int ldlm_get_enq_timeout(struct ldlm_lock *lock) +static unsigned int ldlm_cp_timeout(struct ldlm_lock *lock) { - int timeout = at_get(ldlm_lock_to_ns_at(lock)); + unsigned int timeout; if (AT_OFF) - return obd_timeout / 2; - /* Since these are non-updating timeouts, we should be conservative. - * It would be nice to have some kind of "early reply" mechanism for - * lock callbacks too... + return obd_timeout; + + /* + * Wait a long time for enqueue - server may have to callback a + * lock from another client. Server will evict the other client if it + * doesn't respond reasonably, and then give us the lock. */ - timeout = min_t(int, at_max, timeout + (timeout >> 1)); /* 150% */ - return max(timeout, ldlm_enqueue_min); + timeout = at_get(ldlm_lock_to_ns_at(lock)); + return max(3 * timeout, ldlm_enqueue_min); } /** * Helper function for ldlm_completion_ast(), updating timings when lock is * actually granted. */ -static int ldlm_completion_tail(struct ldlm_lock *lock) +static int ldlm_completion_tail(struct ldlm_lock *lock, void *data) { long delay; - int result; + int result = 0; if (ldlm_is_destroyed(lock) || ldlm_is_failed(lock)) { LDLM_DEBUG(lock, "client-side enqueue: destroyed"); result = -EIO; + } else if (!data) { + LDLM_DEBUG(lock, "client-side enqueue: granted"); } else { + /* Take into AT only CP RPC, not immediately granted locks */ delay = ktime_get_real_seconds() - lock->l_last_activity; LDLM_DEBUG(lock, "client-side enqueue: granted after %lds", delay); /* Update our time estimate */ - at_measured(ldlm_lock_to_ns_at(lock), - delay); - result = 0; + at_measured(ldlm_lock_to_ns_at(lock), delay); } return result; } @@ -177,10 +189,9 @@ int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data) return 0; } - if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED | - LDLM_FL_BLOCK_CONV))) { + if (!(flags & LDLM_FL_BLOCKED_MASK)) { wake_up(&lock->l_waitq); - return ldlm_completion_tail(lock); + return ldlm_completion_tail(lock, data); } LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, going forward"); @@ -224,8 +235,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) goto noreproc; } - if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED | - LDLM_FL_BLOCK_CONV))) { + if (!(flags & LDLM_FL_BLOCKED_MASK)) { wake_up(&lock->l_waitq); return 0; } @@ -240,13 +250,10 @@ noreproc: if (obd) imp = obd->u.cli.cl_import; - /* Wait a long time for enqueue - server may have to callback a - * lock from another client. Server will evict the other client if it - * doesn't respond reasonably, and then give us the lock. - */ - timeout = ldlm_get_enq_timeout(lock) * 2; + timeout = ldlm_cp_timeout(lock); lwd.lwd_lock = lock; + lock->l_last_activity = ktime_get_real_seconds(); if (ldlm_is_no_timeout(lock)) { LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT"); @@ -279,7 +286,7 @@ noreproc: return rc; } - return ldlm_completion_tail(lock); + return ldlm_completion_tail(lock, data); } EXPORT_SYMBOL(ldlm_completion_ast); @@ -309,8 +316,6 @@ static void failed_lock_cleanup(struct ldlm_namespace *ns, else LDLM_DEBUG(lock, "lock was granted or failed in race"); - ldlm_lock_decref_internal(lock, mode); - /* XXX - HACK because we shouldn't call ldlm_lock_destroy() * from llite/file.c/ll_file_flock(). */ @@ -321,9 +326,14 @@ static void failed_lock_cleanup(struct ldlm_namespace *ns, */ if (lock->l_resource->lr_type == LDLM_FLOCK) { lock_res_and_lock(lock); - ldlm_resource_unlink_lock(lock); - ldlm_lock_destroy_nolock(lock); + if (!ldlm_is_destroyed(lock)) { + ldlm_resource_unlink_lock(lock); + ldlm_lock_decref_internal_nolock(lock, mode); + ldlm_lock_destroy_nolock(lock); + } unlock_res_and_lock(lock); + } else { + ldlm_lock_decref_internal(lock, mode); } } @@ -418,11 +428,6 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req, *flags = ldlm_flags_from_wire(reply->lock_flags); lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags & LDLM_FL_INHERIT_MASK); - /* move NO_TIMEOUT flag to the lock to force ldlm_lock_match() - * to wait with no timeout as well - */ - lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags & - LDLM_FL_NO_TIMEOUT); unlock_res_and_lock(lock); CDEBUG(D_INFO, "local: %p, remote cookie: %#llx, flags: 0x%llx\n", @@ -556,7 +561,7 @@ static inline int ldlm_capsule_handles_avail(struct req_capsule *pill, enum req_location loc, int off) { - int size = req_capsule_msg_size(pill, loc); + u32 size = req_capsule_msg_size(pill, loc); return ldlm_req_handles_avail(size, off); } @@ -565,7 +570,7 @@ static inline int ldlm_format_handles_avail(struct obd_import *imp, const struct req_format *fmt, enum req_location loc, int off) { - int size = req_capsule_fmt_size(imp->imp_msg_magic, fmt, loc); + u32 size = req_capsule_fmt_size(imp->imp_msg_magic, fmt, loc); return ldlm_req_handles_avail(size, off); } @@ -696,8 +701,8 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, lock = ldlm_lock_create(ns, res_id, einfo->ei_type, einfo->ei_mode, &cbs, einfo->ei_cbdata, lvb_len, lvb_type); - if (!lock) - return -ENOMEM; + if (IS_ERR(lock)) + return PTR_ERR(lock); /* for the local lock, add the reference */ ldlm_lock_addref_internal(lock, einfo->ei_mode); ldlm_lock2handle(lock, lockh); @@ -719,6 +724,7 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, lock->l_export = NULL; lock->l_blocking_ast = einfo->ei_cb_bl; lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL)); + lock->l_last_activity = ktime_get_real_seconds(); /* lock not sent to server yet */ @@ -819,7 +825,7 @@ static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock) lock_res_and_lock(lock); ldlm_set_cbpending(lock); local_only = !!(lock->l_flags & - (LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK)); + (LDLM_FL_LOCAL_ONLY | LDLM_FL_CANCEL_ON_BLOCK)); ldlm_cancel_callback(lock); rc = ldlm_is_bl_ast(lock) ? LDLM_FL_BL_AST : LDLM_FL_CANCELING; unlock_res_and_lock(lock); @@ -1180,8 +1186,7 @@ static enum ldlm_policy_res ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, slv = ldlm_pool_get_slv(pl); lvf = ldlm_pool_get_lvf(pl); - la = cfs_duration_sec(cfs_time_sub(cur, - lock->l_last_used)); + la = cfs_duration_sec(cfs_time_sub(cur, lock->l_last_used)); lv = lvf * la * unused; /* Inform pool about current CLV to see it via debugfs. */ @@ -1193,9 +1198,6 @@ static enum ldlm_policy_res ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, if (slv == 0 || lv < slv) return LDLM_POLICY_KEEP_LOCK; - if (ns->ns_cancel && ns->ns_cancel(lock) == 0) - return LDLM_POLICY_KEEP_LOCK; - return LDLM_POLICY_CANCEL_LOCK; } @@ -1239,9 +1241,6 @@ static enum ldlm_policy_res ldlm_cancel_aged_policy(struct ldlm_namespace *ns, cfs_time_add(lock->l_last_used, ns->ns_max_age))) return LDLM_POLICY_KEEP_LOCK; - if (ns->ns_cancel && ns->ns_cancel(lock) == 0) - return LDLM_POLICY_KEEP_LOCK; - return LDLM_POLICY_CANCEL_LOCK; } @@ -1374,7 +1373,7 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, break; list_for_each_entry_safe(lock, next, &ns->ns_unused_list, - l_lru) { + l_lru) { /* No locks which got blocking requests. */ LASSERT(!ldlm_is_bl_ast(lock)); @@ -1413,7 +1412,8 @@ static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, * That is, for shrinker policy we drop only * old locks, but additionally choose them by * their weight. Big extent locks will stay in - * the cache. */ + * the cache. + */ result = pf(ns, lock, unused, added, count); if (result == LDLM_POLICY_KEEP_LOCK) { lu_ref_del(&lock->l_reference, @@ -1610,8 +1610,7 @@ int ldlm_cli_cancel_list(struct list_head *cancels, int count, */ while (count > 0) { LASSERT(!list_empty(cancels)); - lock = list_entry(cancels->next, struct ldlm_lock, - l_bl_ast); + lock = list_entry(cancels->next, struct ldlm_lock, l_bl_ast); LASSERT(lock->l_conn_export); if (exp_connect_cancelset(lock->l_conn_export)) { @@ -1660,7 +1659,7 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, int rc; res = ldlm_resource_get(ns, NULL, res_id, 0, 0); - if (!res) { + if (IS_ERR(res)) { /* This is not a problem. */ CDEBUG(D_INFO, "No resource %llu\n", res_id->name[0]); return 0; @@ -1704,7 +1703,8 @@ static int ldlm_cli_hash_cancel_unused(struct cfs_hash *hs, * that have 0 readers/writers. * * If flags & LCF_LOCAL, throw the locks away without trying - * to notify the server. */ + * to notify the server. + */ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, const struct ldlm_res_id *res_id, enum ldlm_cancel_flags flags, void *opaque) @@ -1811,13 +1811,10 @@ int ldlm_resource_iterate(struct ldlm_namespace *ns, struct ldlm_resource *res; int rc; - if (!ns) { - CERROR("must pass in namespace\n"); - LBUG(); - } + LASSERTF(ns, "must pass in namespace\n"); res = ldlm_resource_get(ns, NULL, res_id, 0, 0); - if (!res) + if (IS_ERR(res)) return 0; LDLM_RESOURCE_ADDREF(res); @@ -1843,7 +1840,7 @@ static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure) * bug 17614: locks being actively cancelled. Get a reference * on a lock so that it does not disappear under us (e.g. due to cancel) */ - if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_CANCELING))) { + if (!(lock->l_flags & (LDLM_FL_FAILED | LDLM_FL_CANCELING))) { list_add(&lock->l_pending_chain, list); LDLM_LOCK_GET(lock); } @@ -2013,7 +2010,7 @@ static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns) LCF_LOCAL, LDLM_CANCEL_NO_WAIT); CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n", - canceled, ldlm_ns_name(ns)); + canceled, ldlm_ns_name(ns)); } int ldlm_replay_locks(struct obd_import *imp) diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c index 51a28d96af39..a09c25aea698 100644 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c @@ -449,8 +449,8 @@ static unsigned ldlm_res_hop_hash(struct cfs_hash *hs, const void *key, unsigned mask) { const struct ldlm_res_id *id = key; - unsigned val = 0; - unsigned i; + unsigned int val = 0; + unsigned int i; for (i = 0; i < RES_NAME_SIZE; i++) val += id->name[i]; @@ -561,9 +561,9 @@ static struct cfs_hash_ops ldlm_ns_fid_hash_ops = { struct ldlm_ns_hash_def { enum ldlm_ns_type nsd_type; /** hash bucket bits */ - unsigned nsd_bkt_bits; + unsigned int nsd_bkt_bits; /** hash bits */ - unsigned nsd_all_bits; + unsigned int nsd_all_bits; /** hash operations */ struct cfs_hash_ops *nsd_hops; }; @@ -758,8 +758,7 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, */ lock_res(res); list_for_each(tmp, q) { - lock = list_entry(tmp, struct ldlm_lock, - l_res_link); + lock = list_entry(tmp, struct ldlm_lock, l_res_link); if (ldlm_is_cleaned(lock)) { lock = NULL; continue; @@ -793,8 +792,14 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, */ unlock_res(res); LDLM_DEBUG(lock, "setting FL_LOCAL_ONLY"); + if (lock->l_flags & LDLM_FL_FAIL_LOC) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(4)); + set_current_state(TASK_RUNNING); + } if (lock->l_completion_ast) - lock->l_completion_ast(lock, 0, NULL); + lock->l_completion_ast(lock, LDLM_FL_FAILED, + NULL); LDLM_LOCK_RELEASE(lock); continue; } @@ -875,7 +880,8 @@ static int __ldlm_namespace_free(struct ldlm_namespace *ns, int force) ldlm_ns_name(ns), atomic_read(&ns->ns_bref)); force_wait: if (force) - lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL); + lwi = LWI_TIMEOUT(msecs_to_jiffies(obd_timeout * + MSEC_PER_SEC) / 4, NULL, NULL); rc = l_wait_event(ns->ns_waitq, atomic_read(&ns->ns_bref) == 0, &lwi); @@ -1082,10 +1088,11 @@ ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent, int create) { struct hlist_node *hnode; - struct ldlm_resource *res; + struct ldlm_resource *res = NULL; struct cfs_hash_bd bd; __u64 version; int ns_refcount = 0; + int rc; LASSERT(!parent); LASSERT(ns->ns_rs_hash); @@ -1095,31 +1102,20 @@ ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent, hnode = cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name); if (hnode) { cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0); - res = hlist_entry(hnode, struct ldlm_resource, lr_hash); - /* Synchronize with regard to resource creation. */ - if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) { - mutex_lock(&res->lr_lvb_mutex); - mutex_unlock(&res->lr_lvb_mutex); - } - - if (unlikely(res->lr_lvb_len < 0)) { - ldlm_resource_putref(res); - res = NULL; - } - return res; + goto lvbo_init; } version = cfs_hash_bd_version_get(&bd); cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0); if (create == 0) - return NULL; + return ERR_PTR(-ENOENT); LASSERTF(type >= LDLM_MIN_TYPE && type < LDLM_MAX_TYPE, "type: %d\n", type); res = ldlm_resource_new(); if (!res) - return NULL; + return ERR_PTR(-ENOMEM); res->lr_ns_bucket = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd); res->lr_name = *name; @@ -1137,7 +1133,7 @@ ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent, /* We have taken lr_lvb_mutex. Drop it. */ mutex_unlock(&res->lr_lvb_mutex); kmem_cache_free(ldlm_resource_slab, res); - +lvbo_init: res = hlist_entry(hnode, struct ldlm_resource, lr_hash); /* Synchronize with regard to resource creation. */ if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) { @@ -1146,8 +1142,9 @@ ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent, } if (unlikely(res->lr_lvb_len < 0)) { + rc = res->lr_lvb_len; ldlm_resource_putref(res); - res = NULL; + res = ERR_PTR(rc); } return res; } @@ -1158,8 +1155,6 @@ ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent, cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) { - int rc; - OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CREATE_RESOURCE, 2); rc = ns->ns_lvbo->lvbo_init(res); if (rc < 0) { @@ -1169,7 +1164,7 @@ ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent, res->lr_lvb_len = rc; mutex_unlock(&res->lr_lvb_mutex); ldlm_resource_putref(res); - return NULL; + return ERR_PTR(rc); } } @@ -1386,7 +1381,7 @@ void ldlm_resource_dump(int level, struct ldlm_resource *res) if (!list_empty(&res->lr_granted)) { CDEBUG(level, "Granted locks (in reverse order):\n"); list_for_each_entry_reverse(lock, &res->lr_granted, - l_res_link) { + l_res_link) { LDLM_DEBUG_LIMIT(level, lock, "###"); if (!(level & D_CANTMASK) && ++granted > ldlm_dump_granted_max) { diff --git a/drivers/staging/lustre/lustre/llite/Makefile b/drivers/staging/lustre/lustre/llite/Makefile index 2cbb1b80bd41..1ac0940bd8df 100644 --- a/drivers/staging/lustre/lustre/llite/Makefile +++ b/drivers/staging/lustre/lustre/llite/Makefile @@ -1,6 +1,6 @@ obj-$(CONFIG_LUSTRE_FS) += lustre.o lustre-y := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o \ - rw.o namei.o symlink.o llite_mmap.o \ + rw.o namei.o symlink.o llite_mmap.o range_lock.o \ xattr.o xattr_cache.o rw26.o super25.o statahead.o \ glimpse.o lcommon_cl.o lcommon_misc.o \ vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o vvp_req.o \ diff --git a/drivers/staging/lustre/lustre/llite/dcache.c b/drivers/staging/lustre/lustre/llite/dcache.c index 463b1a360733..0e45d8fc4d7c 100644 --- a/drivers/staging/lustre/lustre/llite/dcache.c +++ b/drivers/staging/lustre/lustre/llite/dcache.c @@ -37,7 +37,6 @@ #define DEBUG_SUBSYSTEM S_LLITE #include "../include/obd_support.h" -#include "../include/lustre_lite.h" #include "../include/lustre/lustre_idl.h" #include "../include/lustre_dlm.h" @@ -102,39 +101,6 @@ static int ll_dcompare(const struct dentry *dentry, return 0; } -static inline int return_if_equal(struct ldlm_lock *lock, void *data) -{ - return (ldlm_is_canceling(lock) && ldlm_is_discard_data(lock)) ? - LDLM_ITER_CONTINUE : LDLM_ITER_STOP; -} - -/* find any ldlm lock of the inode in mdc and lov - * return 0 not find - * 1 find one - * < 0 error - */ -static int find_cbdata(struct inode *inode) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct lov_stripe_md *lsm; - int rc = 0; - - LASSERT(inode); - rc = md_find_cbdata(sbi->ll_md_exp, ll_inode2fid(inode), - return_if_equal, NULL); - if (rc != 0) - return rc; - - lsm = ccc_inode_lsm_get(inode); - if (!lsm) - return rc; - - rc = obd_find_cbdata(sbi->ll_dt_exp, lsm, return_if_equal, NULL); - ccc_inode_lsm_put(inode, lsm); - - return rc; -} - /** * Called when last reference to a dentry is dropped and dcache wants to know * whether or not it should cache it: @@ -155,19 +121,6 @@ static int ll_ddelete(const struct dentry *de) /* kernel >= 2.6.38 last refcount is decreased after this function. */ LASSERT(d_count(de) == 1); - /* Disable this piece of code temporarily because this is called - * inside dcache_lock so it's not appropriate to do lots of work - * here. ATTENTION: Before this piece of code enabling, LU-2487 must be - * resolved. - */ -#if 0 - /* if not ldlm lock for this inode, set i_nlink to 0 so that - * this inode can be recycled later b=20433 - */ - if (d_really_is_positive(de) && !find_cbdata(d_inode(de))) - clear_nlink(d_inode(de)); -#endif - if (d_lustre_invalid((struct dentry *)de)) return 1; return 0; @@ -325,14 +278,13 @@ static int ll_revalidate_dentry(struct dentry *dentry, if (lookup_flags & (LOOKUP_PARENT | LOOKUP_OPEN | LOOKUP_CREATE)) return 1; - if (d_need_statahead(dir, dentry) <= 0) + if (!dentry_may_statahead(dir, dentry)) return 1; if (lookup_flags & LOOKUP_RCU) return -ECHILD; - do_statahead_enter(dir, &dentry, !d_inode(dentry)); - ll_statahead_mark(dir, dentry); + ll_statahead(dir, &dentry, !d_inode(dentry)); return 1; } @@ -347,18 +299,9 @@ static int ll_revalidate_nd(struct dentry *dentry, unsigned int flags) return ll_revalidate_dentry(dentry, flags); } -static void ll_d_iput(struct dentry *de, struct inode *inode) -{ - LASSERT(inode); - if (!find_cbdata(inode)) - clear_nlink(inode); - iput(inode); -} - const struct dentry_operations ll_d_ops = { .d_revalidate = ll_revalidate_nd, .d_release = ll_release, .d_delete = ll_ddelete, - .d_iput = ll_d_iput, .d_compare = ll_dcompare, }; diff --git a/drivers/staging/lustre/lustre/llite/dir.c b/drivers/staging/lustre/lustre/llite/dir.c index 5b381779c827..7f32a539d260 100644 --- a/drivers/staging/lustre/lustre/llite/dir.c +++ b/drivers/staging/lustre/lustre/llite/dir.c @@ -46,9 +46,8 @@ #include "../include/obd_support.h" #include "../include/obd_class.h" +#include "../include/lustre/lustre_ioctl.h" #include "../include/lustre_lib.h" -#include "../include/lustre/lustre_idl.h" -#include "../include/lustre_lite.h" #include "../include/lustre_dlm.h" #include "../include/lustre_fid.h" #include "../include/lustre_kernelcomm.h" @@ -134,111 +133,35 @@ * for this integrated page will be adjusted. See lmv_adjust_dirpages(). * */ - -/* returns the page unlocked, but with a reference */ -static int ll_dir_filler(void *_hash, struct page *page0) +struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data, + __u64 offset) { - struct inode *inode = page0->mapping->host; - int hash64 = ll_i2sbi(inode)->ll_flags & LL_SBI_64BIT_HASH; - struct obd_export *exp = ll_i2sbi(inode)->ll_md_exp; - struct ptlrpc_request *request; - struct mdt_body *body; - struct md_op_data *op_data; - __u64 hash = *((__u64 *)_hash); - struct page **page_pool; + struct md_callback cb_op; struct page *page; - struct lu_dirpage *dp; - int max_pages = ll_i2sbi(inode)->ll_md_brw_size >> PAGE_SHIFT; - int nrdpgs = 0; /* number of pages read actually */ - int npages; - int i; int rc; - CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) hash %llu\n", - PFID(ll_inode2fid(inode)), inode, hash); - - LASSERT(max_pages > 0 && max_pages <= MD_MAX_BRW_PAGES); - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - page_pool = kcalloc(max_pages, sizeof(page), GFP_NOFS); - if (page_pool) { - page_pool[0] = page0; - } else { - page_pool = &page0; - max_pages = 1; - } - for (npages = 1; npages < max_pages; npages++) { - page = page_cache_alloc_cold(inode->i_mapping); - if (!page) - break; - page_pool[npages] = page; - } - - op_data->op_npages = npages; - op_data->op_offset = hash; - rc = md_readpage(exp, op_data, page_pool, &request); - ll_finish_md_op_data(op_data); - if (rc < 0) { - /* page0 is special, which was added into page cache early */ - delete_from_page_cache(page0); - } else if (rc == 0) { - body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); - /* Checked by mdc_readpage() */ - if (body->valid & OBD_MD_FLSIZE) - i_size_write(inode, body->size); - - nrdpgs = (request->rq_bulk->bd_nob_transferred+PAGE_SIZE-1) - >> PAGE_SHIFT; - SetPageUptodate(page0); - } - unlock_page(page0); - ptlrpc_req_finished(request); - - CDEBUG(D_VFSTRACE, "read %d/%d pages\n", nrdpgs, npages); - - for (i = 1; i < npages; i++) { - unsigned long offset; - int ret; - - page = page_pool[i]; - - if (rc < 0 || i >= nrdpgs) { - put_page(page); - continue; - } - - SetPageUptodate(page); - - dp = kmap(page); - hash = le64_to_cpu(dp->ldp_hash_start); - kunmap(page); - - offset = hash_x_index(hash, hash64); - - prefetchw(&page->flags); - ret = add_to_page_cache_lru(page, inode->i_mapping, offset, - GFP_NOFS); - if (ret == 0) { - unlock_page(page); - } else { - CDEBUG(D_VFSTRACE, "page %lu add to page cache failed: %d\n", - offset, ret); - } - put_page(page); - } + cb_op.md_blocking_ast = ll_md_blocking_ast; + rc = md_read_page(ll_i2mdexp(dir), op_data, &cb_op, offset, &page); + if (rc) + return ERR_PTR(rc); - if (page_pool != &page0) - kfree(page_pool); - return rc; + return page; } -void ll_release_page(struct page *page, int remove) +void ll_release_page(struct inode *inode, struct page *page, bool remove) { kunmap(page); + + /* + * Always remove the page for striped dir, because the page is + * built from temporarily in LMV layer + */ + if (inode && S_ISDIR(inode->i_mode) && + ll_i2info(inode)->lli_lsm_md) { + __free_page(page); + return; + } + if (remove) { lock_page(page); if (likely(page->mapping)) @@ -248,225 +171,6 @@ void ll_release_page(struct page *page, int remove) put_page(page); } -/* - * Find, kmap and return page that contains given hash. - */ -static struct page *ll_dir_page_locate(struct inode *dir, __u64 *hash, - __u64 *start, __u64 *end) -{ - int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH; - struct address_space *mapping = dir->i_mapping; - /* - * Complement of hash is used as an index so that - * radix_tree_gang_lookup() can be used to find a page with starting - * hash _smaller_ than one we are looking for. - */ - unsigned long offset = hash_x_index(*hash, hash64); - struct page *page; - int found; - - spin_lock_irq(&mapping->tree_lock); - found = radix_tree_gang_lookup(&mapping->page_tree, - (void **)&page, offset, 1); - if (found > 0 && !radix_tree_exceptional_entry(page)) { - struct lu_dirpage *dp; - - get_page(page); - spin_unlock_irq(&mapping->tree_lock); - /* - * In contrast to find_lock_page() we are sure that directory - * page cannot be truncated (while DLM lock is held) and, - * hence, can avoid restart. - * - * In fact, page cannot be locked here at all, because - * ll_dir_filler() does synchronous io. - */ - wait_on_page_locked(page); - if (PageUptodate(page)) { - dp = kmap(page); - if (BITS_PER_LONG == 32 && hash64) { - *start = le64_to_cpu(dp->ldp_hash_start) >> 32; - *end = le64_to_cpu(dp->ldp_hash_end) >> 32; - *hash = *hash >> 32; - } else { - *start = le64_to_cpu(dp->ldp_hash_start); - *end = le64_to_cpu(dp->ldp_hash_end); - } - LASSERTF(*start <= *hash, "start = %#llx,end = %#llx,hash = %#llx\n", - *start, *end, *hash); - CDEBUG(D_VFSTRACE, "page %lu [%llu %llu], hash %llu\n", - offset, *start, *end, *hash); - if (*hash > *end) { - ll_release_page(page, 0); - page = NULL; - } else if (*end != *start && *hash == *end) { - /* - * upon hash collision, remove this page, - * otherwise put page reference, and - * ll_get_dir_page() will issue RPC to fetch - * the page we want. - */ - ll_release_page(page, - le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); - page = NULL; - } - } else { - put_page(page); - page = ERR_PTR(-EIO); - } - - } else { - spin_unlock_irq(&mapping->tree_lock); - page = NULL; - } - return page; -} - -struct page *ll_get_dir_page(struct inode *dir, __u64 hash, - struct ll_dir_chain *chain) -{ - ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} }; - struct address_space *mapping = dir->i_mapping; - struct lustre_handle lockh; - struct lu_dirpage *dp; - struct page *page; - enum ldlm_mode mode; - int rc; - __u64 start = 0; - __u64 end = 0; - __u64 lhash = hash; - struct ll_inode_info *lli = ll_i2info(dir); - int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH; - - mode = LCK_PR; - rc = md_lock_match(ll_i2sbi(dir)->ll_md_exp, LDLM_FL_BLOCK_GRANTED, - ll_inode2fid(dir), LDLM_IBITS, &policy, mode, &lockh); - if (!rc) { - struct ldlm_enqueue_info einfo = { - .ei_type = LDLM_IBITS, - .ei_mode = mode, - .ei_cb_bl = ll_md_blocking_ast, - .ei_cb_cp = ldlm_completion_ast, - }; - struct lookup_intent it = { .it_op = IT_READDIR }; - struct ptlrpc_request *request; - struct md_op_data *op_data; - - op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return (void *)op_data; - - rc = md_enqueue(ll_i2sbi(dir)->ll_md_exp, &einfo, &it, - op_data, &lockh, NULL, 0, NULL, 0); - - ll_finish_md_op_data(op_data); - - request = (struct ptlrpc_request *)it.it_request; - if (request) - ptlrpc_req_finished(request); - if (rc < 0) { - CERROR("lock enqueue: " DFID " at %llu: rc %d\n", - PFID(ll_inode2fid(dir)), hash, rc); - return ERR_PTR(rc); - } - - CDEBUG(D_INODE, "setting lr_lvb_inode to inode "DFID"(%p)\n", - PFID(ll_inode2fid(dir)), dir); - md_set_lock_data(ll_i2sbi(dir)->ll_md_exp, - &it.it_lock_handle, dir, NULL); - } else { - /* for cross-ref object, l_ast_data of the lock may not be set, - * we reset it here - */ - md_set_lock_data(ll_i2sbi(dir)->ll_md_exp, &lockh.cookie, - dir, NULL); - } - ldlm_lock_dump_handle(D_OTHER, &lockh); - - mutex_lock(&lli->lli_readdir_mutex); - page = ll_dir_page_locate(dir, &lhash, &start, &end); - if (IS_ERR(page)) { - CERROR("dir page locate: "DFID" at %llu: rc %ld\n", - PFID(ll_inode2fid(dir)), lhash, PTR_ERR(page)); - goto out_unlock; - } else if (page) { - /* - * XXX nikita: not entirely correct handling of a corner case: - * suppose hash chain of entries with hash value HASH crosses - * border between pages P0 and P1. First both P0 and P1 are - * cached, seekdir() is called for some entry from the P0 part - * of the chain. Later P0 goes out of cache. telldir(HASH) - * happens and finds P1, as it starts with matching hash - * value. Remaining entries from P0 part of the chain are - * skipped. (Is that really a bug?) - * - * Possible solutions: 0. don't cache P1 is such case, handle - * it as an "overflow" page. 1. invalidate all pages at - * once. 2. use HASH|1 as an index for P1. - */ - goto hash_collision; - } - - page = read_cache_page(mapping, hash_x_index(hash, hash64), - ll_dir_filler, &lhash); - if (IS_ERR(page)) { - CERROR("read cache page: "DFID" at %llu: rc %ld\n", - PFID(ll_inode2fid(dir)), hash, PTR_ERR(page)); - goto out_unlock; - } - - wait_on_page_locked(page); - (void)kmap(page); - if (!PageUptodate(page)) { - CERROR("page not updated: "DFID" at %llu: rc %d\n", - PFID(ll_inode2fid(dir)), hash, -5); - goto fail; - } - if (!PageChecked(page)) - /* XXX: check page format later */ - SetPageChecked(page); - if (PageError(page)) { - CERROR("page error: "DFID" at %llu: rc %d\n", - PFID(ll_inode2fid(dir)), hash, -5); - goto fail; - } -hash_collision: - dp = page_address(page); - if (BITS_PER_LONG == 32 && hash64) { - start = le64_to_cpu(dp->ldp_hash_start) >> 32; - end = le64_to_cpu(dp->ldp_hash_end) >> 32; - lhash = hash >> 32; - } else { - start = le64_to_cpu(dp->ldp_hash_start); - end = le64_to_cpu(dp->ldp_hash_end); - lhash = hash; - } - if (end == start) { - LASSERT(start == lhash); - CWARN("Page-wide hash collision: %llu\n", end); - if (BITS_PER_LONG == 32 && hash64) - CWARN("Real page-wide hash collision at [%llu %llu] with hash %llu\n", - le64_to_cpu(dp->ldp_hash_start), - le64_to_cpu(dp->ldp_hash_end), hash); - /* - * Fetch whole overflow chain... - * - * XXX not yet. - */ - goto fail; - } -out_unlock: - mutex_unlock(&lli->lli_readdir_mutex); - ldlm_lock_decref(&lockh, mode); - return page; - -fail: - ll_release_page(page, 1); - page = ERR_PTR(-EIO); - goto out_unlock; -} - /** * return IF_* type for given lu_dirent entry. * IF_* flag shld be converted to particular OS file type in @@ -489,119 +193,100 @@ static __u16 ll_dirent_type_get(struct lu_dirent *ent) return type; } -int ll_dir_read(struct inode *inode, struct dir_context *ctx) +int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data, + struct dir_context *ctx) { - struct ll_inode_info *info = ll_i2info(inode); struct ll_sb_info *sbi = ll_i2sbi(inode); - __u64 pos = ctx->pos; - int api32 = ll_need_32bit_api(sbi); - int hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH; + __u64 pos = *ppos; + int is_api32 = ll_need_32bit_api(sbi); + int is_hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH; struct page *page; - struct ll_dir_chain chain; - int done = 0; + bool done = false; int rc = 0; - ll_dir_chain_init(&chain); - - page = ll_get_dir_page(inode, pos, &chain); + page = ll_get_dir_page(inode, op_data, pos); while (rc == 0 && !done) { struct lu_dirpage *dp; struct lu_dirent *ent; + __u64 hash; + __u64 next; - if (!IS_ERR(page)) { - /* - * If page is empty (end of directory is reached), - * use this value. - */ - __u64 hash = MDS_DIR_END_OFF; - __u64 next; - - dp = page_address(page); - for (ent = lu_dirent_start(dp); ent && !done; - ent = lu_dirent_next(ent)) { - __u16 type; - int namelen; - struct lu_fid fid; - __u64 lhash; - __u64 ino; + if (IS_ERR(page)) { + rc = PTR_ERR(page); + break; + } + hash = MDS_DIR_END_OFF; + dp = page_address(page); + for (ent = lu_dirent_start(dp); ent && !done; + ent = lu_dirent_next(ent)) { + __u16 type; + int namelen; + struct lu_fid fid; + __u64 lhash; + __u64 ino; + + hash = le64_to_cpu(ent->lde_hash); + if (hash < pos) /* - * XXX: implement correct swabbing here. + * Skip until we find target hash + * value. */ + continue; - hash = le64_to_cpu(ent->lde_hash); - if (hash < pos) - /* - * Skip until we find target hash - * value. - */ - continue; - - namelen = le16_to_cpu(ent->lde_namelen); - if (namelen == 0) - /* - * Skip dummy record. - */ - continue; - - if (api32 && hash64) - lhash = hash >> 32; - else - lhash = hash; - fid_le_to_cpu(&fid, &ent->lde_fid); - ino = cl_fid_build_ino(&fid, api32); - type = ll_dirent_type_get(ent); - ctx->pos = lhash; - /* For 'll_nfs_get_name_filldir()', it will try - * to access the 'ent' through its 'lde_name', - * so the parameter 'name' for 'ctx->actor()' - * must be part of the 'ent'. + namelen = le16_to_cpu(ent->lde_namelen); + if (namelen == 0) + /* + * Skip dummy record. */ - done = !dir_emit(ctx, ent->lde_name, - namelen, ino, type); - } - next = le64_to_cpu(dp->ldp_hash_end); - if (!done) { - pos = next; - if (pos == MDS_DIR_END_OFF) { - /* - * End of directory reached. - */ - done = 1; - ll_release_page(page, 0); - } else if (1 /* chain is exhausted*/) { - /* - * Normal case: continue to the next - * page. - */ - ll_release_page(page, - le32_to_cpu(dp->ldp_flags) & - LDF_COLLIDE); - next = pos; - page = ll_get_dir_page(inode, pos, - &chain); - } else { - /* - * go into overflow page. - */ - LASSERT(le32_to_cpu(dp->ldp_flags) & - LDF_COLLIDE); - ll_release_page(page, 1); - } - } else { - pos = hash; - ll_release_page(page, 0); - } + continue; + + if (is_api32 && is_hash64) + lhash = hash >> 32; + else + lhash = hash; + fid_le_to_cpu(&fid, &ent->lde_fid); + ino = cl_fid_build_ino(&fid, is_api32); + type = ll_dirent_type_get(ent); + ctx->pos = lhash; + /* For 'll_nfs_get_name_filldir()', it will try + * to access the 'ent' through its 'lde_name', + * so the parameter 'name' for 'ctx->actor()' + * must be part of the 'ent'. + */ + done = !dir_emit(ctx, ent->lde_name, + namelen, ino, type); + } + + if (done) { + pos = hash; + ll_release_page(inode, page, false); + break; + } + + next = le64_to_cpu(dp->ldp_hash_end); + pos = next; + if (pos == MDS_DIR_END_OFF) { + /* + * End of directory reached. + */ + done = 1; + ll_release_page(inode, page, false); } else { - rc = PTR_ERR(page); - CERROR("error reading dir "DFID" at %lu: rc %d\n", - PFID(&info->lli_fid), (unsigned long)pos, rc); + /* + * Normal case: continue to the next + * page. + */ + ll_release_page(inode, page, + le32_to_cpu(dp->ldp_flags) & + LDF_COLLIDE); + next = pos; + page = ll_get_dir_page(inode, op_data, pos); } } ctx->pos = pos; - ll_dir_chain_fini(&chain); return rc; } @@ -613,9 +298,10 @@ static int ll_readdir(struct file *filp, struct dir_context *ctx) __u64 pos = lfd ? lfd->lfd_pos : 0; int hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH; int api32 = ll_need_32bit_api(sbi); + struct md_op_data *op_data; int rc; - CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) pos %lu/%llu 32bit_api %d\n", + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) pos/size %lu/%llu 32bit_api %d\n", PFID(ll_inode2fid(inode)), inode, (unsigned long)pos, i_size_read(inode), api32); @@ -627,19 +313,58 @@ static int ll_readdir(struct file *filp, struct dir_context *ctx) goto out; } + op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0, + LUSTRE_OPC_ANY, inode); + if (IS_ERR(op_data)) { + rc = PTR_ERR(op_data); + goto out; + } + + if (unlikely(op_data->op_mea1)) { + /* + * This is only needed for striped dir to fill .., + * see lmv_read_page + */ + if (file_dentry(filp)->d_parent && + file_dentry(filp)->d_parent->d_inode) { + __u64 ibits = MDS_INODELOCK_UPDATE; + struct inode *parent; + + parent = file_dentry(filp)->d_parent->d_inode; + if (ll_have_md_lock(parent, &ibits, LCK_MINMODE)) + op_data->op_fid3 = *ll_inode2fid(parent); + } + + /* + * If it can not find in cache, do lookup .. on the master + * object + */ + if (fid_is_zero(&op_data->op_fid3)) { + rc = ll_dir_get_parent_fid(inode, &op_data->op_fid3); + if (rc) { + ll_finish_md_op_data(op_data); + return rc; + } + } + } + op_data->op_max_pages = sbi->ll_md_brw_pages; ctx->pos = pos; - rc = ll_dir_read(inode, ctx); + rc = ll_dir_read(inode, &pos, op_data, ctx); + pos = ctx->pos; if (lfd) - lfd->lfd_pos = ctx->pos; - if (ctx->pos == MDS_DIR_END_OFF) { + lfd->lfd_pos = pos; + + if (pos == MDS_DIR_END_OFF) { if (api32) - ctx->pos = LL_DIR_END_OFF_32BIT; + pos = LL_DIR_END_OFF_32BIT; else - ctx->pos = LL_DIR_END_OFF; + pos = LL_DIR_END_OFF; } else { if (api32 && hash64) - ctx->pos >>= 32; + pos >>= 32; } + ctx->pos = pos; + ll_finish_md_op_data(op_data); filp->f_version = inode->i_version; out: @@ -668,18 +393,40 @@ static int ll_send_mgc_param(struct obd_export *mgc, char *string) return rc; } -static int ll_dir_setdirstripe(struct inode *dir, struct lmv_user_md *lump, - char *filename) +/** + * Create striped directory with specified stripe(@lump) + * + * param[in] parent the parent of the directory. + * param[in] lump the specified stripes. + * param[in] dirname the name of the directory. + * param[in] mode the specified mode of the directory. + * + * retval =0 if striped directory is being created successfully. + * <0 if the creation is failed. + */ +static int ll_dir_setdirstripe(struct inode *parent, struct lmv_user_md *lump, + const char *dirname, umode_t mode) { struct ptlrpc_request *request = NULL; struct md_op_data *op_data; - struct ll_sb_info *sbi = ll_i2sbi(dir); - int mode; + struct ll_sb_info *sbi = ll_i2sbi(parent); int err; - mode = (~current_umask() & 0755) | S_IFDIR; - op_data = ll_prep_md_op_data(NULL, dir, NULL, filename, - strlen(filename), mode, LUSTRE_OPC_MKDIR, + if (unlikely(lump->lum_magic != LMV_USER_MAGIC)) + return -EINVAL; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) name %s stripe_offset %d, stripe_count: %u\n", + PFID(ll_inode2fid(parent)), parent, dirname, + (int)lump->lum_stripe_offset, lump->lum_stripe_count); + + if (lump->lum_magic != cpu_to_le32(LMV_USER_MAGIC)) + lustre_swab_lmv_user_md(lump); + + if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent))) + mode &= ~current_umask(); + mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR; + op_data = ll_prep_md_op_data(NULL, parent, NULL, dirname, + strlen(dirname), mode, LUSTRE_OPC_MKDIR, lump); if (IS_ERR(op_data)) { err = PTR_ERR(op_data); @@ -730,6 +477,13 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, lum_size = sizeof(struct lov_user_md_v3); break; } + case LMV_USER_MAGIC: { + if (lump->lmm_magic != cpu_to_le32(LMV_USER_MAGIC)) + lustre_swab_lmv_user_md( + (struct lmv_user_md *)lump); + lum_size = sizeof(struct lmv_user_md); + break; + } default: { CDEBUG(D_IOCTL, "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n", lump->lmm_magic, LOV_USER_MAGIC_V1, @@ -746,9 +500,6 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, if (IS_ERR(op_data)) return PTR_ERR(op_data); - if (lump && lump->lmm_magic == cpu_to_le32(LMV_USER_MAGIC)) - op_data->op_cli_flags |= CLI_SET_MEA; - /* swabbing is done in lov_setstripe() on server side */ rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size, NULL, 0, &req, NULL); @@ -803,8 +554,16 @@ end: return rc; } -int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp, - int *lmm_size, struct ptlrpc_request **request) +/** + * This function will be used to get default LOV/LMV/Default LMV + * @valid will be used to indicate which stripe it will retrieve + * OBD_MD_MEA LMV stripe EA + * OBD_MD_DEFAULT_MEA Default LMV stripe EA + * otherwise Default LOV EA. + * Each time, it can only retrieve 1 stripe EA + **/ +int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size, + struct ptlrpc_request **request, u64 valid) { struct ll_sb_info *sbi = ll_i2sbi(inode); struct mdt_body *body; @@ -813,7 +572,7 @@ int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp, int rc, lmmsize; struct md_op_data *op_data; - rc = ll_get_default_mdsize(sbi, &lmmsize); + rc = ll_get_max_mdsize(sbi, &lmmsize); if (rc) return rc; @@ -834,9 +593,9 @@ int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp, body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - lmmsize = body->eadatasize; + lmmsize = body->mbo_eadatasize; - if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) || + if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) || lmmsize == 0) { rc = -ENODATA; goto out; @@ -844,6 +603,7 @@ int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp, lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize); + LASSERT(lmm); /* * This is coming from the MDS, so is probably in @@ -860,40 +620,51 @@ int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp, if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm); break; + case LMV_MAGIC_V1: + if (cpu_to_le32(LMV_MAGIC) != LMV_MAGIC) + lustre_swab_lmv_mds_md((union lmv_mds_md *)lmm); + break; + case LMV_USER_MAGIC: + if (cpu_to_le32(LMV_USER_MAGIC) != LMV_USER_MAGIC) + lustre_swab_lmv_user_md((struct lmv_user_md *)lmm); + break; default: CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic); rc = -EPROTO; } out: - *lmmp = lmm; - *lmm_size = lmmsize; + *plmm = lmm; + *plmm_size = lmmsize; *request = req; return rc; } -/* - * Get MDT index for the inode. - */ -int ll_get_mdt_idx(struct inode *inode) +int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid) { - struct ll_sb_info *sbi = ll_i2sbi(inode); struct md_op_data *op_data; - int rc, mdtidx; + int mdt_index, rc; - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, - 0, LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); + op_data = kzalloc(sizeof(*op_data), GFP_NOFS); + if (!op_data) + return -ENOMEM; op_data->op_flags |= MF_GET_MDT_IDX; + op_data->op_fid1 = *fid; rc = md_getattr(sbi->ll_md_exp, op_data, NULL); - mdtidx = op_data->op_mds; - ll_finish_md_op_data(op_data); - if (rc < 0) { - CDEBUG(D_INFO, "md_getattr_name: %d\n", rc); + mdt_index = op_data->op_mds; + kvfree(op_data); + if (rc < 0) return rc; - } - return mdtidx; + + return mdt_index; +} + +/* + * Get MDT index for the inode. + */ +int ll_get_mdt_idx(struct inode *inode) +{ + return ll_get_mdt_idx_by_fid(ll_i2sbi(inode), ll_inode2fid(inode)); } /** @@ -1288,11 +1059,9 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return 0; } case IOC_MDC_LOOKUP: { - struct ptlrpc_request *request = NULL; int namelen, len = 0; char *buf = NULL; char *filename; - struct md_op_data *op_data; rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg); if (rc) @@ -1308,21 +1077,13 @@ static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg) goto out_free; } - op_data = ll_prep_md_op_data(NULL, inode, NULL, filename, namelen, - 0, LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) { - rc = PTR_ERR(op_data); - goto out_free; - } - - op_data->op_valid = OBD_MD_FLID; - rc = md_getattr_name(sbi->ll_md_exp, op_data, &request); - ll_finish_md_op_data(op_data); + rc = ll_get_fid_by_name(inode, filename, namelen, NULL); if (rc < 0) { - CDEBUG(D_INFO, "md_getattr_name: %d\n", rc); + CERROR("%s: lookup %.*s failed: rc = %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), namelen, + filename, rc); goto out_free; } - ptlrpc_req_finished(request); out_free: obd_ioctl_freedata(buf, len); return rc; @@ -1333,6 +1094,7 @@ out_free: char *filename; int namelen = 0; int lumlen = 0; + umode_t mode; int len; int rc; @@ -1366,15 +1128,32 @@ out_free: goto lmv_out_free; } - /** - * ll_dir_setdirstripe will be used to set dir stripe - * mdc_create--->mdt_reint_create (with dirstripe) - */ - rc = ll_dir_setdirstripe(inode, lum, filename); +#if OBD_OCD_VERSION(2, 9, 50, 0) > LUSTRE_VERSION_CODE + mode = data->ioc_type != 0 ? data->ioc_type : S_IRWXUGO; +#else + mode = data->ioc_type; +#endif + rc = ll_dir_setdirstripe(inode, lum, filename, mode); lmv_out_free: obd_ioctl_freedata(buf, len); return rc; } + case LL_IOC_LMV_SET_DEFAULT_STRIPE: { + struct lmv_user_md __user *ulump; + struct lmv_user_md lum; + int rc; + + ulump = (struct lmv_user_md __user *)arg; + if (copy_from_user(&lum, ulump, sizeof(lum))) + return -EFAULT; + + if (lum.lum_magic != LMV_USER_MAGIC) + return -EINVAL; + + rc = ll_dir_setstripe(inode, (struct lov_user_md *)&lum, 0); + + return rc; + } case LL_IOC_LOV_SETSTRIPE: { struct lov_user_md_v3 lumv3; struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3; @@ -1404,50 +1183,100 @@ lmv_out_free: return rc; } case LL_IOC_LMV_GETSTRIPE: { - struct lmv_user_md __user *lump = (void __user *)arg; + struct lmv_user_md __user *ulmv; struct lmv_user_md lum; - struct lmv_user_md *tmp; + struct ptlrpc_request *request = NULL; + struct lmv_user_md *tmp = NULL; + union lmv_mds_md *lmm = NULL; + u64 valid = 0; + int stripe_count; + int mdt_index; int lum_size; - int rc = 0; - int mdtindex; + int lmmsize; + int rc; + int i; - if (copy_from_user(&lum, lump, sizeof(struct lmv_user_md))) + ulmv = (struct lmv_user_md __user *)arg; + if (copy_from_user(&lum, ulmv, sizeof(*ulmv))) return -EFAULT; - if (lum.lum_magic != LMV_MAGIC_V1) + /* + * lum_magic will indicate which stripe the ioctl will like + * to get, LMV_MAGIC_V1 is for normal LMV stripe, LMV_USER_MAGIC + * is for default LMV stripe + */ + if (lum.lum_magic == LMV_MAGIC_V1) + valid |= OBD_MD_MEA; + else if (lum.lum_magic == LMV_USER_MAGIC) + valid |= OBD_MD_DEFAULT_MEA; + else return -EINVAL; - lum_size = lmv_user_md_size(1, LMV_MAGIC_V1); + rc = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize, &request, + valid); + if (rc) + goto finish_req; + + /* Get default LMV EA */ + if (lum.lum_magic == LMV_USER_MAGIC) { + if (rc) + goto finish_req; + + if (lmmsize > sizeof(*ulmv)) { + rc = -EINVAL; + goto finish_req; + } + + if (copy_to_user(ulmv, lmm, lmmsize)) + rc = -EFAULT; + + goto finish_req; + } + + stripe_count = lmv_mds_md_stripe_count_get(lmm); + lum_size = lmv_user_md_size(stripe_count, LMV_MAGIC_V1); tmp = kzalloc(lum_size, GFP_NOFS); if (!tmp) { rc = -ENOMEM; - goto free_lmv; + goto finish_req; } - *tmp = lum; - tmp->lum_type = LMV_STRIPE_TYPE; - tmp->lum_stripe_count = 1; - mdtindex = ll_get_mdt_idx(inode); - if (mdtindex < 0) { + mdt_index = ll_get_mdt_idx(inode); + if (mdt_index < 0) { rc = -ENOMEM; - goto free_lmv; + goto out_tmp; + } + tmp->lum_magic = LMV_MAGIC_V1; + tmp->lum_stripe_count = 0; + tmp->lum_stripe_offset = mdt_index; + for (i = 0; i < stripe_count; i++) { + struct lu_fid fid; + + fid_le_to_cpu(&fid, &lmm->lmv_md_v1.lmv_stripe_fids[i]); + mdt_index = ll_get_mdt_idx_by_fid(sbi, &fid); + if (mdt_index < 0) { + rc = mdt_index; + goto out_tmp; + } + tmp->lum_objects[i].lum_mds = mdt_index; + tmp->lum_objects[i].lum_fid = fid; + tmp->lum_stripe_count++; } - tmp->lum_stripe_offset = mdtindex; - tmp->lum_objects[0].lum_mds = mdtindex; - memcpy(&tmp->lum_objects[0].lum_fid, ll_inode2fid(inode), - sizeof(struct lu_fid)); - if (copy_to_user((void __user *)arg, tmp, lum_size)) { + if (copy_to_user(ulmv, tmp, lum_size)) { rc = -EFAULT; - goto free_lmv; + goto out_tmp; } -free_lmv: +out_tmp: kfree(tmp); +finish_req: + ptlrpc_req_finished(request); return rc; } + case LL_IOC_LOV_SWAP_LAYOUTS: return -EPERM; - case LL_IOC_OBD_STATFS: + case IOC_OBD_STATFS: return ll_obd_statfs(inode, (void __user *)arg); case LL_IOC_LOV_GETSTRIPE: case LL_IOC_MDC_GETINFO: @@ -1469,7 +1298,8 @@ free_lmv: rc = ll_lov_getstripe_ea_info(inode, filename, &lmm, &lmmsize, &request); } else { - rc = ll_dir_getstripe(inode, &lmm, &lmmsize, &request); + rc = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize, + &request, 0); } if (request) { @@ -1512,18 +1342,18 @@ skip_lmm: lstat_t st = { 0 }; st.st_dev = inode->i_sb->s_dev; - st.st_mode = body->mode; - st.st_nlink = body->nlink; - st.st_uid = body->uid; - st.st_gid = body->gid; - st.st_rdev = body->rdev; - st.st_size = body->size; + st.st_mode = body->mbo_mode; + st.st_nlink = body->mbo_nlink; + st.st_uid = body->mbo_uid; + st.st_gid = body->mbo_gid; + st.st_rdev = body->mbo_rdev; + st.st_size = body->mbo_size; st.st_blksize = PAGE_SIZE; - st.st_blocks = body->blocks; - st.st_atime = body->atime; - st.st_mtime = body->mtime; - st.st_ctime = body->ctime; - st.st_ino = cl_fid_build_ino(&body->fid1, + st.st_blocks = body->mbo_blocks; + st.st_atime = body->mbo_atime; + st.st_mtime = body->mbo_mtime; + st.st_ctime = body->mbo_ctime; + st.st_ino = cl_fid_build_ino(&body->mbo_fid1, sbi->ll_flags & LL_SBI_32BIT_API); @@ -1611,9 +1441,6 @@ free_lmm: kvfree(lmm); return rc; } - case OBD_IOC_LLOG_CATINFO: { - return -EOPNOTSUPP; - } case OBD_IOC_QUOTACHECK: { struct obd_quotactl *oqctl; int error = 0; @@ -1671,7 +1498,7 @@ out_poll: kfree(check); return rc; } - case LL_IOC_QUOTACTL: { + case OBD_IOC_QUOTACTL: { struct if_quotactl *qctl; qctl = kzalloc(sizeof(*qctl), GFP_NOFS); @@ -1739,6 +1566,25 @@ out_quotactl: return rc; case OBD_IOC_FID2PATH: return ll_fid2path(inode, (void __user *)arg); + case LL_IOC_GETPARENT: + return ll_getparent(file, (void __user *)arg); + case LL_IOC_FID2MDTIDX: { + struct obd_export *exp = ll_i2mdexp(inode); + struct lu_fid fid; + __u32 index; + + if (copy_from_user(&fid, (const struct lu_fid __user *)arg, + sizeof(fid))) + return -EFAULT; + + /* Call mdc_iocontrol */ + rc = obd_iocontrol(LL_IOC_FID2MDTIDX, exp, sizeof(fid), &fid, + &index); + if (rc) + return rc; + + return index; + } case LL_IOC_HSM_REQUEST: { struct hsm_user_request *hur; ssize_t totalsize; @@ -1853,6 +1699,45 @@ out_quotactl: kfree(copy); return rc; } + case LL_IOC_MIGRATE: { + char *buf = NULL; + const char *filename; + int namelen = 0; + int len; + int rc; + int mdtidx; + + rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg); + if (rc < 0) + return rc; + + data = (struct obd_ioctl_data *)buf; + if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 || + !data->ioc_inllen1 || !data->ioc_inllen2) { + rc = -EINVAL; + goto migrate_free; + } + + filename = data->ioc_inlbuf1; + namelen = data->ioc_inllen1; + if (namelen < 1 || namelen != strlen(filename) + 1) { + rc = -EINVAL; + goto migrate_free; + } + + if (data->ioc_inllen2 != sizeof(mdtidx)) { + rc = -EINVAL; + goto migrate_free; + } + mdtidx = *(int *)data->ioc_inlbuf2; + + rc = ll_migrate(inode, file, mdtidx, filename, namelen - 1); +migrate_free: + obd_ioctl_freedata(buf, len); + + return rc; + } + default: return obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL, (void __user *)arg); diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c index 57281b9e31ff..6e3a188baaae 100644 --- a/drivers/staging/lustre/lustre/llite/file.c +++ b/drivers/staging/lustre/lustre/llite/file.c @@ -38,14 +38,15 @@ #define DEBUG_SUBSYSTEM S_LLITE #include "../include/lustre_dlm.h" -#include "../include/lustre_lite.h" #include <linux/pagemap.h> #include <linux/file.h> +#include <linux/sched.h> #include <linux/mount.h> -#include "llite_internal.h" #include "../include/lustre/ll_fiemap.h" +#include "../include/lustre/lustre_ioctl.h" #include "../include/cl_object.h" +#include "llite_internal.h" static int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg); @@ -188,17 +189,11 @@ static int ll_close_inode_openhandle(struct obd_export *md_exp, spin_unlock(&lli->lli_lock); } - if (rc == 0) { - rc = ll_objects_destroy(req, inode); - if (rc) - CERROR("inode %lu ll_objects destroy: rc = %d\n", - inode->i_ino, rc); - } if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) { struct mdt_body *body; body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - if (!(body->valid & OBD_MD_FLRELEASED)) + if (!(body->mbo_valid & OBD_MD_FLRELEASED)) rc = -EBUSY; } @@ -349,13 +344,11 @@ int ll_file_release(struct inode *inode, struct file *file) fd = LUSTRE_FPRIVATE(file); LASSERT(fd); - /* The last ref on @file, maybe not be the owner pid of statahead. - * Different processes can open the same dir, "ll_opendir_key" means: - * it is me that should stop the statahead thread. + /* The last ref on @file, maybe not be the owner pid of statahead, + * because parent and child process can share the same file handle. */ - if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd && - lli->lli_opendir_pid != 0) - ll_stop_statahead(inode, lli->lli_opendir_key); + if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd) + ll_deauthorize_statahead(inode, fd); if (is_root_inode(inode)) { LUSTRE_FPRIVATE(file) = NULL; @@ -364,7 +357,8 @@ int ll_file_release(struct inode *inode, struct file *file) } if (!S_ISDIR(inode->i_mode)) { - lov_read_and_clear_async_rc(lli->lli_clob); + if (lli->lli_clob) + lov_read_and_clear_async_rc(lli->lli_clob); lli->lli_async_rc = 0; } @@ -376,55 +370,39 @@ int ll_file_release(struct inode *inode, struct file *file) return rc; } -static int ll_intent_file_open(struct dentry *dentry, void *lmm, - int lmmsize, struct lookup_intent *itp) +static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize, + struct lookup_intent *itp) { - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(de); struct ll_sb_info *sbi = ll_i2sbi(inode); - struct dentry *parent = dentry->d_parent; - const char *name = dentry->d_name.name; - const int len = dentry->d_name.len; + struct dentry *parent = de->d_parent; + const char *name = NULL; struct md_op_data *op_data; - struct ptlrpc_request *req; - __u32 opc = LUSTRE_OPC_ANY; - int rc; + struct ptlrpc_request *req = NULL; + int len = 0, rc; - /* Usually we come here only for NFSD, and we want open lock. */ - /* We can also get here if there was cached open handle in revalidate_it - * but it disappeared while we were getting from there to ll_file_open. - * But this means this file was closed and immediately opened which - * makes a good candidate for using OPEN lock - */ - /* If lmmsize & lmm are not 0, we are just setting stripe info - * parameters. No need for the open lock + LASSERT(parent); + LASSERT(itp->it_flags & MDS_OPEN_BY_FID); + + /* + * if server supports open-by-fid, or file name is invalid, don't pack + * name in open request */ - if (!lmm && lmmsize == 0) { - struct ll_dentry_data *ldd = ll_d2d(dentry); - /* - * If we came via ll_iget_for_nfs, then we need to request - * struct ll_dentry_data *ldd = ll_d2d(file->f_dentry); - * - * NB: when ldd is NULL, it must have come via normal - * lookup path only, since ll_iget_for_nfs always calls - * ll_d_init(). - */ - if (ldd && ldd->lld_nfs_dentry) { - ldd->lld_nfs_dentry = 0; - itp->it_flags |= MDS_OPEN_LOCK; - } - if (itp->it_flags & FMODE_WRITE) - opc = LUSTRE_OPC_CREATE; + if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) && + lu_name_is_valid_2(de->d_name.name, de->d_name.len)) { + name = de->d_name.name; + len = de->d_name.len; } - op_data = ll_prep_md_op_data(NULL, d_inode(parent), - inode, name, len, - O_RDWR, opc, NULL); + op_data = ll_prep_md_op_data(NULL, d_inode(parent), inode, name, len, + O_RDWR, LUSTRE_OPC_ANY, NULL); if (IS_ERR(op_data)) return PTR_ERR(op_data); + op_data->op_data = lmm; + op_data->op_data_size = lmmsize; - itp->it_flags |= MDS_OPEN_BY_FID; - rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp, - 0 /*unused */, &req, ll_md_blocking_ast, 0); + rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req, + &ll_md_blocking_ast, 0); ll_finish_md_op_data(op_data); if (rc == -ESTALE) { /* reason for keep own exit path - don`t flood log @@ -479,8 +457,8 @@ static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it, struct mdt_body *body; body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY); - och->och_fh = body->handle; - och->och_fid = body->fid1; + och->och_fh = body->mbo_handle; + och->och_fid = body->mbo_fid1; och->och_lease_handle.cookie = it->it_lock_handle; och->och_magic = OBD_CLIENT_HANDLE_MAGIC; och->och_flags = it->it_flags; @@ -508,7 +486,7 @@ static int ll_local_open(struct file *file, struct lookup_intent *it, body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY); - ll_ioepoch_open(lli, body->ioepoch); + ll_ioepoch_open(lli, body->mbo_ioepoch); } LUSTRE_FPRIVATE(file) = fd; @@ -543,7 +521,7 @@ int ll_file_open(struct inode *inode, struct file *file) struct obd_client_handle **och_p = NULL; __u64 *och_usecount = NULL; struct ll_file_data *fd; - int rc = 0, opendir_set = 0; + int rc = 0; CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n", PFID(ll_inode2fid(inode)), inode, file->f_flags); @@ -558,16 +536,8 @@ int ll_file_open(struct inode *inode, struct file *file) } fd->fd_file = file; - if (S_ISDIR(inode->i_mode)) { - spin_lock(&lli->lli_sa_lock); - if (!lli->lli_opendir_key && !lli->lli_sai && - lli->lli_opendir_pid == 0) { - lli->lli_opendir_key = fd; - lli->lli_opendir_pid = current_pid(); - opendir_set = 1; - } - spin_unlock(&lli->lli_sa_lock); - } + if (S_ISDIR(inode->i_mode)) + ll_authorize_statahead(inode, fd); if (is_root_inode(inode)) { LUSTRE_FPRIVATE(file) = fd; @@ -615,7 +585,7 @@ restart: } else if (it->it_flags & FMODE_EXEC) { och_p = &lli->lli_mds_exec_och; och_usecount = &lli->lli_open_fd_exec_count; - } else { + } else { och_p = &lli->lli_mds_read_och; och_usecount = &lli->lli_open_fd_read_count; } @@ -652,9 +622,19 @@ restart: * result in a deadlock */ mutex_unlock(&lli->lli_och_mutex); - it->it_create_mode |= M_CHECK_STALE; + /* + * Normally called under two situations: + * 1. NFS export. + * 2. revalidate with IT_OPEN (revalidate doesn't + * execute this intent any more). + * + * Always fetch MDS_OPEN_LOCK if this is not setstripe. + * + * Always specify MDS_OPEN_BY_FID because we don't want + * to get file with different fid. + */ + it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID; rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it); - it->it_create_mode &= ~M_CHECK_STALE; if (rc) goto out_openerr; @@ -716,9 +696,10 @@ out_och_free: mutex_unlock(&lli->lli_och_mutex); out_openerr: - if (opendir_set != 0) - ll_stop_statahead(inode, lli->lli_opendir_key); - ll_file_data_put(fd); + if (lli->lli_opendir_key == fd) + ll_deauthorize_statahead(inode, fd); + if (fd) + ll_file_data_put(fd); } else { ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1); } @@ -764,7 +745,7 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode, struct lookup_intent it = { .it_op = IT_OPEN }; struct ll_sb_info *sbi = ll_i2sbi(inode); struct md_op_data *op_data; - struct ptlrpc_request *req; + struct ptlrpc_request *req = NULL; struct lustre_handle old_handle = { 0 }; struct obd_client_handle *och = NULL; int rc; @@ -831,8 +812,8 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode, it.it_flags = fmode | open_flags; it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE; - rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req, - ll_md_blocking_lease_ast, + rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req, + &ll_md_blocking_lease_ast, /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise * it can be cancelled which may mislead applications that the lease is * broken; @@ -840,7 +821,7 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode, * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast * doesn't deal with openhandle, so normal openhandle will be leaked. */ - LDLM_FL_NO_LRU | LDLM_FL_EXCL); + LDLM_FL_NO_LRU | LDLM_FL_EXCL); ll_finish_md_op_data(op_data); ptlrpc_req_finished(req); if (rc < 0) @@ -908,7 +889,6 @@ static int ll_lease_close(struct obd_client_handle *och, struct inode *inode, { struct ldlm_lock *lock; bool cancelled = true; - int rc; lock = ldlm_handle2lock(&och->och_lease_handle); if (lock) { @@ -926,9 +906,8 @@ static int ll_lease_close(struct obd_client_handle *och, struct inode *inode, if (lease_broken) *lease_broken = cancelled; - rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och, - NULL); - return rc; + return ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, + inode, och, NULL); } /* Fills the obdo with the attributes for the lsm */ @@ -1138,10 +1117,11 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args, { struct ll_inode_info *lli = ll_i2info(file_inode(file)); struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct range_lock range; struct cl_io *io; ssize_t result; - CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: %llu, count: %zd\n", + CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: %llu, count: %zu\n", file->f_path.dentry->d_name.name, iot, *ppos, count); restart: @@ -1150,7 +1130,12 @@ restart: if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) { struct vvp_io *vio = vvp_env_io(env); - int write_mutex_locked = 0; + bool range_locked = false; + + if (file->f_flags & O_APPEND) + range_lock_init(&range, 0, LUSTRE_EOF); + else + range_lock_init(&range, *ppos, *ppos + count - 1); vio->vui_fd = LUSTRE_FPRIVATE(file); vio->vui_io_subtype = args->via_io_subtype; @@ -1159,14 +1144,23 @@ restart: case IO_NORMAL: vio->vui_iter = args->u.normal.via_iter; vio->vui_iocb = args->u.normal.via_iocb; - if ((iot == CIT_WRITE) && + /* + * Direct IO reads must also take range lock, + * or multiple reads will try to work on the same pages + * See LU-6227 for details. + */ + if (((iot == CIT_WRITE) || + (iot == CIT_READ && (file->f_flags & O_DIRECT))) && !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { - if (mutex_lock_interruptible(&lli-> - lli_write_mutex)) { - result = -ERESTARTSYS; + CDEBUG(D_VFSTRACE, "Range lock [%llu, %llu]\n", + range.rl_node.in_extent.start, + range.rl_node.in_extent.end); + result = range_lock(&lli->lli_write_tree, + &range); + if (result < 0) goto out; - } - write_mutex_locked = 1; + + range_locked = true; } down_read(&lli->lli_trunc_sem); break; @@ -1183,8 +1177,12 @@ restart: ll_cl_remove(file, env); if (args->via_io_subtype == IO_NORMAL) up_read(&lli->lli_trunc_sem); - if (write_mutex_locked) - mutex_unlock(&lli->lli_write_mutex); + if (range_locked) { + CDEBUG(D_VFSTRACE, "Range unlock [%llu, %llu]\n", + range.rl_node.in_extent.start, + range.rl_node.in_extent.end); + range_unlock(&lli->lli_write_tree, &range); + } } else { /* cl_io_rw_init() handled IO */ result = io->ci_result; @@ -1201,7 +1199,7 @@ out: * short read/write instead of restart io. */ if ((result == 0 || result == -ENODATA) && io->ci_need_restart) { - CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n", + CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zu\n", iot == CIT_READ ? "read" : "write", file, *ppos, count); LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob); @@ -1296,94 +1294,15 @@ static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos, return result; } -static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx) -{ - struct obd_export *exp = ll_i2dtexp(inode); - struct obd_trans_info oti = { 0 }; - struct obdo *oa = NULL; - int lsm_size; - int rc = 0; - struct lov_stripe_md *lsm = NULL, *lsm2; - - oa = kmem_cache_zalloc(obdo_cachep, GFP_NOFS); - if (!oa) - return -ENOMEM; - - lsm = ccc_inode_lsm_get(inode); - if (!lsm_has_objects(lsm)) { - rc = -ENOENT; - goto out; - } - - lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) * - (lsm->lsm_stripe_count)); - - lsm2 = libcfs_kvzalloc(lsm_size, GFP_NOFS); - if (!lsm2) { - rc = -ENOMEM; - goto out; - } - - oa->o_oi = *oi; - oa->o_nlink = ost_idx; - oa->o_flags |= OBD_FL_RECREATE_OBJS; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP; - obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME | - OBD_MD_FLMTIME | OBD_MD_FLCTIME); - obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid); - memcpy(lsm2, lsm, lsm_size); - ll_inode_size_lock(inode); - rc = obd_create(NULL, exp, oa, &lsm2, &oti); - ll_inode_size_unlock(inode); - - kvfree(lsm2); - goto out; -out: - ccc_inode_lsm_put(inode, lsm); - kmem_cache_free(obdo_cachep, oa); - return rc; -} - -static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg) -{ - struct ll_recreate_obj ucreat; - struct ost_id oi; - - if (!capable(CFS_CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&ucreat, (struct ll_recreate_obj __user *)arg, - sizeof(ucreat))) - return -EFAULT; - - ostid_set_seq_mdt0(&oi); - ostid_set_id(&oi, ucreat.lrc_id); - return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx); -} - -static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg) -{ - struct lu_fid fid; - struct ost_id oi; - u32 ost_idx; - - if (!capable(CFS_CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&fid, (struct lu_fid __user *)arg, sizeof(fid))) - return -EFAULT; - - fid_to_ostid(&fid, &oi); - ost_idx = (fid_seq(&fid) >> 16) & 0xffff; - return ll_lov_recreate(inode, &oi, ost_idx); -} - int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry, __u64 flags, struct lov_user_md *lum, int lum_size) { struct lov_stripe_md *lsm = NULL; - struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags}; + struct lookup_intent oit = { + .it_op = IT_OPEN, + .it_flags = flags | MDS_OPEN_BY_FID, + }; int rc = 0; lsm = ccc_inode_lsm_get(inode); @@ -1397,11 +1316,11 @@ int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry, ll_inode_size_lock(inode); rc = ll_intent_file_open(dentry, lum, lum_size, &oit); - if (rc) + if (rc < 0) goto out_unlock; rc = oit.it_status; if (rc < 0) - goto out_req_free; + goto out_unlock; ll_release_openhandle(inode, &oit); @@ -1411,9 +1330,6 @@ out_unlock: ccc_inode_lsm_put(inode, lsm); out: return rc; -out_req_free: - ptlrpc_req_finished((struct ptlrpc_request *)oit.it_request); - goto out; } int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, @@ -1448,9 +1364,9 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - lmmsize = body->eadatasize; + lmmsize = body->mbo_eadatasize; - if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) || + if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) || lmmsize == 0) { rc = -ENODATA; goto out; @@ -1481,13 +1397,13 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, */ if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) { lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm); - if (S_ISREG(body->mode)) + if (S_ISREG(body->mbo_mode)) lustre_swab_lov_user_md_objects( ((struct lov_user_md_v1 *)lmm)->lmm_objects, stripe_count); } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) { lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm); - if (S_ISREG(body->mode)) + if (S_ISREG(body->mbo_mode)) lustre_swab_lov_user_md_objects( ((struct lov_user_md_v3 *)lmm)->lmm_objects, stripe_count); @@ -1530,55 +1446,48 @@ static int ll_lov_setea(struct inode *inode, struct file *file, return rc; } +static int ll_file_getstripe(struct inode *inode, + struct lov_user_md __user *lum) +{ + struct lu_env *env; + int refcheck; + int rc; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum); + cl_env_put(env, &refcheck); + return rc; +} + static int ll_lov_setstripe(struct inode *inode, struct file *file, unsigned long arg) { - struct lov_user_md_v3 lumv3; - struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3; - struct lov_user_md_v1 __user *lumv1p = (void __user *)arg; - struct lov_user_md_v3 __user *lumv3p = (void __user *)arg; + struct lov_user_md __user *lum = (struct lov_user_md __user *)arg; + struct lov_user_md *klum; int lum_size, rc; __u64 flags = FMODE_WRITE; - /* first try with v1 which is smaller than v3 */ - lum_size = sizeof(struct lov_user_md_v1); - if (copy_from_user(lumv1, lumv1p, lum_size)) - return -EFAULT; - - if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) { - lum_size = sizeof(struct lov_user_md_v3); - if (copy_from_user(&lumv3, lumv3p, lum_size)) - return -EFAULT; - } + rc = ll_copy_user_md(lum, &klum); + if (rc < 0) + return rc; - rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1, + lum_size = rc; + rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, klum, lum_size); cl_lov_delay_create_clear(&file->f_flags); if (rc == 0) { - struct lov_stripe_md *lsm; __u32 gen; - put_user(0, &lumv1p->lmm_stripe_count); + put_user(0, &lum->lmm_stripe_count); ll_layout_refresh(inode, &gen); - lsm = ccc_inode_lsm_get(inode); - rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), - 0, lsm, (void __user *)arg); - ccc_inode_lsm_put(inode, lsm); + rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg); } - return rc; -} -static int ll_lov_getstripe(struct inode *inode, unsigned long arg) -{ - struct lov_stripe_md *lsm; - int rc = -ENODATA; - - lsm = ccc_inode_lsm_get(inode); - if (lsm) - rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, - lsm, (void __user *)arg); - ccc_inode_lsm_put(inode, lsm); + kfree(klum); return rc; } @@ -2247,6 +2156,12 @@ free_hss: return rc; } +static inline long ll_lease_type_from_fmode(fmode_t fmode) +{ + return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) | + ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0); +} + static long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2314,11 +2229,8 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return rc; } case LL_IOC_LOV_GETSTRIPE: - return ll_lov_getstripe(inode, arg); - case LL_IOC_RECREATE_OBJ: - return ll_lov_recreate_obj(inode, arg); - case LL_IOC_RECREATE_FID: - return ll_lov_recreate_fid(inode, arg); + return ll_file_getstripe(inode, + (struct lov_user_md __user *)arg); case FSFILT_IOC_FIEMAP: return ll_ioctl_fiemap(inode, arg); case FSFILT_IOC_GETFLAGS: @@ -2349,6 +2261,8 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return 0; } + case LL_IOC_GETPARENT: + return ll_getparent(file, (struct getparent __user *)arg); case OBD_IOC_FID2PATH: return ll_fid2path(inode, (void __user *)arg); case LL_IOC_DATA_VERSION: { @@ -2451,20 +2365,20 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct ll_inode_info *lli = ll_i2info(inode); struct obd_client_handle *och = NULL; bool lease_broken; - fmode_t mode = 0; + fmode_t fmode; switch (arg) { - case F_WRLCK: + case LL_LEASE_WRLCK: if (!(file->f_mode & FMODE_WRITE)) return -EPERM; - mode = FMODE_WRITE; + fmode = FMODE_WRITE; break; - case F_RDLCK: + case LL_LEASE_RDLCK: if (!(file->f_mode & FMODE_READ)) return -EPERM; - mode = FMODE_READ; + fmode = FMODE_READ; break; - case F_UNLCK: + case LL_LEASE_UNLCK: mutex_lock(&lli->lli_och_mutex); if (fd->fd_lease_och) { och = fd->fd_lease_och; @@ -2472,26 +2386,26 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } mutex_unlock(&lli->lli_och_mutex); - if (och) { - mode = och->och_flags & - (FMODE_READ|FMODE_WRITE); - rc = ll_lease_close(och, inode, &lease_broken); - if (rc == 0 && lease_broken) - mode = 0; - } else { - rc = -ENOLCK; - } + if (!och) + return -ENOLCK; + + fmode = och->och_flags; + rc = ll_lease_close(och, inode, &lease_broken); + if (rc < 0) + return rc; + + if (lease_broken) + fmode = 0; - /* return the type of lease or error */ - return rc < 0 ? rc : (int)mode; + return ll_lease_type_from_fmode(fmode); default: return -EINVAL; } - CDEBUG(D_INODE, "Set lease with mode %d\n", mode); + CDEBUG(D_INODE, "Set lease with mode %u\n", fmode); /* apply for lease */ - och = ll_lease_open(inode, file, mode, 0); + och = ll_lease_open(inode, file, fmode, 0); if (IS_ERR(och)) return PTR_ERR(och); @@ -2512,8 +2426,8 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case LL_IOC_GET_LEASE: { struct ll_inode_info *lli = ll_i2info(inode); struct ldlm_lock *lock = NULL; + fmode_t fmode = 0; - rc = 0; mutex_lock(&lli->lli_och_mutex); if (fd->fd_lease_och) { struct obd_client_handle *och = fd->fd_lease_och; @@ -2522,14 +2436,13 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (lock) { lock_res_and_lock(lock); if (!ldlm_is_cancel(lock)) - rc = och->och_flags & - (FMODE_READ | FMODE_WRITE); + fmode = och->och_flags; unlock_res_and_lock(lock); LDLM_LOCK_PUT(lock); } } mutex_unlock(&lli->lli_och_mutex); - return rc; + return ll_lease_type_from_fmode(fmode); } case LL_IOC_HSM_IMPORT: { struct hsm_user_import *hui; @@ -2574,9 +2487,8 @@ static loff_t ll_file_seek(struct file *file, loff_t offset, int origin) eof = i_size_read(inode); } - retval = generic_file_llseek_size(file, offset, origin, - ll_file_maxbytes(inode), eof); - return retval; + return generic_file_llseek_size(file, offset, origin, + ll_file_maxbytes(inode), eof); } static int ll_flush(struct file *file, fl_owner_t id) @@ -2593,9 +2505,11 @@ static int ll_flush(struct file *file, fl_owner_t id) */ rc = lli->lli_async_rc; lli->lli_async_rc = 0; - err = lov_read_and_clear_async_rc(lli->lli_clob); - if (rc == 0) - rc = err; + if (lli->lli_clob) { + err = lov_read_and_clear_async_rc(lli->lli_clob); + if (!rc) + rc = err; + } /* The application has been told about write failure already. * Do not report failure again. @@ -2714,6 +2628,7 @@ ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) struct md_op_data *op_data; struct lustre_handle lockh = {0}; ldlm_policy_data_t flock = { {0} }; + int fl_type = file_lock->fl_type; __u64 flags = 0; int rc; int rc2 = 0; @@ -2744,7 +2659,7 @@ ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner) flock.l_flock.owner = (unsigned long)file_lock->fl_pid; - switch (file_lock->fl_type) { + switch (fl_type) { case F_RDLCK: einfo.ei_mode = LCK_PR; break; @@ -2764,8 +2679,7 @@ ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) einfo.ei_mode = LCK_PW; break; default: - CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", - file_lock->fl_type); + CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type); return -ENOTSUPP; } @@ -2787,16 +2701,18 @@ ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) case F_GETLK64: #endif flags = LDLM_FL_TEST_LOCK; - /* Save the old mode so that if the mode in the lock changes we - * can decrement the appropriate reader or writer refcount. - */ - file_lock->fl_type = einfo.ei_mode; break; default: CERROR("unknown fcntl lock command: %d\n", cmd); return -EINVAL; } + /* + * Save the old mode so that if the mode in the lock changes we + * can decrement the appropriate reader or writer refcount. + */ + file_lock->fl_type = einfo.ei_mode; + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, LUSTRE_OPC_ANY, NULL); if (IS_ERR(op_data)) @@ -2806,8 +2722,12 @@ ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) PFID(ll_inode2fid(inode)), flock.l_flock.pid, flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end); - rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, - op_data, &lockh, &flock, 0, NULL /* req */, flags); + rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh, + flags); + + /* Restore the file lock type if not TEST lock. */ + if (!(flags & LDLM_FL_TEST_LOCK)) + file_lock->fl_type = fl_type; if ((rc == 0 || file_lock->fl_type == F_UNLCK) && !(flags & LDLM_FL_TEST_LOCK)) @@ -2815,8 +2735,8 @@ ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) if (rc2 && file_lock->fl_type != F_UNLCK) { einfo.ei_mode = LCK_NL; - md_enqueue(sbi->ll_md_exp, &einfo, NULL, - op_data, &lockh, &flock, 0, NULL /* req */, flags); + md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, + &lockh, flags); rc = rc2; } @@ -2825,6 +2745,117 @@ ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) return rc; } +int ll_get_fid_by_name(struct inode *parent, const char *name, + int namelen, struct lu_fid *fid) +{ + struct md_op_data *op_data = NULL; + struct ptlrpc_request *req; + struct mdt_body *body; + int rc; + + op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + op_data->op_valid = OBD_MD_FLID; + rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc < 0) + return rc; + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (!body) { + rc = -EFAULT; + goto out_req; + } + if (fid) + *fid = body->mbo_fid1; +out_req: + ptlrpc_req_finished(req); + return rc; +} + +int ll_migrate(struct inode *parent, struct file *file, int mdtidx, + const char *name, int namelen) +{ + struct ptlrpc_request *request = NULL; + struct inode *child_inode = NULL; + struct dentry *dchild = NULL; + struct md_op_data *op_data; + struct qstr qstr; + int rc; + + CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%d\n", + name, PFID(ll_inode2fid(parent)), mdtidx); + + op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, + 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + + /* Get child FID first */ + qstr.hash = full_name_hash(parent, name, namelen); + qstr.name = name; + qstr.len = namelen; + dchild = d_lookup(file_dentry(file), &qstr); + if (dchild) { + op_data->op_fid3 = *ll_inode2fid(dchild->d_inode); + if (dchild->d_inode) { + child_inode = igrab(dchild->d_inode); + if (child_inode) { + inode_lock(child_inode); + op_data->op_fid3 = *ll_inode2fid(child_inode); + ll_invalidate_aliases(child_inode); + } + } + dput(dchild); + } else { + rc = ll_get_fid_by_name(parent, name, namelen, + &op_data->op_fid3); + if (rc) + goto out_free; + } + + if (!fid_is_sane(&op_data->op_fid3)) { + CERROR("%s: migrate %s, but fid "DFID" is insane\n", + ll_get_fsname(parent->i_sb, NULL, 0), name, + PFID(&op_data->op_fid3)); + rc = -EINVAL; + goto out_free; + } + + rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3); + if (rc < 0) + goto out_free; + + if (rc == mdtidx) { + CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name, + PFID(&op_data->op_fid3), mdtidx); + rc = 0; + goto out_free; + } + + op_data->op_mds = mdtidx; + op_data->op_cli_flags = CLI_MIGRATE; + rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, + namelen, name, namelen, &request); + if (!rc) + ll_update_times(request, parent); + + ptlrpc_req_finished(request); + +out_free: + if (child_inode) { + clear_nlink(child_inode); + inode_unlock(child_inode); + iput(child_inode); + } + + ll_finish_md_op_data(op_data); + return rc; +} + static int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock) { @@ -2847,7 +2878,7 @@ int ll_have_md_lock(struct inode *inode, __u64 *bits, struct lustre_handle lockh; ldlm_policy_data_t policy; enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ? - (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode; + (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode; struct lu_fid *fid; __u64 flags; int i; @@ -2888,15 +2919,12 @@ enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits, { ldlm_policy_data_t policy = { .l_inodebits = {bits} }; struct lu_fid *fid; - enum ldlm_mode rc; fid = &ll_i2info(inode)->lli_fid; CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid)); - rc = md_lock_match(ll_i2mdexp(inode), flags | LDLM_FL_BLOCK_GRANTED, - fid, LDLM_IBITS, &policy, mode, lockh); - - return rc; + return md_lock_match(ll_i2mdexp(inode), flags | LDLM_FL_BLOCK_GRANTED, + fid, LDLM_IBITS, &policy, mode, lockh); } static int ll_inode_revalidate_fini(struct inode *inode, int rc) @@ -2949,15 +2977,9 @@ static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits) if (IS_ERR(op_data)) return PTR_ERR(op_data); - oit.it_create_mode |= M_CHECK_STALE; - rc = md_intent_lock(exp, op_data, NULL, 0, - /* we are not interested in name - * based lookup - */ - &oit, 0, &req, - ll_md_blocking_ast, 0); + rc = md_intent_lock(exp, op_data, &oit, &req, + &ll_md_blocking_ast, 0); ll_finish_md_op_data(op_data); - oit.it_create_mode &= ~M_CHECK_STALE; if (rc < 0) { rc = ll_inode_revalidate_fini(inode, rc); goto out; @@ -3003,10 +3025,8 @@ static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits) op_data->op_valid = valid; rc = md_getattr(sbi->ll_md_exp, op_data, &req); ll_finish_md_op_data(op_data); - if (rc) { - rc = ll_inode_revalidate_fini(inode, rc); - return rc; - } + if (rc) + return ll_inode_revalidate_fini(inode, rc); rc = ll_prep_inode(&inode, req, NULL, NULL); } @@ -3015,6 +3035,28 @@ out: return rc; } +static int ll_merge_md_attr(struct inode *inode) +{ + struct cl_attr attr = { 0 }; + int rc; + + LASSERT(ll_i2info(inode)->lli_lsm_md); + rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md, + &attr, ll_md_blocking_ast); + if (rc) + return rc; + + set_nlink(inode, attr.cat_nlink); + inode->i_blocks = attr.cat_blocks; + i_size_write(inode, attr.cat_size); + + ll_i2info(inode)->lli_atime = attr.cat_atime; + ll_i2info(inode)->lli_mtime = attr.cat_mtime; + ll_i2info(inode)->lli_ctime = attr.cat_ctime; + + return 0; +} + static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits) { struct inode *inode = d_inode(dentry); @@ -3026,6 +3068,13 @@ static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits) /* if object isn't regular file, don't validate size */ if (!S_ISREG(inode->i_mode)) { + if (S_ISDIR(inode->i_mode) && + ll_i2info(inode)->lli_lsm_md) { + rc = ll_merge_md_attr(inode); + if (rc) + return rc; + } + LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime; LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime; LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime; @@ -3057,13 +3106,14 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat) if (res) return res; + OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30); + stat->dev = inode->i_sb->s_dev; if (ll_need_32bit_api(sbi)) stat->ino = cl_fid_build_ino(&lli->lli_fid, 1); else stat->ino = inode->i_ino; stat->mode = inode->i_mode; - stat->nlink = inode->i_nlink; stat->uid = inode->i_uid; stat->gid = inode->i_gid; stat->rdev = inode->i_rdev; @@ -3072,6 +3122,7 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat) stat->ctime = inode->i_ctime; stat->blksize = 1 << inode->i_blkbits; + stat->nlink = inode->i_nlink; stat->size = i_size_read(inode); stat->blocks = inode->i_blocks; @@ -3139,6 +3190,12 @@ struct posix_acl *ll_get_acl(struct inode *inode, int type) int ll_inode_permission(struct inode *inode, int mask) { + struct ll_sb_info *sbi; + struct root_squash_info *squash; + const struct cred *old_cred = NULL; + struct cred *cred = NULL; + bool squash_id = false; + cfs_cap_t cap; int rc = 0; if (mask & MAY_NOT_BLOCK) @@ -3158,9 +3215,46 @@ int ll_inode_permission(struct inode *inode, int mask) CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n", PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask); + /* squash fsuid/fsgid if needed */ + sbi = ll_i2sbi(inode); + squash = &sbi->ll_squash; + if (unlikely(squash->rsi_uid && + uid_eq(current_fsuid(), GLOBAL_ROOT_UID) && + !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) { + squash_id = true; + } + + if (squash_id) { + CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n", + __kuid_val(current_fsuid()), __kgid_val(current_fsgid()), + squash->rsi_uid, squash->rsi_gid); + + /* + * update current process's credentials + * and FS capability + */ + cred = prepare_creds(); + if (!cred) + return -ENOMEM; + + cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid); + cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid); + for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) { + if ((1 << cap) & CFS_CAP_FS_MASK) + cap_lower(cred->cap_effective, cap); + } + old_cred = override_creds(cred); + } + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1); rc = generic_permission(inode, mask); + /* restore current process's credentials and FS capability */ + if (squash_id) { + revert_creds(old_cred); + put_cred(cred); + } + return rc; } @@ -3213,10 +3307,10 @@ const struct inode_operations ll_file_inode_operations = { .setattr = ll_setattr, .getattr = ll_getattr, .permission = ll_inode_permission, - .setxattr = ll_setxattr, - .getxattr = ll_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = ll_listxattr, - .removexattr = ll_removexattr, + .removexattr = generic_removexattr, .fiemap = ll_fiemap, .get_acl = ll_get_acl, }; @@ -3251,7 +3345,6 @@ void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd) if (!in_data) return NULL; - memset(in_data, 0, sizeof(*in_data)); in_data->iocd_size = size; in_data->iocd_cb = cb; in_data->iocd_count = count; @@ -3389,7 +3482,7 @@ static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock) goto out; } - lmmsize = body->eadatasize; + lmmsize = body->mbo_eadatasize; if (lmmsize == 0) /* empty layout */ { rc = 0; goto out; @@ -3447,7 +3540,7 @@ static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode, PFID(&lli->lli_fid), inode, reconf); /* in case this is a caching lock and reinstate with new inode */ - md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL); + md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL); lock_res_and_lock(lock); lvb_ready = ldlm_is_lvb_ready(lock); @@ -3557,8 +3650,8 @@ int ll_layout_refresh(struct inode *inode, __u32 *gen) struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS, .ei_mode = LCK_CR, - .ei_cb_bl = ll_md_blocking_ast, - .ei_cb_cp = ldlm_completion_ast, + .ei_cb_bl = &ll_md_blocking_ast, + .ei_cb_cp = &ldlm_completion_ast, }; int rc; @@ -3604,8 +3697,7 @@ again: ll_get_fsname(inode->i_sb, NULL, 0), PFID(&lli->lli_fid), inode); - rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh, - NULL, 0, NULL, 0); + rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0); ptlrpc_req_finished(it.it_request); it.it_request = NULL; diff --git a/drivers/staging/lustre/lustre/llite/glimpse.c b/drivers/staging/lustre/lustre/llite/glimpse.c index 92004a05f9ee..22507b9c6d69 100644 --- a/drivers/staging/lustre/lustre/llite/glimpse.c +++ b/drivers/staging/lustre/lustre/llite/glimpse.c @@ -42,7 +42,6 @@ #include "../include/obd.h" #include "../include/lustre_dlm.h" -#include "../include/lustre_lite.h" #include "../include/lustre_mdc.h" #include <linux/pagemap.h> #include <linux/file.h> diff --git a/drivers/staging/lustre/lustre/llite/lcommon_cl.c b/drivers/staging/lustre/lustre/llite/lcommon_cl.c index 396e4e4f0715..084330d08f7a 100644 --- a/drivers/staging/lustre/lustre/llite/lcommon_cl.c +++ b/drivers/staging/lustre/lustre/llite/lcommon_cl.c @@ -49,7 +49,6 @@ #include "../include/obd.h" #include "../include/obd_support.h" #include "../include/lustre_fid.h" -#include "../include/lustre_lite.h" #include "../include/lustre_dlm.h" #include "../include/lustre_ver.h" #include "../include/lustre_mdc.h" @@ -100,6 +99,7 @@ int cl_setattr_ost(struct inode *inode, const struct iattr *attr) io->u.ci_setattr.sa_attr.lvb_ctime = LTIME_S(attr->ia_ctime); io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size; io->u.ci_setattr.sa_valid = attr->ia_valid; + io->u.ci_setattr.sa_parent_fid = ll_inode2fid(inode); again: if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) { @@ -154,7 +154,7 @@ int cl_file_inode_init(struct inode *inode, struct lustre_md *md) int result = 0; int refcheck; - LASSERT(md->body->valid & OBD_MD_FLID); + LASSERT(md->body->mbo_valid & OBD_MD_FLID); LASSERT(S_ISREG(inode->i_mode)); env = cl_env_get(&refcheck); diff --git a/drivers/staging/lustre/lustre/llite/lcommon_misc.c b/drivers/staging/lustre/lustre/llite/lcommon_misc.c index f6be105eeef7..fb346c12dad2 100644 --- a/drivers/staging/lustre/lustre/llite/lcommon_misc.c +++ b/drivers/staging/lustre/lustre/llite/lcommon_misc.c @@ -38,7 +38,6 @@ #include "../include/obd.h" #include "../include/cl_object.h" -#include "../include/lustre_lite.h" #include "llite_internal.h" /* Initialize the default and maximum LOV EA and cookie sizes. This allows diff --git a/drivers/staging/lustre/lustre/llite/llite_close.c b/drivers/staging/lustre/lustre/llite/llite_close.c index 2326b40a0870..8644631bf2ba 100644 --- a/drivers/staging/lustre/lustre/llite/llite_close.c +++ b/drivers/staging/lustre/lustre/llite/llite_close.c @@ -38,7 +38,6 @@ #define DEBUG_SUBSYSTEM S_LLITE -#include "../include/lustre_lite.h" #include "llite_internal.h" /** records that a write is in flight */ diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h index 4d6d589a1677..3e98bd685061 100644 --- a/drivers/staging/lustre/lustre/llite/llite_internal.h +++ b/drivers/staging/lustre/lustre/llite/llite_internal.h @@ -36,14 +36,21 @@ #include "../include/lustre_ver.h" #include "../include/lustre_disk.h" /* for s2sbi */ #include "../include/lustre_eacl.h" +#include "../include/lustre_linkea.h" /* for struct cl_lock_descr and struct cl_io */ +#include "../include/lustre_patchless_compat.h" +#include "../include/lustre_compat.h" #include "../include/cl_object.h" +#include "../include/lustre_lmv.h" #include "../include/lustre_mdc.h" #include "../include/lustre_intent.h" #include <linux/compat.h> +#include <linux/namei.h> +#include <linux/xattr.h> #include <linux/posix_acl_xattr.h> #include "vvp_internal.h" +#include "range_lock.h" #ifndef FMODE_EXEC #define FMODE_EXEC 0 @@ -57,6 +64,9 @@ #define LL_DIR_END_OFF 0x7fffffffffffffffULL #define LL_DIR_END_OFF_32BIT 0x7fffffffUL +/* 4UL * 1024 * 1024 */ +#define LL_MAX_BLKSIZE_BITS 22 + #define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0") #define LUSTRE_FPRIVATE(file) ((file)->private_data) @@ -116,9 +126,7 @@ struct ll_inode_info { /* identifying fields for both metadata and data stacks. */ struct lu_fid lli_fid; - /* Parent fid for accessing default stripe data on parent directory - * for allocating OST objects after a mknod() and later open-by-FID. - */ + /* master inode fid for stripe directory */ struct lu_fid lli_pfid; struct list_head lli_close_list; @@ -156,7 +164,7 @@ struct ll_inode_info { /* for directory */ struct { /* serialize normal readdir and statahead-readdir. */ - struct mutex d_readdir_mutex; + struct mutex lli_readdir_mutex; /* metadata statahead */ /* since parent-child threads can share the same @file @@ -164,27 +172,39 @@ struct ll_inode_info { * case of parent exit before child -- it is me should * cleanup the dir readahead. */ - void *d_opendir_key; - struct ll_statahead_info *d_sai; + void *lli_opendir_key; + struct ll_statahead_info *lli_sai; /* protect statahead stuff. */ - spinlock_t d_sa_lock; + spinlock_t lli_sa_lock; /* "opendir_pid" is the token when lookup/revalidate * -- I am the owner of dir statahead. */ - pid_t d_opendir_pid; - } d; - -#define lli_readdir_mutex u.d.d_readdir_mutex -#define lli_opendir_key u.d.d_opendir_key -#define lli_sai u.d.d_sai -#define lli_sa_lock u.d.d_sa_lock -#define lli_opendir_pid u.d.d_opendir_pid + pid_t lli_opendir_pid; + /* stat will try to access statahead entries or start + * statahead if this flag is set, and this flag will be + * set upon dir open, and cleared when dir is closed, + * statahead hit ratio is too low, or start statahead + * thread failed. + */ + unsigned int lli_sa_enabled:1; + /* generation for statahead */ + unsigned int lli_sa_generation; + /* directory stripe information */ + struct lmv_stripe_md *lli_lsm_md; + /* default directory stripe offset. This is extracted + * from the "dmv" xattr in order to decide which MDT to + * create a subdirectory on. The MDS itself fetches + * "dmv" and gets the rest of the default layout itself + * (count, hash, etc). + */ + __u32 lli_def_stripe_offset; + }; /* for non-directory */ struct { - struct mutex f_size_mutex; - char *f_symlink_name; - __u64 f_maxbytes; + struct mutex lli_size_mutex; + char *lli_symlink_name; + __u64 lli_maxbytes; /* * struct rw_semaphore { * signed long count; // align d.d_def_acl @@ -192,16 +212,16 @@ struct ll_inode_info { * struct list_head wait_list; * } */ - struct rw_semaphore f_trunc_sem; - struct mutex f_write_mutex; + struct rw_semaphore lli_trunc_sem; + struct range_lock_tree lli_write_tree; - struct rw_semaphore f_glimpse_sem; - unsigned long f_glimpse_time; - struct list_head f_agl_list; - __u64 f_agl_index; + struct rw_semaphore lli_glimpse_sem; + unsigned long lli_glimpse_time; + struct list_head lli_agl_list; + __u64 lli_agl_index; /* for writepage() only to communicate to fsync */ - int f_async_rc; + int lli_async_rc; /* * whenever a process try to read/write the file, the @@ -211,22 +231,9 @@ struct ll_inode_info { * so the read/write statistics for jobid will not be * accurate if the file is shared by different jobs. */ - char f_jobid[JOBSTATS_JOBID_SIZE]; - } f; - -#define lli_size_mutex u.f.f_size_mutex -#define lli_symlink_name u.f.f_symlink_name -#define lli_maxbytes u.f.f_maxbytes -#define lli_trunc_sem u.f.f_trunc_sem -#define lli_write_mutex u.f.f_write_mutex -#define lli_glimpse_sem u.f.f_glimpse_sem -#define lli_glimpse_time u.f.f_glimpse_time -#define lli_agl_list u.f.f_agl_list -#define lli_agl_index u.f.f_agl_index -#define lli_async_rc u.f.f_async_rc -#define lli_jobid u.f.f_jobid - - } u; + char lli_jobid[LUSTRE_JOBID_SIZE]; + }; + }; /* XXX: For following frequent used members, although they maybe special * used for non-directory object, it is some time-wasting to check @@ -401,12 +408,13 @@ enum stats_track_type { #define LL_SBI_LAYOUT_LOCK 0x20000 /* layout lock support */ #define LL_SBI_USER_FID2PATH 0x40000 /* allow fid2path by unprivileged users */ #define LL_SBI_XATTR_CACHE 0x80000 /* support for xattr cache */ +#define LL_SBI_NOROOTSQUASH 0x100000 /* do not apply root squash */ #define LL_SBI_FLAGS { \ "nolck", \ "checksum", \ "flock", \ - "xattr", \ + "user_xattr", \ "acl", \ "???", \ "???", \ @@ -422,9 +430,27 @@ enum stats_track_type { "verbose", \ "layout", \ "user_fid2path",\ - "xattr", \ + "xattr_cache", \ + "norootsquash", \ } +/* + * This is embedded into llite super-blocks to keep track of connect + * flags (capabilities) supported by all imports given mount is + * connected to. + */ +struct lustre_client_ocd { + /* + * This is conjunction of connect_flags across all imports + * (LOVs) this mount is connected to. This field is updated by + * cl_ocd_update() under ->lco_lock. + */ + __u64 lco_flags; + struct mutex lco_lock; + struct obd_export *lco_md_exp; + struct obd_export *lco_dt_exp; +}; + struct ll_sb_info { /* this protects pglist and ra_info. It isn't safe to * grab from interrupt contexts @@ -461,7 +487,7 @@ struct ll_sb_info { unsigned int ll_namelen; struct file_operations *ll_fop; - unsigned int ll_md_brw_size; /* used by readdir */ + unsigned int ll_md_brw_pages; /* readdir pages per RPC */ struct lu_site *ll_site; struct cl_device *ll_cl; @@ -484,11 +510,17 @@ struct ll_sb_info { atomic_t ll_sa_wrong; /* statahead thread stopped for * low hit ratio */ + atomic_t ll_sa_running; /* running statahead thread + * count + */ atomic_t ll_agl_total; /* AGL thread started count */ dev_t ll_sdev_orig; /* save s_dev before assign for * clustered nfs */ + /* root squash */ + struct root_squash_info ll_squash; + __kernel_fsid_t ll_fsid; struct kobject ll_kobj; /* sysfs object */ struct super_block *ll_sb; /* struct super_block (for sysfs code)*/ @@ -643,25 +675,66 @@ void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct ll_file_data *file, loff_t pos, size_t count, int rw); +enum { + LPROC_LL_DIRTY_HITS, + LPROC_LL_DIRTY_MISSES, + LPROC_LL_READ_BYTES, + LPROC_LL_WRITE_BYTES, + LPROC_LL_BRW_READ, + LPROC_LL_BRW_WRITE, + LPROC_LL_OSC_READ, + LPROC_LL_OSC_WRITE, + LPROC_LL_IOCTL, + LPROC_LL_OPEN, + LPROC_LL_RELEASE, + LPROC_LL_MAP, + LPROC_LL_LLSEEK, + LPROC_LL_FSYNC, + LPROC_LL_READDIR, + LPROC_LL_SETATTR, + LPROC_LL_TRUNC, + LPROC_LL_FLOCK, + LPROC_LL_GETATTR, + LPROC_LL_CREATE, + LPROC_LL_LINK, + LPROC_LL_UNLINK, + LPROC_LL_SYMLINK, + LPROC_LL_MKDIR, + LPROC_LL_RMDIR, + LPROC_LL_MKNOD, + LPROC_LL_RENAME, + LPROC_LL_STAFS, + LPROC_LL_ALLOC_INODE, + LPROC_LL_SETXATTR, + LPROC_LL_GETXATTR, + LPROC_LL_GETXATTR_HITS, + LPROC_LL_LISTXATTR, + LPROC_LL_REMOVEXATTR, + LPROC_LL_INODE_PERM, + LPROC_LL_FILE_OPCODES +}; + /* llite/dir.c */ -void ll_release_page(struct page *page, int remove); extern const struct file_operations ll_dir_operations; extern const struct inode_operations ll_dir_inode_operations; -struct page *ll_get_dir_page(struct inode *dir, __u64 hash, - struct ll_dir_chain *chain); -int ll_dir_read(struct inode *inode, struct dir_context *ctx); - +int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data, + struct dir_context *ctx); int ll_get_mdt_idx(struct inode *inode); +int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid); +struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data, + __u64 offset); +void ll_release_page(struct inode *inode, struct page *page, bool remove); + /* llite/namei.c */ extern const struct inode_operations ll_special_inode_operations; -int ll_objects_destroy(struct ptlrpc_request *request, - struct inode *dir); struct inode *ll_iget(struct super_block *sb, ino_t hash, struct lustre_md *lic); +int ll_test_inode_by_fid(struct inode *inode, void *opaque); int ll_md_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *, void *data, int flag); struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de); +void ll_update_times(struct ptlrpc_request *request, struct inode *inode); /* llite/rw.c */ int ll_writepage(struct page *page, struct writeback_control *wbc); @@ -704,7 +777,10 @@ void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data, struct lustre_handle *fh); int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat); struct posix_acl *ll_get_acl(struct inode *inode, int type); - +int ll_migrate(struct inode *parent, struct file *file, int mdtidx, + const char *name, int namelen); +int ll_get_fid_by_name(struct inode *parent, const char *name, + int namelen, struct lu_fid *fid); int ll_inode_permission(struct inode *inode, int mask); int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry, @@ -715,8 +791,8 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, struct ptlrpc_request **request); int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, int set_default); -int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp, - int *lmm_size, struct ptlrpc_request **request); +int ll_dir_getstripe(struct inode *inode, void **lmmp, int *lmm_size, + struct ptlrpc_request **request, u64 valid); int ll_fsync(struct file *file, loff_t start, loff_t end, int data); int ll_merge_attr(const struct lu_env *env, struct inode *inode); int ll_fid2path(struct inode *inode, void __user *arg); @@ -748,8 +824,8 @@ int ll_setattr(struct dentry *de, struct iattr *attr); int ll_statfs(struct dentry *de, struct kstatfs *sfs); int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs, __u64 max_age, __u32 flags); -void ll_update_inode(struct inode *inode, struct lustre_md *md); -void ll_read_inode2(struct inode *inode, void *opaque); +int ll_update_inode(struct inode *inode, struct lustre_md *md); +int ll_read_inode2(struct inode *inode, void *opaque); void ll_delete_inode(struct inode *inode); int ll_iocontrol(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg); @@ -763,15 +839,46 @@ int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, int ll_obd_statfs(struct inode *inode, void __user *arg); int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize); int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize); +int ll_set_default_mdsize(struct ll_sb_info *sbi, int default_mdsize); int ll_process_config(struct lustre_cfg *lcfg); + +enum { + LUSTRE_OPC_MKDIR = 0, + LUSTRE_OPC_SYMLINK = 1, + LUSTRE_OPC_MKNOD = 2, + LUSTRE_OPC_CREATE = 3, + LUSTRE_OPC_ANY = 5, +}; + struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data, struct inode *i1, struct inode *i2, - const char *name, int namelen, - int mode, __u32 opc, void *data); + const char *name, size_t namelen, + u32 mode, __u32 opc, void *data); void ll_finish_md_op_data(struct md_op_data *op_data); int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg); char *ll_get_fsname(struct super_block *sb, char *buf, int buflen); +void ll_compute_rootsquash_state(struct ll_sb_info *sbi); void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req); +ssize_t ll_copy_user_md(const struct lov_user_md __user *md, + struct lov_user_md **kbuf); + +/* Compute expected user md size when passing in a md from user space */ +static inline ssize_t ll_lov_user_md_size(const struct lov_user_md *lum) +{ + switch (lum->lmm_magic) { + case LOV_USER_MAGIC_V1: + return sizeof(struct lov_user_md_v1); + case LOV_USER_MAGIC_V3: + return sizeof(struct lov_user_md_v3); + case LOV_USER_MAGIC_SPECIFIC: + if (lum->lmm_stripe_count > LOV_MAX_STRIPE_COUNT) + return -EINVAL; + + return lov_user_md_size(lum->lmm_stripe_count, + LOV_USER_MAGIC_SPECIFIC); + } + return -EINVAL; +} /* llite/llite_nfs.c */ extern const struct export_operations lustre_export_operations; @@ -779,6 +886,7 @@ __u32 get_uuid2int(const char *name, int len); void get_uuid2fsid(const char *name, int len, __kernel_fsid_t *fsid); struct inode *search_inode_for_lustre(struct super_block *sb, const struct lu_fid *fid); +int ll_dir_get_parent_fid(struct inode *dir, struct lu_fid *parent_fid); /* llite/symlink.c */ extern const struct inode_operations ll_fast_symlink_inode_operations; @@ -933,12 +1041,19 @@ static inline __u64 ll_file_maxbytes(struct inode *inode) } /* llite/xattr.c */ -int ll_setxattr(struct dentry *dentry, struct inode *inode, - const char *name, const void *value, size_t size, int flags); -ssize_t ll_getxattr(struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size); +extern const struct xattr_handler *ll_xattr_handlers[]; + +#define XATTR_USER_T 1 +#define XATTR_TRUSTED_T 2 +#define XATTR_SECURITY_T 3 +#define XATTR_ACL_ACCESS_T 4 +#define XATTR_ACL_DEFAULT_T 5 +#define XATTR_LUSTRE_T 6 +#define XATTR_OTHER_T 7 + ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size); -int ll_removexattr(struct dentry *dentry, const char *name); +int ll_xattr_list(struct inode *inode, const char *name, int type, + void *buffer, size_t size, __u64 valid); /** * Common IO arguments for various VFS I/O interfaces. @@ -964,11 +1079,10 @@ void ll_ra_stats_inc(struct inode *inode, enum ra_stat which); /* per inode struct, for dir only */ struct ll_statahead_info { - struct inode *sai_inode; + struct dentry *sai_dentry; atomic_t sai_refcount; /* when access this struct, hold * refcount */ - unsigned int sai_generation; /* generation for statahead */ unsigned int sai_max; /* max ahead of lookup */ __u64 sai_sent; /* stat requests sent count */ __u64 sai_replied; /* stat requests which received @@ -995,22 +1109,25 @@ struct ll_statahead_info { unsigned int sai_ls_all:1, /* "ls -al", do stat-ahead for * hidden entries */ - sai_agl_valid:1;/* AGL is valid for the dir */ + sai_agl_valid:1,/* AGL is valid for the dir */ + sai_in_readpage:1;/* statahead is in readdir() */ wait_queue_head_t sai_waitq; /* stat-ahead wait queue */ struct ptlrpc_thread sai_thread; /* stat-ahead thread */ struct ptlrpc_thread sai_agl_thread; /* AGL thread */ - struct list_head sai_entries; /* entry list */ - struct list_head sai_entries_received; /* entries returned */ - struct list_head sai_entries_stated; /* entries stated */ - struct list_head sai_entries_agl; /* AGL entries to be sent */ + struct list_head sai_interim_entries; /* entries which got async + * stat reply, but not + * instantiated + */ + struct list_head sai_entries; /* completed entries */ + struct list_head sai_agls; /* AGLs to be sent */ struct list_head sai_cache[LL_SA_CACHE_SIZE]; spinlock_t sai_cache_lock[LL_SA_CACHE_SIZE]; atomic_t sai_cache_count; /* entry count in cache */ }; -int do_statahead_enter(struct inode *dir, struct dentry **dentry, - int only_unplug); -void ll_stop_statahead(struct inode *dir, void *key); +int ll_statahead(struct inode *dir, struct dentry **dentry, bool unplug); +void ll_authorize_statahead(struct inode *dir, void *key); +void ll_deauthorize_statahead(struct inode *dir, void *key); blkcnt_t dirty_cnt(struct inode *inode); @@ -1040,73 +1157,53 @@ static inline int ll_glimpse_size(struct inode *inode) return rc; } -static inline void -ll_statahead_mark(struct inode *dir, struct dentry *dentry) -{ - struct ll_inode_info *lli = ll_i2info(dir); - struct ll_statahead_info *sai = lli->lli_sai; - struct ll_dentry_data *ldd = ll_d2d(dentry); - - /* not the same process, don't mark */ - if (lli->lli_opendir_pid != current_pid()) - return; - - LASSERT(ldd); - if (sai) - ldd->lld_sa_generation = sai->sai_generation; -} - -static inline int -d_need_statahead(struct inode *dir, struct dentry *dentryp) +/* + * dentry may statahead when statahead is enabled and current process has opened + * parent directory, and this dentry hasn't accessed statahead cache before + */ +static inline bool +dentry_may_statahead(struct inode *dir, struct dentry *dentry) { struct ll_inode_info *lli; struct ll_dentry_data *ldd; if (ll_i2sbi(dir)->ll_sa_max == 0) - return -EAGAIN; + return false; lli = ll_i2info(dir); + + /* + * statahead is not allowed for this dir, there may be three causes: + * 1. dir is not opened. + * 2. statahead hit ratio is too low. + * 3. previous stat started statahead thread failed. + */ + if (!lli->lli_sa_enabled) + return false; + /* not the same process, don't statahead */ if (lli->lli_opendir_pid != current_pid()) - return -EAGAIN; - - /* statahead has been stopped */ - if (!lli->lli_opendir_key) - return -EAGAIN; + return false; - ldd = ll_d2d(dentryp); /* - * When stats a dentry, the system trigger more than once "revalidate" - * or "lookup", for "getattr", for "getxattr", and maybe for others. - * Under patchless client mode, the operation intent is not accurate, - * which maybe misguide the statahead thread. For example: - * The "revalidate" call for "getattr" and "getxattr" of a dentry maybe - * have the same operation intent -- "IT_GETATTR". - * In fact, one dentry should has only one chance to interact with the - * statahead thread, otherwise the statahead windows will be confused. + * When stating a dentry, kernel may trigger 'revalidate' or 'lookup' + * multiple times, eg. for 'getattr', 'getxattr' and etc. + * For patchless client, lookup intent is not accurate, which may + * misguide statahead. For example: + * The 'revalidate' call for 'getattr' and 'getxattr' of a dentry will + * have the same intent -- IT_GETATTR, while one dentry should access + * statahead cache once, otherwise statahead windows is messed up. * The solution is as following: - * Assign "lld_sa_generation" with "sai_generation" when a dentry - * "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR" - * will bypass interacting with statahead thread for checking: - * "lld_sa_generation == lli_sai->sai_generation" + * Assign 'lld_sa_generation' with 'lli_sa_generation' when a dentry + * IT_GETATTR for the first time, and subsequent IT_GETATTR will + * bypass interacting with statahead cache by checking + * 'lld_sa_generation == lli->lli_sa_generation'. */ - if (ldd && lli->lli_sai && - ldd->lld_sa_generation == lli->lli_sai->sai_generation) - return -EAGAIN; + ldd = ll_d2d(dentry); + if (ldd && ldd->lld_sa_generation == lli->lli_sa_generation) + return false; - return 1; -} - -static inline int -ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int only_unplug) -{ - int ret; - - ret = d_need_statahead(dir, *dentryp); - if (ret <= 0) - return ret; - - return do_statahead_enter(dir, dentryp, only_unplug); + return true; } /* llite ioctl register support routine */ @@ -1213,7 +1310,7 @@ static inline void ll_set_lock_data(struct obd_export *exp, struct inode *inode, CDEBUG(D_DLMTRACE, "setting l_data to inode "DFID"%p for remote lock %#llx\n", PFID(ll_inode2fid(inode)), inode, handle.cookie); - md_set_lock_data(exp, &handle.cookie, inode, NULL); + md_set_lock_data(exp, &handle, inode, NULL); } handle.cookie = it->it_lock_handle; @@ -1221,8 +1318,7 @@ static inline void ll_set_lock_data(struct obd_export *exp, struct inode *inode, CDEBUG(D_DLMTRACE, "setting l_data to inode "DFID"%p for lock %#llx\n", PFID(ll_inode2fid(inode)), inode, handle.cookie); - md_set_lock_data(exp, &handle.cookie, inode, - &it->it_lock_bits); + md_set_lock_data(exp, &handle, inode, &it->it_lock_bits); it->it_lock_set = 1; } @@ -1295,6 +1391,8 @@ void ll_xattr_fini(void); int ll_page_sync_io(const struct lu_env *env, struct cl_io *io, struct cl_page *page, enum cl_req_type crt); +int ll_getparent(struct file *file, struct getparent __user *arg); + /* lcommon_cl.c */ int cl_setattr_ost(struct inode *inode, const struct iattr *attr); diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c index 546063e728db..6bb41b09172e 100644 --- a/drivers/staging/lustre/lustre/llite/llite_lib.c +++ b/drivers/staging/lustre/lustre/llite/llite_lib.c @@ -41,7 +41,7 @@ #include <linux/types.h> #include <linux/mm.h> -#include "../include/lustre_lite.h" +#include "../include/lustre/lustre_ioctl.h" #include "../include/lustre_ha.h" #include "../include/lustre_dlm.h" #include "../include/lprocfs_status.h" @@ -115,9 +115,16 @@ static struct ll_sb_info *ll_init_sbi(struct super_block *sb) sbi->ll_sa_max = LL_SA_RPC_DEF; atomic_set(&sbi->ll_sa_total, 0); atomic_set(&sbi->ll_sa_wrong, 0); + atomic_set(&sbi->ll_sa_running, 0); atomic_set(&sbi->ll_agl_total, 0); sbi->ll_flags |= LL_SBI_AGL_ENABLED; + /* root squash */ + sbi->ll_squash.rsi_uid = 0; + sbi->ll_squash.rsi_gid = 0; + INIT_LIST_HEAD(&sbi->ll_squash.rsi_nosquash_nids); + init_rwsem(&sbi->ll_squash.rsi_sem); + sbi->ll_sb = sb; return sbi; @@ -128,6 +135,8 @@ static void ll_free_sbi(struct super_block *sb) struct ll_sb_info *sbi = ll_s2sbi(sb); if (sbi->ll_cache) { + if (!list_empty(&sbi->ll_squash.rsi_nosquash_nids)) + cfs_free_nidlist(&sbi->ll_squash.rsi_nosquash_nids); cl_cache_decref(sbi->ll_cache); sbi->ll_cache = NULL; } @@ -180,7 +189,9 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, OBD_CONNECT_PINGLESS | OBD_CONNECT_MAX_EASIZE | OBD_CONNECT_FLOCK_DEAD | - OBD_CONNECT_DISP_STRIPE; + OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK | + OBD_CONNECT_OPEN_BY_FID | + OBD_CONNECT_DIR_STRIPE; if (sbi->ll_flags & LL_SBI_SOM_PREVIEW) data->ocd_connect_flags |= OBD_CONNECT_SOM; @@ -310,9 +321,9 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, sbi->ll_flags |= LL_SBI_64BIT_HASH; if (data->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) - sbi->ll_md_brw_size = data->ocd_brw_size; + sbi->ll_md_brw_pages = data->ocd_brw_size >> PAGE_SHIFT; else - sbi->ll_md_brw_size = PAGE_SIZE; + sbi->ll_md_brw_pages = 1; if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK) sbi->ll_flags |= LL_SBI_LAYOUT_LOCK; @@ -418,6 +429,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid)); sb->s_op = &lustre_super_operations; + sb->s_xattr = ll_xattr_handlers; #if THREAD_SIZE >= 8192 /*b=17630*/ sb->s_export_op = &lustre_export_operations; #endif @@ -462,7 +474,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, md_free_lustre_md(sbi->ll_md_exp, &lmd); ptlrpc_req_finished(request); - if (!(root)) { + if (IS_ERR(root)) { if (lmd.lsm) obd_free_memmd(sbi->ll_dt_exp, &lmd.lsm); #ifdef CONFIG_FS_POSIX_ACL @@ -486,11 +498,21 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM), KEY_CHECKSUM, sizeof(checksum), &checksum, NULL); + if (err) { + CERROR("%s: Set checksum failed: rc = %d\n", + sbi->ll_dt_exp->exp_obd->obd_name, err); + goto out_root; + } cl_sb_init(sb); err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CACHE_SET), KEY_CACHE_SET, sizeof(*sbi->ll_cache), sbi->ll_cache, NULL); + if (err) { + CERROR("%s: Set cache_set failed: rc = %d\n", + sbi->ll_dt_exp->exp_obd->obd_name, err); + goto out_root; + } sb->s_root = d_make_root(root); if (!sb->s_root) { @@ -560,6 +582,17 @@ int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize) return rc; } +/** + * Get the value of the default_easize parameter. + * + * \see client_obd::cl_default_mds_easize + * + * \param[in] sbi superblock info for this filesystem + * \param[out] lmmsize pointer to storage location for value + * + * \retval 0 on success + * \retval negative negated errno on failure + */ int ll_get_default_mdsize(struct ll_sb_info *sbi, int *lmmsize) { int size, rc; @@ -573,6 +606,29 @@ int ll_get_default_mdsize(struct ll_sb_info *sbi, int *lmmsize) return rc; } +/** + * Set the default_easize parameter to the given value. + * + * \see client_obd::cl_default_mds_easize + * + * \param[in] sbi superblock info for this filesystem + * \param[in] lmmsize the size to set + * + * \retval 0 on success + * \retval negative negated errno on failure + */ +int ll_set_default_mdsize(struct ll_sb_info *sbi, int lmmsize) +{ + if (lmmsize < sizeof(struct lov_mds_md) || + lmmsize > OBD_MAX_DEFAULT_EA_SIZE) + return -EINVAL; + + return obd_set_info_async(NULL, sbi->ll_md_exp, + sizeof(KEY_DEFAULT_EASIZE), + KEY_DEFAULT_EASIZE, + sizeof(int), &lmmsize, NULL); +} + static void client_common_put_super(struct super_block *sb) { struct ll_sb_info *sbi = ll_s2sbi(sb); @@ -608,6 +664,12 @@ void ll_kill_super(struct super_block *sb) if (sbi) { sb->s_dev = sbi->ll_sdev_orig; sbi->ll_umounting = 1; + + /* wait running statahead threads to quit */ + while (atomic_read(&sbi->ll_sa_running) > 0) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC >> 3)); + } } } @@ -647,7 +709,8 @@ static int ll_options(char *options, int *flags) *flags |= tmp; goto next; } - tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK); + tmp = ll_set_opt("noflock", s1, + LL_SBI_FLOCK | LL_SBI_LOCALFLOCK); if (tmp) { *flags &= ~tmp; goto next; @@ -772,11 +835,13 @@ void ll_lli_init(struct ll_inode_info *lli) lli->lli_sai = NULL; spin_lock_init(&lli->lli_sa_lock); lli->lli_opendir_pid = 0; + lli->lli_sa_enabled = 0; + lli->lli_def_stripe_offset = -1; } else { mutex_init(&lli->lli_size_mutex); lli->lli_symlink_name = NULL; init_rwsem(&lli->lli_trunc_sem); - mutex_init(&lli->lli_write_mutex); + range_lock_tree_init(&lli->lli_write_tree); init_rwsem(&lli->lli_glimpse_sem); lli->lli_glimpse_time = 0; INIT_LIST_HEAD(&lli->lli_agl_list); @@ -896,7 +961,8 @@ void ll_put_super(struct super_block *sb) struct lustre_sb_info *lsi = s2lsi(sb); struct ll_sb_info *sbi = ll_s2sbi(sb); char *profilenm = get_profile_name(sb); - int ccc_count, next, force = 1, rc = 0; + int next, force = 1, rc = 0; + long ccc_count; CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm); @@ -917,13 +983,13 @@ void ll_put_super(struct super_block *sb) struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); rc = l_wait_event(sbi->ll_cache->ccc_unstable_waitq, - !atomic_read(&sbi->ll_cache->ccc_unstable_nr), + !atomic_long_read(&sbi->ll_cache->ccc_unstable_nr), &lwi); } - ccc_count = atomic_read(&sbi->ll_cache->ccc_unstable_nr); + ccc_count = atomic_long_read(&sbi->ll_cache->ccc_unstable_nr); if (!force && rc != -EINTR) - LASSERTF(!ccc_count, "count: %i\n", ccc_count); + LASSERTF(!ccc_count, "count: %li\n", ccc_count); /* We need to set force before the lov_disconnect in * lustre_common_put_super, since l_d cleans up osc's as well. @@ -991,6 +1057,206 @@ struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock) return inode; } +static void ll_dir_clear_lsm_md(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + + LASSERT(S_ISDIR(inode->i_mode)); + + if (lli->lli_lsm_md) { + lmv_free_memmd(lli->lli_lsm_md); + lli->lli_lsm_md = NULL; + } +} + +static struct inode *ll_iget_anon_dir(struct super_block *sb, + const struct lu_fid *fid, + struct lustre_md *md) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct mdt_body *body = md->body; + struct inode *inode; + ino_t ino; + + ino = cl_fid_build_ino(fid, sbi->ll_flags & LL_SBI_32BIT_API); + inode = iget_locked(sb, ino); + if (!inode) { + CERROR("%s: failed get simple inode "DFID": rc = -ENOENT\n", + ll_get_fsname(sb, NULL, 0), PFID(fid)); + return ERR_PTR(-ENOENT); + } + + if (inode->i_state & I_NEW) { + struct ll_inode_info *lli = ll_i2info(inode); + struct lmv_stripe_md *lsm = md->lmv; + + inode->i_mode = (inode->i_mode & ~S_IFMT) | + (body->mbo_mode & S_IFMT); + LASSERTF(S_ISDIR(inode->i_mode), "Not slave inode "DFID"\n", + PFID(fid)); + + LTIME_S(inode->i_mtime) = 0; + LTIME_S(inode->i_atime) = 0; + LTIME_S(inode->i_ctime) = 0; + inode->i_rdev = 0; + + inode->i_op = &ll_dir_inode_operations; + inode->i_fop = &ll_dir_operations; + lli->lli_fid = *fid; + ll_lli_init(lli); + + LASSERT(lsm); + /* master object FID */ + lli->lli_pfid = body->mbo_fid1; + CDEBUG(D_INODE, "lli %p slave "DFID" master "DFID"\n", + lli, PFID(fid), PFID(&lli->lli_pfid)); + unlock_new_inode(inode); + } + + return inode; +} + +static int ll_init_lsm_md(struct inode *inode, struct lustre_md *md) +{ + struct lmv_stripe_md *lsm = md->lmv; + struct lu_fid *fid; + int i; + + LASSERT(lsm); + /* + * XXX sigh, this lsm_root initialization should be in + * LMV layer, but it needs ll_iget right now, so we + * put this here right now. + */ + for (i = 0; i < lsm->lsm_md_stripe_count; i++) { + fid = &lsm->lsm_md_oinfo[i].lmo_fid; + LASSERT(!lsm->lsm_md_oinfo[i].lmo_root); + /* Unfortunately ll_iget will call ll_update_inode, + * where the initialization of slave inode is slightly + * different, so it reset lsm_md to NULL to avoid + * initializing lsm for slave inode. + */ + /* For migrating inode, master stripe and master object will + * be same, so we only need assign this inode + */ + if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION && !i) + lsm->lsm_md_oinfo[i].lmo_root = inode; + else + lsm->lsm_md_oinfo[i].lmo_root = + ll_iget_anon_dir(inode->i_sb, fid, md); + if (IS_ERR(lsm->lsm_md_oinfo[i].lmo_root)) { + int rc = PTR_ERR(lsm->lsm_md_oinfo[i].lmo_root); + + lsm->lsm_md_oinfo[i].lmo_root = NULL; + return rc; + } + } + + return 0; +} + +static inline int lli_lsm_md_eq(const struct lmv_stripe_md *lsm_md1, + const struct lmv_stripe_md *lsm_md2) +{ + return lsm_md1->lsm_md_magic == lsm_md2->lsm_md_magic && + lsm_md1->lsm_md_stripe_count == lsm_md2->lsm_md_stripe_count && + lsm_md1->lsm_md_master_mdt_index == + lsm_md2->lsm_md_master_mdt_index && + lsm_md1->lsm_md_hash_type == lsm_md2->lsm_md_hash_type && + lsm_md1->lsm_md_layout_version == + lsm_md2->lsm_md_layout_version && + !strcmp(lsm_md1->lsm_md_pool_name, + lsm_md2->lsm_md_pool_name); +} + +static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct lmv_stripe_md *lsm = md->lmv; + int rc; + + LASSERT(S_ISDIR(inode->i_mode)); + CDEBUG(D_INODE, "update lsm %p of "DFID"\n", lli->lli_lsm_md, + PFID(ll_inode2fid(inode))); + + /* no striped information from request. */ + if (!lsm) { + if (!lli->lli_lsm_md) { + return 0; + } else if (lli->lli_lsm_md->lsm_md_hash_type & + LMV_HASH_FLAG_MIGRATION) { + /* + * migration is done, the temporay MIGRATE layout has + * been removed + */ + CDEBUG(D_INODE, DFID" finish migration.\n", + PFID(ll_inode2fid(inode))); + lmv_free_memmd(lli->lli_lsm_md); + lli->lli_lsm_md = NULL; + return 0; + } else { + /* + * The lustre_md from req does not include stripeEA, + * see ll_md_setattr + */ + return 0; + } + } + + /* set the directory layout */ + if (!lli->lli_lsm_md) { + rc = ll_init_lsm_md(inode, md); + if (rc) + return rc; + + lli->lli_lsm_md = lsm; + /* + * set lsm_md to NULL, so the following free lustre_md + * will not free this lsm + */ + md->lmv = NULL; + CDEBUG(D_INODE, "Set lsm %p magic %x to "DFID"\n", lsm, + lsm->lsm_md_magic, PFID(ll_inode2fid(inode))); + return 0; + } + + /* Compare the old and new stripe information */ + if (!lsm_md_eq(lli->lli_lsm_md, lsm)) { + struct lmv_stripe_md *old_lsm = lli->lli_lsm_md; + int idx; + + CERROR("%s: inode "DFID"(%p)'s lmv layout mismatch (%p)/(%p) magic:0x%x/0x%x stripe count: %d/%d master_mdt: %d/%d hash_type:0x%x/0x%x layout: 0x%x/0x%x pool:%s/%s\n", + ll_get_fsname(inode->i_sb, NULL, 0), PFID(&lli->lli_fid), + inode, lsm, old_lsm, + lsm->lsm_md_magic, old_lsm->lsm_md_magic, + lsm->lsm_md_stripe_count, + old_lsm->lsm_md_stripe_count, + lsm->lsm_md_master_mdt_index, + old_lsm->lsm_md_master_mdt_index, + lsm->lsm_md_hash_type, old_lsm->lsm_md_hash_type, + lsm->lsm_md_layout_version, + old_lsm->lsm_md_layout_version, + lsm->lsm_md_pool_name, + old_lsm->lsm_md_pool_name); + + for (idx = 0; idx < old_lsm->lsm_md_stripe_count; idx++) { + CERROR("%s: sub FIDs in old lsm idx %d, old: "DFID"\n", + ll_get_fsname(inode->i_sb, NULL, 0), idx, + PFID(&old_lsm->lsm_md_oinfo[idx].lmo_fid)); + } + + for (idx = 0; idx < lsm->lsm_md_stripe_count; idx++) { + CERROR("%s: sub FIDs in new lsm idx %d, new: "DFID"\n", + ll_get_fsname(inode->i_sb, NULL, 0), idx, + PFID(&lsm->lsm_md_oinfo[idx].lmo_fid)); + } + + return -EIO; + } + + return 0; +} + void ll_clear_inode(struct inode *inode) { struct ll_inode_info *lli = ll_i2info(inode); @@ -1031,14 +1297,15 @@ void ll_clear_inode(struct inode *inode) #ifdef CONFIG_FS_POSIX_ACL if (lli->lli_posix_acl) { - LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1); posix_acl_release(lli->lli_posix_acl); lli->lli_posix_acl = NULL; } #endif lli->lli_inode_magic = LLI_INODE_DEAD; - if (!S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode)) + ll_dir_clear_lsm_md(inode); + if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) LASSERT(list_empty(&lli->lli_agl_list)); /* @@ -1103,10 +1370,10 @@ static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data, op_data->op_attr.ia_valid = ia_valid; /* Extract epoch data if obtained. */ - op_data->op_handle = md.body->handle; - op_data->op_ioepoch = md.body->ioepoch; + op_data->op_handle = md.body->mbo_handle; + op_data->op_ioepoch = md.body->mbo_ioepoch; - ll_update_inode(inode, &md); + rc = ll_update_inode(inode, &md); ptlrpc_req_finished(request); return rc; @@ -1138,8 +1405,8 @@ static int ll_setattr_done_writing(struct inode *inode, rc = ll_som_update(inode, op_data); else if (rc) { CERROR("%s: inode "DFID" mdc truncate failed: rc = %d\n", - ll_i2sbi(inode)->ll_md_exp->exp_obd->obd_name, - PFID(ll_inode2fid(inode)), rc); + ll_i2sbi(inode)->ll_md_exp->exp_obd->obd_name, + PFID(ll_inode2fid(inode)), rc); } return rc; } @@ -1331,14 +1598,14 @@ int ll_setattr(struct dentry *de, struct iattr *attr) { int mode = d_inode(de)->i_mode; - if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) == - (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) + if ((attr->ia_valid & (ATTR_CTIME | ATTR_SIZE | ATTR_MODE)) == + (ATTR_CTIME | ATTR_SIZE | ATTR_MODE)) attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE; - if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) == - (ATTR_SIZE|ATTR_MODE)) && + if (((attr->ia_valid & (ATTR_MODE | ATTR_FORCE | ATTR_SIZE)) == + (ATTR_SIZE | ATTR_MODE)) && (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) || - (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) && + (((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) && !(attr->ia_mode & S_ISGID)))) attr->ia_valid |= ATTR_FORCE; @@ -1349,7 +1616,7 @@ int ll_setattr(struct dentry *de, struct iattr *attr) attr->ia_valid |= ATTR_KILL_SUID; if ((attr->ia_valid & ATTR_MODE) && - ((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) && + ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) && !(attr->ia_mode & S_ISGID) && !(attr->ia_valid & ATTR_KILL_SGID)) attr->ia_valid |= ATTR_KILL_SGID; @@ -1465,14 +1732,14 @@ void ll_inode_size_unlock(struct inode *inode) mutex_unlock(&lli->lli_size_mutex); } -void ll_update_inode(struct inode *inode, struct lustre_md *md) +int ll_update_inode(struct inode *inode, struct lustre_md *md) { struct ll_inode_info *lli = ll_i2info(inode); struct mdt_body *body = md->body; struct lov_stripe_md *lsm = md->lsm; struct ll_sb_info *sbi = ll_i2sbi(inode); - LASSERT((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0)); + LASSERT((lsm != NULL) == ((body->mbo_valid & OBD_MD_FLEASIZE) != 0)); if (lsm) { if (!lli->lli_has_smd && !(sbi->ll_flags & LL_SBI_LAYOUT_LOCK)) @@ -1483,8 +1750,16 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) lli->lli_maxbytes = MAX_LFS_FILESIZE; } + if (S_ISDIR(inode->i_mode)) { + int rc; + + rc = ll_update_lsm_md(inode, md); + if (rc) + return rc; + } + #ifdef CONFIG_FS_POSIX_ACL - if (body->valid & OBD_MD_FLACL) { + if (body->mbo_valid & OBD_MD_FLACL) { spin_lock(&lli->lli_lock); if (lli->lli_posix_acl) posix_acl_release(lli->lli_posix_acl); @@ -1492,65 +1767,67 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) spin_unlock(&lli->lli_lock); } #endif - inode->i_ino = cl_fid_build_ino(&body->fid1, + inode->i_ino = cl_fid_build_ino(&body->mbo_fid1, sbi->ll_flags & LL_SBI_32BIT_API); - inode->i_generation = cl_fid_build_gen(&body->fid1); + inode->i_generation = cl_fid_build_gen(&body->mbo_fid1); - if (body->valid & OBD_MD_FLATIME) { - if (body->atime > LTIME_S(inode->i_atime)) - LTIME_S(inode->i_atime) = body->atime; - lli->lli_atime = body->atime; + if (body->mbo_valid & OBD_MD_FLATIME) { + if (body->mbo_atime > LTIME_S(inode->i_atime)) + LTIME_S(inode->i_atime) = body->mbo_atime; + lli->lli_atime = body->mbo_atime; } - if (body->valid & OBD_MD_FLMTIME) { - if (body->mtime > LTIME_S(inode->i_mtime)) { + if (body->mbo_valid & OBD_MD_FLMTIME) { + if (body->mbo_mtime > LTIME_S(inode->i_mtime)) { CDEBUG(D_INODE, "setting ino %lu mtime from %lu to %llu\n", inode->i_ino, LTIME_S(inode->i_mtime), - body->mtime); - LTIME_S(inode->i_mtime) = body->mtime; + body->mbo_mtime); + LTIME_S(inode->i_mtime) = body->mbo_mtime; } - lli->lli_mtime = body->mtime; - } - if (body->valid & OBD_MD_FLCTIME) { - if (body->ctime > LTIME_S(inode->i_ctime)) - LTIME_S(inode->i_ctime) = body->ctime; - lli->lli_ctime = body->ctime; - } - if (body->valid & OBD_MD_FLMODE) - inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT); - if (body->valid & OBD_MD_FLTYPE) - inode->i_mode = (inode->i_mode & ~S_IFMT)|(body->mode & S_IFMT); + lli->lli_mtime = body->mbo_mtime; + } + if (body->mbo_valid & OBD_MD_FLCTIME) { + if (body->mbo_ctime > LTIME_S(inode->i_ctime)) + LTIME_S(inode->i_ctime) = body->mbo_ctime; + lli->lli_ctime = body->mbo_ctime; + } + if (body->mbo_valid & OBD_MD_FLMODE) + inode->i_mode = (inode->i_mode & S_IFMT) | + (body->mbo_mode & ~S_IFMT); + if (body->mbo_valid & OBD_MD_FLTYPE) + inode->i_mode = (inode->i_mode & ~S_IFMT) | + (body->mbo_mode & S_IFMT); LASSERT(inode->i_mode != 0); if (S_ISREG(inode->i_mode)) inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS + 1, LL_MAX_BLKSIZE_BITS); else inode->i_blkbits = inode->i_sb->s_blocksize_bits; - if (body->valid & OBD_MD_FLUID) - inode->i_uid = make_kuid(&init_user_ns, body->uid); - if (body->valid & OBD_MD_FLGID) - inode->i_gid = make_kgid(&init_user_ns, body->gid); - if (body->valid & OBD_MD_FLFLAGS) - inode->i_flags = ll_ext_to_inode_flags(body->flags); - if (body->valid & OBD_MD_FLNLINK) - set_nlink(inode, body->nlink); - if (body->valid & OBD_MD_FLRDEV) - inode->i_rdev = old_decode_dev(body->rdev); - - if (body->valid & OBD_MD_FLID) { + if (body->mbo_valid & OBD_MD_FLUID) + inode->i_uid = make_kuid(&init_user_ns, body->mbo_uid); + if (body->mbo_valid & OBD_MD_FLGID) + inode->i_gid = make_kgid(&init_user_ns, body->mbo_gid); + if (body->mbo_valid & OBD_MD_FLFLAGS) + inode->i_flags = ll_ext_to_inode_flags(body->mbo_flags); + if (body->mbo_valid & OBD_MD_FLNLINK) + set_nlink(inode, body->mbo_nlink); + if (body->mbo_valid & OBD_MD_FLRDEV) + inode->i_rdev = old_decode_dev(body->mbo_rdev); + + if (body->mbo_valid & OBD_MD_FLID) { /* FID shouldn't be changed! */ if (fid_is_sane(&lli->lli_fid)) { - LASSERTF(lu_fid_eq(&lli->lli_fid, &body->fid1), + LASSERTF(lu_fid_eq(&lli->lli_fid, &body->mbo_fid1), "Trying to change FID "DFID" to the "DFID", inode "DFID"(%p)\n", - PFID(&lli->lli_fid), PFID(&body->fid1), + PFID(&lli->lli_fid), PFID(&body->mbo_fid1), PFID(ll_inode2fid(inode)), inode); } else { - lli->lli_fid = body->fid1; + lli->lli_fid = body->mbo_fid1; } } LASSERT(fid_seq(&lli->lli_fid) != 0); - if (body->valid & OBD_MD_FLSIZE) { + if (body->mbo_valid & OBD_MD_FLSIZE) { if (exp_connect_som(ll_i2mdexp(inode)) && S_ISREG(inode->i_mode)) { struct lustre_handle lockh; @@ -1577,7 +1854,7 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) /* Use old size assignment to avoid * deadlock bz14138 & bz14326 */ - i_size_write(inode, body->size); + i_size_write(inode, body->mbo_size); spin_lock(&lli->lli_lock); lli->lli_flags |= LLIF_MDS_SIZE_LOCK; spin_unlock(&lli->lli_lock); @@ -1588,26 +1865,29 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) /* Use old size assignment to avoid * deadlock bz14138 & bz14326 */ - i_size_write(inode, body->size); + i_size_write(inode, body->mbo_size); CDEBUG(D_VFSTRACE, "inode=%lu, updating i_size %llu\n", - inode->i_ino, (unsigned long long)body->size); + inode->i_ino, (unsigned long long)body->mbo_size); } - if (body->valid & OBD_MD_FLBLOCKS) - inode->i_blocks = body->blocks; + if (body->mbo_valid & OBD_MD_FLBLOCKS) + inode->i_blocks = body->mbo_blocks; } - if (body->valid & OBD_MD_TSTATE) { - if (body->t_state & MS_RESTORE) + if (body->mbo_valid & OBD_MD_TSTATE) { + if (body->mbo_t_state & MS_RESTORE) lli->lli_flags |= LLIF_FILE_RESTORING; } + + return 0; } -void ll_read_inode2(struct inode *inode, void *opaque) +int ll_read_inode2(struct inode *inode, void *opaque) { struct lustre_md *md = opaque; struct ll_inode_info *lli = ll_i2info(inode); + int rc; CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", PFID(&lli->lli_fid), inode); @@ -1623,7 +1903,9 @@ void ll_read_inode2(struct inode *inode, void *opaque) LTIME_S(inode->i_atime) = 0; LTIME_S(inode->i_ctime) = 0; inode->i_rdev = 0; - ll_update_inode(inode, md); + rc = ll_update_inode(inode, md); + if (rc) + return rc; /* OIDEBUG(inode); */ @@ -1644,6 +1926,8 @@ void ll_read_inode2(struct inode *inode, void *opaque) init_special_inode(inode, inode->i_mode, inode->i_rdev); } + + return 0; } void ll_delete_inode(struct inode *inode) @@ -1655,20 +1939,13 @@ void ll_delete_inode(struct inode *inode) * osc_extent implementation at LU-1030. */ cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, - CL_FSYNC_DISCARD, 1); + CL_FSYNC_LOCAL, 1); truncate_inode_pages_final(&inode->i_data); - /* Workaround for LU-118 */ - if (inode->i_data.nrpages) { - spin_lock_irq(&inode->i_data.tree_lock); - spin_unlock_irq(&inode->i_data.tree_lock); - LASSERTF(inode->i_data.nrpages == 0, - "inode="DFID"(%p) nrpages=%lu, see http://jira.whamcloud.com/browse/LU-118\n", - PFID(ll_inode2fid(inode)), inode, - inode->i_data.nrpages); - } - /* Workaround end */ + LASSERTF(!inode->i_data.nrpages, + "inode=" DFID "(%p) nrpages=%lu, see http://jira.whamcloud.com/browse/LU-118\n", + PFID(ll_inode2fid(inode)), inode, inode->i_data.nrpages); ll_clear_inode(inode); clear_inode(inode); @@ -1704,7 +1981,7 @@ int ll_iocontrol(struct inode *inode, struct file *file, body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - flags = body->flags; + flags = body->mbo_flags; ptlrpc_req_finished(req); @@ -1886,9 +2163,9 @@ void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req) if (!op_data) return; - op_data->op_fid1 = body->fid1; - op_data->op_ioepoch = body->ioepoch; - op_data->op_handle = body->handle; + op_data->op_fid1 = body->mbo_fid1; + op_data->op_ioepoch = body->mbo_ioepoch; + op_data->op_handle = body->mbo_handle; op_data->op_mod_time = get_seconds(); md_close(exp, op_data, NULL, &close_req); ptlrpc_req_finished(close_req); @@ -1910,7 +2187,9 @@ int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, goto cleanup; if (*inode) { - ll_update_inode(*inode, &md); + rc = ll_update_inode(*inode, &md); + if (rc) + goto out; } else { LASSERT(sb); @@ -1918,18 +2197,18 @@ int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, * At this point server returns to client's same fid as client * generated for creating. So using ->fid1 is okay here. */ - if (!fid_is_sane(&md.body->fid1)) { + if (!fid_is_sane(&md.body->mbo_fid1)) { CERROR("%s: Fid is insane " DFID "\n", ll_get_fsname(sb, NULL, 0), - PFID(&md.body->fid1)); + PFID(&md.body->mbo_fid1)); rc = -EINVAL; goto out; } - *inode = ll_iget(sb, cl_fid_build_ino(&md.body->fid1, + *inode = ll_iget(sb, cl_fid_build_ino(&md.body->mbo_fid1, sbi->ll_flags & LL_SBI_32BIT_API), &md); - if (!*inode) { + if (IS_ERR(*inode)) { #ifdef CONFIG_FS_POSIX_ACL if (md.posix_acl) { posix_acl_release(md.posix_acl); @@ -2075,11 +2354,20 @@ int ll_process_config(struct lustre_cfg *lcfg) /* this function prepares md_op_data hint for passing ot down to MD stack. */ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data, struct inode *i1, struct inode *i2, - const char *name, int namelen, - int mode, __u32 opc, void *data) + const char *name, size_t namelen, + u32 mode, __u32 opc, void *data) { - if (namelen > ll_i2sbi(i1)->ll_namelen) - return ERR_PTR(-ENAMETOOLONG); + if (!name) { + /* Do not reuse namelen for something else. */ + if (namelen) + return ERR_PTR(-EINVAL); + } else { + if (namelen > ll_i2sbi(i1)->ll_namelen) + return ERR_PTR(-ENAMETOOLONG); + + if (!lu_name_is_valid_2(name, namelen)) + return ERR_PTR(-EINVAL); + } if (!op_data) op_data = kzalloc(sizeof(*op_data), GFP_NOFS); @@ -2089,11 +2377,26 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data, ll_i2gids(op_data->op_suppgids, i1, i2); op_data->op_fid1 = *ll_inode2fid(i1); + op_data->op_default_stripe_offset = -1; + if (S_ISDIR(i1->i_mode)) { + op_data->op_mea1 = ll_i2info(i1)->lli_lsm_md; + op_data->op_default_stripe_offset = + ll_i2info(i1)->lli_def_stripe_offset; + } - if (i2) + if (i2) { op_data->op_fid2 = *ll_inode2fid(i2); - else + if (S_ISDIR(i2->i_mode)) + op_data->op_mea2 = ll_i2info(i2)->lli_lsm_md; + } else { fid_zero(&op_data->op_fid2); + } + + if (ll_i2sbi(i1)->ll_flags & LL_SBI_64BIT_HASH) + op_data->op_cli_flags |= CLI_HASH64; + + if (ll_need_32bit_api(ll_i2sbi(i1))) + op_data->op_cli_flags |= CLI_API32; op_data->op_name = name; op_data->op_namelen = namelen; @@ -2105,26 +2408,12 @@ struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data, op_data->op_bias = 0; op_data->op_cli_flags = 0; if ((opc == LUSTRE_OPC_CREATE) && name && - filename_is_volatile(name, namelen, NULL)) + filename_is_volatile(name, namelen, &op_data->op_mds)) op_data->op_bias |= MDS_CREATE_VOLATILE; - op_data->op_opc = opc; - op_data->op_mds = 0; + else + op_data->op_mds = 0; op_data->op_data = data; - /* If the file is being opened after mknod() (normally due to NFS) - * try to use the default stripe data from parent directory for - * allocating OST objects. Try to pass the parent FID to MDS. - */ - if (opc == LUSTRE_OPC_CREATE && i1 == i2 && S_ISREG(i2->i_mode) && - !ll_i2info(i2)->lli_has_smd) { - struct ll_inode_info *lli = ll_i2info(i2); - - spin_lock(&lli->lli_lock); - if (likely(!lli->lli_has_smd && !fid_is_zero(&lli->lli_pfid))) - op_data->op_fid1 = lli->lli_pfid; - spin_unlock(&lli->lli_lock); - } - /* When called by ll_setattr_raw, file is i1. */ if (ll_i2info(i1)->lli_flags & LLIF_DATA_MODIFIED) op_data->op_bias |= MDS_DATA_MODIFIED; @@ -2251,3 +2540,197 @@ void ll_dirty_page_discard_warn(struct page *page, int ioret) if (buf) free_page((unsigned long)buf); } + +ssize_t ll_copy_user_md(const struct lov_user_md __user *md, + struct lov_user_md **kbuf) +{ + struct lov_user_md lum; + ssize_t lum_size; + + if (copy_from_user(&lum, md, sizeof(lum))) { + lum_size = -EFAULT; + goto no_kbuf; + } + + lum_size = ll_lov_user_md_size(&lum); + if (lum_size < 0) + goto no_kbuf; + + *kbuf = kzalloc(lum_size, GFP_NOFS); + if (!*kbuf) { + lum_size = -ENOMEM; + goto no_kbuf; + } + + if (copy_from_user(*kbuf, md, lum_size) != 0) { + kfree(*kbuf); + *kbuf = NULL; + lum_size = -EFAULT; + } +no_kbuf: + return lum_size; +} + +/* + * Compute llite root squash state after a change of root squash + * configuration setting or add/remove of a lnet nid + */ +void ll_compute_rootsquash_state(struct ll_sb_info *sbi) +{ + struct root_squash_info *squash = &sbi->ll_squash; + lnet_process_id_t id; + bool matched; + int i; + + /* Update norootsquash flag */ + down_write(&squash->rsi_sem); + if (list_empty(&squash->rsi_nosquash_nids)) { + sbi->ll_flags &= ~LL_SBI_NOROOTSQUASH; + } else { + /* + * Do not apply root squash as soon as one of our NIDs is + * in the nosquash_nids list + */ + matched = false; + i = 0; + + while (LNetGetId(i++, &id) != -ENOENT) { + if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND) + continue; + if (cfs_match_nid(id.nid, &squash->rsi_nosquash_nids)) { + matched = true; + break; + } + } + if (matched) + sbi->ll_flags |= LL_SBI_NOROOTSQUASH; + else + sbi->ll_flags &= ~LL_SBI_NOROOTSQUASH; + } + up_write(&squash->rsi_sem); +} + +/** + * Parse linkea content to extract information about a given hardlink + * + * \param[in] ldata - Initialized linkea data + * \param[in] linkno - Link identifier + * \param[out] parent_fid - The entry's parent FID + * \param[in] size - Entry name destination buffer + * + * \retval 0 on success + * \retval Appropriate negative error code on failure + */ +static int ll_linkea_decode(struct linkea_data *ldata, unsigned int linkno, + struct lu_fid *parent_fid, struct lu_name *ln) +{ + unsigned int idx; + int rc; + + rc = linkea_init(ldata); + if (rc < 0) + return rc; + + if (linkno >= ldata->ld_leh->leh_reccount) + /* beyond last link */ + return -ENODATA; + + linkea_first_entry(ldata); + for (idx = 0; ldata->ld_lee; idx++) { + linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, ln, + parent_fid); + if (idx == linkno) + break; + + linkea_next_entry(ldata); + } + + if (idx < linkno) + return -ENODATA; + + return 0; +} + +/** + * Get parent FID and name of an identified link. Operation is performed for + * a given link number, letting the caller iterate over linkno to list one or + * all links of an entry. + * + * \param[in] file - File descriptor against which to perform the operation + * \param[in,out] arg - User-filled structure containing the linkno to operate + * on and the available size. It is eventually filled with + * the requested information or left untouched on error + * + * \retval - 0 on success + * \retval - Appropriate negative error code on failure + */ +int ll_getparent(struct file *file, struct getparent __user *arg) +{ + struct inode *inode = file_inode(file); + struct linkea_data *ldata; + struct lu_fid parent_fid; + struct lu_buf buf = { + .lb_buf = NULL, + .lb_len = 0 + }; + struct lu_name ln; + u32 name_size; + u32 linkno; + int rc; + + if (!capable(CFS_CAP_DAC_READ_SEARCH) && + !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH)) + return -EPERM; + + if (get_user(name_size, &arg->gp_name_size)) + return -EFAULT; + + if (get_user(linkno, &arg->gp_linkno)) + return -EFAULT; + + if (name_size > PATH_MAX) + return -EINVAL; + + ldata = kzalloc(sizeof(*ldata), GFP_NOFS); + if (!ldata) + return -ENOMEM; + + rc = linkea_data_new(ldata, &buf); + if (rc < 0) + goto ldata_free; + + rc = ll_xattr_list(inode, XATTR_NAME_LINK, XATTR_TRUSTED_T, buf.lb_buf, + buf.lb_len, OBD_MD_FLXATTR); + if (rc < 0) + goto lb_free; + + rc = ll_linkea_decode(ldata, linkno, &parent_fid, &ln); + if (rc < 0) + goto lb_free; + + if (ln.ln_namelen >= name_size) { + rc = -EOVERFLOW; + goto lb_free; + } + + if (copy_to_user(&arg->gp_fid, &parent_fid, sizeof(arg->gp_fid))) { + rc = -EFAULT; + goto lb_free; + } + + if (copy_to_user(&arg->gp_name, ln.ln_name, ln.ln_namelen)) { + rc = -EFAULT; + goto lb_free; + } + + if (put_user('\0', arg->gp_name + ln.ln_namelen)) { + rc = -EFAULT; + goto lb_free; + } + +lb_free: + lu_buf_free(&buf); +ldata_free: + kfree(ldata); + return rc; +} diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c b/drivers/staging/lustre/lustre/llite/llite_mmap.c index 66ee5db5fce8..436691814a5e 100644 --- a/drivers/staging/lustre/lustre/llite/llite_mmap.c +++ b/drivers/staging/lustre/lustre/llite/llite_mmap.c @@ -43,9 +43,7 @@ #define DEBUG_SUBSYSTEM S_LLITE -#include "../include/lustre_lite.h" #include "llite_internal.h" -#include "../include/linux/lustre_compat25.h" static const struct vm_operations_struct ll_file_vm_ops; @@ -126,7 +124,7 @@ restart: fio = &io->u.ci_fault; fio->ft_index = index; - fio->ft_executable = vma->vm_flags&VM_EXEC; + fio->ft_executable = vma->vm_flags & VM_EXEC; /* * disable VM_SEQ_READ and use VM_RAND_READ to make sure that @@ -134,7 +132,7 @@ restart: * filemap_nopage. we do our readahead in ll_readpage. */ if (ra_flags) - *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ); + *ra_flags = vma->vm_flags & (VM_RAND_READ | VM_SEQ_READ); vma->vm_flags &= ~VM_SEQ_READ; vma->vm_flags |= VM_RAND_READ; @@ -429,7 +427,6 @@ static void ll_vm_open(struct vm_area_struct *vma) struct inode *inode = file_inode(vma->vm_file); struct vvp_object *vob = cl_inode2vvp(inode); - LASSERT(vma->vm_file); LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0); atomic_inc(&vob->vob_mmap_cnt); } @@ -442,7 +439,6 @@ static void ll_vm_close(struct vm_area_struct *vma) struct inode *inode = file_inode(vma->vm_file); struct vvp_object *vob = cl_inode2vvp(inode); - LASSERT(vma->vm_file); atomic_dec(&vob->vob_mmap_cnt); LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0); } diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c b/drivers/staging/lustre/lustre/llite/llite_nfs.c index 65972c892731..709230571b4b 100644 --- a/drivers/staging/lustre/lustre/llite/llite_nfs.c +++ b/drivers/staging/lustre/lustre/llite/llite_nfs.c @@ -38,7 +38,6 @@ */ #define DEBUG_SUBSYSTEM S_LLITE -#include "../include/lustre_lite.h" #include "llite_internal.h" #include <linux/exportfs.h> @@ -73,11 +72,6 @@ void get_uuid2fsid(const char *name, int len, __kernel_fsid_t *fsid) fsid->val[1] = key >> 32; } -static int ll_nfs_test_inode(struct inode *inode, void *opaque) -{ - return lu_fid_eq(&ll_i2info(inode)->lli_fid, opaque); -} - struct inode *search_inode_for_lustre(struct super_block *sb, const struct lu_fid *fid) { @@ -92,7 +86,7 @@ struct inode *search_inode_for_lustre(struct super_block *sb, CDEBUG(D_INFO, "searching inode for:(%lu,"DFID")\n", hash, PFID(fid)); - inode = ilookup5(sb, hash, ll_nfs_test_inode, (void *)fid); + inode = ilookup5(sb, hash, ll_test_inode_by_fid, (void *)fid); if (inode) return inode; @@ -153,12 +147,18 @@ ll_iget_for_nfs(struct super_block *sb, struct lu_fid *fid, struct lu_fid *paren return ERR_PTR(-ESTALE); } + result = d_obtain_alias(inode); + if (IS_ERR(result)) { + iput(inode); + return result; + } + /** - * It is an anonymous dentry without OST objects created yet. - * We have to find the parent to tell MDS how to init lov objects. + * In case d_obtain_alias() found a disconnected dentry, always update + * lli_pfid to allow later operation (normally open) have parent fid, + * which may be used by MDS to create data. */ - if (S_ISREG(inode->i_mode) && !ll_i2info(inode)->lli_has_smd && - parent && !fid_is_zero(parent)) { + if (parent) { struct ll_inode_info *lli = ll_i2info(inode); spin_lock(&lli->lli_lock); @@ -255,6 +255,8 @@ static int ll_get_name(struct dentry *dentry, char *name, .lgd_fid = ll_i2info(d_inode(child))->lli_fid, .ctx.actor = ll_nfs_get_name_filldir, }; + struct md_op_data *op_data; + __u64 pos = 0; if (!dir || !S_ISDIR(dir->i_mode)) { rc = -ENOTDIR; @@ -266,9 +268,18 @@ static int ll_get_name(struct dentry *dentry, char *name, goto out; } + op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, + LUSTRE_OPC_ANY, dir); + if (IS_ERR(op_data)) { + rc = PTR_ERR(op_data); + goto out; + } + + op_data->op_max_pages = ll_i2sbi(dir)->ll_md_brw_pages; inode_lock(dir); - rc = ll_dir_read(dir, &lgd.ctx); + rc = ll_dir_read(dir, &pos, op_data, &lgd.ctx); inode_unlock(dir); + ll_finish_md_op_data(op_data); if (!rc && !lgd.lgd_found) rc = -ENOENT; out: @@ -297,14 +308,12 @@ static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid, return ll_iget_for_nfs(sb, &nfs_fid->lnf_parent, NULL); } -static struct dentry *ll_get_parent(struct dentry *dchild) +int ll_dir_get_parent_fid(struct inode *dir, struct lu_fid *parent_fid) { struct ptlrpc_request *req = NULL; - struct inode *dir = d_inode(dchild); struct ll_sb_info *sbi; - struct dentry *result = NULL; struct mdt_body *body; - static char dotdot[] = ".."; + static const char dotdot[] = ".."; struct md_op_data *op_data; int rc; int lmmsize; @@ -319,13 +328,13 @@ static struct dentry *ll_get_parent(struct dentry *dchild) rc = ll_get_default_mdsize(sbi, &lmmsize); if (rc != 0) - return ERR_PTR(rc); + return rc; op_data = ll_prep_md_op_data(NULL, dir, NULL, dotdot, strlen(dotdot), lmmsize, LUSTRE_OPC_ANY, NULL); if (IS_ERR(op_data)) - return (void *)op_data; + return PTR_ERR(op_data); rc = md_getattr_name(sbi->ll_md_exp, op_data, &req); ll_finish_md_op_data(op_data); @@ -333,21 +342,36 @@ static struct dentry *ll_get_parent(struct dentry *dchild) CERROR("%s: failure inode "DFID" get parent: rc = %d\n", ll_get_fsname(dir->i_sb, NULL, 0), PFID(ll_inode2fid(dir)), rc); - return ERR_PTR(rc); + return rc; } body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); /* * LU-3952: MDT may lost the FID of its parent, we should not crash * the NFS server, ll_iget_for_nfs() will handle the error. */ - if (body->valid & OBD_MD_FLID) { + if (body->mbo_valid & OBD_MD_FLID) { CDEBUG(D_INFO, "parent for " DFID " is " DFID "\n", - PFID(ll_inode2fid(dir)), PFID(&body->fid1)); + PFID(ll_inode2fid(dir)), PFID(&body->mbo_fid1)); + *parent_fid = body->mbo_fid1; } - result = ll_iget_for_nfs(dir->i_sb, &body->fid1, NULL); ptlrpc_req_finished(req); - return result; + return 0; +} + +static struct dentry *ll_get_parent(struct dentry *dchild) +{ + struct lu_fid parent_fid = { 0 }; + struct dentry *dentry; + int rc; + + rc = ll_dir_get_parent_fid(dchild->d_inode, &parent_fid); + if (rc) + return ERR_PTR(rc); + + dentry = ll_iget_for_nfs(dchild->d_inode->i_sb, &parent_fid, NULL); + + return dentry; } const struct export_operations lustre_export_operations = { diff --git a/drivers/staging/lustre/lustre/llite/lproc_llite.c b/drivers/staging/lustre/lustre/llite/lproc_llite.c index e86bf3c53be3..6eae60595905 100644 --- a/drivers/staging/lustre/lustre/llite/lproc_llite.c +++ b/drivers/staging/lustre/lustre/llite/lproc_llite.c @@ -31,7 +31,6 @@ */ #define DEBUG_SUBSYSTEM S_LLITE -#include "../include/lustre_lite.h" #include "../include/lprocfs_status.h" #include <linux/seq_file.h> #include "../include/obd_support.h" @@ -358,16 +357,16 @@ static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v) struct ll_sb_info *sbi = ll_s2sbi(sb); struct cl_client_cache *cache = sbi->ll_cache; int shift = 20 - PAGE_SHIFT; - int max_cached_mb; - int unused_mb; + long max_cached_mb; + long unused_mb; max_cached_mb = cache->ccc_lru_max >> shift; - unused_mb = atomic_read(&cache->ccc_lru_left) >> shift; + unused_mb = atomic_long_read(&cache->ccc_lru_left) >> shift; seq_printf(m, "users: %d\n" - "max_cached_mb: %d\n" - "used_mb: %d\n" - "unused_mb: %d\n" + "max_cached_mb: %ld\n" + "used_mb: %ld\n" + "unused_mb: %ld\n" "reclaim_count: %u\n", atomic_read(&cache->ccc_users), max_cached_mb, @@ -385,10 +384,13 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file, struct ll_sb_info *sbi = ll_s2sbi(sb); struct cl_client_cache *cache = sbi->ll_cache; struct lu_env *env; + long diff = 0; + long nrpages = 0; int refcheck; - int mult, rc, pages_number; - int diff = 0; - int nrpages = 0; + long pages_number; + int mult; + long rc; + u64 val; char kernbuf[128]; if (count >= sizeof(kernbuf)) @@ -401,10 +403,14 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file, mult = 1 << (20 - PAGE_SHIFT); buffer += lprocfs_find_named_value(kernbuf, "max_cached_mb:", &count) - kernbuf; - rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); + rc = lprocfs_write_frac_u64_helper(buffer, count, &val, mult); if (rc) return rc; + if (val > LONG_MAX) + return -ERANGE; + pages_number = (long)val; + if (pages_number < 0 || pages_number > totalram_pages) { CERROR("%s: can't set max cache more than %lu MB\n", ll_get_fsname(sb, NULL, 0), @@ -418,7 +424,7 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file, /* easy - add more LRU slots. */ if (diff >= 0) { - atomic_add(diff, &cache->ccc_lru_left); + atomic_long_add(diff, &cache->ccc_lru_left); rc = 0; goto out; } @@ -429,18 +435,18 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file, diff = -diff; while (diff > 0) { - int tmp; + long tmp; /* reduce LRU budget from free slots. */ do { - int ov, nv; + long ov, nv; - ov = atomic_read(&cache->ccc_lru_left); + ov = atomic_long_read(&cache->ccc_lru_left); if (ov == 0) break; nv = ov > diff ? ov - diff : 0; - rc = atomic_cmpxchg(&cache->ccc_lru_left, ov, nv); + rc = atomic_long_cmpxchg(&cache->ccc_lru_left, ov, nv); if (likely(ov == rc)) { diff -= ov - nv; nrpages += ov - nv; @@ -474,7 +480,7 @@ out: spin_unlock(&sbi->ll_lock); rc = count; } else { - atomic_add(nrpages, &cache->ccc_lru_left); + atomic_long_add(nrpages, &cache->ccc_lru_left); } return rc; } @@ -738,6 +744,18 @@ static ssize_t max_easize_show(struct kobject *kobj, } LUSTRE_RO_ATTR(max_easize); +/** + * Get default_easize. + * + * \see client_obd::cl_default_mds_easize + * + * \param[in] kobj kernel object for sysfs tree + * \param[in] attr attribute of this kernel object + * \param[in] buf buffer to write data into + * + * \retval positive \a count on success + * \retval negative negated errno on failure + */ static ssize_t default_easize_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -753,7 +771,44 @@ static ssize_t default_easize_show(struct kobject *kobj, return sprintf(buf, "%u\n", ealen); } -LUSTRE_RO_ATTR(default_easize); + +/** + * Set default_easize. + * + * Range checking on the passed value is handled by + * ll_set_default_mdsize(). + * + * \see client_obd::cl_default_mds_easize + * + * \param[in] kobj kernel object for sysfs tree + * \param[in] attr attribute of this kernel object + * \param[in] buffer string passed from user space + * \param[in] count \a buffer length + * + * \retval positive \a count on success + * \retval negative negated errno on failure + */ +static ssize_t default_easize_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kobj); + unsigned long val; + int rc; + + rc = kstrtoul(buffer, 10, &val); + if (rc) + return rc; + + rc = ll_set_default_mdsize(sbi, val); + if (rc) + return rc; + + return count; +} +LUSTRE_RW_ATTR(default_easize); static int ll_sbi_flags_seq_show(struct seq_file *m, void *v) { @@ -774,7 +829,7 @@ static int ll_sbi_flags_seq_show(struct seq_file *m, void *v) flags >>= 1; ++i; } - seq_printf(m, "\b\n"); + seq_puts(m, "\b\n"); return 0; } @@ -823,15 +878,116 @@ static ssize_t unstable_stats_show(struct kobject *kobj, struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, ll_kobj); struct cl_client_cache *cache = sbi->ll_cache; - int pages, mb; + long pages; + int mb; - pages = atomic_read(&cache->ccc_unstable_nr); + pages = atomic_long_read(&cache->ccc_unstable_nr); mb = (pages * PAGE_SIZE) >> 20; - return sprintf(buf, "unstable_pages: %8d\n" - "unstable_mb: %8d\n", pages, mb); + return sprintf(buf, "unstable_check: %8d\n" + "unstable_pages: %12ld\n" + "unstable_mb: %8d\n", + cache->ccc_unstable_check, pages, mb); } -LUSTRE_RO_ATTR(unstable_stats); + +static ssize_t unstable_stats_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kobj); + char kernbuf[128]; + int val, rc; + + if (!count) + return 0; + if (count >= sizeof(kernbuf)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + kernbuf[count] = 0; + + buffer += lprocfs_find_named_value(kernbuf, "unstable_check:", &count) - + kernbuf; + rc = lprocfs_write_helper(buffer, count, &val); + if (rc < 0) + return rc; + + /* borrow lru lock to set the value */ + spin_lock(&sbi->ll_cache->ccc_lru_lock); + sbi->ll_cache->ccc_unstable_check = !!val; + spin_unlock(&sbi->ll_cache->ccc_lru_lock); + + return count; +} +LUSTRE_RW_ATTR(unstable_stats); + +static ssize_t root_squash_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kobj); + struct root_squash_info *squash = &sbi->ll_squash; + + return sprintf(buf, "%u:%u\n", squash->rsi_uid, squash->rsi_gid); +} + +static ssize_t root_squash_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kobj); + struct root_squash_info *squash = &sbi->ll_squash; + + return lprocfs_wr_root_squash(buffer, count, squash, + ll_get_fsname(sbi->ll_sb, NULL, 0)); +} +LUSTRE_RW_ATTR(root_squash); + +static int ll_nosquash_nids_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct root_squash_info *squash = &sbi->ll_squash; + int len; + + down_read(&squash->rsi_sem); + if (!list_empty(&squash->rsi_nosquash_nids)) { + len = cfs_print_nidlist(m->buf + m->count, m->size - m->count, + &squash->rsi_nosquash_nids); + m->count += len; + seq_puts(m, "\n"); + } else { + seq_puts(m, "NONE\n"); + } + up_read(&squash->rsi_sem); + + return 0; +} + +static ssize_t ll_nosquash_nids_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct root_squash_info *squash = &sbi->ll_squash; + int rc; + + rc = lprocfs_wr_nosquash_nids(buffer, count, squash, + ll_get_fsname(sb, NULL, 0)); + if (rc < 0) + return rc; + + ll_compute_rootsquash_state(sbi); + + return rc; +} + +LPROC_SEQ_FOPS(ll_nosquash_nids); static struct lprocfs_vars lprocfs_llite_obd_vars[] = { /* { "mntpt_path", ll_rd_path, 0, 0 }, */ @@ -840,6 +996,8 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = { { "max_cached_mb", &ll_max_cached_mb_fops, NULL }, { "statahead_stats", &ll_statahead_stats_fops, NULL, 0 }, { "sbi_flags", &ll_sbi_flags_fops, NULL, 0 }, + { .name = "nosquash_nids", + .fops = &ll_nosquash_nids_fops }, { NULL } }; @@ -869,6 +1027,7 @@ static struct attribute *llite_attrs[] = { &lustre_attr_default_easize.attr, &lustre_attr_xattr_cache.attr, &lustre_attr_unstable_stats.attr, + &lustre_attr_root_squash.attr, NULL, }; @@ -893,17 +1052,17 @@ static const struct llite_file_opcode { /* file operation */ { LPROC_LL_DIRTY_HITS, LPROCFS_TYPE_REGS, "dirty_pages_hits" }, { LPROC_LL_DIRTY_MISSES, LPROCFS_TYPE_REGS, "dirty_pages_misses" }, - { LPROC_LL_READ_BYTES, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES, + { LPROC_LL_READ_BYTES, LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_BYTES, "read_bytes" }, - { LPROC_LL_WRITE_BYTES, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES, + { LPROC_LL_WRITE_BYTES, LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_BYTES, "write_bytes" }, - { LPROC_LL_BRW_READ, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES, + { LPROC_LL_BRW_READ, LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_PAGES, "brw_read" }, - { LPROC_LL_BRW_WRITE, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES, + { LPROC_LL_BRW_WRITE, LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_PAGES, "brw_write" }, - { LPROC_LL_OSC_READ, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES, + { LPROC_LL_OSC_READ, LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_BYTES, "osc_read" }, - { LPROC_LL_OSC_WRITE, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES, + { LPROC_LL_OSC_WRITE, LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_BYTES, "osc_write" }, { LPROC_LL_IOCTL, LPROCFS_TYPE_REGS, "ioctl" }, { LPROC_LL_OPEN, LPROCFS_TYPE_REGS, "open" }, @@ -1150,7 +1309,7 @@ static void ll_display_extents_info(struct ll_rw_extents_info *io_extents, r, pct(r, read_tot), pct(read_cum, read_tot), w, pct(w, write_tot), pct(write_cum, write_tot)); start = end; - if (start == 1<<10) { + if (start == 1 << 10) { start = 1; units += 10; unitp++; diff --git a/drivers/staging/lustre/lustre/llite/namei.c b/drivers/staging/lustre/lustre/llite/namei.c index 2c4dc69731e8..dfa36d34c645 100644 --- a/drivers/staging/lustre/lustre/llite/namei.c +++ b/drivers/staging/lustre/lustre/llite/namei.c @@ -42,13 +42,12 @@ #include "../include/obd_support.h" #include "../include/lustre_fid.h" -#include "../include/lustre_lite.h" #include "../include/lustre_dlm.h" #include "../include/lustre_ver.h" #include "llite_internal.h" -static int ll_create_it(struct inode *, struct dentry *, - int, struct lookup_intent *); +static int ll_create_it(struct inode *dir, struct dentry *dentry, + struct lookup_intent *it); /* called from iget5_locked->find_inode() under inode_hash_lock spinlock */ static int ll_test_inode(struct inode *inode, void *opaque) @@ -56,12 +55,12 @@ static int ll_test_inode(struct inode *inode, void *opaque) struct ll_inode_info *lli = ll_i2info(inode); struct lustre_md *md = opaque; - if (unlikely(!(md->body->valid & OBD_MD_FLID))) { + if (unlikely(!(md->body->mbo_valid & OBD_MD_FLID))) { CERROR("MDS body missing FID\n"); return 0; } - if (!lu_fid_eq(&lli->lli_fid, &md->body->fid1)) + if (!lu_fid_eq(&lli->lli_fid, &md->body->mbo_fid1)) return 0; return 1; @@ -72,20 +71,20 @@ static int ll_set_inode(struct inode *inode, void *opaque) struct ll_inode_info *lli = ll_i2info(inode); struct mdt_body *body = ((struct lustre_md *)opaque)->body; - if (unlikely(!(body->valid & OBD_MD_FLID))) { + if (unlikely(!(body->mbo_valid & OBD_MD_FLID))) { CERROR("MDS body missing FID\n"); return -EINVAL; } - lli->lli_fid = body->fid1; - if (unlikely(!(body->valid & OBD_MD_FLTYPE))) { + lli->lli_fid = body->mbo_fid1; + if (unlikely(!(body->mbo_valid & OBD_MD_FLTYPE))) { CERROR("Can not initialize inode " DFID " without object type: valid = %#llx\n", - PFID(&lli->lli_fid), body->valid); + PFID(&lli->lli_fid), body->mbo_valid); return -EINVAL; } - inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mode & S_IFMT); + inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mbo_mode & S_IFMT); if (unlikely(inode->i_mode == 0)) { CERROR("Invalid inode "DFID" type\n", PFID(&lli->lli_fid)); return -EINVAL; @@ -96,41 +95,45 @@ static int ll_set_inode(struct inode *inode, void *opaque) return 0; } -/* - * Get an inode by inode number (already instantiated by the intent lookup). - * Returns inode or NULL +/** + * Get an inode by inode number(@hash), which is already instantiated by + * the intent lookup). */ struct inode *ll_iget(struct super_block *sb, ino_t hash, struct lustre_md *md) { struct inode *inode; + int rc = 0; LASSERT(hash != 0); inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md); - - if (inode) { - if (inode->i_state & I_NEW) { - int rc = 0; - - ll_read_inode2(inode, md); - if (S_ISREG(inode->i_mode) && - !ll_i2info(inode)->lli_clob) { - CDEBUG(D_INODE, - "%s: apply lsm %p to inode " DFID ".\n", - ll_get_fsname(sb, NULL, 0), md->lsm, - PFID(ll_inode2fid(inode))); - rc = cl_file_inode_init(inode, md); - } - if (rc != 0) { - iget_failed(inode); - inode = NULL; - } else { - unlock_new_inode(inode); - } - } else if (!(inode->i_state & (I_FREEING | I_CLEAR))) { - ll_update_inode(inode, md); - CDEBUG(D_VFSTRACE, "got inode: "DFID"(%p)\n", - PFID(&md->body->fid1), inode); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + rc = ll_read_inode2(inode, md); + if (!rc && S_ISREG(inode->i_mode) && + !ll_i2info(inode)->lli_clob) { + CDEBUG(D_INODE, "%s: apply lsm %p to inode "DFID"\n", + ll_get_fsname(sb, NULL, 0), md->lsm, + PFID(ll_inode2fid(inode))); + rc = cl_file_inode_init(inode, md); + } + if (rc) { + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); + inode = ERR_PTR(rc); + } else { + unlock_new_inode(inode); + } + } else if (!(inode->i_state & (I_FREEING | I_CLEAR))) { + rc = ll_update_inode(inode, md); + CDEBUG(D_VFSTRACE, "got inode: "DFID"(%p): rc = %d\n", + PFID(&md->body->mbo_fid1), inode, rc); + if (rc) { + iput(inode); + inode = ERR_PTR(rc); } } return inode; @@ -158,6 +161,11 @@ static void ll_invalidate_negative_children(struct inode *dir) spin_unlock(&dir->i_lock); } +int ll_test_inode_by_fid(struct inode *inode, void *opaque) +{ + return lu_fid_eq(&ll_i2info(inode)->lli_fid, opaque); +} + int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, void *data, int flag) { @@ -196,6 +204,8 @@ int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, } if (bits & MDS_INODELOCK_XATTR) { + if (S_ISDIR(inode->i_mode)) + ll_i2info(inode)->lli_def_stripe_offset = -1; ll_xattr_cache_destroy(inode); bits &= ~MDS_INODELOCK_XATTR; } @@ -253,10 +263,41 @@ int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, } if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) { - CDEBUG(D_INODE, "invalidating inode "DFID"\n", - PFID(ll_inode2fid(inode))); + struct ll_inode_info *lli = ll_i2info(inode); + + CDEBUG(D_INODE, "invalidating inode "DFID" lli = %p, pfid = "DFID"\n", + PFID(ll_inode2fid(inode)), lli, + PFID(&lli->lli_pfid)); + truncate_inode_pages(inode->i_mapping, 0); - ll_invalidate_negative_children(inode); + + if (unlikely(!fid_is_zero(&lli->lli_pfid))) { + struct inode *master_inode = NULL; + unsigned long hash; + + /* + * This is slave inode, since all of the child + * dentry is connected on the master inode, so + * we have to invalidate the negative children + * on master inode + */ + CDEBUG(D_INODE, "Invalidate s"DFID" m"DFID"\n", + PFID(ll_inode2fid(inode)), + PFID(&lli->lli_pfid)); + + hash = cl_fid_build_ino(&lli->lli_pfid, + ll_need_32bit_api(ll_i2sbi(inode))); + + master_inode = ilookup5(inode->i_sb, hash, + ll_test_inode_by_fid, + (void *)&lli->lli_pfid); + if (master_inode && !IS_ERR(master_inode)) { + ll_invalidate_negative_children(master_inode); + iput(master_inode); + } + } else { + ll_invalidate_negative_children(inode); + } } if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) && @@ -322,7 +363,8 @@ static struct dentry *ll_find_alias(struct inode *inode, struct dentry *dentry) LASSERT(alias != dentry); spin_lock(&alias->d_lock); - if (alias->d_flags & DCACHE_DISCONNECTED) + if ((alias->d_flags & DCACHE_DISCONNECTED) && + S_ISDIR(inode->i_mode)) /* LASSERT(last_discon == NULL); LU-405, bz 20055 */ discon_alias = alias; else if (alias->d_parent == dentry->d_parent && @@ -433,9 +475,20 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request, struct lookup_intent parent_it = { .it_op = IT_GETATTR, .it_lock_handle = 0 }; + struct lu_fid fid = ll_i2info(parent)->lli_fid; + + /* If it is striped directory, get the real stripe parent */ + if (unlikely(ll_i2info(parent)->lli_lsm_md)) { + rc = md_get_fid_from_lsm(ll_i2mdexp(parent), + ll_i2info(parent)->lli_lsm_md, + (*de)->d_name.name, + (*de)->d_name.len, &fid); + if (rc) + return rc; + } - if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it, - &ll_i2info(parent)->lli_fid, NULL)) { + if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it, &fid, + NULL)) { d_lustre_revalidate(*de); ll_intent_release(&parent_it); } @@ -449,13 +502,13 @@ out: } static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, - struct lookup_intent *it, int lookup_flags) + struct lookup_intent *it) { struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; struct dentry *save = dentry, *retval; struct ptlrpc_request *req = NULL; + struct md_op_data *op_data = NULL; struct inode *inode; - struct md_op_data *op_data; __u32 opc; int rc; @@ -471,8 +524,8 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, if (!it || it->it_op == IT_GETXATTR) it = &lookup_it; - if (it->it_op == IT_GETATTR) { - rc = ll_statahead_enter(parent, &dentry, 0); + if (it->it_op == IT_GETATTR && dentry_may_statahead(parent, dentry)) { + rc = ll_statahead(parent, &dentry, 0); if (rc == 1) { if (dentry == save) retval = NULL; @@ -488,8 +541,7 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, opc = LUSTRE_OPC_ANY; op_data = ll_prep_md_op_data(NULL, parent, NULL, dentry->d_name.name, - dentry->d_name.len, lookup_flags, opc, - NULL); + dentry->d_name.len, 0, opc, NULL); if (IS_ERR(op_data)) return (void *)op_data; @@ -497,9 +549,38 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent))) it->it_create_mode &= ~current_umask(); - rc = md_intent_lock(ll_i2mdexp(parent), op_data, NULL, 0, it, - lookup_flags, &req, ll_md_blocking_ast, 0); - ll_finish_md_op_data(op_data); + rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req, + &ll_md_blocking_ast, 0); + /* + * If the MDS allows the client to chgrp (CFS_SETGRP_PERM), but the + * client does not know which suppgid should be sent to the MDS, or + * some other(s) changed the target file's GID after this RPC sent + * to the MDS with the suppgid as the original GID, then we should + * try again with right suppgid. + */ + if (rc == -EACCES && it->it_op & IT_OPEN && + it_disposition(it, DISP_OPEN_DENY)) { + struct mdt_body *body; + + LASSERT(req); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (op_data->op_suppgids[0] == body->mbo_gid || + op_data->op_suppgids[1] == body->mbo_gid || + !in_group_p(make_kgid(&init_user_ns, body->mbo_gid))) { + retval = ERR_PTR(-EACCES); + goto out; + } + + fid_zero(&op_data->op_fid2); + op_data->op_suppgids[1] = body->mbo_gid; + ptlrpc_req_finished(req); + req = NULL; + ll_intent_release(it); + rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req, + ll_md_blocking_ast, 0); + } + if (rc < 0) { retval = ERR_PTR(rc); goto out; @@ -524,11 +605,11 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, retval = NULL; else retval = dentry; - out: - if (req) - ptlrpc_req_finished(req); - if (it->it_op == IT_GETATTR && (!retval || retval == dentry)) - ll_statahead_mark(parent, dentry); +out: + if (op_data && !IS_ERR(op_data)) + ll_finish_md_op_data(op_data); + + ptlrpc_req_finished(req); return retval; } @@ -541,15 +622,19 @@ static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry, CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p),flags=%u\n", dentry, PFID(ll_inode2fid(parent)), parent, flags); - /* Optimize away (CREATE && !OPEN). Let .create handle the race. */ - if ((flags & LOOKUP_CREATE) && !(flags & LOOKUP_OPEN)) + /* Optimize away (CREATE && !OPEN). Let .create handle the race. + * but only if we have write permissions there, otherwise we need + * to proceed with lookup. LU-4185 + */ + if ((flags & LOOKUP_CREATE) && !(flags & LOOKUP_OPEN) && + (inode_permission(parent, MAY_WRITE | MAY_EXEC) == 0)) return NULL; - if (flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE)) + if (flags & (LOOKUP_PARENT | LOOKUP_OPEN | LOOKUP_CREATE)) itp = NULL; else itp = ⁢ - de = ll_lookup_it(parent, dentry, itp, 0); + de = ll_lookup_it(parent, dentry, itp); if (itp) ll_intent_release(itp); @@ -567,7 +652,6 @@ static int ll_atomic_open(struct inode *dir, struct dentry *dentry, { struct lookup_intent *it; struct dentry *de; - long long lookup_flags = LOOKUP_OPEN; int rc = 0; CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p),file %p,open_flags %x,mode %x opened %d\n", @@ -597,15 +681,14 @@ static int ll_atomic_open(struct inode *dir, struct dentry *dentry, return -ENOMEM; it->it_op = IT_OPEN; - if (open_flags & O_CREAT) { + if (open_flags & O_CREAT) it->it_op |= IT_CREAT; - lookup_flags |= LOOKUP_CREATE; - } it->it_create_mode = (mode & S_IALLUGO) | S_IFREG; it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags); + it->it_flags &= ~MDS_OPEN_FL_INTERNAL; /* Dentry added to dcache tree in ll_lookup_it */ - de = ll_lookup_it(dir, dentry, it, lookup_flags); + de = ll_lookup_it(dir, dentry, it); if (IS_ERR(de)) rc = PTR_ERR(de); else if (de) @@ -614,7 +697,7 @@ static int ll_atomic_open(struct inode *dir, struct dentry *dentry, if (!rc) { if (it_disposition(it, DISP_OPEN_CREATE)) { /* Dentry instantiated in ll_create_it. */ - rc = ll_create_it(dir, dentry, mode, it); + rc = ll_create_it(dir, dentry, it); if (rc) { /* We dget in ll_splice_alias. */ if (de) @@ -700,7 +783,7 @@ static struct inode *ll_create_node(struct inode *dir, struct lookup_intent *it) * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode, +static int ll_create_it(struct inode *dir, struct dentry *dentry, struct lookup_intent *it) { struct inode *inode; @@ -721,27 +804,26 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode, return 0; } -static void ll_update_times(struct ptlrpc_request *request, - struct inode *inode) +void ll_update_times(struct ptlrpc_request *request, struct inode *inode) { struct mdt_body *body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); LASSERT(body); - if (body->valid & OBD_MD_FLMTIME && - body->mtime > LTIME_S(inode->i_mtime)) { + if (body->mbo_valid & OBD_MD_FLMTIME && + body->mbo_mtime > LTIME_S(inode->i_mtime)) { CDEBUG(D_INODE, "setting fid "DFID" mtime from %lu to %llu\n", PFID(ll_inode2fid(inode)), LTIME_S(inode->i_mtime), - body->mtime); - LTIME_S(inode->i_mtime) = body->mtime; + body->mbo_mtime); + LTIME_S(inode->i_mtime) = body->mbo_mtime; } - if (body->valid & OBD_MD_FLCTIME && - body->ctime > LTIME_S(inode->i_ctime)) - LTIME_S(inode->i_ctime) = body->ctime; + if (body->mbo_valid & OBD_MD_FLCTIME && + body->mbo_ctime > LTIME_S(inode->i_ctime)) + LTIME_S(inode->i_ctime) = body->mbo_ctime; } static int ll_new_node(struct inode *dir, struct dentry *dentry, - const char *tgt, int mode, int rdev, + const char *tgt, umode_t mode, int rdev, __u32 opc) { struct ptlrpc_request *request = NULL; @@ -753,7 +835,7 @@ static int ll_new_node(struct inode *dir, struct dentry *dentry, if (unlikely(tgt)) tgt_len = strlen(tgt) + 1; - +again: op_data = ll_prep_md_op_data(NULL, dir, NULL, dentry->d_name.name, dentry->d_name.len, @@ -768,9 +850,45 @@ static int ll_new_node(struct inode *dir, struct dentry *dentry, from_kgid(&init_user_ns, current_fsgid()), cfs_curproc_cap_pack(), rdev, &request); ll_finish_md_op_data(op_data); - if (err) + if (err < 0 && err != -EREMOTE) goto err_exit; + /* + * If the client doesn't know where to create a subdirectory (or + * in case of a race that sends the RPC to the wrong MDS), the + * MDS will return -EREMOTE and the client will fetch the layout + * of the directory, then create the directory on the right MDT. + */ + if (unlikely(err == -EREMOTE)) { + struct ll_inode_info *lli = ll_i2info(dir); + struct lmv_user_md *lum; + int lumsize, err2; + + ptlrpc_req_finished(request); + request = NULL; + + err2 = ll_dir_getstripe(dir, (void **)&lum, &lumsize, &request, + OBD_MD_DEFAULT_MEA); + if (!err2) { + /* Update stripe_offset and retry */ + lli->lli_def_stripe_offset = lum->lum_stripe_offset; + } else if (err2 == -ENODATA && + lli->lli_def_stripe_offset != -1) { + /* + * If there are no default stripe EA on the MDT, but the + * client has default stripe, then it probably means + * default stripe EA has just been deleted. + */ + lli->lli_def_stripe_offset = -1; + } else { + goto err_exit; + } + + ptlrpc_req_finished(request); + request = NULL; + goto again; + } + ll_update_times(request, dir); err = ll_prep_inode(&inode, request, dir->i_sb, NULL); @@ -779,7 +897,8 @@ static int ll_new_node(struct inode *dir, struct dentry *dentry, d_instantiate(dentry, inode); err_exit: - ptlrpc_req_finished(request); + if (request) + ptlrpc_req_finished(request); return err; } @@ -842,77 +961,6 @@ static int ll_create_nd(struct inode *dir, struct dentry *dentry, return rc; } -int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir) -{ - struct mdt_body *body; - struct lov_mds_md *eadata; - struct lov_stripe_md *lsm = NULL; - struct obd_trans_info oti = { 0 }; - struct obdo *oa; - int rc; - - /* req is swabbed so this is safe */ - body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); - if (!(body->valid & OBD_MD_FLEASIZE)) - return 0; - - if (body->eadatasize == 0) { - CERROR("OBD_MD_FLEASIZE set but eadatasize zero\n"); - rc = -EPROTO; - goto out; - } - - /* The MDS sent back the EA because we unlinked the last reference - * to this file. Use this EA to unlink the objects on the OST. - * It's opaque so we don't swab here; we leave it to obd_unpackmd() to - * check it is complete and sensible. - */ - eadata = req_capsule_server_sized_get(&request->rq_pill, &RMF_MDT_MD, - body->eadatasize); - LASSERT(eadata); - - rc = obd_unpackmd(ll_i2dtexp(dir), &lsm, eadata, body->eadatasize); - if (rc < 0) { - CERROR("obd_unpackmd: %d\n", rc); - goto out; - } - LASSERT(rc >= sizeof(*lsm)); - - oa = kmem_cache_zalloc(obdo_cachep, GFP_NOFS); - if (!oa) { - rc = -ENOMEM; - goto out_free_memmd; - } - - oa->o_oi = lsm->lsm_oi; - oa->o_mode = body->mode & S_IFMT; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLGROUP; - - if (body->valid & OBD_MD_FLCOOKIE) { - oa->o_valid |= OBD_MD_FLCOOKIE; - oti.oti_logcookies = - req_capsule_server_sized_get(&request->rq_pill, - &RMF_LOGCOOKIES, - sizeof(struct llog_cookie) * - lsm->lsm_stripe_count); - if (!oti.oti_logcookies) { - oa->o_valid &= ~OBD_MD_FLCOOKIE; - body->valid &= ~OBD_MD_FLCOOKIE; - } - } - - rc = obd_destroy(NULL, ll_i2dtexp(dir), oa, lsm, &oti, - ll_i2mdexp(dir)); - if (rc) - CERROR("obd destroy objid "DOSTID" error %d\n", - POSTID(&lsm->lsm_oi), rc); -out_free_memmd: - obd_free_memmd(ll_i2dtexp(dir), &lsm); - kmem_cache_free(obdo_cachep, oa); -out: - return rc; -} - /* ll_unlink() doesn't update the inode with the new link count. * Instead, ll_ddelete() and ll_d_iput() will update it based upon if there * is any lock existing. They will recycle dentries and inodes based upon locks @@ -934,7 +982,7 @@ static int ll_unlink(struct inode *dir, struct dentry *dchild) if (IS_ERR(op_data)) return PTR_ERR(op_data); - if (dchild && dchild->d_inode) + if (dchild->d_inode) op_data->op_fid3 = *ll_inode2fid(dchild->d_inode); op_data->op_fid2 = op_data->op_fid3; @@ -946,7 +994,6 @@ static int ll_unlink(struct inode *dir, struct dentry *dchild) ll_update_times(request, dir); ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1); - rc = ll_objects_destroy(request, dir); out: ptlrpc_req_finished(request); return rc; @@ -961,9 +1008,9 @@ static int ll_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir))) mode &= ~current_umask(); - mode = (mode & (S_IRWXUGO|S_ISVTX)) | S_IFDIR; - err = ll_new_node(dir, dentry, NULL, mode, 0, LUSTRE_OPC_MKDIR); + mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR; + err = ll_new_node(dir, dentry, NULL, mode, 0, LUSTRE_OPC_MKDIR); if (!err) ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKDIR, 1); @@ -986,7 +1033,7 @@ static int ll_rmdir(struct inode *dir, struct dentry *dchild) if (IS_ERR(op_data)) return PTR_ERR(op_data); - if (dchild && dchild->d_inode) + if (dchild->d_inode) op_data->op_fid3 = *ll_inode2fid(dchild->d_inode); op_data->op_fid2 = op_data->op_fid3; @@ -1067,9 +1114,9 @@ static int ll_rename(struct inode *src, struct dentry *src_dchild, if (IS_ERR(op_data)) return PTR_ERR(op_data); - if (src_dchild && src_dchild->d_inode) + if (src_dchild->d_inode) op_data->op_fid3 = *ll_inode2fid(src_dchild->d_inode); - if (tgt_dchild && tgt_dchild->d_inode) + if (tgt_dchild->d_inode) op_data->op_fid4 = *ll_inode2fid(tgt_dchild->d_inode); err = md_rename(sbi->ll_md_exp, op_data, @@ -1082,7 +1129,6 @@ static int ll_rename(struct inode *src, struct dentry *src_dchild, ll_update_times(request, src); ll_update_times(request, tgt); ll_stats_ops_tally(sbi, LPROC_LL_RENAME, 1); - err = ll_objects_destroy(request, src); } ptlrpc_req_finished(request); @@ -1106,10 +1152,10 @@ const struct inode_operations ll_dir_inode_operations = { .setattr = ll_setattr, .getattr = ll_getattr, .permission = ll_inode_permission, - .setxattr = ll_setxattr, - .getxattr = ll_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = ll_listxattr, - .removexattr = ll_removexattr, + .removexattr = generic_removexattr, .get_acl = ll_get_acl, }; @@ -1117,9 +1163,9 @@ const struct inode_operations ll_special_inode_operations = { .setattr = ll_setattr, .getattr = ll_getattr, .permission = ll_inode_permission, - .setxattr = ll_setxattr, - .getxattr = ll_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = ll_listxattr, - .removexattr = ll_removexattr, + .removexattr = generic_removexattr, .get_acl = ll_get_acl, }; diff --git a/drivers/staging/lustre/lustre/llite/range_lock.c b/drivers/staging/lustre/lustre/llite/range_lock.c new file mode 100644 index 000000000000..94c818f1478b --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/range_lock.c @@ -0,0 +1,233 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Range lock is used to allow multiple threads writing a single shared + * file given each thread is writing to a non-overlapping portion of the + * file. + * + * Refer to the possible upstream kernel version of range lock by + * Jan Kara <jack@suse.cz>: https://lkml.org/lkml/2013/1/31/480 + * + * This file could later replaced by the upstream kernel version. + */ +/* + * Author: Prakash Surya <surya1@llnl.gov> + * Author: Bobi Jam <bobijam.xu@intel.com> + */ +#include "range_lock.h" +#include "../include/lustre/lustre_user.h" + +/** + * Initialize a range lock tree + * + * \param tree [in] an empty range lock tree + * + * Pre: Caller should have allocated the range lock tree. + * Post: The range lock tree is ready to function. + */ +void range_lock_tree_init(struct range_lock_tree *tree) +{ + tree->rlt_root = NULL; + tree->rlt_sequence = 0; + spin_lock_init(&tree->rlt_lock); +} + +/** + * Initialize a range lock node + * + * \param lock [in] an empty range lock node + * \param start [in] start of the covering region + * \param end [in] end of the covering region + * + * Pre: Caller should have allocated the range lock node. + * Post: The range lock node is meant to cover [start, end] region + */ +void range_lock_init(struct range_lock *lock, __u64 start, __u64 end) +{ + memset(&lock->rl_node, 0, sizeof(lock->rl_node)); + if (end != LUSTRE_EOF) + end >>= PAGE_SHIFT; + interval_set(&lock->rl_node, start >> PAGE_SHIFT, end); + INIT_LIST_HEAD(&lock->rl_next_lock); + lock->rl_task = NULL; + lock->rl_lock_count = 0; + lock->rl_blocking_ranges = 0; + lock->rl_sequence = 0; +} + +static inline struct range_lock *next_lock(struct range_lock *lock) +{ + return list_entry(lock->rl_next_lock.next, typeof(*lock), rl_next_lock); +} + +/** + * Helper function of range_unlock() + * + * \param node [in] a range lock found overlapped during interval node + * search + * \param arg [in] the range lock to be tested + * + * \retval INTERVAL_ITER_CONT indicate to continue the search for next + * overlapping range node + * \retval INTERVAL_ITER_STOP indicate to stop the search + */ +static enum interval_iter range_unlock_cb(struct interval_node *node, void *arg) +{ + struct range_lock *lock = arg; + struct range_lock *overlap = node2rangelock(node); + struct range_lock *iter; + + list_for_each_entry(iter, &overlap->rl_next_lock, rl_next_lock) { + if (iter->rl_sequence > lock->rl_sequence) { + --iter->rl_blocking_ranges; + LASSERT(iter->rl_blocking_ranges > 0); + } + } + if (overlap->rl_sequence > lock->rl_sequence) { + --overlap->rl_blocking_ranges; + if (overlap->rl_blocking_ranges == 0) + wake_up_process(overlap->rl_task); + } + return INTERVAL_ITER_CONT; +} + +/** + * Unlock a range lock, wake up locks blocked by this lock. + * + * \param tree [in] range lock tree + * \param lock [in] range lock to be deleted + * + * If this lock has been granted, relase it; if not, just delete it from + * the tree or the same region lock list. Wake up those locks only blocked + * by this lock through range_unlock_cb(). + */ +void range_unlock(struct range_lock_tree *tree, struct range_lock *lock) +{ + spin_lock(&tree->rlt_lock); + if (!list_empty(&lock->rl_next_lock)) { + struct range_lock *next; + + if (interval_is_intree(&lock->rl_node)) { /* first lock */ + /* Insert the next same range lock into the tree */ + next = next_lock(lock); + next->rl_lock_count = lock->rl_lock_count - 1; + interval_erase(&lock->rl_node, &tree->rlt_root); + interval_insert(&next->rl_node, &tree->rlt_root); + } else { + /* find the first lock in tree */ + list_for_each_entry(next, &lock->rl_next_lock, + rl_next_lock) { + if (!interval_is_intree(&next->rl_node)) + continue; + + LASSERT(next->rl_lock_count > 0); + next->rl_lock_count--; + break; + } + } + list_del_init(&lock->rl_next_lock); + } else { + LASSERT(interval_is_intree(&lock->rl_node)); + interval_erase(&lock->rl_node, &tree->rlt_root); + } + + interval_search(tree->rlt_root, &lock->rl_node.in_extent, + range_unlock_cb, lock); + spin_unlock(&tree->rlt_lock); +} + +/** + * Helper function of range_lock() + * + * \param node [in] a range lock found overlapped during interval node + * search + * \param arg [in] the range lock to be tested + * + * \retval INTERVAL_ITER_CONT indicate to continue the search for next + * overlapping range node + * \retval INTERVAL_ITER_STOP indicate to stop the search + */ +static enum interval_iter range_lock_cb(struct interval_node *node, void *arg) +{ + struct range_lock *lock = (struct range_lock *)arg; + struct range_lock *overlap = node2rangelock(node); + + lock->rl_blocking_ranges += overlap->rl_lock_count + 1; + return INTERVAL_ITER_CONT; +} + +/** + * Lock a region + * + * \param tree [in] range lock tree + * \param lock [in] range lock node containing the region span + * + * \retval 0 get the range lock + * \retval <0 error code while not getting the range lock + * + * If there exists overlapping range lock, the new lock will wait and + * retry, if later it find that it is not the chosen one to wake up, + * it wait again. + */ +int range_lock(struct range_lock_tree *tree, struct range_lock *lock) +{ + struct interval_node *node; + int rc = 0; + + spin_lock(&tree->rlt_lock); + /* + * We need to check for all conflicting intervals + * already in the tree. + */ + interval_search(tree->rlt_root, &lock->rl_node.in_extent, + range_lock_cb, lock); + /* + * Insert to the tree if I am unique, otherwise I've been linked to + * the rl_next_lock of another lock which has the same range as mine + * in range_lock_cb(). + */ + node = interval_insert(&lock->rl_node, &tree->rlt_root); + if (node) { + struct range_lock *tmp = node2rangelock(node); + + list_add_tail(&lock->rl_next_lock, &tmp->rl_next_lock); + tmp->rl_lock_count++; + } + lock->rl_sequence = ++tree->rlt_sequence; + + while (lock->rl_blocking_ranges > 0) { + lock->rl_task = current; + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock(&tree->rlt_lock); + schedule(); + + if (signal_pending(current)) { + range_unlock(tree, lock); + rc = -EINTR; + goto out; + } + spin_lock(&tree->rlt_lock); + } + spin_unlock(&tree->rlt_lock); +out: + return rc; +} diff --git a/drivers/staging/lustre/lustre/llite/range_lock.h b/drivers/staging/lustre/lustre/llite/range_lock.h new file mode 100644 index 000000000000..c6d04a6f99fd --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/range_lock.h @@ -0,0 +1,82 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Range lock is used to allow multiple threads writing a single shared + * file given each thread is writing to a non-overlapping portion of the + * file. + * + * Refer to the possible upstream kernel version of range lock by + * Jan Kara <jack@suse.cz>: https://lkml.org/lkml/2013/1/31/480 + * + * This file could later replaced by the upstream kernel version. + */ +/* + * Author: Prakash Surya <surya1@llnl.gov> + * Author: Bobi Jam <bobijam.xu@intel.com> + */ +#ifndef _RANGE_LOCK_H +#define _RANGE_LOCK_H + +#include "../../include/linux/libcfs/libcfs.h" +#include "../include/interval_tree.h" + +struct range_lock { + struct interval_node rl_node; + /** + * Process to enqueue this lock. + */ + struct task_struct *rl_task; + /** + * List of locks with the same range. + */ + struct list_head rl_next_lock; + /** + * Number of locks in the list rl_next_lock + */ + unsigned int rl_lock_count; + /** + * Number of ranges which are blocking acquisition of the lock + */ + unsigned int rl_blocking_ranges; + /** + * Sequence number of range lock. This number is used to get to know + * the order the locks are queued; this is required for range_cancel(). + */ + __u64 rl_sequence; +}; + +static inline struct range_lock *node2rangelock(const struct interval_node *n) +{ + return container_of(n, struct range_lock, rl_node); +} + +struct range_lock_tree { + struct interval_node *rlt_root; + spinlock_t rlt_lock; /* protect range lock tree */ + __u64 rlt_sequence; +}; + +void range_lock_tree_init(struct range_lock_tree *tree); +void range_lock_init(struct range_lock *lock, __u64 start, __u64 end); +int range_lock(struct range_lock_tree *tree, struct range_lock *lock); +void range_unlock(struct range_lock_tree *tree, struct range_lock *lock); +#endif diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c index 87393c4bd51e..50c0152ba022 100644 --- a/drivers/staging/lustre/lustre/llite/rw.c +++ b/drivers/staging/lustre/lustre/llite/rw.c @@ -50,10 +50,8 @@ #define DEBUG_SUBSYSTEM S_LLITE -#include "../include/lustre_lite.h" #include "../include/obd_cksum.h" #include "llite_internal.h" -#include "../include/linux/lustre_compat25.h" static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which); @@ -413,7 +411,7 @@ static int ll_read_ahead_pages(const struct lu_env *env, * forward read-ahead, it will be fixed when backward * read-ahead is implemented */ - LASSERTF(page_idx > ria->ria_stoff, "Invalid page_idx %lu rs %lu re %lu ro %lu rl %lu rp %lu\n", + LASSERTF(page_idx >= ria->ria_stoff, "Invalid page_idx %lu rs %lu re %lu ro %lu rl %lu rp %lu\n", page_idx, ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length, ria->ria_pages); @@ -474,10 +472,22 @@ int ll_readahead(const struct lu_env *env, struct cl_io *io, } /* Reserve a part of the read-ahead window that we'll be issuing */ - if (ras->ras_window_len) { - start = ras->ras_next_readahead; + if (ras->ras_window_len > 0) { + /* + * Note: other thread might rollback the ras_next_readahead, + * if it can not get the full size of prepared pages, see the + * end of this function. For stride read ahead, it needs to + * make sure the offset is no less than ras_stride_offset, + * so that stride read ahead can work correctly. + */ + if (stride_io_mode(ras)) + start = max(ras->ras_next_readahead, + ras->ras_stride_offset); + else + start = ras->ras_next_readahead; end = ras->ras_window_start + ras->ras_window_len - 1; } + if (end != 0) { unsigned long rpc_boundary; /* @@ -648,10 +658,11 @@ static void ras_update_stride_detector(struct ll_readahead_state *ras, { unsigned long stride_gap = index - ras->ras_last_readpage - 1; - if (!stride_io_mode(ras) && (stride_gap != 0 || - ras->ras_consecutive_stride_requests == 0)) { + if ((stride_gap != 0 || ras->ras_consecutive_stride_requests == 0) && + !stride_io_mode(ras)) { ras->ras_stride_pages = ras->ras_consecutive_pages; - ras->ras_stride_length = stride_gap+ras->ras_consecutive_pages; + ras->ras_stride_length = ras->ras_consecutive_pages + + stride_gap; } LASSERT(ras->ras_request_index == 0); LASSERT(ras->ras_consecutive_stride_requests == 0); @@ -663,10 +674,9 @@ static void ras_update_stride_detector(struct ll_readahead_state *ras, } ras->ras_stride_pages = ras->ras_consecutive_pages; - ras->ras_stride_length = stride_gap+ras->ras_consecutive_pages; + ras->ras_stride_length = stride_gap + ras->ras_consecutive_pages; RAS_CDEBUG(ras); - return; } /* Stride Read-ahead window will be increased inc_len according to @@ -882,7 +892,6 @@ out_unlock: RAS_CDEBUG(ras); ras->ras_request_index++; spin_unlock(&ras->ras_lock); - return; } int ll_writepage(struct page *vmpage, struct writeback_control *wbc) @@ -1015,11 +1024,15 @@ int ll_writepages(struct address_space *mapping, struct writeback_control *wbc) * is called later on. */ ignore_layout = 1; + + if (!ll_i2info(inode)->lli_clob) + return 0; + result = cl_sync_file_range(inode, start, end, mode, ignore_layout); if (result > 0) { wbc->nr_to_write -= result; result = 0; - } + } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) { if (end == OBD_OBJECT_EOF) diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c index d98c7acc0832..26f3a37873a7 100644 --- a/drivers/staging/lustre/lustre/llite/rw26.c +++ b/drivers/staging/lustre/lustre/llite/rw26.c @@ -51,9 +51,7 @@ #define DEBUG_SUBSYSTEM S_LLITE -#include "../include/lustre_lite.h" #include "llite_internal.h" -#include "../include/linux/lustre_compat25.h" /** * Implements Linux VM address_space::invalidatepage() method. This method is @@ -161,7 +159,7 @@ static int ll_releasepage(struct page *vmpage, gfp_t gfp_mask) return result; } -#define MAX_DIRECTIO_SIZE (2*1024*1024*1024UL) +#define MAX_DIRECTIO_SIZE (2 * 1024 * 1024 * 1024UL) static inline int ll_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages, @@ -214,10 +212,10 @@ ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io, int i; ssize_t rc = 0; loff_t file_offset = pv->ldp_start_offset; - long size = pv->ldp_size; + size_t size = pv->ldp_size; int page_count = pv->ldp_nr; struct page **pages = pv->ldp_pages; - long page_size = cl_page_size(obj); + size_t page_size = cl_page_size(obj); bool do_io; int io_pages = 0; @@ -346,7 +344,6 @@ static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter) struct cl_io *io; struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - struct vvp_object *obj = cl_inode2vvp(inode); loff_t file_offset = iocb->ki_pos; ssize_t count = iov_iter_count(iter); ssize_t tot_bytes = 0, result = 0; @@ -375,14 +372,6 @@ static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter) io = vvp_env_io(env)->vui_cl.cis_io; LASSERT(io); - /* 0. Need locking between buffered and direct access. and race with - * size changing by concurrent truncates and writes. - * 1. Need inode mutex to operate transient pages. - */ - if (iov_iter_rw(iter) == READ) - inode_lock(inode); - - LASSERT(obj->vob_transient_pages == 0); while (iov_iter_count(iter)) { struct page **pages; size_t offs; @@ -430,10 +419,6 @@ static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter) file_offset += result; } out: - LASSERT(obj->vob_transient_pages == 0); - if (iov_iter_rw(iter) == READ) - inode_unlock(inode); - if (tot_bytes > 0) { struct vvp_io *vio = vvp_env_io(env); @@ -616,6 +601,13 @@ static int ll_write_end(struct file *file, struct address_space *mapping, LASSERT(from == 0); vio->u.write.vui_to = from + copied; + /* + * To address the deadlock in balance_dirty_pages() where + * this dirty page may be written back in the same thread. + */ + if (PageDirty(vmpage)) + unplug = true; + /* We may have one full RPC, commit it soon */ if (plist->pl_nr >= PTLRPC_MAX_BRW_PAGES) unplug = true; diff --git a/drivers/staging/lustre/lustre/llite/statahead.c b/drivers/staging/lustre/lustre/llite/statahead.c index c1cb6b19e724..0677513476ec 100644 --- a/drivers/staging/lustre/lustre/llite/statahead.c +++ b/drivers/staging/lustre/lustre/llite/statahead.c @@ -39,7 +39,6 @@ #define DEBUG_SUBSYSTEM S_LLITE #include "../include/obd_support.h" -#include "../include/lustre_lite.h" #include "../include/lustre_dlm.h" #include "llite_internal.h" @@ -50,24 +49,26 @@ enum se_stat { SA_ENTRY_INIT = 0, /** init entry */ SA_ENTRY_SUCC = 1, /** stat succeed */ SA_ENTRY_INVA = 2, /** invalid entry */ - SA_ENTRY_DEST = 3, /** entry to be destroyed */ }; -struct ll_sa_entry { - /* link into sai->sai_entries */ - struct list_head se_link; - /* link into sai->sai_entries_{received,stated} */ +/* + * sa_entry is not refcounted: statahead thread allocates it and do async stat, + * and in async stat callback ll_statahead_interpret() will add it into + * sai_interim_entries, later statahead thread will call sa_handle_callback() to + * instantiate entry and move it into sai_entries, and then only scanner process + * can access and free it. + */ +struct sa_entry { + /* link into sai_interim_entries or sai_entries */ struct list_head se_list; /* link into sai hash table locally */ struct list_head se_hash; - /* entry reference count */ - atomic_t se_refcount; /* entry index in the sai */ __u64 se_index; /* low layer ldlm lock handle */ __u64 se_handle; /* entry status */ - enum se_stat se_stat; + enum se_stat se_state; /* entry size, contains name */ int se_size; /* pointer to async getattr enqueue info */ @@ -83,27 +84,24 @@ struct ll_sa_entry { static unsigned int sai_generation; static DEFINE_SPINLOCK(sai_generation_lock); -/* - * The entry only can be released by the caller, it is necessary to hold lock. - */ -static inline int ll_sa_entry_stated(struct ll_sa_entry *entry) +/* sa_entry is ready to use */ +static inline int sa_ready(struct sa_entry *entry) { smp_rmb(); - return (entry->se_stat != SA_ENTRY_INIT); + return (entry->se_state != SA_ENTRY_INIT); } -static inline int ll_sa_entry_hash(int val) +/* hash value to put in sai_cache */ +static inline int sa_hash(int val) { return val & LL_SA_CACHE_MASK; } -/* - * Insert entry to hash SA table. - */ +/* hash entry into sai_cache */ static inline void -ll_sa_entry_enhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry) +sa_rehash(struct ll_statahead_info *sai, struct sa_entry *entry) { - int i = ll_sa_entry_hash(entry->se_qstr.hash); + int i = sa_hash(entry->se_qstr.hash); spin_lock(&sai->sai_cache_lock[i]); list_add_tail(&entry->se_hash, &sai->sai_cache[i]); @@ -114,9 +112,9 @@ ll_sa_entry_enhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry) * Remove entry from SA table. */ static inline void -ll_sa_entry_unhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry) +sa_unhash(struct ll_statahead_info *sai, struct sa_entry *entry) { - int i = ll_sa_entry_hash(entry->se_qstr.hash); + int i = sa_hash(entry->se_qstr.hash); spin_lock(&sai->sai_cache_lock[i]); list_del_init(&entry->se_hash); @@ -129,19 +127,21 @@ static inline int agl_should_run(struct ll_statahead_info *sai, return (inode && S_ISREG(inode->i_mode) && sai->sai_agl_valid); } +/* statahead window is full */ static inline int sa_sent_full(struct ll_statahead_info *sai) { return atomic_read(&sai->sai_cache_count) >= sai->sai_max; } -static inline int sa_received_empty(struct ll_statahead_info *sai) +/* got async stat replies */ +static inline int sa_has_callback(struct ll_statahead_info *sai) { - return list_empty(&sai->sai_entries_received); + return !list_empty(&sai->sai_interim_entries); } static inline int agl_list_empty(struct ll_statahead_info *sai) { - return list_empty(&sai->sai_entries_agl); + return list_empty(&sai->sai_agls); } /** @@ -157,7 +157,7 @@ static inline int sa_low_hit(struct ll_statahead_info *sai) } /* - * If the given index is behind of statahead window more than + * if the given index is behind of statahead window more than * SA_OMITTED_ENTRY_MAX, then it is old. */ static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index) @@ -166,20 +166,17 @@ static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index) sai->sai_index); } -/* - * Insert it into sai_entries tail when init. - */ -static struct ll_sa_entry * -ll_sa_entry_alloc(struct dentry *parent, - struct ll_statahead_info *sai, __u64 index, - const char *name, int len) +/* allocate sa_entry and hash it to allow scanner process to find it */ +static struct sa_entry * +sa_alloc(struct dentry *parent, struct ll_statahead_info *sai, __u64 index, + const char *name, int len) { struct ll_inode_info *lli; - struct ll_sa_entry *entry; + struct sa_entry *entry; int entry_size; char *dname; - entry_size = sizeof(struct ll_sa_entry) + (len & ~3) + 4; + entry_size = sizeof(struct sa_entry) + (len & ~3) + 4; entry = kzalloc(entry_size, GFP_NOFS); if (unlikely(!entry)) return ERR_PTR(-ENOMEM); @@ -188,34 +185,9 @@ ll_sa_entry_alloc(struct dentry *parent, len, name, entry, index); entry->se_index = index; - - /* - * Statahead entry reference rules: - * - * 1) When statahead entry is initialized, its reference is set as 2. - * One reference is used by the directory scanner. When the scanner - * searches the statahead cache for the given name, it can perform - * lockless hash lookup (only the scanner can remove entry from hash - * list), and once found, it needn't to call "atomic_inc()" for the - * entry reference. So the performance is improved. After using the - * statahead entry, the scanner will call "atomic_dec()" to drop the - * reference held when initialization. If it is the last reference, - * the statahead entry will be freed. - * - * 2) All other threads, including statahead thread and ptlrpcd thread, - * when they process the statahead entry, the reference for target - * should be held to guarantee the entry will not be released by the - * directory scanner. After processing the entry, these threads will - * drop the entry reference. If it is the last reference, the entry - * will be freed. - * - * The second reference when initializes the statahead entry is used - * by the statahead thread, following the rule 2). - */ - atomic_set(&entry->se_refcount, 2); - entry->se_stat = SA_ENTRY_INIT; + entry->se_state = SA_ENTRY_INIT; entry->se_size = entry_size; - dname = (char *)entry + sizeof(struct ll_sa_entry); + dname = (char *)entry + sizeof(struct sa_entry); memcpy(dname, name, len); dname[len] = 0; @@ -223,11 +195,10 @@ ll_sa_entry_alloc(struct dentry *parent, entry->se_qstr.len = len; entry->se_qstr.name = dname; - lli = ll_i2info(sai->sai_inode); + lli = ll_i2info(sai->sai_dentry->d_inode); spin_lock(&lli->lli_sa_lock); - list_add_tail(&entry->se_link, &sai->sai_entries); INIT_LIST_HEAD(&entry->se_list); - ll_sa_entry_enhash(sai, entry); + sa_rehash(sai, entry); spin_unlock(&lli->lli_sa_lock); atomic_inc(&sai->sai_cache_count); @@ -235,18 +206,29 @@ ll_sa_entry_alloc(struct dentry *parent, return entry; } +/* free sa_entry, which should have been unhashed and not in any list */ +static void sa_free(struct ll_statahead_info *sai, struct sa_entry *entry) +{ + CDEBUG(D_READA, "free sa entry %.*s(%p) index %llu\n", + entry->se_qstr.len, entry->se_qstr.name, entry, + entry->se_index); + + LASSERT(list_empty(&entry->se_list)); + LASSERT(list_empty(&entry->se_hash)); + + kfree(entry); + atomic_dec(&sai->sai_cache_count); +} + /* - * Used by the directory scanner to search entry with name. - * - * Only the caller can remove the entry from hash, so it is unnecessary to hold - * hash lock. It is caller's duty to release the init refcount on the entry, so - * it is also unnecessary to increase refcount on the entry. + * find sa_entry by name, used by directory scanner, lock is not needed because + * only scanner can remove the entry from cache. */ -static struct ll_sa_entry * -ll_sa_entry_get_byname(struct ll_statahead_info *sai, const struct qstr *qstr) +static struct sa_entry * +sa_get(struct ll_statahead_info *sai, const struct qstr *qstr) { - struct ll_sa_entry *entry; - int i = ll_sa_entry_hash(qstr->hash); + struct sa_entry *entry; + int i = sa_hash(qstr->hash); list_for_each_entry(entry, &sai->sai_cache[i], se_hash) { if (entry->se_qstr.hash == qstr->hash && @@ -257,164 +239,126 @@ ll_sa_entry_get_byname(struct ll_statahead_info *sai, const struct qstr *qstr) return NULL; } -/* - * Used by the async getattr request callback to find entry with index. - * - * Inside lli_sa_lock to prevent others to change the list during the search. - * It needs to increase entry refcount before returning to guarantee that the - * entry cannot be freed by others. - */ -static struct ll_sa_entry * -ll_sa_entry_get_byindex(struct ll_statahead_info *sai, __u64 index) -{ - struct ll_sa_entry *entry; - - list_for_each_entry(entry, &sai->sai_entries, se_link) { - if (entry->se_index == index) { - LASSERT(atomic_read(&entry->se_refcount) > 0); - atomic_inc(&entry->se_refcount); - return entry; - } - if (entry->se_index > index) - break; - } - return NULL; -} - -static void ll_sa_entry_cleanup(struct ll_statahead_info *sai, - struct ll_sa_entry *entry) -{ - struct md_enqueue_info *minfo = entry->se_minfo; - struct ptlrpc_request *req = entry->se_req; - - if (minfo) { - entry->se_minfo = NULL; - ll_intent_release(&minfo->mi_it); - iput(minfo->mi_dir); - kfree(minfo); - } - - if (req) { - entry->se_req = NULL; - ptlrpc_req_finished(req); - } -} - -static void ll_sa_entry_put(struct ll_statahead_info *sai, - struct ll_sa_entry *entry) -{ - if (atomic_dec_and_test(&entry->se_refcount)) { - CDEBUG(D_READA, "free sa entry %.*s(%p) index %llu\n", - entry->se_qstr.len, entry->se_qstr.name, entry, - entry->se_index); - - LASSERT(list_empty(&entry->se_link)); - LASSERT(list_empty(&entry->se_list)); - LASSERT(list_empty(&entry->se_hash)); - - ll_sa_entry_cleanup(sai, entry); - iput(entry->se_inode); - - kfree(entry); - atomic_dec(&sai->sai_cache_count); - } -} - +/* unhash and unlink sa_entry, and then free it */ static inline void -do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) +sa_kill(struct ll_statahead_info *sai, struct sa_entry *entry) { - struct ll_inode_info *lli = ll_i2info(sai->sai_inode); + struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode); LASSERT(!list_empty(&entry->se_hash)); - LASSERT(!list_empty(&entry->se_link)); + LASSERT(!list_empty(&entry->se_list)); + LASSERT(sa_ready(entry)); - ll_sa_entry_unhash(sai, entry); + sa_unhash(sai, entry); spin_lock(&lli->lli_sa_lock); - entry->se_stat = SA_ENTRY_DEST; - list_del_init(&entry->se_link); - if (likely(!list_empty(&entry->se_list))) - list_del_init(&entry->se_list); + list_del_init(&entry->se_list); spin_unlock(&lli->lli_sa_lock); - ll_sa_entry_put(sai, entry); + if (entry->se_inode) + iput(entry->se_inode); + + sa_free(sai, entry); } -/* - * Delete it from sai_entries_stated list when fini. - */ +/* called by scanner after use, sa_entry will be killed */ static void -ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) +sa_put(struct ll_statahead_info *sai, struct sa_entry *entry) { - struct ll_sa_entry *pos, *next; + struct sa_entry *tmp, *next; + + if (entry && entry->se_state == SA_ENTRY_SUCC) { + struct ll_sb_info *sbi = ll_i2sbi(sai->sai_dentry->d_inode); + + sai->sai_hit++; + sai->sai_consecutive_miss = 0; + sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max); + } else { + sai->sai_miss++; + sai->sai_consecutive_miss++; + } if (entry) - do_sa_entry_fini(sai, entry); + sa_kill(sai, entry); - /* drop old entry, only 'scanner' process does this, no need to lock */ - list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) { - if (!is_omitted_entry(sai, pos->se_index)) + /* + * kill old completed entries, only scanner process does this, no need + * to lock + */ + list_for_each_entry_safe(tmp, next, &sai->sai_entries, se_list) { + if (!is_omitted_entry(sai, tmp->se_index)) break; - do_sa_entry_fini(sai, pos); + sa_kill(sai, tmp); } + + wake_up(&sai->sai_thread.t_ctl_waitq); } /* - * Inside lli_sa_lock. + * update state and sort add entry to sai_entries by index, return true if + * scanner is waiting on this entry. */ -static void -do_sa_entry_to_stated(struct ll_statahead_info *sai, - struct ll_sa_entry *entry, enum se_stat stat) +static bool +__sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret) { - struct ll_sa_entry *se; - struct list_head *pos = &sai->sai_entries_stated; + struct list_head *pos = &sai->sai_entries; + __u64 index = entry->se_index; + struct sa_entry *se; - if (!list_empty(&entry->se_list)) - list_del_init(&entry->se_list); + LASSERT(!sa_ready(entry)); + LASSERT(list_empty(&entry->se_list)); - list_for_each_entry_reverse(se, &sai->sai_entries_stated, se_list) { + list_for_each_entry_reverse(se, &sai->sai_entries, se_list) { if (se->se_index < entry->se_index) { pos = &se->se_list; break; } } - list_add(&entry->se_list, pos); - entry->se_stat = stat; + entry->se_state = ret < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC; + + return (index == sai->sai_index_wait); } /* - * Move entry to sai_entries_stated and sort with the index. - * \retval 1 -- entry to be destroyed. - * \retval 0 -- entry is inserted into stated list. + * release resources used in async stat RPC, update entry state and wakeup if + * scanner process it waiting on this entry. */ -static int -ll_sa_entry_to_stated(struct ll_statahead_info *sai, - struct ll_sa_entry *entry, enum se_stat stat) +static void +sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret) { - struct ll_inode_info *lli = ll_i2info(sai->sai_inode); - int ret = 1; + struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode); + struct md_enqueue_info *minfo = entry->se_minfo; + struct ptlrpc_request *req = entry->se_req; + bool wakeup; + + /* release resources used in RPC */ + if (minfo) { + entry->se_minfo = NULL; + ll_intent_release(&minfo->mi_it); + iput(minfo->mi_dir); + kfree(minfo); + } - ll_sa_entry_cleanup(sai, entry); + if (req) { + entry->se_req = NULL; + ptlrpc_req_finished(req); + } spin_lock(&lli->lli_sa_lock); - if (likely(entry->se_stat != SA_ENTRY_DEST)) { - do_sa_entry_to_stated(sai, entry, stat); - ret = 0; - } + wakeup = __sa_make_ready(sai, entry, ret); spin_unlock(&lli->lli_sa_lock); - return ret; + if (wakeup) + wake_up(&sai->sai_waitq); } -/* - * Insert inode into the list of sai_entries_agl. - */ +/* Insert inode into the list of sai_agls. */ static void ll_agl_add(struct ll_statahead_info *sai, struct inode *inode, int index) { struct ll_inode_info *child = ll_i2info(inode); - struct ll_inode_info *parent = ll_i2info(sai->sai_inode); + struct ll_inode_info *parent = ll_i2info(sai->sai_dentry->d_inode); int added = 0; spin_lock(&child->lli_agl_lock); @@ -426,9 +370,9 @@ static void ll_agl_add(struct ll_statahead_info *sai, igrab(inode); spin_lock(&parent->lli_agl_lock); - if (list_empty(&sai->sai_entries_agl)) + if (list_empty(&sai->sai_agls)) added = 1; - list_add_tail(&child->lli_agl_list, &sai->sai_entries_agl); + list_add_tail(&child->lli_agl_list, &sai->sai_agls); spin_unlock(&parent->lli_agl_lock); } else { spin_unlock(&child->lli_agl_lock); @@ -438,8 +382,10 @@ static void ll_agl_add(struct ll_statahead_info *sai, wake_up(&sai->sai_agl_thread.t_ctl_waitq); } -static struct ll_statahead_info *ll_sai_alloc(void) +/* allocate sai */ +static struct ll_statahead_info *ll_sai_alloc(struct dentry *dentry) { + struct ll_inode_info *lli = ll_i2info(dentry->d_inode); struct ll_statahead_info *sai; int i; @@ -447,24 +393,18 @@ static struct ll_statahead_info *ll_sai_alloc(void) if (!sai) return NULL; + sai->sai_dentry = dget(dentry); atomic_set(&sai->sai_refcount, 1); - spin_lock(&sai_generation_lock); - sai->sai_generation = ++sai_generation; - if (unlikely(sai_generation == 0)) - sai->sai_generation = ++sai_generation; - spin_unlock(&sai_generation_lock); - sai->sai_max = LL_SA_RPC_MIN; sai->sai_index = 1; init_waitqueue_head(&sai->sai_waitq); init_waitqueue_head(&sai->sai_thread.t_ctl_waitq); init_waitqueue_head(&sai->sai_agl_thread.t_ctl_waitq); + INIT_LIST_HEAD(&sai->sai_interim_entries); INIT_LIST_HEAD(&sai->sai_entries); - INIT_LIST_HEAD(&sai->sai_entries_received); - INIT_LIST_HEAD(&sai->sai_entries_stated); - INIT_LIST_HEAD(&sai->sai_entries_agl); + INIT_LIST_HEAD(&sai->sai_agls); for (i = 0; i < LL_SA_CACHE_SIZE; i++) { INIT_LIST_HEAD(&sai->sai_cache[i]); @@ -472,63 +412,74 @@ static struct ll_statahead_info *ll_sai_alloc(void) } atomic_set(&sai->sai_cache_count, 0); + spin_lock(&sai_generation_lock); + lli->lli_sa_generation = ++sai_generation; + if (unlikely(!sai_generation)) + lli->lli_sa_generation = ++sai_generation; + spin_unlock(&sai_generation_lock); + return sai; } -static inline struct ll_statahead_info * -ll_sai_get(struct ll_statahead_info *sai) +/* free sai */ +static inline void ll_sai_free(struct ll_statahead_info *sai) { - atomic_inc(&sai->sai_refcount); + LASSERT(sai->sai_dentry); + dput(sai->sai_dentry); + kfree(sai); +} + +/* + * take refcount of sai if sai for @dir exists, which means statahead is on for + * this directory. + */ +static inline struct ll_statahead_info *ll_sai_get(struct inode *dir) +{ + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = NULL; + + spin_lock(&lli->lli_sa_lock); + sai = lli->lli_sai; + if (sai) + atomic_inc(&sai->sai_refcount); + spin_unlock(&lli->lli_sa_lock); + return sai; } +/* + * put sai refcount after use, if refcount reaches zero, free sai and sa_entries + * attached to it. + */ static void ll_sai_put(struct ll_statahead_info *sai) { - struct inode *inode = sai->sai_inode; - struct ll_inode_info *lli = ll_i2info(inode); + struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode); if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) { - struct ll_sa_entry *entry, *next; - - if (unlikely(atomic_read(&sai->sai_refcount) > 0)) { - /* It is race case, the interpret callback just hold - * a reference count - */ - spin_unlock(&lli->lli_sa_lock); - return; - } - - LASSERT(!lli->lli_opendir_key); - LASSERT(thread_is_stopped(&sai->sai_thread)); - LASSERT(thread_is_stopped(&sai->sai_agl_thread)); + struct ll_sb_info *sbi = ll_i2sbi(sai->sai_dentry->d_inode); + struct sa_entry *entry, *next; lli->lli_sai = NULL; - lli->lli_opendir_pid = 0; spin_unlock(&lli->lli_sa_lock); - if (sai->sai_sent > sai->sai_replied) - CDEBUG(D_READA, "statahead for dir "DFID - " does not finish: [sent:%llu] [replied:%llu]\n", - PFID(&lli->lli_fid), - sai->sai_sent, sai->sai_replied); + LASSERT(thread_is_stopped(&sai->sai_thread)); + LASSERT(thread_is_stopped(&sai->sai_agl_thread)); + LASSERT(sai->sai_sent == sai->sai_replied); + LASSERT(!sa_has_callback(sai)); list_for_each_entry_safe(entry, next, &sai->sai_entries, - se_link) - do_sa_entry_fini(sai, entry); - - LASSERT(list_empty(&sai->sai_entries)); - LASSERT(list_empty(&sai->sai_entries_received)); - LASSERT(list_empty(&sai->sai_entries_stated)); + se_list) + sa_kill(sai, entry); LASSERT(atomic_read(&sai->sai_cache_count) == 0); - LASSERT(list_empty(&sai->sai_entries_agl)); + LASSERT(list_empty(&sai->sai_agls)); - iput(inode); - kfree(sai); + ll_sai_free(sai); + atomic_dec(&sbi->ll_sa_running); } } -/* Do NOT forget to drop inode refcount when into sai_entries_agl. */ +/* Do NOT forget to drop inode refcount when into sai_agls. */ static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai) { struct ll_inode_info *lli = ll_i2info(inode); @@ -588,29 +539,21 @@ static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai) iput(inode); } -static void ll_post_statahead(struct ll_statahead_info *sai) +/* + * prepare inode for sa entry, add it into agl list, now sa_entry is ready + * to be used by scanner process. + */ +static void sa_instantiate(struct ll_statahead_info *sai, + struct sa_entry *entry) { - struct inode *dir = sai->sai_inode; + struct inode *dir = sai->sai_dentry->d_inode; struct inode *child; - struct ll_inode_info *lli = ll_i2info(dir); - struct ll_sa_entry *entry; struct md_enqueue_info *minfo; struct lookup_intent *it; struct ptlrpc_request *req; struct mdt_body *body; int rc = 0; - spin_lock(&lli->lli_sa_lock); - if (unlikely(list_empty(&sai->sai_entries_received))) { - spin_unlock(&lli->lli_sa_lock); - return; - } - entry = list_entry(sai->sai_entries_received.next, - struct ll_sa_entry, se_list); - atomic_inc(&entry->se_refcount); - list_del_init(&entry->se_list); - spin_unlock(&lli->lli_sa_lock); - LASSERT(entry->se_handle != 0); minfo = entry->se_minfo; @@ -632,7 +575,7 @@ static void ll_post_statahead(struct ll_statahead_info *sai) /* XXX: No fid in reply, this is probably cross-ref case. * SA can't handle it yet. */ - if (body->valid & OBD_MD_MDS) { + if (body->mbo_valid & OBD_MD_MDS) { rc = -EAGAIN; goto out; } @@ -641,7 +584,7 @@ static void ll_post_statahead(struct ll_statahead_info *sai) * revalidate. */ /* unlinked and re-created with the same name */ - if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->fid1))) { + if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->mbo_fid1))) { entry->se_inode = NULL; iput(child); child = NULL; @@ -659,8 +602,9 @@ static void ll_post_statahead(struct ll_statahead_info *sai) if (rc) goto out; - CDEBUG(D_DLMTRACE, "%s: setting l_data to inode "DFID"%p\n", + CDEBUG(D_READA, "%s: setting %.*s" DFID " l_data to inode %p\n", ll_get_fsname(child->i_sb, NULL, 0), + entry->se_qstr.len, entry->se_qstr.name, PFID(ll_inode2fid(child)), child); ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL); @@ -670,34 +614,75 @@ static void ll_post_statahead(struct ll_statahead_info *sai) ll_agl_add(sai, child, entry->se_index); out: - /* The "ll_sa_entry_to_stated()" will drop related ldlm ibits lock - * reference count by calling "ll_intent_drop_lock()" in spite of the - * above operations failed or not. Do not worry about calling - * "ll_intent_drop_lock()" more than once. + /* + * sa_make_ready() will drop ldlm ibits lock refcount by calling + * ll_intent_drop_lock() in spite of failures. Do not worry about + * calling ll_intent_drop_lock() more than once. */ - rc = ll_sa_entry_to_stated(sai, entry, - rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC); - if (rc == 0 && entry->se_index == sai->sai_index_wait) - wake_up(&sai->sai_waitq); - ll_sa_entry_put(sai, entry); + sa_make_ready(sai, entry, rc); } +/* once there are async stat replies, instantiate sa_entry from replies */ +static void sa_handle_callback(struct ll_statahead_info *sai) +{ + struct ll_inode_info *lli; + + lli = ll_i2info(sai->sai_dentry->d_inode); + + while (sa_has_callback(sai)) { + struct sa_entry *entry; + + spin_lock(&lli->lli_sa_lock); + if (unlikely(!sa_has_callback(sai))) { + spin_unlock(&lli->lli_sa_lock); + break; + } + entry = list_entry(sai->sai_interim_entries.next, + struct sa_entry, se_list); + list_del_init(&entry->se_list); + spin_unlock(&lli->lli_sa_lock); + + sa_instantiate(sai, entry); + } +} + +/* + * callback for async stat, because this is called in ptlrpcd context, we only + * put sa_entry in sai_cb_entries list, and let sa_handle_callback() to really + * prepare inode and instantiate sa_entry later. + */ static int ll_statahead_interpret(struct ptlrpc_request *req, struct md_enqueue_info *minfo, int rc) { struct lookup_intent *it = &minfo->mi_it; struct inode *dir = minfo->mi_dir; struct ll_inode_info *lli = ll_i2info(dir); - struct ll_statahead_info *sai = NULL; - struct ll_sa_entry *entry; - __u64 handle = 0; - int wakeup; + struct ll_statahead_info *sai = lli->lli_sai; + struct sa_entry *entry = (struct sa_entry *)minfo->mi_cbdata; + __u64 handle = 0; + bool wakeup; if (it_disposition(it, DISP_LOOKUP_NEG)) rc = -ENOENT; - if (rc == 0) { - /* release ibits lock ASAP to avoid deadlock when statahead + /* + * because statahead thread will wait for all inflight RPC to finish, + * sai should be always valid, no need to refcount + */ + LASSERT(sai); + LASSERT(!thread_is_stopped(&sai->sai_thread)); + LASSERT(entry); + + CDEBUG(D_READA, "sa_entry %.*s rc %d\n", + entry->se_qstr.len, entry->se_qstr.name, rc); + + if (rc) { + ll_intent_release(it); + iput(dir); + kfree(minfo); + } else { + /* + * release ibits lock ASAP to avoid deadlock when statahead * thread enqueues lock on parent in readdir and another * process enqueues lock on child with parent lock held, eg. * unlink. @@ -707,65 +692,32 @@ static int ll_statahead_interpret(struct ptlrpc_request *req, } spin_lock(&lli->lli_sa_lock); - /* stale entry */ - if (unlikely(!lli->lli_sai || - lli->lli_sai->sai_generation != minfo->mi_generation)) { - spin_unlock(&lli->lli_sa_lock); - rc = -ESTALE; - goto out; + if (rc) { + wakeup = __sa_make_ready(sai, entry, rc); } else { - sai = ll_sai_get(lli->lli_sai); - if (unlikely(!thread_is_running(&sai->sai_thread))) { - sai->sai_replied++; - spin_unlock(&lli->lli_sa_lock); - rc = -EBADFD; - goto out; - } - - entry = ll_sa_entry_get_byindex(sai, minfo->mi_cbdata); - if (!entry) { - sai->sai_replied++; - spin_unlock(&lli->lli_sa_lock); - rc = -EIDRM; - goto out; - } - - if (rc != 0) { - do_sa_entry_to_stated(sai, entry, SA_ENTRY_INVA); - wakeup = (entry->se_index == sai->sai_index_wait); - } else { - entry->se_minfo = minfo; - entry->se_req = ptlrpc_request_addref(req); - /* Release the async ibits lock ASAP to avoid deadlock - * when statahead thread tries to enqueue lock on parent - * for readpage and other tries to enqueue lock on child - * with parent's lock held, for example: unlink. - */ - entry->se_handle = handle; - wakeup = list_empty(&sai->sai_entries_received); - list_add_tail(&entry->se_list, - &sai->sai_entries_received); - } - sai->sai_replied++; - spin_unlock(&lli->lli_sa_lock); - - ll_sa_entry_put(sai, entry); - if (wakeup) - wake_up(&sai->sai_thread.t_ctl_waitq); + entry->se_minfo = minfo; + entry->se_req = ptlrpc_request_addref(req); + /* + * Release the async ibits lock ASAP to avoid deadlock + * when statahead thread tries to enqueue lock on parent + * for readpage and other tries to enqueue lock on child + * with parent's lock held, for example: unlink. + */ + entry->se_handle = handle; + wakeup = !sa_has_callback(sai); + list_add_tail(&entry->se_list, &sai->sai_interim_entries); } + sai->sai_replied++; + + if (wakeup) + wake_up(&sai->sai_thread.t_ctl_waitq); + spin_unlock(&lli->lli_sa_lock); -out: - if (rc != 0) { - ll_intent_release(it); - iput(dir); - kfree(minfo); - } - if (sai) - ll_sai_put(sai); return rc; } -static void sa_args_fini(struct md_enqueue_info *minfo, +/* finish async stat RPC arguments */ +static void sa_fini_data(struct md_enqueue_info *minfo, struct ldlm_enqueue_info *einfo) { LASSERT(minfo && einfo); @@ -777,12 +729,11 @@ static void sa_args_fini(struct md_enqueue_info *minfo, /** * prepare arguments for async stat RPC. */ -static int sa_args_init(struct inode *dir, struct inode *child, - struct ll_sa_entry *entry, struct md_enqueue_info **pmi, +static int sa_prep_data(struct inode *dir, struct inode *child, + struct sa_entry *entry, struct md_enqueue_info **pmi, struct ldlm_enqueue_info **pei) { const struct qstr *qstr = &entry->se_qstr; - struct ll_inode_info *lli = ll_i2info(dir); struct md_enqueue_info *minfo; struct ldlm_enqueue_info *einfo; struct md_op_data *op_data; @@ -808,8 +759,7 @@ static int sa_args_init(struct inode *dir, struct inode *child, minfo->mi_it.it_op = IT_GETATTR; minfo->mi_dir = igrab(dir); minfo->mi_cb = ll_statahead_interpret; - minfo->mi_generation = lli->lli_sai->sai_generation; - minfo->mi_cbdata = entry->se_index; + minfo->mi_cbdata = entry; einfo->ei_type = LDLM_IBITS; einfo->ei_mode = it_to_lock_mode(&minfo->mi_it); @@ -824,31 +774,33 @@ static int sa_args_init(struct inode *dir, struct inode *child, return 0; } -static int do_sa_lookup(struct inode *dir, struct ll_sa_entry *entry) +/* async stat for file not found in dcache */ +static int sa_lookup(struct inode *dir, struct sa_entry *entry) { struct md_enqueue_info *minfo; struct ldlm_enqueue_info *einfo; int rc; - rc = sa_args_init(dir, NULL, entry, &minfo, &einfo); + rc = sa_prep_data(dir, NULL, entry, &minfo, &einfo); if (rc) return rc; rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo); - if (rc < 0) - sa_args_fini(minfo, einfo); + if (rc) + sa_fini_data(minfo, einfo); return rc; } /** - * similar to ll_revalidate_it(). - * \retval 1 -- dentry valid - * \retval 0 -- will send stat-ahead request - * \retval others -- prepare stat-ahead request failed + * async stat for file found in dcache, similar to .revalidate + * + * \retval 1 dentry valid, no RPC sent + * \retval 0 dentry invalid, will send async stat RPC + * \retval negative number upon error */ -static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry, - struct dentry *dentry) +static int sa_revalidate(struct inode *dir, struct sa_entry *entry, + struct dentry *dentry) { struct inode *inode = d_inode(dentry); struct lookup_intent it = { .it_op = IT_GETATTR, @@ -872,7 +824,7 @@ static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry, return 1; } - rc = sa_args_init(dir, inode, entry, &minfo, &einfo); + rc = sa_prep_data(dir, inode, entry, &minfo, &einfo); if (rc) { entry->se_inode = NULL; iput(inode); @@ -880,56 +832,50 @@ static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry, } rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo); - if (rc < 0) { + if (rc) { entry->se_inode = NULL; iput(inode); - sa_args_fini(minfo, einfo); + sa_fini_data(minfo, einfo); } return rc; } -static void ll_statahead_one(struct dentry *parent, const char *entry_name, - int entry_name_len) +/* async stat for file with @name */ +static void sa_statahead(struct dentry *parent, const char *name, int len) { struct inode *dir = d_inode(parent); struct ll_inode_info *lli = ll_i2info(dir); struct ll_statahead_info *sai = lli->lli_sai; struct dentry *dentry = NULL; - struct ll_sa_entry *entry; + struct sa_entry *entry; int rc; - int rc1; - entry = ll_sa_entry_alloc(parent, sai, sai->sai_index, entry_name, - entry_name_len); + entry = sa_alloc(parent, sai, sai->sai_index, name, len); if (IS_ERR(entry)) return; dentry = d_lookup(parent, &entry->se_qstr); if (!dentry) { - rc = do_sa_lookup(dir, entry); + rc = sa_lookup(dir, entry); } else { - rc = do_sa_revalidate(dir, entry, dentry); + rc = sa_revalidate(dir, entry, dentry); if (rc == 1 && agl_should_run(sai, d_inode(dentry))) ll_agl_add(sai, d_inode(dentry), entry->se_index); + } + if (dentry) dput(dentry); - } - if (rc) { - rc1 = ll_sa_entry_to_stated(sai, entry, - rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC); - if (rc1 == 0 && entry->se_index == sai->sai_index_wait) - wake_up(&sai->sai_waitq); - } else { + if (rc) + sa_make_ready(sai, entry, rc); + else sai->sai_sent++; - } sai->sai_index++; - /* drop one refcount on entry by ll_sa_entry_alloc */ - ll_sa_entry_put(sai, entry); } +/* async glimpse (agl) thread main function */ static int ll_agl_thread(void *arg) { struct dentry *parent = arg; @@ -937,10 +883,12 @@ static int ll_agl_thread(void *arg) struct ll_inode_info *plli = ll_i2info(dir); struct ll_inode_info *clli; struct ll_sb_info *sbi = ll_i2sbi(dir); - struct ll_statahead_info *sai = ll_sai_get(plli->lli_sai); - struct ptlrpc_thread *thread = &sai->sai_agl_thread; + struct ll_statahead_info *sai; + struct ptlrpc_thread *thread; struct l_wait_info lwi = { 0 }; + sai = ll_sai_get(dir); + thread = &sai->sai_agl_thread; thread->t_pid = current_pid(); CDEBUG(D_READA, "agl thread started: sai %p, parent %pd\n", sai, parent); @@ -959,7 +907,7 @@ static int ll_agl_thread(void *arg) while (1) { l_wait_event(thread->t_ctl_waitq, - !list_empty(&sai->sai_entries_agl) || + !list_empty(&sai->sai_agls) || !thread_is_running(thread), &lwi); @@ -970,8 +918,8 @@ static int ll_agl_thread(void *arg) /* The statahead thread maybe help to process AGL entries, * so check whether list empty again. */ - if (!list_empty(&sai->sai_entries_agl)) { - clli = list_entry(sai->sai_entries_agl.next, + if (!list_empty(&sai->sai_agls)) { + clli = list_entry(sai->sai_agls.next, struct ll_inode_info, lli_agl_list); list_del_init(&clli->lli_agl_list); spin_unlock(&plli->lli_agl_lock); @@ -983,8 +931,8 @@ static int ll_agl_thread(void *arg) spin_lock(&plli->lli_agl_lock); sai->sai_agl_valid = 0; - while (!list_empty(&sai->sai_entries_agl)) { - clli = list_entry(sai->sai_entries_agl.next, + while (!list_empty(&sai->sai_agls)) { + clli = list_entry(sai->sai_agls.next, struct ll_inode_info, lli_agl_list); list_del_init(&clli->lli_agl_list); spin_unlock(&plli->lli_agl_lock); @@ -1001,6 +949,7 @@ static int ll_agl_thread(void *arg) return 0; } +/* start agl thread */ static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai) { struct ptlrpc_thread *thread = &sai->sai_agl_thread; @@ -1025,58 +974,71 @@ static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai) &lwi); } +/* statahead thread main function */ static int ll_statahead_thread(void *arg) { struct dentry *parent = arg; struct inode *dir = d_inode(parent); - struct ll_inode_info *plli = ll_i2info(dir); - struct ll_inode_info *clli; + struct ll_inode_info *lli = ll_i2info(dir); struct ll_sb_info *sbi = ll_i2sbi(dir); - struct ll_statahead_info *sai = ll_sai_get(plli->lli_sai); - struct ptlrpc_thread *thread = &sai->sai_thread; - struct ptlrpc_thread *agl_thread = &sai->sai_agl_thread; - struct page *page; + struct ll_statahead_info *sai; + struct ptlrpc_thread *sa_thread; + struct ptlrpc_thread *agl_thread; + struct page *page = NULL; __u64 pos = 0; int first = 0; int rc = 0; - struct ll_dir_chain chain; + struct md_op_data *op_data; struct l_wait_info lwi = { 0 }; - thread->t_pid = current_pid(); + sai = ll_sai_get(dir); + sa_thread = &sai->sai_thread; + agl_thread = &sai->sai_agl_thread; + sa_thread->t_pid = current_pid(); CDEBUG(D_READA, "statahead thread starting: sai %p, parent %pd\n", sai, parent); + op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, + LUSTRE_OPC_ANY, dir); + if (IS_ERR(op_data)) { + rc = PTR_ERR(op_data); + goto out; + } + + op_data->op_max_pages = ll_i2sbi(dir)->ll_md_brw_pages; + if (sbi->ll_flags & LL_SBI_AGL_ENABLED) ll_start_agl(parent, sai); atomic_inc(&sbi->ll_sa_total); - spin_lock(&plli->lli_sa_lock); - if (thread_is_init(thread)) + spin_lock(&lli->lli_sa_lock); + if (thread_is_init(sa_thread)) /* If someone else has changed the thread state * (e.g. already changed to SVC_STOPPING), we can't just * blindly overwrite that setting. */ - thread_set_flags(thread, SVC_RUNNING); - spin_unlock(&plli->lli_sa_lock); - wake_up(&thread->t_ctl_waitq); - - ll_dir_chain_init(&chain); - page = ll_get_dir_page(dir, pos, &chain); + thread_set_flags(sa_thread, SVC_RUNNING); + spin_unlock(&lli->lli_sa_lock); + wake_up(&sa_thread->t_ctl_waitq); - while (1) { + while (pos != MDS_DIR_END_OFF && thread_is_running(sa_thread)) { struct lu_dirpage *dp; struct lu_dirent *ent; + sai->sai_in_readpage = 1; + page = ll_get_dir_page(dir, op_data, pos); + sai->sai_in_readpage = 0; if (IS_ERR(page)) { rc = PTR_ERR(page); - CDEBUG(D_READA, "error reading dir "DFID" at %llu/%llu: [rc %d] [parent %u]\n", + CDEBUG(D_READA, "error reading dir "DFID" at %llu/%llu: opendir_pid = %u: rc = %d\n", PFID(ll_inode2fid(dir)), pos, sai->sai_index, - rc, plli->lli_opendir_pid); - goto out; + lli->lli_opendir_pid, rc); + break; } dp = page_address(page); - for (ent = lu_dirent_start(dp); ent; + for (ent = lu_dirent_start(dp); + ent && thread_is_running(sa_thread) && !sa_low_hit(sai); ent = lu_dirent_next(ent)) { __u64 hash; int namelen; @@ -1123,123 +1085,79 @@ static int ll_statahead_thread(void *arg) if (unlikely(++first == 1)) continue; -keep_it: - l_wait_event(thread->t_ctl_waitq, - !sa_sent_full(sai) || - !list_empty(&sai->sai_entries_received) || - !list_empty(&sai->sai_entries_agl) || - !thread_is_running(thread), - &lwi); - -interpret_it: - while (!list_empty(&sai->sai_entries_received)) - ll_post_statahead(sai); - - if (unlikely(!thread_is_running(thread))) { - ll_release_page(page, 0); - rc = 0; - goto out; - } + /* wait for spare statahead window */ + do { + l_wait_event(sa_thread->t_ctl_waitq, + !sa_sent_full(sai) || + sa_has_callback(sai) || + !list_empty(&sai->sai_agls) || + !thread_is_running(sa_thread), + &lwi); + sa_handle_callback(sai); - /* If no window for metadata statahead, but there are - * some AGL entries to be triggered, then try to help - * to process the AGL entries. - */ - if (sa_sent_full(sai)) { - spin_lock(&plli->lli_agl_lock); - while (!list_empty(&sai->sai_entries_agl)) { - clli = list_entry(sai->sai_entries_agl.next, + spin_lock(&lli->lli_agl_lock); + while (sa_sent_full(sai) && + !agl_list_empty(sai)) { + struct ll_inode_info *clli; + + clli = list_entry(sai->sai_agls.next, struct ll_inode_info, lli_agl_list); list_del_init(&clli->lli_agl_list); - spin_unlock(&plli->lli_agl_lock); + spin_unlock(&lli->lli_agl_lock); + ll_agl_trigger(&clli->lli_vfs_inode, sai); - if (!list_empty(&sai->sai_entries_received)) - goto interpret_it; - - if (unlikely( - !thread_is_running(thread))) { - ll_release_page(page, 0); - rc = 0; - goto out; - } - - if (!sa_sent_full(sai)) - goto do_it; - - spin_lock(&plli->lli_agl_lock); + spin_lock(&lli->lli_agl_lock); } - spin_unlock(&plli->lli_agl_lock); + spin_unlock(&lli->lli_agl_lock); + } while (sa_sent_full(sai) && + thread_is_running(sa_thread)); - goto keep_it; - } - -do_it: - ll_statahead_one(parent, name, namelen); + sa_statahead(parent, name, namelen); } - pos = le64_to_cpu(dp->ldp_hash_end); - if (pos == MDS_DIR_END_OFF) { - /* - * End of directory reached. - */ - ll_release_page(page, 0); - while (1) { - l_wait_event(thread->t_ctl_waitq, - !list_empty(&sai->sai_entries_received) || - sai->sai_sent == sai->sai_replied || - !thread_is_running(thread), - &lwi); - while (!list_empty(&sai->sai_entries_received)) - ll_post_statahead(sai); + pos = le64_to_cpu(dp->ldp_hash_end); + ll_release_page(dir, page, + le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); - if (unlikely(!thread_is_running(thread))) { - rc = 0; - goto out; - } + if (sa_low_hit(sai)) { + rc = -EFAULT; + atomic_inc(&sbi->ll_sa_wrong); + CDEBUG(D_READA, "Statahead for dir "DFID" hit ratio too low: hit/miss %llu/%llu, sent/replied %llu/%llu, stopping statahead thread: pid %d\n", + PFID(&lli->lli_fid), sai->sai_hit, + sai->sai_miss, sai->sai_sent, + sai->sai_replied, current_pid()); + break; + } + } + ll_finish_md_op_data(op_data); - if (sai->sai_sent == sai->sai_replied && - list_empty(&sai->sai_entries_received)) - break; - } + if (rc < 0) { + spin_lock(&lli->lli_sa_lock); + thread_set_flags(sa_thread, SVC_STOPPING); + lli->lli_sa_enabled = 0; + spin_unlock(&lli->lli_sa_lock); + } - spin_lock(&plli->lli_agl_lock); - while (!list_empty(&sai->sai_entries_agl) && - thread_is_running(thread)) { - clli = list_entry(sai->sai_entries_agl.next, - struct ll_inode_info, lli_agl_list); - list_del_init(&clli->lli_agl_list); - spin_unlock(&plli->lli_agl_lock); - ll_agl_trigger(&clli->lli_vfs_inode, sai); - spin_lock(&plli->lli_agl_lock); - } - spin_unlock(&plli->lli_agl_lock); + /* + * statahead is finished, but statahead entries need to be cached, wait + * for file release to stop me. + */ + while (thread_is_running(sa_thread)) { + l_wait_event(sa_thread->t_ctl_waitq, + sa_has_callback(sai) || + !agl_list_empty(sai) || + !thread_is_running(sa_thread), + &lwi); - rc = 0; - goto out; - } else if (1) { - /* - * chain is exhausted. - * Normal case: continue to the next page. - */ - ll_release_page(page, le32_to_cpu(dp->ldp_flags) & - LDF_COLLIDE); - page = ll_get_dir_page(dir, pos, &chain); - } else { - LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); - ll_release_page(page, 1); - /* - * go into overflow page. - */ - } + sa_handle_callback(sai); } - out: if (sai->sai_agl_valid) { - spin_lock(&plli->lli_agl_lock); + spin_lock(&lli->lli_agl_lock); thread_set_flags(agl_thread, SVC_STOPPING); - spin_unlock(&plli->lli_agl_lock); + spin_unlock(&lli->lli_agl_lock); wake_up(&agl_thread->t_ctl_waitq); CDEBUG(D_READA, "stop agl thread: sai %p pid %u\n", @@ -1249,84 +1167,93 @@ out: &lwi); } else { /* Set agl_thread flags anyway. */ - thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED); + thread_set_flags(agl_thread, SVC_STOPPED); } - ll_dir_chain_fini(&chain); - spin_lock(&plli->lli_sa_lock); - if (!list_empty(&sai->sai_entries_received)) { - thread_set_flags(thread, SVC_STOPPING); - spin_unlock(&plli->lli_sa_lock); - - /* To release the resources held by received entries. */ - while (!list_empty(&sai->sai_entries_received)) - ll_post_statahead(sai); - spin_lock(&plli->lli_sa_lock); + /* + * wait for inflight statahead RPCs to finish, and then we can free sai + * safely because statahead RPC will access sai data + */ + while (sai->sai_sent != sai->sai_replied) { + /* in case we're not woken up, timeout wait */ + lwi = LWI_TIMEOUT(msecs_to_jiffies(MSEC_PER_SEC >> 3), + NULL, NULL); + l_wait_event(sa_thread->t_ctl_waitq, + sai->sai_sent == sai->sai_replied, &lwi); } - thread_set_flags(thread, SVC_STOPPED); - spin_unlock(&plli->lli_sa_lock); - wake_up(&sai->sai_waitq); - wake_up(&thread->t_ctl_waitq); - ll_sai_put(sai); - dput(parent); + + /* release resources held by statahead RPCs */ + sa_handle_callback(sai); + + spin_lock(&lli->lli_sa_lock); + thread_set_flags(sa_thread, SVC_STOPPED); + spin_unlock(&lli->lli_sa_lock); + CDEBUG(D_READA, "statahead thread stopped: sai %p, parent %pd\n", sai, parent); + + wake_up(&sai->sai_waitq); + wake_up(&sa_thread->t_ctl_waitq); + ll_sai_put(sai); + return rc; } -/** - * called in ll_file_release(). - */ -void ll_stop_statahead(struct inode *dir, void *key) +/* authorize opened dir handle @key to statahead */ +void ll_authorize_statahead(struct inode *dir, void *key) { struct ll_inode_info *lli = ll_i2info(dir); - if (unlikely(!key)) - return; - spin_lock(&lli->lli_sa_lock); - if (lli->lli_opendir_key != key || lli->lli_opendir_pid == 0) { - spin_unlock(&lli->lli_sa_lock); - return; + if (!lli->lli_opendir_key && !lli->lli_sai) { + /* + * if lli_sai is not NULL, it means previous statahead is not + * finished yet, we'd better not start a new statahead for now. + */ + LASSERT(!lli->lli_opendir_pid); + lli->lli_opendir_key = key; + lli->lli_opendir_pid = current_pid(); + lli->lli_sa_enabled = 1; } + spin_unlock(&lli->lli_sa_lock); +} - lli->lli_opendir_key = NULL; - - if (lli->lli_sai) { - struct l_wait_info lwi = { 0 }; - struct ptlrpc_thread *thread = &lli->lli_sai->sai_thread; +/* + * deauthorize opened dir handle @key to statahead, but statahead thread may + * still be running, notify it to quit. + */ +void ll_deauthorize_statahead(struct inode *dir, void *key) +{ + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai; - if (!thread_is_stopped(thread)) { - thread_set_flags(thread, SVC_STOPPING); - spin_unlock(&lli->lli_sa_lock); - wake_up(&thread->t_ctl_waitq); + LASSERT(lli->lli_opendir_key == key); + LASSERT(lli->lli_opendir_pid); - CDEBUG(D_READA, "stop statahead thread: sai %p pid %u\n", - lli->lli_sai, (unsigned int)thread->t_pid); - l_wait_event(thread->t_ctl_waitq, - thread_is_stopped(thread), - &lwi); - } else { - spin_unlock(&lli->lli_sa_lock); - } + CDEBUG(D_READA, "deauthorize statahead for "DFID"\n", + PFID(&lli->lli_fid)); + spin_lock(&lli->lli_sa_lock); + lli->lli_opendir_key = NULL; + lli->lli_opendir_pid = 0; + lli->lli_sa_enabled = 0; + sai = lli->lli_sai; + if (sai && thread_is_running(&sai->sai_thread)) { /* - * Put the ref which was held when first statahead_enter. - * It maybe not the last ref for some statahead requests - * maybe inflight. + * statahead thread may not quit yet because it needs to cache + * entries, now it's time to tell it to quit. */ - ll_sai_put(lli->lli_sai); - } else { - lli->lli_opendir_pid = 0; - spin_unlock(&lli->lli_sa_lock); + thread_set_flags(&sai->sai_thread, SVC_STOPPING); + wake_up(&sai->sai_thread.t_ctl_waitq); } + spin_unlock(&lli->lli_sa_lock); } enum { /** * not first dirent, or is "." */ - LS_NONE_FIRST_DE = 0, + LS_NOT_FIRST_DE = 0, /** * the first non-hidden dirent */ @@ -1337,17 +1264,26 @@ enum { LS_FIRST_DOT_DE }; +/* file is first dirent under @dir */ static int is_first_dirent(struct inode *dir, struct dentry *dentry) { - struct ll_dir_chain chain; const struct qstr *target = &dentry->d_name; + struct md_op_data *op_data; struct page *page; __u64 pos = 0; int dot_de; - int rc = LS_NONE_FIRST_DE; + int rc = LS_NOT_FIRST_DE; - ll_dir_chain_init(&chain); - page = ll_get_dir_page(dir, pos, &chain); + op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, + LUSTRE_OPC_ANY, dir); + if (IS_ERR(op_data)) + return PTR_ERR(op_data); + /** + * FIXME choose the start offset of the readdir + */ + op_data->op_max_pages = ll_i2sbi(dir)->ll_md_brw_pages; + + page = ll_get_dir_page(dir, op_data, pos); while (1) { struct lu_dirpage *dp; @@ -1357,9 +1293,10 @@ static int is_first_dirent(struct inode *dir, struct dentry *dentry) struct ll_inode_info *lli = ll_i2info(dir); rc = PTR_ERR(page); - CERROR("error reading dir "DFID" at %llu: [rc %d] [parent %u]\n", + CERROR("%s: error reading dir "DFID" at %llu: opendir_pid = %u : rc = %d\n", + ll_get_fsname(dir->i_sb, NULL, 0), PFID(ll_inode2fid(dir)), pos, - rc, lli->lli_opendir_pid); + lli->lli_opendir_pid, rc); break; } @@ -1411,13 +1348,13 @@ static int is_first_dirent(struct inode *dir, struct dentry *dentry) if (target->len != namelen || memcmp(target->name, name, namelen) != 0) - rc = LS_NONE_FIRST_DE; + rc = LS_NOT_FIRST_DE; else if (!dot_de) rc = LS_FIRST_DE; else rc = LS_FIRST_DOT_DE; - ll_release_page(page, 0); + ll_release_page(dir, page, false); goto out; } pos = le64_to_cpu(dp->ldp_hash_end); @@ -1425,261 +1362,228 @@ static int is_first_dirent(struct inode *dir, struct dentry *dentry) /* * End of directory reached. */ - ll_release_page(page, 0); - break; - } else if (1) { + ll_release_page(dir, page, false); + goto out; + } else { /* * chain is exhausted * Normal case: continue to the next page. */ - ll_release_page(page, le32_to_cpu(dp->ldp_flags) & - LDF_COLLIDE); - page = ll_get_dir_page(dir, pos, &chain); - } else { - /* - * go into overflow page. - */ - LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); - ll_release_page(page, 1); + ll_release_page(dir, page, + le32_to_cpu(dp->ldp_flags) & + LDF_COLLIDE); + page = ll_get_dir_page(dir, op_data, pos); } } - out: - ll_dir_chain_fini(&chain); + ll_finish_md_op_data(op_data); return rc; } -static void -ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry) -{ - struct ptlrpc_thread *thread = &sai->sai_thread; - struct ll_sb_info *sbi = ll_i2sbi(sai->sai_inode); - int hit; - - if (entry && entry->se_stat == SA_ENTRY_SUCC) - hit = 1; - else - hit = 0; - - ll_sa_entry_fini(sai, entry); - if (hit) { - sai->sai_hit++; - sai->sai_consecutive_miss = 0; - sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max); - } else { - struct ll_inode_info *lli = ll_i2info(sai->sai_inode); - - sai->sai_miss++; - sai->sai_consecutive_miss++; - if (sa_low_hit(sai) && thread_is_running(thread)) { - atomic_inc(&sbi->ll_sa_wrong); - CDEBUG(D_READA, "Statahead for dir " DFID " hit ratio too low: hit/miss %llu/%llu, sent/replied %llu/%llu, stopping statahead thread\n", - PFID(&lli->lli_fid), sai->sai_hit, - sai->sai_miss, sai->sai_sent, - sai->sai_replied); - spin_lock(&lli->lli_sa_lock); - if (!thread_is_stopped(thread)) - thread_set_flags(thread, SVC_STOPPING); - spin_unlock(&lli->lli_sa_lock); - } - } - - if (!thread_is_stopped(thread)) - wake_up(&thread->t_ctl_waitq); -} - /** - * Start statahead thread if this is the first dir entry. - * Otherwise if a thread is started already, wait it until it is ahead of me. - * \retval 1 -- find entry with lock in cache, the caller needs to do - * nothing. - * \retval 0 -- find entry in cache, but without lock, the caller needs - * refresh from MDS. - * \retval others -- the caller need to process as non-statahead. + * revalidate @dentryp from statahead cache + * + * \param[in] dir parent directory + * \param[in] sai sai structure + * \param[out] dentryp pointer to dentry which will be revalidated + * \param[in] unplug unplug statahead window only (normally for negative + * dentry) + * \retval 1 on success, dentry is saved in @dentryp + * \retval 0 if revalidation failed (no proper lock on client) + * \retval negative number upon error */ -int do_statahead_enter(struct inode *dir, struct dentry **dentryp, - int only_unplug) +static int revalidate_statahead_dentry(struct inode *dir, + struct ll_statahead_info *sai, + struct dentry **dentryp, + bool unplug) { - struct ll_inode_info *lli = ll_i2info(dir); - struct ll_statahead_info *sai = lli->lli_sai; - struct dentry *parent; - struct ll_sa_entry *entry; - struct ptlrpc_thread *thread; - struct l_wait_info lwi = { 0 }; - struct task_struct *task; - int rc = 0; - struct ll_inode_info *plli; + struct sa_entry *entry = NULL; + struct l_wait_info lwi = { 0 }; + struct ll_dentry_data *ldd; + struct ll_inode_info *lli; + int rc = 0; - LASSERT(lli->lli_opendir_pid == current_pid()); + if ((*dentryp)->d_name.name[0] == '.') { + if (sai->sai_ls_all || + sai->sai_miss_hidden >= sai->sai_skip_hidden) { + /* + * Hidden dentry is the first one, or statahead + * thread does not skip so many hidden dentries + * before "sai_ls_all" enabled as below. + */ + } else { + if (!sai->sai_ls_all) + /* + * It maybe because hidden dentry is not + * the first one, "sai_ls_all" was not + * set, then "ls -al" missed. Enable + * "sai_ls_all" for such case. + */ + sai->sai_ls_all = 1; - if (sai) { - thread = &sai->sai_thread; - if (unlikely(thread_is_stopped(thread) && - list_empty(&sai->sai_entries_stated))) { - /* to release resource */ - ll_stop_statahead(dir, lli->lli_opendir_key); + /* + * Such "getattr" has been skipped before + * "sai_ls_all" enabled as above. + */ + sai->sai_miss_hidden++; return -EAGAIN; } + } - if ((*dentryp)->d_name.name[0] == '.') { - if (sai->sai_ls_all || - sai->sai_miss_hidden >= sai->sai_skip_hidden) { - /* - * Hidden dentry is the first one, or statahead - * thread does not skip so many hidden dentries - * before "sai_ls_all" enabled as below. - */ - } else { - if (!sai->sai_ls_all) - /* - * It maybe because hidden dentry is not - * the first one, "sai_ls_all" was not - * set, then "ls -al" missed. Enable - * "sai_ls_all" for such case. - */ - sai->sai_ls_all = 1; + if (unplug) { + rc = 1; + goto out_unplug; + } - /* - * Such "getattr" has been skipped before - * "sai_ls_all" enabled as above. - */ - sai->sai_miss_hidden++; - return -EAGAIN; - } - } + entry = sa_get(sai, &(*dentryp)->d_name); + if (!entry) { + rc = -EAGAIN; + goto out_unplug; + } - entry = ll_sa_entry_get_byname(sai, &(*dentryp)->d_name); - if (!entry || only_unplug) { - ll_sai_unplug(sai, entry); - return entry ? 1 : -EAGAIN; - } + /* if statahead is busy in readdir, help it do post-work */ + if (!sa_ready(entry) && sai->sai_in_readpage) + sa_handle_callback(sai); - if (!ll_sa_entry_stated(entry)) { - sai->sai_index_wait = entry->se_index; - lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(30), NULL, - LWI_ON_SIGNAL_NOOP, NULL); - rc = l_wait_event(sai->sai_waitq, - ll_sa_entry_stated(entry) || - thread_is_stopped(thread), - &lwi); - if (rc < 0) { - ll_sai_unplug(sai, entry); - return -EAGAIN; - } + if (!sa_ready(entry)) { + sai->sai_index_wait = entry->se_index; + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(30), NULL, + LWI_ON_SIGNAL_NOOP, NULL); + rc = l_wait_event(sai->sai_waitq, sa_ready(entry), &lwi); + if (rc < 0) { + /* + * entry may not be ready, so it may be used by inflight + * statahead RPC, don't free it. + */ + entry = NULL; + rc = -EAGAIN; + goto out_unplug; } + } - if (entry->se_stat == SA_ENTRY_SUCC && entry->se_inode) { - struct inode *inode = entry->se_inode; - struct lookup_intent it = { .it_op = IT_GETATTR, - .it_lock_handle = - entry->se_handle }; - __u64 bits; - - rc = md_revalidate_lock(ll_i2mdexp(dir), &it, - ll_inode2fid(inode), &bits); - if (rc == 1) { - if (!d_inode(*dentryp)) { - struct dentry *alias; - - alias = ll_splice_alias(inode, - *dentryp); - if (IS_ERR(alias)) { - ll_sai_unplug(sai, entry); - return PTR_ERR(alias); - } - *dentryp = alias; - } else if (d_inode(*dentryp) != inode) { - /* revalidate, but inode is recreated */ - CDEBUG(D_READA, "%s: stale dentry %pd inode "DFID", statahead inode "DFID"\n", - ll_get_fsname(d_inode(*dentryp)->i_sb, NULL, 0), - *dentryp, - PFID(ll_inode2fid(d_inode(*dentryp))), - PFID(ll_inode2fid(inode))); - ll_sai_unplug(sai, entry); - return -ESTALE; - } else { - iput(inode); + if (entry->se_state == SA_ENTRY_SUCC && entry->se_inode) { + struct inode *inode = entry->se_inode; + struct lookup_intent it = { .it_op = IT_GETATTR, + .it_lock_handle = entry->se_handle }; + __u64 bits; + + rc = md_revalidate_lock(ll_i2mdexp(dir), &it, + ll_inode2fid(inode), &bits); + if (rc == 1) { + if (!(*dentryp)->d_inode) { + struct dentry *alias; + + alias = ll_splice_alias(inode, *dentryp); + if (IS_ERR(alias)) { + rc = PTR_ERR(alias); + goto out_unplug; } + *dentryp = alias; + /** + * statahead prepared this inode, transfer inode + * refcount from sa_entry to dentry + */ entry->se_inode = NULL; - - if ((bits & MDS_INODELOCK_LOOKUP) && - d_lustre_invalid(*dentryp)) - d_lustre_revalidate(*dentryp); - ll_intent_release(&it); + } else if ((*dentryp)->d_inode != inode) { + /* revalidate, but inode is recreated */ + CDEBUG(D_READA, + "%s: stale dentry %pd inode "DFID", statahead inode "DFID"\n", + ll_get_fsname((*dentryp)->d_inode->i_sb, + NULL, 0), + *dentryp, + PFID(ll_inode2fid((*dentryp)->d_inode)), + PFID(ll_inode2fid(inode))); + rc = -ESTALE; + goto out_unplug; } - } - ll_sai_unplug(sai, entry); - return rc; + if ((bits & MDS_INODELOCK_LOOKUP) && + d_lustre_invalid(*dentryp)) + d_lustre_revalidate(*dentryp); + ll_intent_release(&it); + } } +out_unplug: + /* + * statahead cached sa_entry can be used only once, and will be killed + * right after use, so if lookup/revalidate accessed statahead cache, + * set dentry ldd_sa_generation to parent lli_sa_generation, later if we + * stat this file again, we know we've done statahead before, see + * dentry_may_statahead(). + */ + ldd = ll_d2d(*dentryp); + lli = ll_i2info(dir); + /* ldd can be NULL if llite lookup failed. */ + if (ldd) + ldd->lld_sa_generation = lli->lli_sa_generation; + sa_put(sai, entry); + return rc; +} + +/** + * start statahead thread + * + * \param[in] dir parent directory + * \param[in] dentry dentry that triggers statahead, normally the first + * dirent under @dir + * \retval -EAGAIN on success, because when this function is + * called, it's already in lookup call, so client should + * do it itself instead of waiting for statahead thread + * to do it asynchronously. + * \retval negative number upon error + */ +static int start_statahead_thread(struct inode *dir, struct dentry *dentry) +{ + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = NULL; + struct l_wait_info lwi = { 0 }; + struct ptlrpc_thread *thread; + struct task_struct *task; + struct dentry *parent = dentry->d_parent; + int rc; /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */ - rc = is_first_dirent(dir, *dentryp); - if (rc == LS_NONE_FIRST_DE) { + rc = is_first_dirent(dir, dentry); + if (rc == LS_NOT_FIRST_DE) { /* It is not "ls -{a}l" operation, no need statahead for it. */ - rc = -EAGAIN; + rc = -EFAULT; goto out; } - sai = ll_sai_alloc(); + sai = ll_sai_alloc(parent); if (!sai) { rc = -ENOMEM; goto out; } sai->sai_ls_all = (rc == LS_FIRST_DOT_DE); - sai->sai_inode = igrab(dir); - if (unlikely(!sai->sai_inode)) { - CWARN("Do not start stat ahead on dying inode "DFID"\n", - PFID(&lli->lli_fid)); - rc = -ESTALE; - goto out; - } - - /* get parent reference count here, and put it in ll_statahead_thread */ - parent = dget((*dentryp)->d_parent); - if (unlikely(sai->sai_inode != d_inode(parent))) { - struct ll_inode_info *nlli = ll_i2info(d_inode(parent)); - - CWARN("Race condition, someone changed %pd just now: old parent "DFID", new parent "DFID"\n", - *dentryp, - PFID(&lli->lli_fid), PFID(&nlli->lli_fid)); - dput(parent); - iput(sai->sai_inode); - rc = -EAGAIN; + /* + * if current lli_opendir_key was deauthorized, or dir re-opened by + * another process, don't start statahead, otherwise the newly spawned + * statahead thread won't be notified to quit. + */ + spin_lock(&lli->lli_sa_lock); + if (unlikely(lli->lli_sai || lli->lli_opendir_key || + lli->lli_opendir_pid != current->pid)) { + spin_unlock(&lli->lli_sa_lock); + rc = -EPERM; goto out; } + lli->lli_sai = sai; + spin_unlock(&lli->lli_sa_lock); - CDEBUG(D_READA, "start statahead thread: sai %p, parent %pd\n", - sai, parent); + atomic_inc(&ll_i2sbi(parent->d_inode)->ll_sa_running); - /* The sai buffer already has one reference taken at allocation time, - * but as soon as we expose the sai by attaching it to the lli that - * default reference can be dropped by another thread calling - * ll_stop_statahead. We need to take a local reference to protect - * the sai buffer while we intend to access it. - */ - ll_sai_get(sai); - lli->lli_sai = sai; + CDEBUG(D_READA, "start statahead thread: [pid %d] [parent %pd]\n", + current_pid(), parent); - plli = ll_i2info(d_inode(parent)); task = kthread_run(ll_statahead_thread, parent, "ll_sa_%u", - plli->lli_opendir_pid); + lli->lli_opendir_pid); thread = &sai->sai_thread; if (IS_ERR(task)) { rc = PTR_ERR(task); - CERROR("can't start ll_sa thread, rc: %d\n", rc); - dput(parent); - lli->lli_opendir_key = NULL; - thread_set_flags(thread, SVC_STOPPED); - thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED); - /* Drop both our own local reference and the default - * reference from allocation time. - */ - ll_sai_put(sai); - ll_sai_put(sai); - LASSERT(!lli->lli_sai); - return -EAGAIN; + CERROR("can't start ll_sa thread, rc : %d\n", rc); + goto out; } l_wait_event(thread->t_ctl_waitq, @@ -1694,10 +1598,47 @@ int do_statahead_enter(struct inode *dir, struct dentry **dentryp, return -EAGAIN; out: - kfree(sai); + /* + * once we start statahead thread failed, disable statahead so + * that subsequent stat won't waste time to try it. + */ spin_lock(&lli->lli_sa_lock); - lli->lli_opendir_key = NULL; - lli->lli_opendir_pid = 0; + lli->lli_sa_enabled = 0; + lli->lli_sai = NULL; spin_unlock(&lli->lli_sa_lock); + if (sai) + ll_sai_free(sai); return rc; } + +/** + * statahead entry function, this is called when client getattr on a file, it + * will start statahead thread if this is the first dir entry, else revalidate + * dentry from statahead cache. + * + * \param[in] dir parent directory + * \param[out] dentryp dentry to getattr + * \param[in] unplug unplug statahead window only (normally for negative + * dentry) + * \retval 1 on success + * \retval 0 revalidation from statahead cache failed, caller needs + * to getattr from server directly + * \retval negative number on error, caller often ignores this and + * then getattr from server + */ +int ll_statahead(struct inode *dir, struct dentry **dentryp, bool unplug) +{ + struct ll_statahead_info *sai; + + sai = ll_sai_get(dir); + if (sai) { + int rc; + + rc = revalidate_statahead_dentry(dir, sai, dentryp, unplug); + CDEBUG(D_READA, "revalidate statahead %pd: %d.\n", + *dentryp, rc); + ll_sai_put(sai); + return rc; + } + return start_statahead_thread(dir, *dentryp); +} diff --git a/drivers/staging/lustre/lustre/llite/super25.c b/drivers/staging/lustre/lustre/llite/super25.c index 3dd7e0eb0b54..106cd00910a7 100644 --- a/drivers/staging/lustre/lustre/llite/super25.c +++ b/drivers/staging/lustre/lustre/llite/super25.c @@ -34,7 +34,6 @@ #include <linux/module.h> #include <linux/types.h> -#include "../include/lustre_lite.h" #include "../include/lustre_ha.h" #include "../include/lustre_dlm.h" #include <linux/init.h> @@ -83,8 +82,6 @@ struct super_operations lustre_super_operations = { }; MODULE_ALIAS_FS("lustre"); -void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg)); - static int __init lustre_init(void) { lnet_process_id_t lnet_id; @@ -102,8 +99,8 @@ static int __init lustre_init(void) rc = -ENOMEM; ll_inode_cachep = kmem_cache_create("lustre_inode_cache", - sizeof(struct ll_inode_info), - 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, + sizeof(struct ll_inode_info), 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!ll_inode_cachep) goto out_cache; diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c index 8c8bdfe1ad71..f8bc7ed59646 100644 --- a/drivers/staging/lustre/lustre/llite/symlink.c +++ b/drivers/staging/lustre/lustre/llite/symlink.c @@ -35,7 +35,6 @@ #include <linux/stat.h> #define DEBUG_SUBSYSTEM S_LLITE -#include "../include/lustre_lite.h" #include "llite_internal.h" static int ll_readlink_internal(struct inode *inode, @@ -80,17 +79,17 @@ static int ll_readlink_internal(struct inode *inode, } body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); - if ((body->valid & OBD_MD_LINKNAME) == 0) { + if ((body->mbo_valid & OBD_MD_LINKNAME) == 0) { CERROR("OBD_MD_LINKNAME not set on reply\n"); rc = -EPROTO; goto failed; } LASSERT(symlen != 0); - if (body->eadatasize != symlen) { + if (body->mbo_eadatasize != symlen) { CERROR("%s: inode "DFID": symlink length %d not expected %d\n", ll_get_fsname(inode->i_sb, NULL, 0), - PFID(ll_inode2fid(inode)), body->eadatasize - 1, + PFID(ll_inode2fid(inode)), body->mbo_eadatasize - 1, symlen - 1); rc = -EPROTO; goto failed; @@ -155,8 +154,8 @@ const struct inode_operations ll_fast_symlink_inode_operations = { .get_link = ll_get_link, .getattr = ll_getattr, .permission = ll_inode_permission, - .setxattr = ll_setxattr, - .getxattr = ll_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = ll_listxattr, - .removexattr = ll_removexattr, + .removexattr = generic_removexattr, }; diff --git a/drivers/staging/lustre/lustre/llite/vvp_dev.c b/drivers/staging/lustre/lustre/llite/vvp_dev.c index e623216e962d..8aa8ecc09a48 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_dev.c +++ b/drivers/staging/lustre/lustre/llite/vvp_dev.c @@ -38,7 +38,6 @@ #define DEBUG_SUBSYSTEM S_LLITE #include "../include/obd.h" -#include "../include/lustre_lite.h" #include "llite_internal.h" #include "vvp_internal.h" @@ -368,12 +367,6 @@ int cl_sb_fini(struct super_block *sb) CERROR("Cannot cleanup cl-stack due to memory shortage.\n"); result = PTR_ERR(env); } - /* - * If mount failed (sbi->ll_cl == NULL), and this there are no other - * mounts, stop device types manually (this usually happens - * automatically when last device is destroyed). - */ - lu_types_stop(); return result; } diff --git a/drivers/staging/lustre/lustre/llite/vvp_internal.h b/drivers/staging/lustre/lustre/llite/vvp_internal.h index 79fc428461ed..5802da81cd0e 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_internal.h +++ b/drivers/staging/lustre/lustre/llite/vvp_internal.h @@ -217,11 +217,12 @@ struct vvp_object { struct list_head vob_pending_list; /** - * Access this counter is protected by inode->i_sem. Now that - * the lifetime of transient pages must be covered by inode sem, - * we don't need to hold any lock.. + * Number of transient pages. This is no longer protected by i_sem, + * and needs to be atomic. This is not actually used for anything, + * and can probably be removed. */ - int vob_transient_pages; + atomic_t vob_transient_pages; + /** * Number of outstanding mmaps on this file. * @@ -247,9 +248,9 @@ struct vvp_object { */ struct vvp_page { struct cl_page_slice vpg_cl; - int vpg_defer_uptodate; - int vpg_ra_used; - int vpg_write_queued; + unsigned int vpg_defer_uptodate:1, + vpg_ra_used:1, + vpg_write_queued:1; /** * Non-empty iff this page is already counted in * vvp_object::vob_pending_list. This list is only used as a flag, diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c index 94916dcc6caa..2ab450359b6d 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_io.c +++ b/drivers/staging/lustre/lustre/llite/vvp_io.c @@ -38,7 +38,6 @@ #define DEBUG_SUBSYSTEM S_LLITE #include "../include/obd.h" -#include "../include/lustre_lite.h" #include "llite_internal.h" #include "vvp_internal.h" @@ -628,7 +627,7 @@ static int vvp_io_setattr_time(const struct lu_env *env, attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime; valid |= CAT_MTIME; } - result = cl_object_attr_set(env, obj, attr, valid); + result = cl_object_attr_update(env, obj, attr, valid); cl_object_attr_unlock(obj); return result; @@ -821,7 +820,7 @@ static void write_commit_callback(const struct lu_env *env, struct cl_io *io, cl_page_disown(env, io, page); /* held in ll_cl_init() */ - lu_ref_del(&page->cp_reference, "cl_io", io); + lu_ref_del(&page->cp_reference, "cl_io", cl_io_top(io)); cl_page_put(env, page); } @@ -959,10 +958,30 @@ static int vvp_io_write_start(const struct lu_env *env, CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt); - if (!vio->vui_iter) /* from a temp io in ll_cl_init(). */ + if (!vio->vui_iter) { + /* from a temp io in ll_cl_init(). */ result = 0; - else - result = generic_file_write_iter(vio->vui_iocb, vio->vui_iter); + } else { + /* + * When using the locked AIO function (generic_file_aio_write()) + * testing has shown the inode mutex to be a limiting factor + * with multi-threaded single shared file performance. To get + * around this, we now use the lockless version. To maintain + * consistency, proper locking to protect against writes, + * trucates, etc. is handled in the higher layers of lustre. + */ + bool lock_node = !IS_NOSEC(inode); + + if (lock_node) + inode_lock(inode); + result = __generic_file_write_iter(vio->vui_iocb, + vio->vui_iter); + if (lock_node) + inode_unlock(inode); + + if (result > 0 || result == -EIOCBQUEUED) + result = generic_write_sync(vio->vui_iocb, result); + } if (result > 0) { result = vvp_io_write_commit(env, io); diff --git a/drivers/staging/lustre/lustre/llite/vvp_lock.c b/drivers/staging/lustre/lustre/llite/vvp_lock.c index 64be0c9df35b..07eb26cc43f5 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_lock.c +++ b/drivers/staging/lustre/lustre/llite/vvp_lock.c @@ -37,7 +37,6 @@ #define DEBUG_SUBSYSTEM S_LLITE #include "../include/obd_support.h" -#include "../include/lustre_lite.h" #include "vvp_internal.h" diff --git a/drivers/staging/lustre/lustre/llite/vvp_object.c b/drivers/staging/lustre/lustre/llite/vvp_object.c index 2c520b0bf6ca..b57195d15674 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_object.c +++ b/drivers/staging/lustre/lustre/llite/vvp_object.c @@ -39,7 +39,6 @@ #include "../../include/linux/libcfs/libcfs.h" #include "../include/obd.h" -#include "../include/lustre_lite.h" #include "llite_internal.h" #include "vvp_internal.h" @@ -68,8 +67,8 @@ static int vvp_object_print(const struct lu_env *env, void *cookie, (*p)(env, cookie, "(%s %d %d) inode: %p ", list_empty(&obj->vob_pending_list) ? "-" : "+", - obj->vob_transient_pages, atomic_read(&obj->vob_mmap_cnt), - inode); + atomic_read(&obj->vob_transient_pages), + atomic_read(&obj->vob_mmap_cnt), inode); if (inode) { lli = ll_i2info(inode); (*p)(env, cookie, "%lu/%u %o %u %d %p "DFID, @@ -102,8 +101,8 @@ static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj, return 0; /* layers below have to fill in the rest */ } -static int vvp_attr_set(const struct lu_env *env, struct cl_object *obj, - const struct cl_attr *attr, unsigned valid) +static int vvp_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned int valid) { struct inode *inode = vvp_object_inode(obj); @@ -120,7 +119,7 @@ static int vvp_attr_set(const struct lu_env *env, struct cl_object *obj, if (0 && valid & CAT_SIZE) i_size_write(inode, attr->cat_size); /* not currently necessary */ - if (0 && valid & (CAT_UID|CAT_GID|CAT_SIZE)) + if (0 && valid & (CAT_UID | CAT_GID | CAT_SIZE)) mark_inode_dirty(inode); return 0; } @@ -210,7 +209,7 @@ static const struct cl_object_operations vvp_ops = { .coo_lock_init = vvp_lock_init, .coo_io_init = vvp_io_init, .coo_attr_get = vvp_attr_get, - .coo_attr_set = vvp_attr_set, + .coo_attr_update = vvp_attr_update, .coo_conf_set = vvp_conf_set, .coo_prune = vvp_prune, .coo_glimpse = vvp_object_glimpse @@ -221,7 +220,7 @@ static int vvp_object_init0(const struct lu_env *env, const struct cl_object_conf *conf) { vob->vob_inode = conf->coc_inode; - vob->vob_transient_pages = 0; + atomic_set(&vob->vob_transient_pages, 0); cl_object_page_init(&vob->vob_cl, sizeof(struct vvp_page)); return 0; } diff --git a/drivers/staging/lustre/lustre/llite/vvp_page.c b/drivers/staging/lustre/lustre/llite/vvp_page.c index 2e566d90bb94..5d79efc1aafe 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_page.c +++ b/drivers/staging/lustre/lustre/llite/vvp_page.c @@ -44,8 +44,6 @@ #include <linux/page-flags.h> #include <linux/pagemap.h> -#include "../include/lustre_lite.h" - #include "llite_internal.h" #include "vvp_internal.h" @@ -249,7 +247,7 @@ static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, int ioret set_bit(AS_EIO, &inode->i_mapping->flags); if ((ioret == -ESHUTDOWN || ioret == -EINTR) && - obj->vob_discard_page_warned == 0) { + obj->vob_discard_page_warned == 0) { obj->vob_discard_page_warned = 1; ll_dirty_page_discard_warn(vmpage, ioret); } @@ -444,18 +442,10 @@ static int vvp_transient_page_prep(const struct lu_env *env, return 0; } -static void vvp_transient_page_verify(const struct cl_page *page) -{ - struct inode *inode = vvp_object_inode(page->cp_obj); - - LASSERT(!inode_trylock(inode)); -} - static int vvp_transient_page_own(const struct lu_env *env, const struct cl_page_slice *slice, struct cl_io *unused, int nonblock) { - vvp_transient_page_verify(slice->cpl_page); return 0; } @@ -463,21 +453,18 @@ static void vvp_transient_page_assume(const struct lu_env *env, const struct cl_page_slice *slice, struct cl_io *unused) { - vvp_transient_page_verify(slice->cpl_page); } static void vvp_transient_page_unassume(const struct lu_env *env, const struct cl_page_slice *slice, struct cl_io *unused) { - vvp_transient_page_verify(slice->cpl_page); } static void vvp_transient_page_disown(const struct lu_env *env, const struct cl_page_slice *slice, struct cl_io *unused) { - vvp_transient_page_verify(slice->cpl_page); } static void vvp_transient_page_discard(const struct lu_env *env, @@ -486,8 +473,6 @@ static void vvp_transient_page_discard(const struct lu_env *env, { struct cl_page *page = slice->cpl_page; - vvp_transient_page_verify(slice->cpl_page); - /* * For transient pages, remove it from the radix tree. */ @@ -511,7 +496,6 @@ vvp_transient_page_completion(const struct lu_env *env, const struct cl_page_slice *slice, int ioret) { - vvp_transient_page_verify(slice->cpl_page); } static void vvp_transient_page_fini(const struct lu_env *env, @@ -522,8 +506,7 @@ static void vvp_transient_page_fini(const struct lu_env *env, struct vvp_object *clobj = cl2vvp(clp->cp_obj); vvp_page_fini_common(vpg); - LASSERT(!inode_trylock(clobj->vob_inode)); - clobj->vob_transient_pages--; + atomic_dec(&clobj->vob_transient_pages); } static const struct cl_page_operations vvp_transient_page_ops = { @@ -549,7 +532,7 @@ static const struct cl_page_operations vvp_transient_page_ops = { }; int vvp_page_init(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index) + struct cl_page *page, pgoff_t index) { struct vvp_page *vpg = cl_object_page_slice(obj, page); struct page *vmpage = page->cp_vmpage; @@ -570,10 +553,9 @@ int vvp_page_init(const struct lu_env *env, struct cl_object *obj, } else { struct vvp_object *clobj = cl2vvp(obj); - LASSERT(!inode_trylock(clobj->vob_inode)); cl_page_slice_add(page, &vpg->vpg_cl, obj, index, &vvp_transient_page_ops); - clobj->vob_transient_pages++; + atomic_inc(&clobj->vob_transient_pages); } return 0; } diff --git a/drivers/staging/lustre/lustre/llite/vvp_req.c b/drivers/staging/lustre/lustre/llite/vvp_req.c index 9fe9d6c0a7d1..e3f4c790d646 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_req.c +++ b/drivers/staging/lustre/lustre/llite/vvp_req.c @@ -32,7 +32,6 @@ #include "../include/cl_object.h" #include "../include/obd.h" #include "../include/obd_support.h" -#include "../include/lustre_lite.h" #include "llite_internal.h" #include "vvp_internal.h" @@ -83,8 +82,10 @@ static void vvp_req_attr_set(const struct lu_env *env, } obdo_from_inode(oa, inode, valid_flags & flags); obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid); + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_INVALID_PFID)) + oa->o_parent_oid++; memcpy(attr->cra_jobid, ll_i2info(inode)->lli_jobid, - JOBSTATS_JOBID_SIZE); + LUSTRE_JOBID_SIZE); } static void vvp_req_completion(const struct lu_env *env, diff --git a/drivers/staging/lustre/lustre/llite/xattr.c b/drivers/staging/lustre/lustre/llite/xattr.c index 98303cf85815..e070adb7a3cc 100644 --- a/drivers/staging/lustre/lustre/llite/xattr.c +++ b/drivers/staging/lustre/lustre/llite/xattr.c @@ -38,21 +38,12 @@ #define DEBUG_SUBSYSTEM S_LLITE #include "../include/obd_support.h" -#include "../include/lustre_lite.h" #include "../include/lustre_dlm.h" #include "../include/lustre_ver.h" #include "../include/lustre_eacl.h" #include "llite_internal.h" -#define XATTR_USER_T (1) -#define XATTR_TRUSTED_T (2) -#define XATTR_SECURITY_T (3) -#define XATTR_ACL_ACCESS_T (4) -#define XATTR_ACL_DEFAULT_T (5) -#define XATTR_LUSTRE_T (6) -#define XATTR_OTHER_T (7) - static int get_xattr_type(const char *name) { @@ -99,46 +90,57 @@ int xattr_type_filter(struct ll_sb_info *sbi, int xattr_type) return 0; } -static -int ll_setxattr_common(struct inode *inode, const char *name, - const void *value, size_t size, - int flags, __u64 valid) +static int +ll_xattr_set_common(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, + int flags) { + char fullname[strlen(handler->prefix) + strlen(name) + 1]; struct ll_sb_info *sbi = ll_i2sbi(inode); struct ptlrpc_request *req = NULL; - int xattr_type, rc; const char *pv = value; + __u64 valid; + int rc; + + if (flags == XATTR_REPLACE) { + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1); + valid = OBD_MD_FLXATTRRM; + } else { + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1); + valid = OBD_MD_FLXATTR; + } - xattr_type = get_xattr_type(name); - rc = xattr_type_filter(sbi, xattr_type); + rc = xattr_type_filter(sbi, handler->flags); if (rc) return rc; - if ((xattr_type == XATTR_ACL_ACCESS_T || - xattr_type == XATTR_ACL_DEFAULT_T) && + if ((handler->flags == XATTR_ACL_ACCESS_T || + handler->flags == XATTR_ACL_DEFAULT_T) && !inode_owner_or_capable(inode)) return -EPERM; /* b10667: ignore lustre special xattr for now */ - if ((xattr_type == XATTR_TRUSTED_T && strcmp(name, "trusted.lov") == 0) || - (xattr_type == XATTR_LUSTRE_T && strcmp(name, "lustre.lov") == 0)) + if ((handler->flags == XATTR_TRUSTED_T && !strcmp(name, "lov")) || + (handler->flags == XATTR_LUSTRE_T && !strcmp(name, "lov"))) return 0; /* b15587: ignore security.capability xattr for now */ - if ((xattr_type == XATTR_SECURITY_T && - strcmp(name, "security.capability") == 0)) + if ((handler->flags == XATTR_SECURITY_T && + !strcmp(name, "capability"))) return 0; /* LU-549: Disable security.selinux when selinux is disabled */ - if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() && - strcmp(name, "security.selinux") == 0) + if (handler->flags == XATTR_SECURITY_T && !selinux_is_enabled() && + strcmp(name, "selinux") == 0) return -EOPNOTSUPP; + sprintf(fullname, "%s%s\n", handler->prefix, name); rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), - valid, name, pv, size, 0, flags, + valid, fullname, pv, size, 0, flags, ll_i2suppgid(inode), &req); if (rc) { - if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) { + if (rc == -EOPNOTSUPP && handler->flags == XATTR_USER_T) { LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n"); sbi->ll_flags &= ~LL_SBI_USER_XATTR; } @@ -149,8 +151,10 @@ int ll_setxattr_common(struct inode *inode, const char *name, return 0; } -int ll_setxattr(struct dentry *dentry, struct inode *inode, - const char *name, const void *value, size_t size, int flags) +static int ll_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, + int flags) { LASSERT(inode); LASSERT(name); @@ -158,20 +162,24 @@ int ll_setxattr(struct dentry *dentry, struct inode *inode, CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), xattr %s\n", PFID(ll_inode2fid(inode)), inode, name); - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1); - - if ((strncmp(name, XATTR_TRUSTED_PREFIX, - sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 && - strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) || - (strncmp(name, XATTR_LUSTRE_PREFIX, - sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 && - strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) { + if (!strcmp(name, "lov")) { struct lov_user_md *lump = (struct lov_user_md *)value; + int op_type = flags == XATTR_REPLACE ? LPROC_LL_REMOVEXATTR : + LPROC_LL_SETXATTR; int rc = 0; + ll_stats_ops_tally(ll_i2sbi(inode), op_type, 1); + if (size != 0 && size < sizeof(struct lov_user_md)) return -EINVAL; + /* + * It is possible to set an xattr to a "" value of zero size. + * For this case we are going to treat it as a removal. + */ + if (!size && lump) + lump = NULL; + /* Attributes that are saved via getxattr will always have * the stripe_offset as 0. Instead, the MDS should be * allowed to pick the starting OST index. b=17846 @@ -181,12 +189,15 @@ int ll_setxattr(struct dentry *dentry, struct inode *inode, if (lump && S_ISREG(inode->i_mode)) { __u64 it_flags = FMODE_WRITE; - int lum_size = (lump->lmm_magic == LOV_USER_MAGIC_V1) ? - sizeof(*lump) : sizeof(struct lov_user_md_v3); + int lum_size; + + lum_size = ll_lov_user_md_size(lump); + if (lum_size < 0 || size < lum_size) + return 0; /* b=10667: ignore error */ rc = ll_lov_setstripe_ea_info(inode, dentry, it_flags, lump, lum_size); - /* b10667: rc always be 0 here for now */ + /* b=10667: rc always be 0 here for now */ rc = 0; } else if (S_ISDIR(inode->i_mode)) { rc = ll_dir_setstripe(inode, lump, 0); @@ -194,92 +205,27 @@ int ll_setxattr(struct dentry *dentry, struct inode *inode, return rc; - } else if (strcmp(name, XATTR_NAME_LMA) == 0 || - strcmp(name, XATTR_NAME_LINK) == 0) + } else if (!strcmp(name, "lma") || !strcmp(name, "link")) { + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1); return 0; + } - return ll_setxattr_common(inode, name, value, size, flags, - OBD_MD_FLXATTR); -} - -int ll_removexattr(struct dentry *dentry, const char *name) -{ - struct inode *inode = d_inode(dentry); - - LASSERT(inode); - LASSERT(name); - - CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), xattr %s\n", - PFID(ll_inode2fid(inode)), inode, name); - - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1); - return ll_setxattr_common(inode, name, NULL, 0, 0, - OBD_MD_FLXATTRRM); + return ll_xattr_set_common(handler, dentry, inode, name, value, size, + flags); } -static -int ll_getxattr_common(struct inode *inode, const char *name, - void *buffer, size_t size, __u64 valid) +int +ll_xattr_list(struct inode *inode, const char *name, int type, void *buffer, + size_t size, __u64 valid) { + struct ll_inode_info *lli = ll_i2info(inode); struct ll_sb_info *sbi = ll_i2sbi(inode); struct ptlrpc_request *req = NULL; struct mdt_body *body; - int xattr_type, rc; void *xdata; - struct ll_inode_info *lli = ll_i2info(inode); - - CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", - PFID(ll_inode2fid(inode)), inode); - - /* listxattr have slightly different behavior from of ext3: - * without 'user_xattr' ext3 will list all xattr names but - * filtered out "^user..*"; we list them all for simplicity. - */ - if (!name) { - xattr_type = XATTR_OTHER_T; - goto do_getxattr; - } + int rc; - xattr_type = get_xattr_type(name); - rc = xattr_type_filter(sbi, xattr_type); - if (rc) - return rc; - - /* b15587: ignore security.capability xattr for now */ - if ((xattr_type == XATTR_SECURITY_T && - strcmp(name, "security.capability") == 0)) - return -ENODATA; - - /* LU-549: Disable security.selinux when selinux is disabled */ - if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() && - strcmp(name, "security.selinux") == 0) - return -EOPNOTSUPP; - -#ifdef CONFIG_FS_POSIX_ACL - /* posix acl is under protection of LOOKUP lock. when calling to this, - * we just have path resolution to the target inode, so we have great - * chance that cached ACL is uptodate. - */ - if (xattr_type == XATTR_ACL_ACCESS_T) { - struct posix_acl *acl; - - spin_lock(&lli->lli_lock); - acl = posix_acl_dup(lli->lli_posix_acl); - spin_unlock(&lli->lli_lock); - - if (!acl) - return -ENODATA; - - rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - return rc; - } - if (xattr_type == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode)) - return -ENODATA; -#endif - -do_getxattr: - if (sbi->ll_xattr_cache_enabled && xattr_type != XATTR_ACL_ACCESS_T) { + if (sbi->ll_xattr_cache_enabled && type != XATTR_ACL_ACCESS_T) { rc = ll_xattr_cache_get(inode, name, buffer, size, valid); if (rc == -EAGAIN) goto getxattr_nocache; @@ -311,36 +257,36 @@ getxattr_nocache: /* only detect the xattr size */ if (size == 0) { - rc = body->eadatasize; + rc = body->mbo_eadatasize; goto out; } - if (size < body->eadatasize) { + if (size < body->mbo_eadatasize) { CERROR("server bug: replied size %u > %u\n", - body->eadatasize, (int)size); + body->mbo_eadatasize, (int)size); rc = -ERANGE; goto out; } - if (body->eadatasize == 0) { + if (body->mbo_eadatasize == 0) { rc = -ENODATA; goto out; } /* do not need swab xattr data */ xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, - body->eadatasize); + body->mbo_eadatasize); if (!xdata) { rc = -EFAULT; goto out; } - memcpy(buffer, xdata, body->eadatasize); - rc = body->eadatasize; + memcpy(buffer, xdata, body->mbo_eadatasize); + rc = body->mbo_eadatasize; } out_xattr: - if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) { + if (rc == -EOPNOTSUPP && type == XATTR_USER_T) { LCONSOLE_INFO( "%s: disabling user_xattr feature because it is not supported on the server: rc = %d\n", ll_get_fsname(inode->i_sb, NULL, 0), rc); @@ -351,8 +297,65 @@ out: return rc; } -ssize_t ll_getxattr(struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) +static int ll_xattr_get_common(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + char fullname[strlen(handler->prefix) + strlen(name) + 1]; + struct ll_sb_info *sbi = ll_i2sbi(inode); +#ifdef CONFIG_FS_POSIX_ACL + struct ll_inode_info *lli = ll_i2info(inode); +#endif + int rc; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", + PFID(ll_inode2fid(inode)), inode); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1); + + rc = xattr_type_filter(sbi, handler->flags); + if (rc) + return rc; + + /* b15587: ignore security.capability xattr for now */ + if ((handler->flags == XATTR_SECURITY_T && !strcmp(name, "capability"))) + return -ENODATA; + + /* LU-549: Disable security.selinux when selinux is disabled */ + if (handler->flags == XATTR_SECURITY_T && !selinux_is_enabled() && + !strcmp(name, "selinux")) + return -EOPNOTSUPP; + +#ifdef CONFIG_FS_POSIX_ACL + /* posix acl is under protection of LOOKUP lock. when calling to this, + * we just have path resolution to the target inode, so we have great + * chance that cached ACL is uptodate. + */ + if (handler->flags == XATTR_ACL_ACCESS_T) { + struct posix_acl *acl; + + spin_lock(&lli->lli_lock); + acl = posix_acl_dup(lli->lli_posix_acl); + spin_unlock(&lli->lli_lock); + + if (!acl) + return -ENODATA; + + rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + posix_acl_release(acl); + return rc; + } + if (handler->flags == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode)) + return -ENODATA; +#endif + sprintf(fullname, "%s%s\n", handler->prefix, name); + return ll_xattr_list(inode, fullname, handler->flags, buffer, size, + OBD_MD_FLXATTR); +} + +static int ll_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) { LASSERT(inode); LASSERT(name); @@ -360,36 +363,23 @@ ssize_t ll_getxattr(struct dentry *dentry, struct inode *inode, CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), xattr %s\n", PFID(ll_inode2fid(inode)), inode, name); - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1); - - if ((strncmp(name, XATTR_TRUSTED_PREFIX, - sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 && - strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) || - (strncmp(name, XATTR_LUSTRE_PREFIX, - sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 && - strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) { + if (!strcmp(name, "lov")) { struct lov_stripe_md *lsm; struct lov_user_md *lump; struct lov_mds_md *lmm = NULL; struct ptlrpc_request *request = NULL; int rc = 0, lmmsize = 0; + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1); + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) return -ENODATA; - if (size == 0 && S_ISDIR(inode->i_mode)) { - /* XXX directory EA is fix for now, optimize to save - * RPC transfer - */ - rc = sizeof(struct lov_user_md); - goto out; - } - lsm = ccc_inode_lsm_get(inode); if (!lsm) { if (S_ISDIR(inode->i_mode)) { - rc = ll_dir_getstripe(inode, &lmm, - &lmmsize, &request); + rc = ll_dir_getstripe(inode, (void **)&lmm, + &lmmsize, &request, 0); } else { rc = -ENODATA; } @@ -439,7 +429,7 @@ out: return rc; } - return ll_getxattr_common(inode, name, buffer, size, OBD_MD_FLXATTR); + return ll_xattr_get_common(handler, dentry, inode, name, buffer, size); } ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size) @@ -457,7 +447,8 @@ ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size) ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, 1); - rc = ll_getxattr_common(inode, NULL, buffer, size, OBD_MD_FLXATTRLS); + rc = ll_xattr_list(inode, NULL, XATTR_OTHER_T, buffer, size, + OBD_MD_FLXATTRLS); if (rc < 0) goto out; @@ -488,7 +479,8 @@ ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size) if (!ll_i2info(inode)->lli_has_smd) rc2 = -1; } else if (S_ISDIR(inode->i_mode)) { - rc2 = ll_dir_getstripe(inode, &lmm, &lmmsize, &request); + rc2 = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize, + &request, 0); } if (rc2 < 0) { @@ -518,3 +510,57 @@ out: return rc; } + +static const struct xattr_handler ll_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = XATTR_USER_T, + .get = ll_xattr_get_common, + .set = ll_xattr_set_common, +}; + +static const struct xattr_handler ll_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = XATTR_TRUSTED_T, + .get = ll_xattr_get, + .set = ll_xattr_set, +}; + +static const struct xattr_handler ll_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = XATTR_SECURITY_T, + .get = ll_xattr_get_common, + .set = ll_xattr_set_common, +}; + +static const struct xattr_handler ll_acl_access_xattr_handler = { + .prefix = XATTR_NAME_POSIX_ACL_ACCESS, + .flags = XATTR_ACL_ACCESS_T, + .get = ll_xattr_get_common, + .set = ll_xattr_set_common, +}; + +static const struct xattr_handler ll_acl_default_xattr_handler = { + .prefix = XATTR_NAME_POSIX_ACL_DEFAULT, + .flags = XATTR_ACL_DEFAULT_T, + .get = ll_xattr_get_common, + .set = ll_xattr_set_common, +}; + +static const struct xattr_handler ll_lustre_xattr_handler = { + .prefix = XATTR_LUSTRE_PREFIX, + .flags = XATTR_LUSTRE_T, + .get = ll_xattr_get, + .set = ll_xattr_set, +}; + +const struct xattr_handler *ll_xattr_handlers[] = { + &ll_user_xattr_handler, + &ll_trusted_xattr_handler, + &ll_security_xattr_handler, +#ifdef CONFIG_FS_POSIX_ACL + &ll_acl_access_xattr_handler, + &ll_acl_default_xattr_handler, +#endif + &ll_lustre_xattr_handler, + NULL, +}; diff --git a/drivers/staging/lustre/lustre/llite/xattr_cache.c b/drivers/staging/lustre/lustre/llite/xattr_cache.c index 8089da8143d9..50a19a40bd4e 100644 --- a/drivers/staging/lustre/lustre/llite/xattr_cache.c +++ b/drivers/staging/lustre/lustre/llite/xattr_cache.c @@ -13,7 +13,6 @@ #include <linux/sched.h> #include <linux/mm.h> #include "../include/obd_support.h" -#include "../include/lustre_lite.h" #include "../include/lustre_dlm.h" #include "../include/lustre_ver.h" #include "llite_internal.h" @@ -270,10 +269,12 @@ static int ll_xattr_find_get_lock(struct inode *inode, struct lustre_handle lockh = { 0 }; struct md_op_data *op_data; struct ll_inode_info *lli = ll_i2info(inode); - struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS, - .ei_mode = it_to_lock_mode(oit), - .ei_cb_bl = ll_md_blocking_ast, - .ei_cb_cp = ldlm_completion_ast }; + struct ldlm_enqueue_info einfo = { + .ei_type = LDLM_IBITS, + .ei_mode = it_to_lock_mode(oit), + .ei_cb_bl = &ll_md_blocking_ast, + .ei_cb_cp = &ldlm_completion_ast, + }; struct ll_sb_info *sbi = ll_i2sbi(inode); struct obd_export *exp = sbi->ll_md_exp; int rc; @@ -304,7 +305,7 @@ static int ll_xattr_find_get_lock(struct inode *inode, op_data->op_valid = OBD_MD_FLXATTR | OBD_MD_FLXATTRLS; - rc = md_enqueue(exp, &einfo, oit, op_data, &lockh, NULL, 0, NULL, 0); + rc = md_enqueue(exp, &einfo, NULL, oit, op_data, &lockh, 0); ll_finish_md_op_data(op_data); if (rc < 0) { @@ -380,25 +381,25 @@ static int ll_xattr_cache_refill(struct inode *inode, struct lookup_intent *oit) } /* do not need swab xattr data */ xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, - body->eadatasize); + body->mbo_eadatasize); xval = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS, - body->aclsize); + body->mbo_aclsize); xsizes = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS_LENS, - body->max_mdsize * sizeof(__u32)); + body->mbo_max_mdsize * sizeof(__u32)); if (!xdata || !xval || !xsizes) { CERROR("wrong setxattr reply\n"); rc = -EPROTO; goto out_destroy; } - xtail = xdata + body->eadatasize; - xvtail = xval + body->aclsize; + xtail = xdata + body->mbo_eadatasize; + xvtail = xval + body->mbo_aclsize; CDEBUG(D_CACHE, "caching: xdata=%p xtail=%p\n", xdata, xtail); ll_xattr_cache_init(lli); - for (i = 0; i < body->max_mdsize; i++) { + for (i = 0; i < body->mbo_max_mdsize; i++) { CDEBUG(D_CACHE, "caching [%s]=%.*s\n", xdata, *xsizes, xval); /* Perform consistency checks: attr names and vals in pill */ if (!memchr(xdata, 0, xtail - xdata)) { diff --git a/drivers/staging/lustre/lustre/lmv/lmv_fld.c b/drivers/staging/lustre/lustre/lmv/lmv_fld.c index a3d170aa6fd2..a5265f9b5797 100644 --- a/drivers/staging/lustre/lustre/lmv/lmv_fld.c +++ b/drivers/staging/lustre/lustre/lmv/lmv_fld.c @@ -47,18 +47,20 @@ #include "../include/lprocfs_status.h" #include "lmv_internal.h" -int lmv_fld_lookup(struct lmv_obd *lmv, - const struct lu_fid *fid, - u32 *mds) +int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds) { + struct obd_device *obd = lmv2obd_dev(lmv); int rc; - /* FIXME: Currently ZFS still use local seq for ROOT unfortunately, and + /* + * FIXME: Currently ZFS still use local seq for ROOT unfortunately, and * this fid_is_local check should be removed once LU-2240 is fixed */ - LASSERTF((fid_seq_in_fldb(fid_seq(fid)) || - fid_seq_is_local_file(fid_seq(fid))) && - fid_is_sane(fid), DFID" is insane!\n", PFID(fid)); + if (!fid_is_sane(fid) || !(fid_seq_in_fldb(fid_seq(fid)) || + fid_seq_is_local_file(fid_seq(fid)))) { + CERROR("%s: invalid FID " DFID "\n", obd->obd_name, PFID(fid)); + return -EINVAL; + } rc = fld_client_lookup(&lmv->lmv_fld, fid_seq(fid), mds, LU_SEQ_RANGE_MDT, NULL); diff --git a/drivers/staging/lustre/lustre/lmv/lmv_intent.c b/drivers/staging/lustre/lustre/lmv/lmv_intent.c index 2f58fdab8d1e..9f4e826bb0af 100644 --- a/drivers/staging/lustre/lustre/lmv/lmv_intent.c +++ b/drivers/staging/lustre/lustre/lmv/lmv_intent.c @@ -43,13 +43,13 @@ #include "../include/lustre_lib.h" #include "../include/lustre_net.h" #include "../include/lustre_dlm.h" +#include "../include/lustre_mdc.h" #include "../include/obd_class.h" #include "../include/lprocfs_status.h" #include "lmv_internal.h" -static int lmv_intent_remote(struct obd_export *exp, void *lmm, - int lmmsize, struct lookup_intent *it, - const struct lu_fid *parent_fid, int flags, +static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it, + const struct lu_fid *parent_fid, struct ptlrpc_request **reqp, ldlm_blocking_callback cb_blocking, __u64 extra_lock_flags) @@ -68,7 +68,7 @@ static int lmv_intent_remote(struct obd_export *exp, void *lmm, if (!body) return -EPROTO; - LASSERT((body->valid & OBD_MD_MDS)); + LASSERT((body->mbo_valid & OBD_MD_MDS)); /* * Unfortunately, we have to lie to MDC/MDS to retrieve @@ -87,9 +87,9 @@ static int lmv_intent_remote(struct obd_export *exp, void *lmm, it->it_request = NULL; } - LASSERT(fid_is_sane(&body->fid1)); + LASSERT(fid_is_sane(&body->mbo_fid1)); - tgt = lmv_find_target(lmv, &body->fid1); + tgt = lmv_find_target(lmv, &body->mbo_fid1); if (IS_ERR(tgt)) { rc = PTR_ERR(tgt); goto out; @@ -101,7 +101,7 @@ static int lmv_intent_remote(struct obd_export *exp, void *lmm, goto out; } - op_data->op_fid1 = body->fid1; + op_data->op_fid1 = body->mbo_fid1; /* Sent the parent FID to the remote MDT */ if (parent_fid) { /* The parent fid is only for remote open to @@ -110,18 +110,14 @@ static int lmv_intent_remote(struct obd_export *exp, void *lmm, */ LASSERT(it->it_op & IT_OPEN); op_data->op_fid2 = *parent_fid; - /* Add object FID to op_fid3, in case it needs to check stale - * (M_CHECK_STALE), see mdc_finish_intent_lock - */ - op_data->op_fid3 = body->fid1; } op_data->op_bias = MDS_CROSS_REF; - CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%d\n", - PFID(&body->fid1), tgt->ltd_idx); + CDEBUG(D_INODE, "REMOTE_INTENT with fid=" DFID " -> mds #%u\n", + PFID(&body->mbo_fid1), tgt->ltd_idx); - rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it, - flags, &req, cb_blocking, extra_lock_flags); + rc = md_intent_lock(tgt->ltd_exp, op_data, it, &req, cb_blocking, + extra_lock_flags); if (rc) goto out_free_op_data; @@ -136,8 +132,10 @@ static int lmv_intent_remote(struct obd_export *exp, void *lmm, it->it_remote_lock_mode = it->it_lock_mode; } - it->it_lock_handle = plock.cookie; - it->it_lock_mode = pmode; + if (pmode) { + it->it_lock_handle = plock.cookie; + it->it_lock_mode = pmode; + } out_free_op_data: kfree(op_data); @@ -150,13 +148,126 @@ out: return rc; } +int lmv_revalidate_slaves(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + ldlm_blocking_callback cb_blocking, + int extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct ptlrpc_request *req = NULL; + struct mdt_body *body; + struct md_op_data *op_data; + int rc = 0, i; + + /** + * revalidate slaves has some problems, temporarily return, + * we may not need that + */ + op_data = kzalloc(sizeof(*op_data), GFP_NOFS); + if (!op_data) + return -ENOMEM; + + /** + * Loop over the stripe information, check validity and update them + * from MDS if needed. + */ + for (i = 0; i < lsm->lsm_md_stripe_count; i++) { + struct lookup_intent it = { .it_op = IT_GETATTR }; + struct lustre_handle *lockh = NULL; + struct lmv_tgt_desc *tgt = NULL; + struct inode *inode; + struct lu_fid fid; + + fid = lsm->lsm_md_oinfo[i].lmo_fid; + inode = lsm->lsm_md_oinfo[i].lmo_root; + + /* + * Prepare op_data for revalidating. Note that @fid2 shluld be + * defined otherwise it will go to server and take new lock + * which is not needed here. + */ + memset(op_data, 0, sizeof(*op_data)); + op_data->op_fid1 = fid; + op_data->op_fid2 = fid; + + tgt = lmv_locate_mds(lmv, op_data, &fid); + if (IS_ERR(tgt)) { + rc = PTR_ERR(tgt); + goto cleanup; + } + + CDEBUG(D_INODE, "Revalidate slave " DFID " -> mds #%u\n", + PFID(&fid), tgt->ltd_idx); + + if (req) { + ptlrpc_req_finished(req); + req = NULL; + } + + rc = md_intent_lock(tgt->ltd_exp, op_data, &it, &req, + cb_blocking, extra_lock_flags); + if (rc < 0) + goto cleanup; + + lockh = (struct lustre_handle *)&it.it_lock_handle; + if (rc > 0 && !req) { + /* slave inode is still valid */ + CDEBUG(D_INODE, "slave "DFID" is still valid.\n", + PFID(&fid)); + rc = 0; + } else { + /* refresh slave from server */ + body = req_capsule_server_get(&req->rq_pill, + &RMF_MDT_BODY); + LASSERT(body); + + if (unlikely(body->mbo_nlink < 2)) { + CERROR("%s: nlink %d < 2 corrupt stripe %d "DFID":" DFID"\n", + obd->obd_name, body->mbo_nlink, i, + PFID(&lsm->lsm_md_oinfo[i].lmo_fid), + PFID(&lsm->lsm_md_oinfo[0].lmo_fid)); + + if (it.it_lock_mode && lockh) { + ldlm_lock_decref(lockh, it.it_lock_mode); + it.it_lock_mode = 0; + } + + rc = -EIO; + goto cleanup; + } + + i_size_write(inode, body->mbo_size); + inode->i_blocks = body->mbo_blocks; + set_nlink(inode, body->mbo_nlink); + LTIME_S(inode->i_atime) = body->mbo_atime; + LTIME_S(inode->i_ctime) = body->mbo_ctime; + LTIME_S(inode->i_mtime) = body->mbo_mtime; + } + + md_set_lock_data(tgt->ltd_exp, lockh, inode, NULL); + + if (it.it_lock_mode && lockh) { + ldlm_lock_decref(lockh, it.it_lock_mode); + it.it_lock_mode = 0; + } + } + +cleanup: + if (req) + ptlrpc_req_finished(req); + + kfree(op_data); + return rc; +} + /* * IT_OPEN is intended to open (and create, possible) an object. Parent (pid) * may be split dir. */ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data, - void *lmm, int lmmsize, struct lookup_intent *it, - int flags, struct ptlrpc_request **reqp, + struct lookup_intent *it, + struct ptlrpc_request **reqp, ldlm_blocking_callback cb_blocking, __u64 extra_lock_flags) { @@ -166,35 +277,55 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data, struct mdt_body *body; int rc; - tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); + if (it->it_flags & MDS_OPEN_BY_FID) { + LASSERT(fid_is_sane(&op_data->op_fid2)); + + /* + * for striped directory, we can't know parent stripe fid + * without name, but we can set it to child fid, and MDT + * will obtain it from linkea in open in such case. + */ + if (op_data->op_mea1) + op_data->op_fid1 = op_data->op_fid2; + + tgt = lmv_find_target(lmv, &op_data->op_fid2); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + op_data->op_mds = tgt->ltd_idx; + } else { + LASSERT(fid_is_sane(&op_data->op_fid1)); + LASSERT(fid_is_zero(&op_data->op_fid2)); + LASSERT(op_data->op_name); + + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + } /* If it is ready to open the file by FID, do not need * allocate FID at all, otherwise it will confuse MDT */ - if ((it->it_op & IT_CREAT) && - !(it->it_flags & MDS_OPEN_BY_FID)) { + if ((it->it_op & IT_CREAT) && !(it->it_flags & MDS_OPEN_BY_FID)) { /* - * For open with IT_CREATE and for IT_CREATE cases allocate new - * fid and setup FLD for it. + * For lookup(IT_CREATE) cases allocate new fid and setup FLD + * for it. */ - op_data->op_fid3 = op_data->op_fid2; - rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data); + rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); if (rc != 0) return rc; } - CDEBUG(D_INODE, "OPEN_INTENT with fid1=" DFID ", fid2=" DFID ", name='%s' -> mds #%d\n", + CDEBUG(D_INODE, "OPEN_INTENT with fid1=" DFID ", fid2=" DFID ", name='%s' -> mds #%u\n", PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_idx); - rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it, flags, - reqp, cb_blocking, extra_lock_flags); + rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, cb_blocking, + extra_lock_flags); if (rc != 0) return rc; /* - * Nothing is found, do not access body->fid1 as it is zero and thus + * Nothing is found, do not access body->mbo_fid1 as it is zero and thus * pointless. */ if ((it->it_disposition & DISP_LOOKUP_NEG) && @@ -205,31 +336,17 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data, body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); if (!body) return -EPROTO; - /* - * Not cross-ref case, just get out of here. - */ - if (likely(!(body->valid & OBD_MD_MDS))) - return 0; - /* - * Okay, MDS has returned success. Probably name has been resolved in - * remote inode. - */ - rc = lmv_intent_remote(exp, lmm, lmmsize, it, &op_data->op_fid1, flags, - reqp, cb_blocking, extra_lock_flags); - if (rc != 0) { - LASSERT(rc < 0); - /* - * This is possible, that some userspace application will try to - * open file as directory and we will have -ENOTDIR here. As - * this is normal situation, we should not print error here, - * only debug info. - */ - CDEBUG(D_INODE, "Can't handle remote %s: dir " DFID "(" DFID "):%*s: %d\n", - LL_IT2STR(it), PFID(&op_data->op_fid2), - PFID(&op_data->op_fid1), op_data->op_namelen, - op_data->op_name, rc); - return rc; + /* Not cross-ref case, just get out of here. */ + if (unlikely((body->mbo_valid & OBD_MD_MDS))) { + rc = lmv_intent_remote(exp, it, &op_data->op_fid1, reqp, + cb_blocking, extra_lock_flags); + if (rc != 0) + return rc; + + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (!body) + return -EPROTO; } return rc; @@ -240,37 +357,102 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data, */ static int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data, - void *lmm, int lmmsize, struct lookup_intent *it, - int flags, struct ptlrpc_request **reqp, + struct lookup_intent *it, + struct ptlrpc_request **reqp, ldlm_blocking_callback cb_blocking, __u64 extra_lock_flags) { + struct lmv_stripe_md *lsm = op_data->op_mea1; struct obd_device *obd = exp->exp_obd; struct lmv_obd *lmv = &obd->u.lmv; struct lmv_tgt_desc *tgt = NULL; struct mdt_body *body; int rc = 0; + /* + * If it returns ERR_PTR(-EBADFD) then it is an unknown hash type + * it will try all stripes to locate the object + */ tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); - if (IS_ERR(tgt)) + if (IS_ERR(tgt) && (PTR_ERR(tgt) != -EBADFD)) return PTR_ERR(tgt); + /* + * Both migrating dir and unknown hash dir need to try + * all of sub-stripes + */ + if (lsm && !lmv_is_known_hash_type(lsm->lsm_md_hash_type)) { + struct lmv_oinfo *oinfo = &lsm->lsm_md_oinfo[0]; + + op_data->op_fid1 = oinfo->lmo_fid; + op_data->op_mds = oinfo->lmo_mds; + tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + } + if (!fid_is_sane(&op_data->op_fid2)) fid_zero(&op_data->op_fid2); - CDEBUG(D_INODE, "LOOKUP_INTENT with fid1="DFID", fid2="DFID - ", name='%s' -> mds #%d\n", PFID(&op_data->op_fid1), - PFID(&op_data->op_fid2), + CDEBUG(D_INODE, "LOOKUP_INTENT with fid1=" DFID ", fid2=" DFID ", name='%s' -> mds #%u lsm=%p lsm_magic=%x\n", + PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), op_data->op_name ? op_data->op_name : "<NULL>", - tgt->ltd_idx); + tgt->ltd_idx, lsm, !lsm ? -1 : lsm->lsm_md_magic); op_data->op_bias &= ~MDS_CROSS_REF; - rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it, - flags, reqp, cb_blocking, extra_lock_flags); + rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, cb_blocking, + extra_lock_flags); + if (rc < 0) + return rc; - if (rc < 0 || !*reqp) + if (!*reqp) { + /* + * If RPC happens, lsm information will be revalidated + * during update_inode process (see ll_update_lsm_md) + */ + if (op_data->op_mea2) { + rc = lmv_revalidate_slaves(exp, op_data->op_mea2, + cb_blocking, + extra_lock_flags); + if (rc != 0) + return rc; + } return rc; + } else if (it_disposition(it, DISP_LOOKUP_NEG) && lsm && + lmv_need_try_all_stripes(lsm)) { + /* + * For migrating and unknown hash type directory, it will + * try to target the entry on other stripes + */ + int stripe_index; + + for (stripe_index = 1; + stripe_index < lsm->lsm_md_stripe_count && + it_disposition(it, DISP_LOOKUP_NEG); stripe_index++) { + struct lmv_oinfo *oinfo; + + /* release the previous request */ + ptlrpc_req_finished(*reqp); + it->it_request = NULL; + *reqp = NULL; + + oinfo = &lsm->lsm_md_oinfo[stripe_index]; + tgt = lmv_find_target(lmv, &oinfo->lmo_fid); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + CDEBUG(D_INODE, "Try other stripes " DFID"\n", + PFID(&oinfo->lmo_fid)); + + op_data->op_fid1 = oinfo->lmo_fid; + it->it_disposition &= ~DISP_ENQ_COMPLETE; + rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, + cb_blocking, extra_lock_flags); + if (rc) + return rc; + } + } /* * MDS has returned success. Probably name has been resolved in @@ -279,19 +461,23 @@ static int lmv_intent_lookup(struct obd_export *exp, body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); if (!body) return -EPROTO; - /* Not cross-ref case, just get out of here. */ - if (likely(!(body->valid & OBD_MD_MDS))) - return 0; - rc = lmv_intent_remote(exp, lmm, lmmsize, it, NULL, flags, reqp, - cb_blocking, extra_lock_flags); + /* Not cross-ref case, just get out of here. */ + if (unlikely((body->mbo_valid & OBD_MD_MDS))) { + rc = lmv_intent_remote(exp, it, NULL, reqp, cb_blocking, + extra_lock_flags); + if (rc != 0) + return rc; + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (!body) + return -EPROTO; + } return rc; } int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data, - void *lmm, int lmmsize, struct lookup_intent *it, - int flags, struct ptlrpc_request **reqp, + struct lookup_intent *it, struct ptlrpc_request **reqp, ldlm_blocking_callback cb_blocking, __u64 extra_lock_flags) { @@ -300,8 +486,9 @@ int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data, LASSERT(fid_is_sane(&op_data->op_fid1)); - CDEBUG(D_INODE, "INTENT LOCK '%s' for '%*s' on "DFID"\n", - LL_IT2STR(it), op_data->op_namelen, op_data->op_name, + CDEBUG(D_INODE, "INTENT LOCK '%s' for "DFID" '%*s' on "DFID"\n", + LL_IT2STR(it), PFID(&op_data->op_fid2), + (int)op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1)); rc = lmv_check_connect(obd); @@ -309,14 +496,34 @@ int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data, return rc; if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_LAYOUT)) - rc = lmv_intent_lookup(exp, op_data, lmm, lmmsize, it, - flags, reqp, cb_blocking, + rc = lmv_intent_lookup(exp, op_data, it, reqp, cb_blocking, extra_lock_flags); else if (it->it_op & IT_OPEN) - rc = lmv_intent_open(exp, op_data, lmm, lmmsize, it, - flags, reqp, cb_blocking, + rc = lmv_intent_open(exp, op_data, it, reqp, cb_blocking, extra_lock_flags); else LBUG(); + + if (rc < 0) { + struct lustre_handle lock_handle; + + if (it->it_lock_mode) { + lock_handle.cookie = it->it_lock_handle; + ldlm_lock_decref(&lock_handle, it->it_lock_mode); + } + + it->it_lock_handle = 0; + it->it_lock_mode = 0; + + if (it->it_remote_lock_mode) { + lock_handle.cookie = it->it_remote_lock_handle; + ldlm_lock_decref(&lock_handle, + it->it_remote_lock_mode); + } + + it->it_remote_lock_handle = 0; + it->it_remote_lock_mode = 0; + } + return rc; } diff --git a/drivers/staging/lustre/lustre/lmv/lmv_internal.h b/drivers/staging/lustre/lustre/lmv/lmv_internal.h index 0beafc49b8d2..52b03745ac19 100644 --- a/drivers/staging/lustre/lustre/lmv/lmv_internal.h +++ b/drivers/staging/lustre/lustre/lmv/lmv_internal.h @@ -35,6 +35,7 @@ #include "../include/lustre/lustre_idl.h" #include "../include/obd.h" +#include "../include/lustre_lmv.h" #define LMV_MAX_TGT_COUNT 128 @@ -44,77 +45,116 @@ int lmv_check_connect(struct obd_device *obd); int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data, - void *lmm, int lmmsize, struct lookup_intent *it, - int flags, struct ptlrpc_request **reqp, + struct lookup_intent *it, struct ptlrpc_request **reqp, ldlm_blocking_callback cb_blocking, __u64 extra_lock_flags); int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds); int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds); -int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid, - struct md_op_data *op_data); +int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp, + struct lu_fid *fid, struct md_op_data *op_data); -static inline struct lmv_stripe_md *lmv_get_mea(struct ptlrpc_request *req) -{ - struct mdt_body *body; - struct lmv_stripe_md *mea; - - LASSERT(req); - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - - if (!body || !S_ISDIR(body->mode) || !body->eadatasize) - return NULL; +int lmv_unpack_md(struct obd_export *exp, struct lmv_stripe_md **lsmp, + const union lmv_mds_md *lmm, int stripe_count); - mea = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, - body->eadatasize); - if (mea->mea_count == 0) - return NULL; - if (mea->mea_magic != MEA_MAGIC_LAST_CHAR && - mea->mea_magic != MEA_MAGIC_ALL_CHARS && - mea->mea_magic != MEA_MAGIC_HASH_SEGMENT) - return NULL; +int lmv_revalidate_slaves(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + ldlm_blocking_callback cb_blocking, + int extra_lock_flags); - return mea; -} - -static inline int lmv_get_easize(struct lmv_obd *lmv) +static inline struct obd_device *lmv2obd_dev(struct lmv_obd *lmv) { - return sizeof(struct lmv_stripe_md) + - lmv->desc.ld_tgt_count * - sizeof(struct lu_fid); + return container_of0(lmv, struct obd_device, u.lmv); } static inline struct lmv_tgt_desc * -lmv_get_target(struct lmv_obd *lmv, u32 mds) +lmv_get_target(struct lmv_obd *lmv, u32 mdt_idx, int *index) { - int count = lmv->desc.ld_tgt_count; int i; - for (i = 0; i < count; i++) { + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { if (!lmv->tgts[i]) continue; - if (lmv->tgts[i]->ltd_idx == mds) + if (lmv->tgts[i]->ltd_idx == mdt_idx) { + if (index) + *index = i; return lmv->tgts[i]; + } } return ERR_PTR(-ENODEV); } -static inline struct lmv_tgt_desc * -lmv_find_target(struct lmv_obd *lmv, const struct lu_fid *fid) +static inline int +lmv_find_target_index(struct lmv_obd *lmv, const struct lu_fid *fid) { - u32 mds = 0; - int rc; + struct lmv_tgt_desc *ltd; + u32 mdt_idx = 0; + int index = 0; if (lmv->desc.ld_tgt_count > 1) { - rc = lmv_fld_lookup(lmv, fid, &mds); - if (rc) - return ERR_PTR(rc); + int rc; + + rc = lmv_fld_lookup(lmv, fid, &mdt_idx); + if (rc < 0) + return rc; } - return lmv_get_target(lmv, mds); + ltd = lmv_get_target(lmv, mdt_idx, &index); + if (IS_ERR(ltd)) + return PTR_ERR(ltd); + + return index; +} + +static inline struct lmv_tgt_desc * +lmv_find_target(struct lmv_obd *lmv, const struct lu_fid *fid) +{ + int index; + + index = lmv_find_target_index(lmv, fid); + if (index < 0) + return ERR_PTR(index); + + return lmv->tgts[index]; +} + +static inline int lmv_stripe_md_size(int stripe_count) +{ + struct lmv_stripe_md *lsm; + + return sizeof(*lsm) + stripe_count * sizeof(lsm->lsm_md_oinfo[0]); +} + +int lmv_name_to_stripe_index(enum lmv_hash_type hashtype, + unsigned int max_mdt_index, + const char *name, int namelen); + +static inline const struct lmv_oinfo * +lsm_name_to_stripe_info(const struct lmv_stripe_md *lsm, const char *name, + int namelen) +{ + int stripe_index; + + stripe_index = lmv_name_to_stripe_index(lsm->lsm_md_hash_type, + lsm->lsm_md_stripe_count, + name, namelen); + if (stripe_index < 0) + return ERR_PTR(stripe_index); + + LASSERTF(stripe_index < lsm->lsm_md_stripe_count, + "stripe_index = %d, stripe_count = %d hash_type = %x name = %.*s\n", + stripe_index, lsm->lsm_md_stripe_count, + lsm->lsm_md_hash_type, namelen, name); + + return &lsm->lsm_md_oinfo[stripe_index]; +} + +static inline bool lmv_need_try_all_stripes(const struct lmv_stripe_md *lsm) +{ + return !lmv_is_known_hash_type(lsm->lsm_md_hash_type) || + lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION; } struct lmv_tgt_desc @@ -123,6 +163,6 @@ struct lmv_tgt_desc /* lproc_lmv.c */ void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars); -extern struct file_operations lmv_proc_target_fops; +extern const struct file_operations lmv_proc_target_fops; #endif diff --git a/drivers/staging/lustre/lustre/lmv/lmv_obd.c b/drivers/staging/lustre/lustre/lmv/lmv_obd.c index 0e1588a43187..7dbb2b946acf 100644 --- a/drivers/staging/lustre/lustre/lmv/lmv_obd.c +++ b/drivers/staging/lustre/lustre/lmv/lmv_obd.c @@ -43,12 +43,13 @@ #include "../include/lustre/lustre_idl.h" #include "../include/obd_support.h" -#include "../include/lustre_lib.h" #include "../include/lustre_net.h" #include "../include/obd_class.h" +#include "../include/lustre_lmv.h" #include "../include/lprocfs_status.h" -#include "../include/lustre_lite.h" +#include "../include/cl_object.h" #include "../include/lustre_fid.h" +#include "../include/lustre/lustre_ioctl.h" #include "../include/lustre_kernelcomm.h" #include "lmv_internal.h" @@ -70,12 +71,12 @@ static void lmv_activate_target(struct lmv_obd *lmv, * -ENOTCONN: The UUID is found, but the target connection is bad (!) * -EBADF : The UUID is found, but the OBD of the wrong type (!) */ -static int lmv_set_mdc_active(struct lmv_obd *lmv, struct obd_uuid *uuid, +static int lmv_set_mdc_active(struct lmv_obd *lmv, const struct obd_uuid *uuid, int activate) { struct lmv_tgt_desc *uninitialized_var(tgt); struct obd_device *obd; - int i; + u32 i; int rc = 0; CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n", @@ -244,36 +245,12 @@ static int lmv_connect(const struct lu_env *env, return rc; } -static void lmv_set_timeouts(struct obd_device *obd) -{ - struct lmv_obd *lmv; - int i; - - lmv = &obd->u.lmv; - if (lmv->server_timeout == 0) - return; - - if (lmv->connected == 0) - return; - - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - struct lmv_tgt_desc *tgt = lmv->tgts[i]; - - tgt = lmv->tgts[i]; - if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) - continue; - - obd_set_info_async(NULL, tgt->ltd_exp, sizeof(KEY_INTERMDS), - KEY_INTERMDS, 0, NULL, NULL); - } -} - -static int lmv_init_ea_size(struct obd_export *exp, int easize, - int def_easize, int cookiesize, int def_cookiesize) +static int lmv_init_ea_size(struct obd_export *exp, u32 easize, u32 def_easize, + u32 cookiesize, u32 def_cookiesize) { struct obd_device *obd = exp->exp_obd; struct lmv_obd *lmv = &obd->u.lmv; - int i; + u32 i; int rc = 0; int change = 0; @@ -420,6 +397,7 @@ static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp, { struct lmv_obd *lmv = &obd->u.lmv; struct lmv_tgt_desc *tgt; + int orig_tgt_count = 0; int rc = 0; CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index); @@ -489,14 +467,17 @@ static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp, tgt->ltd_uuid = *uuidp; tgt->ltd_active = 0; lmv->tgts[index] = tgt; - if (index >= lmv->desc.ld_tgt_count) + if (index >= lmv->desc.ld_tgt_count) { + orig_tgt_count = lmv->desc.ld_tgt_count; lmv->desc.ld_tgt_count = index + 1; + } if (lmv->connected) { rc = lmv_connect_mdc(obd, tgt); if (rc) { spin_lock(&lmv->lmv_lock); - lmv->desc.ld_tgt_count--; + if (lmv->desc.ld_tgt_count == index + 1) + lmv->desc.ld_tgt_count = orig_tgt_count; memset(tgt, 0, sizeof(*tgt)); spin_unlock(&lmv->lmv_lock); } else { @@ -514,7 +495,7 @@ int lmv_check_connect(struct obd_device *obd) { struct lmv_obd *lmv = &obd->u.lmv; struct lmv_tgt_desc *tgt; - int i; + u32 i; int rc; int easize; @@ -554,10 +535,9 @@ int lmv_check_connect(struct obd_device *obd) goto out_disc; } - lmv_set_timeouts(obd); class_export_put(lmv->exp); lmv->connected = 1; - easize = lmv_get_easize(lmv); + easize = lmv_mds_md_size(lmv->desc.ld_tgt_count, LMV_MAGIC); lmv_init_ea_size(obd->obd_self_export, easize, 0, 0, 0); mutex_unlock(&lmv->lmv_init_mutex); return 0; @@ -629,7 +609,7 @@ static int lmv_disconnect(struct obd_export *exp) struct obd_device *obd = class_exp2obd(exp); struct lmv_obd *lmv = &obd->u.lmv; int rc; - int i; + u32 i; if (!lmv->tgts) goto out_local; @@ -758,7 +738,7 @@ static int lmv_hsm_req_count(struct lmv_obd *lmv, const struct hsm_user_request *hur, const struct lmv_tgt_desc *tgt_mds) { - int i, nr = 0; + u32 i, nr = 0; struct lmv_tgt_desc *curr_tgt; /* count how many requests must be sent to the given target */ @@ -885,10 +865,8 @@ static int lmv_hsm_ct_register(struct lmv_obd *lmv, unsigned int cmd, int len, rc = libcfs_kkuc_group_add(filp, lk->lk_uid, lk->lk_group, &kcd, sizeof(kcd)); - if (rc) { - if (filp) - fput(filp); - } + if (rc) + fput(filp); return rc; } @@ -899,10 +877,10 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp, struct obd_device *obddev = class_exp2obd(exp); struct lmv_obd *lmv = &obddev->u.lmv; struct lmv_tgt_desc *tgt = NULL; - int i = 0; + u32 i = 0; int rc = 0; int set = 0; - int count = lmv->desc.ld_tgt_count; + u32 count = lmv->desc.ld_tgt_count; if (count == 0) return -ENOTTY; @@ -1011,6 +989,21 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp, rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); break; } + case LL_IOC_FID2MDTIDX: { + struct lu_fid *fid = karg; + int mdt_index; + + rc = lmv_fld_lookup(lmv, fid, &mdt_index); + if (rc) + return rc; + + /* + * Note: this is from llite(see ll_dir_ioctl()), @uarg does not + * point to user space memory for FID2MDTIDX. + */ + *(__u32 *)uarg = mdt_index; + break; + } case OBD_IOC_FID2PATH: { rc = lmv_fid2path(exp, len, karg, uarg); break; @@ -1169,32 +1162,37 @@ static int lmv_placement_policy(struct obd_device *obd, return 0; } + if (op_data->op_default_stripe_offset != -1) { + *mds = op_data->op_default_stripe_offset; + return 0; + } + /** * If stripe_offset is provided during setdirstripe * (setdirstripe -i xx), xx MDS will be chosen. */ - if (op_data->op_cli_flags & CLI_SET_MEA) { + if (op_data->op_cli_flags & CLI_SET_MEA && op_data->op_data) { struct lmv_user_md *lum; - lum = (struct lmv_user_md *)op_data->op_data; - if (lum->lum_type == LMV_STRIPE_TYPE && - lum->lum_stripe_offset != -1) { - if (lum->lum_stripe_offset >= lmv->desc.ld_tgt_count) { - CERROR("%s: Stripe_offset %d > MDT count %d: rc = %d\n", - obd->obd_name, - lum->lum_stripe_offset, - lmv->desc.ld_tgt_count, -ERANGE); - return -ERANGE; - } - *mds = lum->lum_stripe_offset; - return 0; + lum = op_data->op_data; + if (le32_to_cpu(lum->lum_stripe_offset) != (__u32)-1) { + *mds = le32_to_cpu(lum->lum_stripe_offset); + } else { + /* + * -1 means default, which will be in the same MDT with + * the stripe + */ + *mds = op_data->op_mds; + lum->lum_stripe_offset = cpu_to_le32(op_data->op_mds); } + } else { + /* + * Allocate new fid on target according to operation type and + * parent home mds. + */ + *mds = op_data->op_mds; } - /* Allocate new fid on target according to operation type and parent - * home mds. - */ - *mds = op_data->op_mds; return 0; } @@ -1203,7 +1201,7 @@ int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds) struct lmv_tgt_desc *tgt; int rc; - tgt = lmv_get_target(lmv, mds); + tgt = lmv_get_target(lmv, mds, NULL); if (IS_ERR(tgt)) return PTR_ERR(tgt); @@ -1221,7 +1219,7 @@ int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds) /* * Asking underlaying tgt layer to allocate new fid. */ - rc = obd_fid_alloc(tgt->ltd_exp, fid, NULL); + rc = obd_fid_alloc(NULL, tgt->ltd_exp, fid, NULL); if (rc > 0) { LASSERT(fid_is_sane(fid)); rc = 0; @@ -1232,8 +1230,8 @@ out: return rc; } -int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid, - struct md_op_data *op_data) +int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp, + struct lu_fid *fid, struct md_op_data *op_data) { struct obd_device *obd = class_exp2obd(exp); struct lmv_obd *lmv = &obd->u.lmv; @@ -1278,10 +1276,10 @@ static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg) return -EINVAL; } - lmv->tgts = kcalloc(32, sizeof(*lmv->tgts), GFP_NOFS); + lmv->tgts_size = 32U; + lmv->tgts = kcalloc(lmv->tgts_size, sizeof(*lmv->tgts), GFP_NOFS); if (!lmv->tgts) return -ENOMEM; - lmv->tgts_size = 32; obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid); lmv->desc.ld_tgt_count = 0; @@ -1354,7 +1352,7 @@ static int lmv_process_config(struct obd_device *obd, u32 len, void *buf) obd_str2uuid(&obd_uuid, lustre_cfg_buf(lcfg, 1)); - if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", &index) != 1) { + if (sscanf(lustre_cfg_buf(lcfg, 2), "%u", &index) != 1) { rc = -EINVAL; goto out; } @@ -1380,7 +1378,7 @@ static int lmv_statfs(const struct lu_env *env, struct obd_export *exp, struct lmv_obd *lmv = &obd->u.lmv; struct obd_statfs *temp; int rc = 0; - int i; + u32 i; rc = lmv_check_connect(obd); if (rc) @@ -1522,7 +1520,7 @@ static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid) { struct obd_device *obd = exp->exp_obd; struct lmv_obd *lmv = &obd->u.lmv; - int i; + u32 i; int rc; rc = lmv_check_connect(obd); @@ -1545,36 +1543,6 @@ static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid) return 0; } -static int lmv_find_cbdata(struct obd_export *exp, const struct lu_fid *fid, - ldlm_iterator_t it, void *data) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - int i; - int rc; - - rc = lmv_check_connect(obd); - if (rc) - return rc; - - CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid)); - - /* - * With DNE every object can have two locks in different namespaces: - * lookup lock in space of MDT storing direntry and update/open lock in - * space of MDT storing inode. - */ - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - if (!lmv->tgts[i] || !lmv->tgts[i]->ltd_exp) - continue; - rc = md_find_cbdata(lmv->tgts[i]->ltd_exp, fid, it, data); - if (rc) - return rc; - } - - return rc; -} - static int lmv_close(struct obd_export *exp, struct md_op_data *op_data, struct md_open_data *mod, struct ptlrpc_request **request) { @@ -1596,25 +1564,116 @@ static int lmv_close(struct obd_export *exp, struct md_op_data *op_data, return rc; } -struct lmv_tgt_desc -*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data, - struct lu_fid *fid) +/** + * Choosing the MDT by name or FID in @op_data. + * For non-striped directory, it will locate MDT by fid. + * For striped-directory, it will locate MDT by name. And also + * it will reset op_fid1 with the FID of the chosen stripe. + **/ +static struct lmv_tgt_desc * +lmv_locate_target_for_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm, + const char *name, int namelen, struct lu_fid *fid, + u32 *mds) +{ + const struct lmv_oinfo *oinfo; + struct lmv_tgt_desc *tgt; + + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_NAME_HASH)) { + if (cfs_fail_val >= lsm->lsm_md_stripe_count) + return ERR_PTR(-EBADF); + oinfo = &lsm->lsm_md_oinfo[cfs_fail_val]; + } else { + oinfo = lsm_name_to_stripe_info(lsm, name, namelen); + if (IS_ERR(oinfo)) + return ERR_CAST(oinfo); + } + + if (fid) + *fid = oinfo->lmo_fid; + if (mds) + *mds = oinfo->lmo_mds; + + tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL); + + CDEBUG(D_INFO, "locate on mds %u " DFID "\n", oinfo->lmo_mds, + PFID(&oinfo->lmo_fid)); + return tgt; +} + +/** + * Locate mds by fid or name + * + * For striped directory (lsm != NULL), it will locate the stripe + * by name hash (see lsm_name_to_stripe_info()). Note: if the hash_type + * is unknown, it will return -EBADFD, and lmv_intent_lookup might need + * walk through all of stripes to locate the entry. + * + * For normal direcotry, it will locate MDS by FID directly. + * \param[in] lmv LMV device + * \param[in] op_data client MD stack parameters, name, namelen + * mds_num etc. + * \param[in] fid object FID used to locate MDS. + * + * retval pointer to the lmv_tgt_desc if succeed. + * ERR_PTR(errno) if failed. + */ +struct lmv_tgt_desc* +lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data, + struct lu_fid *fid) { + struct lmv_stripe_md *lsm = op_data->op_mea1; struct lmv_tgt_desc *tgt; - tgt = lmv_find_target(lmv, fid); - if (IS_ERR(tgt)) + /* + * During creating VOLATILE file, it should honor the mdt + * index if the file under striped dir is being restored, see + * ct_restore(). + */ + if (op_data->op_bias & MDS_CREATE_VOLATILE && + (int)op_data->op_mds != -1 && lsm) { + int i; + + tgt = lmv_get_target(lmv, op_data->op_mds, NULL); + if (IS_ERR(tgt)) + return tgt; + + /* refill the right parent fid */ + for (i = 0; i < lsm->lsm_md_stripe_count; i++) { + struct lmv_oinfo *oinfo; + + oinfo = &lsm->lsm_md_oinfo[i]; + if (oinfo->lmo_mds == op_data->op_mds) { + *fid = oinfo->lmo_fid; + break; + } + } + + /* Hmm, can not find the stripe by mdt_index(op_mds) */ + if (i == lsm->lsm_md_stripe_count) + tgt = ERR_PTR(-EINVAL); + return tgt; + } - op_data->op_mds = tgt->ltd_idx; + if (!lsm || !op_data->op_namelen) { + tgt = lmv_find_target(lmv, fid); + if (IS_ERR(tgt)) + return tgt; - return tgt; + op_data->op_mds = tgt->ltd_idx; + + return tgt; + } + + return lmv_locate_target_for_name(lmv, lsm, op_data->op_name, + op_data->op_namelen, fid, + &op_data->op_mds); } static int lmv_create(struct obd_export *exp, struct md_op_data *op_data, - const void *data, int datalen, int mode, __u32 uid, - __u32 gid, cfs_cap_t cap_effective, __u64 rdev, - struct ptlrpc_request **request) + const void *data, size_t datalen, umode_t mode, + uid_t uid, gid_t gid, cfs_cap_t cap_effective, + __u64 rdev, struct ptlrpc_request **request) { struct obd_device *obd = exp->exp_obd; struct lmv_obd *lmv = &obd->u.lmv; @@ -1632,13 +1691,30 @@ static int lmv_create(struct obd_export *exp, struct md_op_data *op_data, if (IS_ERR(tgt)) return PTR_ERR(tgt); - rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data); + CDEBUG(D_INODE, "CREATE name '%.*s' on "DFID" -> mds #%x\n", + (int)op_data->op_namelen, op_data->op_name, + PFID(&op_data->op_fid1), op_data->op_mds); + + rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); if (rc) return rc; - CDEBUG(D_INODE, "CREATE '%*s' on "DFID" -> mds #%x\n", - op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1), - op_data->op_mds); + if (exp_connect_flags(exp) & OBD_CONNECT_DIR_STRIPE) { + /* + * Send the create request to the MDT where the object + * will be located + */ + tgt = lmv_find_target(lmv, &op_data->op_fid2); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + op_data->op_mds = tgt->ltd_idx; + } else { + CDEBUG(D_CONFIG, "Server doesn't support striped dirs\n"); + } + + CDEBUG(D_INODE, "CREATE obj "DFID" -> mds #%x\n", + PFID(&op_data->op_fid1), op_data->op_mds); op_data->op_flags |= MF_MDC_CANCEL_FID1; rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid, @@ -1674,70 +1750,10 @@ static int lmv_done_writing(struct obd_export *exp, } static int -lmv_enqueue_remote(struct obd_export *exp, struct ldlm_enqueue_info *einfo, - struct lookup_intent *it, struct md_op_data *op_data, - struct lustre_handle *lockh, void *lmm, int lmmsize, - __u64 extra_lock_flags) -{ - struct ptlrpc_request *req = it->it_request; - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lustre_handle plock; - struct lmv_tgt_desc *tgt; - struct md_op_data *rdata; - struct lu_fid fid1; - struct mdt_body *body; - int rc = 0; - int pmode; - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - - if (!(body->valid & OBD_MD_MDS)) - return 0; - - CDEBUG(D_INODE, "REMOTE_ENQUEUE '%s' on "DFID" -> "DFID"\n", - LL_IT2STR(it), PFID(&op_data->op_fid1), PFID(&body->fid1)); - - /* - * We got LOOKUP lock, but we really need attrs. - */ - pmode = it->it_lock_mode; - LASSERT(pmode != 0); - memcpy(&plock, lockh, sizeof(plock)); - it->it_lock_mode = 0; - it->it_request = NULL; - fid1 = body->fid1; - - ptlrpc_req_finished(req); - - tgt = lmv_find_target(lmv, &fid1); - if (IS_ERR(tgt)) { - rc = PTR_ERR(tgt); - goto out; - } - - rdata = kzalloc(sizeof(*rdata), GFP_NOFS); - if (!rdata) { - rc = -ENOMEM; - goto out; - } - - rdata->op_fid1 = fid1; - rdata->op_bias = MDS_CROSS_REF; - - rc = md_enqueue(tgt->ltd_exp, einfo, it, rdata, lockh, - lmm, lmmsize, NULL, extra_lock_flags); - kfree(rdata); -out: - ldlm_lock_decref(&plock, pmode); - return rc; -} - -static int lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + const ldlm_policy_data_t *policy, struct lookup_intent *it, struct md_op_data *op_data, - struct lustre_handle *lockh, void *lmm, int lmmsize, - struct ptlrpc_request **req, __u64 extra_lock_flags) + struct lustre_handle *lockh, __u64 extra_lock_flags) { struct obd_device *obd = exp->exp_obd; struct lmv_obd *lmv = &obd->u.lmv; @@ -1755,22 +1771,18 @@ lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, if (IS_ERR(tgt)) return PTR_ERR(tgt); - CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID" -> mds #%d\n", + CDEBUG(D_INODE, "ENQUEUE '%s' on " DFID " -> mds #%u\n", LL_IT2STR(it), PFID(&op_data->op_fid1), tgt->ltd_idx); - rc = md_enqueue(tgt->ltd_exp, einfo, it, op_data, lockh, - lmm, lmmsize, req, extra_lock_flags); + rc = md_enqueue(tgt->ltd_exp, einfo, policy, it, op_data, lockh, + extra_lock_flags); - if (rc == 0 && it && it->it_op == IT_OPEN) { - rc = lmv_enqueue_remote(exp, einfo, it, op_data, lockh, - lmm, lmmsize, extra_lock_flags); - } return rc; } static int lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data, - struct ptlrpc_request **request) + struct ptlrpc_request **preq) { struct ptlrpc_request *req = NULL; struct obd_device *obd = exp->exp_obd; @@ -1787,26 +1799,25 @@ lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data, if (IS_ERR(tgt)) return PTR_ERR(tgt); - CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n", - op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1), - tgt->ltd_idx); + CDEBUG(D_INODE, "GETATTR_NAME for %*s on " DFID " -> mds #%u\n", + (int)op_data->op_namelen, op_data->op_name, + PFID(&op_data->op_fid1), tgt->ltd_idx); - rc = md_getattr_name(tgt->ltd_exp, op_data, request); + rc = md_getattr_name(tgt->ltd_exp, op_data, preq); if (rc != 0) return rc; - body = req_capsule_server_get(&(*request)->rq_pill, - &RMF_MDT_BODY); - - if (body->valid & OBD_MD_MDS) { - struct lu_fid rid = body->fid1; + body = req_capsule_server_get(&(*preq)->rq_pill, &RMF_MDT_BODY); + if (body->mbo_valid & OBD_MD_MDS) { + struct lu_fid rid = body->mbo_fid1; CDEBUG(D_INODE, "Request attrs for "DFID"\n", PFID(&rid)); tgt = lmv_find_target(lmv, &rid); if (IS_ERR(tgt)) { - ptlrpc_req_finished(*request); + ptlrpc_req_finished(*preq); + *preq = NULL; return PTR_ERR(tgt); } @@ -1815,8 +1826,8 @@ lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data, op_data->op_namelen = 0; op_data->op_name = NULL; rc = md_getattr_name(tgt->ltd_exp, op_data, &req); - ptlrpc_req_finished(*request); - *request = req; + ptlrpc_req_finished(*preq); + *preq = req; } return rc; @@ -1829,23 +1840,24 @@ lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data, fl == MF_MDC_CANCEL_FID4 ? &op_data->op_fid4 : \ NULL) -static int lmv_early_cancel(struct obd_export *exp, struct md_op_data *op_data, - int op_tgt, enum ldlm_mode mode, int bits, - int flag) +static int lmv_early_cancel(struct obd_export *exp, struct lmv_tgt_desc *tgt, + struct md_op_data *op_data, int op_tgt, + enum ldlm_mode mode, int bits, int flag) { struct lu_fid *fid = md_op_data_fid(op_data, flag); struct obd_device *obd = exp->exp_obd; struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; ldlm_policy_data_t policy = { {0} }; int rc = 0; if (!fid_is_sane(fid)) return 0; - tgt = lmv_find_target(lmv, fid); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); + if (!tgt) { + tgt = lmv_find_target(lmv, fid); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + } if (tgt->ltd_idx != op_tgt) { CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid)); @@ -1882,12 +1894,24 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data, LASSERT(op_data->op_namelen != 0); CDEBUG(D_INODE, "LINK "DFID":%*s to "DFID"\n", - PFID(&op_data->op_fid2), op_data->op_namelen, + PFID(&op_data->op_fid2), (int)op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1)); op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); op_data->op_cap = cfs_curproc_cap_pack(); + if (op_data->op_mea2) { + struct lmv_stripe_md *lsm = op_data->op_mea2; + const struct lmv_oinfo *oinfo; + + oinfo = lsm_name_to_stripe_info(lsm, op_data->op_name, + op_data->op_namelen); + if (IS_ERR(oinfo)) + return PTR_ERR(oinfo); + + op_data->op_fid2 = oinfo->lmo_fid; + } + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2); if (IS_ERR(tgt)) return PTR_ERR(tgt); @@ -1896,7 +1920,7 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data, * Cancel UPDATE lock on child (fid1). */ op_data->op_flags |= MF_MDC_CANCEL_FID2; - rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX, + rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX, MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1); if (rc != 0) return rc; @@ -1907,20 +1931,22 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data, } static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data, - const char *old, int oldlen, const char *new, int newlen, + const char *old, size_t oldlen, + const char *new, size_t newlen, struct ptlrpc_request **request) { struct obd_device *obd = exp->exp_obd; struct lmv_obd *lmv = &obd->u.lmv; struct lmv_tgt_desc *src_tgt; - struct lmv_tgt_desc *tgt_tgt; int rc; LASSERT(oldlen != 0); - CDEBUG(D_INODE, "RENAME %*s in "DFID" to %*s in "DFID"\n", - oldlen, old, PFID(&op_data->op_fid1), - newlen, new, PFID(&op_data->op_fid2)); + CDEBUG(D_INODE, "RENAME %.*s in "DFID":%d to %.*s in "DFID":%d\n", + (int)oldlen, old, PFID(&op_data->op_fid1), + op_data->op_mea1 ? op_data->op_mea1->lsm_md_stripe_count : 0, + (int)newlen, new, PFID(&op_data->op_fid2), + op_data->op_mea2 ? op_data->op_mea2->lsm_md_stripe_count : 0); rc = lmv_check_connect(obd); if (rc) @@ -1929,13 +1955,60 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data, op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); op_data->op_cap = cfs_curproc_cap_pack(); - src_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + + if (op_data->op_cli_flags & CLI_MIGRATE) { + LASSERTF(fid_is_sane(&op_data->op_fid3), "invalid FID "DFID"\n", + PFID(&op_data->op_fid3)); + + if (op_data->op_mea1) { + struct lmv_stripe_md *lsm = op_data->op_mea1; + struct lmv_tgt_desc *tmp; + + /* Fix the parent fid for striped dir */ + tmp = lmv_locate_target_for_name(lmv, lsm, old, + oldlen, + &op_data->op_fid1, + NULL); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + } + + rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); + if (rc) + return rc; + src_tgt = lmv_find_target(lmv, &op_data->op_fid3); + } else { + if (op_data->op_mea1) { + struct lmv_stripe_md *lsm = op_data->op_mea1; + + src_tgt = lmv_locate_target_for_name(lmv, lsm, old, + oldlen, + &op_data->op_fid1, + &op_data->op_mds); + if (IS_ERR(src_tgt)) + return PTR_ERR(src_tgt); + } else { + src_tgt = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(src_tgt)) + return PTR_ERR(src_tgt); + + op_data->op_mds = src_tgt->ltd_idx; + } + + if (op_data->op_mea2) { + struct lmv_stripe_md *lsm = op_data->op_mea2; + const struct lmv_oinfo *oinfo; + + oinfo = lsm_name_to_stripe_info(lsm, new, newlen); + if (IS_ERR(oinfo)) + return PTR_ERR(oinfo); + + op_data->op_fid2 = oinfo->lmo_fid; + } + } if (IS_ERR(src_tgt)) return PTR_ERR(src_tgt); - tgt_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2); - if (IS_ERR(tgt_tgt)) - return PTR_ERR(tgt_tgt); /* * LOOKUP lock on src child (fid3) should also be cancelled for * src_tgt in mdc_rename. @@ -1946,35 +2019,53 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data, * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its * own target. */ - rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, + rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx, LCK_EX, MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2); - + if (rc) + return rc; /* - * Cancel LOOKUP locks on tgt child (fid4) for parent tgt_tgt. + * Cancel LOOKUP locks on source child (fid3) for parent tgt_tgt. */ - if (rc == 0) { - rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, + if (fid_is_sane(&op_data->op_fid3)) { + struct lmv_tgt_desc *tgt; + + tgt = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + /* Cancel LOOKUP lock on its parent */ + rc = lmv_early_cancel(exp, tgt, op_data, src_tgt->ltd_idx, LCK_EX, MDS_INODELOCK_LOOKUP, - MF_MDC_CANCEL_FID4); + MF_MDC_CANCEL_FID3); + if (rc) + return rc; + + rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx, + LCK_EX, MDS_INODELOCK_FULL, + MF_MDC_CANCEL_FID3); + if (rc) + return rc; } /* * Cancel all the locks on tgt child (fid4). */ - if (rc == 0) - rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, + if (fid_is_sane(&op_data->op_fid4)) + rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx, LCK_EX, MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID4); - if (rc == 0) - rc = md_rename(src_tgt->ltd_exp, op_data, old, oldlen, - new, newlen, request); + CDEBUG(D_INODE, DFID":m%d to "DFID"\n", PFID(&op_data->op_fid1), + op_data->op_mds, PFID(&op_data->op_fid2)); + + rc = md_rename(src_tgt->ltd_exp, op_data, old, oldlen, + new, newlen, request); return rc; } static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data, - void *ea, int ealen, void *ea2, int ea2len, + void *ea, size_t ealen, void *ea2, size_t ea2len, struct ptlrpc_request **request, struct md_open_data **mod) { @@ -2021,169 +2112,419 @@ static int lmv_sync(struct obd_export *exp, const struct lu_fid *fid, return rc; } -/* - * Adjust a set of pages, each page containing an array of lu_dirpages, - * so that each page can be used as a single logical lu_dirpage. +/** + * Get current minimum entry from striped directory * - * A lu_dirpage is laid out as follows, where s = ldp_hash_start, - * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a - * struct lu_dirent. It has size up to LU_PAGE_SIZE. The ldp_hash_end - * value is used as a cookie to request the next lu_dirpage in a - * directory listing that spans multiple pages (two in this example): - * ________ - * | | - * .|--------v------- -----. - * |s|e|f|p|ent|ent| ... |ent| - * '--|-------------- -----' Each CFS_PAGE contains a single - * '------. lu_dirpage. - * .---------v------- -----. - * |s|e|f|p|ent| 0 | ... | 0 | - * '----------------- -----' + * This function will search the dir entry, whose hash value is the + * closest(>=) to @hash_offset, from all of sub-stripes, and it is + * only being called for striped directory. * - * However, on hosts where the native VM page size (PAGE_SIZE) is - * larger than LU_PAGE_SIZE, a single host page may contain multiple - * lu_dirpages. After reading the lu_dirpages from the MDS, the - * ldp_hash_end of the first lu_dirpage refers to the one immediately - * after it in the same CFS_PAGE (arrows simplified for brevity, but - * in general e0==s1, e1==s2, etc.): + * \param[in] exp export of LMV + * \param[in] op_data parameters transferred beween client MD stack + * stripe_information will be included in this + * parameter + * \param[in] cb_op ldlm callback being used in enqueue in + * mdc_read_page + * \param[in] hash_offset the hash value, which is used to locate + * minum(closet) dir entry + * \param[in|out] stripe_offset the caller use this to indicate the stripe + * index of last entry, so to avoid hash conflict + * between stripes. It will also be used to + * return the stripe index of current dir entry. + * \param[in|out] entp the minum entry and it also is being used + * to input the last dir entry to resolve the + * hash conflict * - * .-------------------- -----. - * |s0|e0|f0|p|ent|ent| ... |ent| - * |---v---------------- -----| - * |s1|e1|f1|p|ent|ent| ... |ent| - * |---v---------------- -----| Here, each CFS_PAGE contains - * ... multiple lu_dirpages. - * |---v---------------- -----| - * |s'|e'|f'|p|ent|ent| ... |ent| - * '---|---------------- -----' - * v - * .----------------------------. - * | next CFS_PAGE | + * \param[out] ppage the page which holds the minum entry * - * This structure is transformed into a single logical lu_dirpage as follows: + * \retval = 0 get the entry successfully + * negative errno (< 0) does not get the entry + */ +static int lmv_get_min_striped_entry(struct obd_export *exp, + struct md_op_data *op_data, + struct md_callback *cb_op, + __u64 hash_offset, int *stripe_offset, + struct lu_dirent **entp, + struct page **ppage) +{ + struct lmv_stripe_md *lsm = op_data->op_mea1; + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lu_dirent *min_ent = NULL; + struct page *min_page = NULL; + struct lmv_tgt_desc *tgt; + int stripe_count; + int min_idx = 0; + int rc = 0; + int i; + + stripe_count = lsm->lsm_md_stripe_count; + for (i = 0; i < stripe_count; i++) { + __u64 stripe_hash = hash_offset; + struct lu_dirent *ent = NULL; + struct page *page = NULL; + struct lu_dirpage *dp; + + tgt = lmv_get_target(lmv, lsm->lsm_md_oinfo[i].lmo_mds, NULL); + if (IS_ERR(tgt)) { + rc = PTR_ERR(tgt); + goto out; + } + + /* + * op_data will be shared by each stripe, so we need + * reset these value for each stripe + */ + op_data->op_fid1 = lsm->lsm_md_oinfo[i].lmo_fid; + op_data->op_fid2 = lsm->lsm_md_oinfo[i].lmo_fid; + op_data->op_data = lsm->lsm_md_oinfo[i].lmo_root; +next: + rc = md_read_page(tgt->ltd_exp, op_data, cb_op, stripe_hash, + &page); + if (rc) + goto out; + + dp = page_address(page); + for (ent = lu_dirent_start(dp); ent; + ent = lu_dirent_next(ent)) { + /* Skip dummy entry */ + if (!le16_to_cpu(ent->lde_namelen)) + continue; + + if (le64_to_cpu(ent->lde_hash) < hash_offset) + continue; + + if (le64_to_cpu(ent->lde_hash) == hash_offset && + (*entp == ent || i < *stripe_offset)) + continue; + + /* skip . and .. for other stripes */ + if (i && (!strncmp(ent->lde_name, ".", + le16_to_cpu(ent->lde_namelen)) || + !strncmp(ent->lde_name, "..", + le16_to_cpu(ent->lde_namelen)))) + continue; + break; + } + + if (!ent) { + stripe_hash = le64_to_cpu(dp->ldp_hash_end); + + kunmap(page); + put_page(page); + page = NULL; + + /* + * reach the end of current stripe, go to next stripe + */ + if (stripe_hash == MDS_DIR_END_OFF) + continue; + else + goto next; + } + + if (min_ent) { + if (le64_to_cpu(min_ent->lde_hash) > + le64_to_cpu(ent->lde_hash)) { + min_ent = ent; + kunmap(min_page); + put_page(min_page); + min_idx = i; + min_page = page; + } else { + kunmap(page); + put_page(page); + page = NULL; + } + } else { + min_ent = ent; + min_page = page; + min_idx = i; + } + } + +out: + if (*ppage) { + kunmap(*ppage); + put_page(*ppage); + } + *stripe_offset = min_idx; + *entp = min_ent; + *ppage = min_page; + return rc; +} + +/** + * Build dir entry page from a striped directory * - * - Replace e0 with e' so the request for the next lu_dirpage gets the page - * labeled 'next CFS_PAGE'. + * This function gets one entry by @offset from a striped directory. It will + * read entries from all of stripes, and choose one closest to the required + * offset(&offset). A few notes + * 1. skip . and .. for non-zero stripes, because there can only have one . + * and .. in a directory. + * 2. op_data will be shared by all of stripes, instead of allocating new + * one, so need to restore before reusing. + * 3. release the entry page if that is not being chosen. * - * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether - * a hash collision with the next page exists. + * \param[in] exp obd export refer to LMV + * \param[in] op_data hold those MD parameters of read_entry + * \param[in] cb_op ldlm callback being used in enqueue in mdc_read_entry + * \param[out] ldp the entry being read + * \param[out] ppage the page holding the entry. Note: because the entry + * will be accessed in upper layer, so we need hold the + * page until the usages of entry is finished, see + * ll_dir_entry_next. * - * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span - * to the first entry of the next lu_dirpage. + * retval =0 if get entry successfully + * <0 cannot get entry */ -#if PAGE_SIZE > LU_PAGE_SIZE -static void lmv_adjust_dirpages(struct page **pages, int ncfspgs, int nlupgs) -{ - int i; +static int lmv_read_striped_page(struct obd_export *exp, + struct md_op_data *op_data, + struct md_callback *cb_op, + __u64 offset, struct page **ppage) +{ + struct inode *master_inode = op_data->op_data; + struct lu_fid master_fid = op_data->op_fid1; + struct obd_device *obd = exp->exp_obd; + __u64 hash_offset = offset; + struct page *min_ent_page = NULL; + struct page *ent_page = NULL; + struct lu_dirent *min_ent = NULL; + struct lu_dirent *last_ent; + struct lu_dirent *ent; + struct lu_dirpage *dp; + size_t left_bytes; + int ent_idx = 0; + void *area; + int rc; - for (i = 0; i < ncfspgs; i++) { - struct lu_dirpage *dp = kmap(pages[i]); - struct lu_dirpage *first = dp; - struct lu_dirent *end_dirent = NULL; - struct lu_dirent *ent; - __u64 hash_end = dp->ldp_hash_end; - __u32 flags = dp->ldp_flags; - - while (--nlupgs > 0) { - ent = lu_dirent_start(dp); - for (end_dirent = ent; ent; - end_dirent = ent, ent = lu_dirent_next(ent)) - ; - - /* Advance dp to next lu_dirpage. */ - dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE); - - /* Check if we've reached the end of the CFS_PAGE. */ - if (!((unsigned long)dp & ~PAGE_MASK)) - break; + rc = lmv_check_connect(obd); + if (rc) + return rc; - /* Save the hash and flags of this lu_dirpage. */ - hash_end = dp->ldp_hash_end; - flags = dp->ldp_flags; + /* + * Allocate a page and read entries from all of stripes and fill + * the page by hash order + */ + ent_page = alloc_page(GFP_KERNEL); + if (!ent_page) + return -ENOMEM; - /* Check if lu_dirpage contains no entries. */ - if (!end_dirent) - break; + /* Initialize the entry page */ + dp = kmap(ent_page); + memset(dp, 0, sizeof(*dp)); + dp->ldp_hash_start = cpu_to_le64(offset); + dp->ldp_flags |= LDF_COLLIDE; + + area = dp + 1; + left_bytes = PAGE_SIZE - sizeof(*dp); + ent = area; + last_ent = ent; + do { + __u16 ent_size; + + /* Find the minum entry from all sub-stripes */ + rc = lmv_get_min_striped_entry(exp, op_data, cb_op, hash_offset, + &ent_idx, &min_ent, + &min_ent_page); + if (rc) + goto out; - /* Enlarge the end entry lde_reclen from 0 to - * first entry of next lu_dirpage. - */ - LASSERT(le16_to_cpu(end_dirent->lde_reclen) == 0); - end_dirent->lde_reclen = - cpu_to_le16((char *)(dp->ldp_entries) - - (char *)end_dirent); + /* + * If it can not get minum entry, it means it already reaches + * the end of this directory + */ + if (!min_ent) { + last_ent->lde_reclen = 0; + hash_offset = MDS_DIR_END_OFF; + goto out; + } + + ent_size = le16_to_cpu(min_ent->lde_reclen); + + /* + * the last entry lde_reclen is 0, but it might not + * the end of this entry of this temporay entry + */ + if (!ent_size) + ent_size = lu_dirent_calc_size( + le16_to_cpu(min_ent->lde_namelen), + le32_to_cpu(min_ent->lde_attrs)); + if (ent_size > left_bytes) { + last_ent->lde_reclen = cpu_to_le16(0); + hash_offset = le64_to_cpu(min_ent->lde_hash); + goto out; } - first->ldp_hash_end = hash_end; - first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE); - first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE); + memcpy(ent, min_ent, ent_size); + + /* + * Replace . with master FID and Replace .. with the parent FID + * of master object + */ + if (!strncmp(ent->lde_name, ".", + le16_to_cpu(ent->lde_namelen)) && + le16_to_cpu(ent->lde_namelen) == 1) + fid_cpu_to_le(&ent->lde_fid, &master_fid); + else if (!strncmp(ent->lde_name, "..", + le16_to_cpu(ent->lde_namelen)) && + le16_to_cpu(ent->lde_namelen) == 2) + fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid3); + + left_bytes -= ent_size; + ent->lde_reclen = cpu_to_le16(ent_size); + last_ent = ent; + ent = (void *)ent + ent_size; + hash_offset = le64_to_cpu(min_ent->lde_hash); + if (hash_offset == MDS_DIR_END_OFF) { + last_ent->lde_reclen = 0; + break; + } + } while (1); +out: + if (min_ent_page) { + kunmap(min_ent_page); + put_page(min_ent_page); + } - kunmap(pages[i]); + if (unlikely(rc)) { + __free_page(ent_page); + ent_page = NULL; + } else { + if (ent == area) + dp->ldp_flags |= LDF_EMPTY; + dp->ldp_flags = cpu_to_le32(dp->ldp_flags); + dp->ldp_hash_end = cpu_to_le64(hash_offset); } - LASSERTF(nlupgs == 0, "left = %d", nlupgs); + + /* + * We do not want to allocate md_op_data during each + * dir entry reading, so op_data will be shared by every stripe, + * then we need to restore it back to original value before + * return to the upper layer + */ + op_data->op_fid1 = master_fid; + op_data->op_fid2 = master_fid; + op_data->op_data = master_inode; + + *ppage = ent_page; + + return rc; } -#else -#define lmv_adjust_dirpages(pages, ncfspgs, nlupgs) do {} while (0) -#endif /* PAGE_SIZE > LU_PAGE_SIZE */ -static int lmv_readpage(struct obd_export *exp, struct md_op_data *op_data, - struct page **pages, struct ptlrpc_request **request) +static int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data, + struct md_callback *cb_op, __u64 offset, + struct page **ppage) { - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - __u64 offset = op_data->op_offset; - int rc; - int ncfspgs; /* pages read in PAGE_SIZE */ - int nlupgs; /* pages read in LU_PAGE_SIZE */ - struct lmv_tgt_desc *tgt; + struct lmv_stripe_md *lsm = op_data->op_mea1; + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; rc = lmv_check_connect(obd); if (rc) return rc; - CDEBUG(D_INODE, "READPAGE at %#llx from "DFID"\n", - offset, PFID(&op_data->op_fid1)); + if (unlikely(lsm)) { + rc = lmv_read_striped_page(exp, op_data, cb_op, offset, ppage); + return rc; + } tgt = lmv_find_target(lmv, &op_data->op_fid1); if (IS_ERR(tgt)) return PTR_ERR(tgt); - rc = md_readpage(tgt->ltd_exp, op_data, pages, request); - if (rc != 0) - return rc; - - ncfspgs = ((*request)->rq_bulk->bd_nob_transferred + PAGE_SIZE - 1) - >> PAGE_SHIFT; - nlupgs = (*request)->rq_bulk->bd_nob_transferred >> LU_PAGE_SHIFT; - LASSERT(!((*request)->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK)); - LASSERT(ncfspgs > 0 && ncfspgs <= op_data->op_npages); - - CDEBUG(D_INODE, "read %d(%d)/%d pages\n", ncfspgs, nlupgs, - op_data->op_npages); - - lmv_adjust_dirpages(pages, ncfspgs, nlupgs); + rc = md_read_page(tgt->ltd_exp, op_data, cb_op, offset, ppage); return rc; } +/** + * Unlink a file/directory + * + * Unlink a file or directory under the parent dir. The unlink request + * usually will be sent to the MDT where the child is located, but if + * the client does not have the child FID then request will be sent to the + * MDT where the parent is located. + * + * If the parent is a striped directory then it also needs to locate which + * stripe the name of the child is located, and replace the parent FID + * (@op->op_fid1) with the stripe FID. Note: if the stripe is unknown, + * it will walk through all of sub-stripes until the child is being + * unlinked finally. + * + * \param[in] exp export refer to LMV + * \param[in] op_data different parameters transferred beween client + * MD stacks, name, namelen, FIDs etc. + * op_fid1 is the parent FID, op_fid2 is the child + * FID. + * \param[out] request point to the request of unlink. + * + * retval 0 if succeed + * negative errno if failed. + */ static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data, struct ptlrpc_request **request) { - struct obd_device *obd = exp->exp_obd; + struct lmv_stripe_md *lsm = op_data->op_mea1; + struct obd_device *obd = exp->exp_obd; struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *parent_tgt = NULL; struct lmv_tgt_desc *tgt = NULL; struct mdt_body *body; + int stripe_index = 0; int rc; rc = lmv_check_connect(obd); if (rc) return rc; -retry: +retry_unlink: + /* For striped dir, we need to locate the parent as well */ + if (lsm) { + struct lmv_tgt_desc *tmp; + + LASSERT(op_data->op_name && op_data->op_namelen); + + tmp = lmv_locate_target_for_name(lmv, lsm, + op_data->op_name, + op_data->op_namelen, + &op_data->op_fid1, + &op_data->op_mds); + + /* + * return -EBADFD means unknown hash type, might + * need try all sub-stripe here + */ + if (IS_ERR(tmp) && PTR_ERR(tmp) != -EBADFD) + return PTR_ERR(tmp); + + /* + * Note: both migrating dir and unknown hash dir need to + * try all of sub-stripes, so we need start search the + * name from stripe 0, but migrating dir is already handled + * inside lmv_locate_target_for_name(), so we only check + * unknown hash type directory here + */ + if (!lmv_is_known_hash_type(lsm->lsm_md_hash_type)) { + struct lmv_oinfo *oinfo; + + oinfo = &lsm->lsm_md_oinfo[stripe_index]; + + op_data->op_fid1 = oinfo->lmo_fid; + op_data->op_mds = oinfo->lmo_mds; + } + } + +try_next_stripe: /* Send unlink requests to the MDT where the child is located */ if (likely(!fid_is_zero(&op_data->op_fid2))) - tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2); + tgt = lmv_find_target(lmv, &op_data->op_fid2); + else if (lsm) + tgt = lmv_get_target(lmv, op_data->op_mds, NULL); else tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(tgt)) return PTR_ERR(tgt); @@ -2203,29 +2544,57 @@ retry: /* * Cancel FULL locks on child (fid3). */ - rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX, - MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3); + parent_tgt = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(parent_tgt)) + return PTR_ERR(parent_tgt); + if (parent_tgt != tgt) { + rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx, + LCK_EX, MDS_INODELOCK_LOOKUP, + MF_MDC_CANCEL_FID3); + } + + rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX, + MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3); if (rc != 0) return rc; - CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%d\n", + CDEBUG(D_INODE, "unlink with fid=" DFID "/" DFID " -> mds #%u\n", PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx); rc = md_unlink(tgt->ltd_exp, op_data, request); - if (rc != 0 && rc != -EREMOTE) + if (rc != 0 && rc != -EREMOTE && rc != -ENOENT) return rc; + /* Try next stripe if it is needed. */ + if (rc == -ENOENT && lsm && lmv_need_try_all_stripes(lsm)) { + struct lmv_oinfo *oinfo; + + stripe_index++; + if (stripe_index >= lsm->lsm_md_stripe_count) + return rc; + + oinfo = &lsm->lsm_md_oinfo[stripe_index]; + + op_data->op_fid1 = oinfo->lmo_fid; + op_data->op_mds = oinfo->lmo_mds; + + ptlrpc_req_finished(*request); + *request = NULL; + + goto try_next_stripe; + } + body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); if (!body) return -EPROTO; /* Not cross-ref case, just get out of here. */ - if (likely(!(body->valid & OBD_MD_MDS))) - return 0; + if (likely(!(body->mbo_valid & OBD_MD_MDS))) + return rc; CDEBUG(D_INODE, "%s: try unlink to another MDT for "DFID"\n", - exp->exp_obd->obd_name, PFID(&body->fid1)); + exp->exp_obd->obd_name, PFID(&body->mbo_fid1)); /* This is a remote object, try remote MDT, Note: it may * try more than 1 time here, Considering following case @@ -2247,11 +2616,11 @@ retry: * In theory, it might try unlimited time here, but it should * be very rare case. */ - op_data->op_fid2 = body->fid1; + op_data->op_fid2 = body->mbo_fid1; ptlrpc_req_finished(*request); *request = NULL; - goto retry; + goto retry_unlink; } static int lmv_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) @@ -2274,6 +2643,22 @@ static int lmv_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) return 0; } +/** + * Get by key a value associated with a LMV device. + * + * Dispatch request to lower-layer devices as needed. + * + * \param[in] env execution environment for this thread + * \param[in] exp export for the LMV device + * \param[in] keylen length of key identifier + * \param[in] key identifier of key to get value for + * \param[in] vallen size of \a val + * \param[out] val pointer to storage location for value + * \param[in] lsm optional striping metadata of object + * + * \retval 0 on success + * \retval negative negated errno on failure + */ static int lmv_get_info(const struct lu_env *env, struct obd_export *exp, __u32 keylen, void *key, __u32 *vallen, void *val, struct lov_stripe_md *lsm) @@ -2337,6 +2722,22 @@ static int lmv_get_info(const struct lu_env *env, struct obd_export *exp, return -EINVAL; } +/** + * Asynchronously set by key a value associated with a LMV device. + * + * Dispatch request to lower-layer devices as needed. + * + * \param[in] env execution environment for this thread + * \param[in] exp export for the LMV device + * \param[in] keylen length of key identifier + * \param[in] key identifier of key to store value for + * \param[in] vallen size of value to store + * \param[in] val pointer to data to be stored + * \param[in] set optional list of related ptlrpc requests + * + * \retval 0 on success + * \retval negative negated errno on failure + */ static int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp, u32 keylen, void *key, u32 vallen, void *val, struct ptlrpc_request_set *set) @@ -2354,7 +2755,8 @@ static int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp, } lmv = &obd->u.lmv; - if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX)) { + if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX) || + KEY_IS(KEY_DEFAULT_EASIZE)) { int i, err = 0; for (i = 0; i < lmv->desc.ld_tgt_count; i++) { @@ -2375,105 +2777,247 @@ static int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp, return -EINVAL; } -static int lmv_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, - struct lov_stripe_md *lsm) +static int lmv_pack_md_v1(const struct lmv_stripe_md *lsm, + struct lmv_mds_md_v1 *lmm1) { - struct obd_device *obd = class_exp2obd(exp); - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_stripe_md *meap; - struct lmv_stripe_md *lsmp; - int mea_size; - int i; + int cplen; + int i; + + lmm1->lmv_magic = cpu_to_le32(lsm->lsm_md_magic); + lmm1->lmv_stripe_count = cpu_to_le32(lsm->lsm_md_stripe_count); + lmm1->lmv_master_mdt_index = cpu_to_le32(lsm->lsm_md_master_mdt_index); + lmm1->lmv_hash_type = cpu_to_le32(lsm->lsm_md_hash_type); + cplen = strlcpy(lmm1->lmv_pool_name, lsm->lsm_md_pool_name, + sizeof(lmm1->lmv_pool_name)); + if (cplen >= sizeof(lmm1->lmv_pool_name)) + return -E2BIG; + + for (i = 0; i < lsm->lsm_md_stripe_count; i++) + fid_cpu_to_le(&lmm1->lmv_stripe_fids[i], + &lsm->lsm_md_oinfo[i].lmo_fid); + return 0; +} - mea_size = lmv_get_easize(lmv); - if (!lmmp) - return mea_size; +static int +lmv_pack_md(union lmv_mds_md **lmmp, const struct lmv_stripe_md *lsm, + int stripe_count) +{ + int lmm_size = 0, rc = 0; + bool allocated = false; + LASSERT(lmmp); + + /* Free lmm */ if (*lmmp && !lsm) { + int stripe_cnt; + + stripe_cnt = lmv_mds_md_stripe_count_get(*lmmp); + lmm_size = lmv_mds_md_size(stripe_cnt, + le32_to_cpu((*lmmp)->lmv_magic)); + if (!lmm_size) + return -EINVAL; kvfree(*lmmp); *lmmp = NULL; return 0; } + /* Alloc lmm */ + if (!*lmmp && !lsm) { + lmm_size = lmv_mds_md_size(stripe_count, LMV_MAGIC); + LASSERT(lmm_size > 0); + *lmmp = libcfs_kvzalloc(lmm_size, GFP_NOFS); + if (!*lmmp) + return -ENOMEM; + lmv_mds_md_stripe_count_set(*lmmp, stripe_count); + (*lmmp)->lmv_magic = cpu_to_le32(LMV_MAGIC); + return lmm_size; + } + + /* pack lmm */ + LASSERT(lsm); + lmm_size = lmv_mds_md_size(lsm->lsm_md_stripe_count, + lsm->lsm_md_magic); if (!*lmmp) { - *lmmp = libcfs_kvzalloc(mea_size, GFP_NOFS); + *lmmp = libcfs_kvzalloc(lmm_size, GFP_NOFS); if (!*lmmp) return -ENOMEM; + allocated = true; } - if (!lsm) - return mea_size; + switch (lsm->lsm_md_magic) { + case LMV_MAGIC_V1: + rc = lmv_pack_md_v1(lsm, &(*lmmp)->lmv_md_v1); + break; + default: + rc = -EINVAL; + break; + } - lsmp = (struct lmv_stripe_md *)lsm; - meap = (struct lmv_stripe_md *)*lmmp; + if (rc && allocated) { + kvfree(*lmmp); + *lmmp = NULL; + } - if (lsmp->mea_magic != MEA_MAGIC_LAST_CHAR && - lsmp->mea_magic != MEA_MAGIC_ALL_CHARS) - return -EINVAL; + return lmm_size; +} - meap->mea_magic = cpu_to_le32(lsmp->mea_magic); - meap->mea_count = cpu_to_le32(lsmp->mea_count); - meap->mea_master = cpu_to_le32(lsmp->mea_master); +static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm, + const struct lmv_mds_md_v1 *lmm1) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + int stripe_count; + int rc = 0; + int cplen; + int i; - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - meap->mea_ids[i] = lsmp->mea_ids[i]; - fid_cpu_to_le(&meap->mea_ids[i], &lsmp->mea_ids[i]); + lsm->lsm_md_magic = le32_to_cpu(lmm1->lmv_magic); + lsm->lsm_md_stripe_count = le32_to_cpu(lmm1->lmv_stripe_count); + lsm->lsm_md_master_mdt_index = le32_to_cpu(lmm1->lmv_master_mdt_index); + if (OBD_FAIL_CHECK(OBD_FAIL_UNKNOWN_LMV_STRIPE)) + lsm->lsm_md_hash_type = LMV_HASH_TYPE_UNKNOWN; + else + lsm->lsm_md_hash_type = le32_to_cpu(lmm1->lmv_hash_type); + lsm->lsm_md_layout_version = le32_to_cpu(lmm1->lmv_layout_version); + cplen = strlcpy(lsm->lsm_md_pool_name, lmm1->lmv_pool_name, + sizeof(lsm->lsm_md_pool_name)); + + if (cplen >= sizeof(lsm->lsm_md_pool_name)) + return -E2BIG; + + CDEBUG(D_INFO, "unpack lsm count %d, master %d hash_type %d layout_version %d\n", + lsm->lsm_md_stripe_count, lsm->lsm_md_master_mdt_index, + lsm->lsm_md_hash_type, lsm->lsm_md_layout_version); + + stripe_count = le32_to_cpu(lmm1->lmv_stripe_count); + for (i = 0; i < stripe_count; i++) { + fid_le_to_cpu(&lsm->lsm_md_oinfo[i].lmo_fid, + &lmm1->lmv_stripe_fids[i]); + rc = lmv_fld_lookup(lmv, &lsm->lsm_md_oinfo[i].lmo_fid, + &lsm->lsm_md_oinfo[i].lmo_mds); + if (rc) + return rc; + CDEBUG(D_INFO, "unpack fid #%d "DFID"\n", i, + PFID(&lsm->lsm_md_oinfo[i].lmo_fid)); } - return mea_size; + return rc; } -static int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, - struct lov_mds_md *lmm, int lmm_size) +int lmv_unpack_md(struct obd_export *exp, struct lmv_stripe_md **lsmp, + const union lmv_mds_md *lmm, int stripe_count) { - struct obd_device *obd = class_exp2obd(exp); - struct lmv_stripe_md **tmea = (struct lmv_stripe_md **)lsmp; - struct lmv_stripe_md *mea = (struct lmv_stripe_md *)lmm; - struct lmv_obd *lmv = &obd->u.lmv; - int mea_size; - int i; - __u32 magic; + struct lmv_stripe_md *lsm; + bool allocated = false; + int lsm_size, rc; - mea_size = lmv_get_easize(lmv); - if (!lsmp) - return mea_size; + LASSERT(lsmp); - if (*lsmp && !lmm) { - kvfree(*tmea); + lsm = *lsmp; + /* Free memmd */ + if (lsm && !lmm) { + int i; + + for (i = 1; i < lsm->lsm_md_stripe_count; i++) { + /* + * For migrating inode, the master stripe and master + * object will be the same, so do not need iput, see + * ll_update_lsm_md + */ + if (!(lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION && + !i) && lsm->lsm_md_oinfo[i].lmo_root) + iput(lsm->lsm_md_oinfo[i].lmo_root); + } + + kvfree(lsm); *lsmp = NULL; return 0; } - LASSERT(mea_size == lmm_size); + /* Alloc memmd */ + if (!lsm && !lmm) { + lsm_size = lmv_stripe_md_size(stripe_count); + lsm = libcfs_kvzalloc(lsm_size, GFP_NOFS); + if (!lsm) + return -ENOMEM; + lsm->lsm_md_stripe_count = stripe_count; + *lsmp = lsm; + return 0; + } - *tmea = libcfs_kvzalloc(mea_size, GFP_NOFS); - if (!*tmea) - return -ENOMEM; + if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_STRIPE) + return -EPERM; - if (!lmm) - return mea_size; + /* Unpack memmd */ + if (le32_to_cpu(lmm->lmv_magic) != LMV_MAGIC_V1 && + le32_to_cpu(lmm->lmv_magic) != LMV_USER_MAGIC) { + CERROR("%s: invalid lmv magic %x: rc = %d\n", + exp->exp_obd->obd_name, le32_to_cpu(lmm->lmv_magic), + -EIO); + return -EIO; + } - if (mea->mea_magic == MEA_MAGIC_LAST_CHAR || - mea->mea_magic == MEA_MAGIC_ALL_CHARS || - mea->mea_magic == MEA_MAGIC_HASH_SEGMENT) { - magic = le32_to_cpu(mea->mea_magic); - } else { - /* - * Old mea is not handled here. + if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_V1) + lsm_size = lmv_stripe_md_size(lmv_mds_md_stripe_count_get(lmm)); + else + /** + * Unpack default dirstripe(lmv_user_md) to lmv_stripe_md, + * stripecount should be 0 then. */ - CERROR("Old not supportable EA is found\n"); - LBUG(); + lsm_size = lmv_stripe_md_size(0); + + if (!lsm) { + lsm = libcfs_kvzalloc(lsm_size, GFP_NOFS); + if (!lsm) + return -ENOMEM; + allocated = true; + *lsmp = lsm; + } + + switch (le32_to_cpu(lmm->lmv_magic)) { + case LMV_MAGIC_V1: + rc = lmv_unpack_md_v1(exp, lsm, &lmm->lmv_md_v1); + break; + default: + CERROR("%s: unrecognized magic %x\n", exp->exp_obd->obd_name, + le32_to_cpu(lmm->lmv_magic)); + rc = -EINVAL; + break; } - (*tmea)->mea_magic = magic; - (*tmea)->mea_count = le32_to_cpu(mea->mea_count); - (*tmea)->mea_master = le32_to_cpu(mea->mea_master); + if (rc && allocated) { + kvfree(lsm); + *lsmp = NULL; + lsm_size = rc; + } + return lsm_size; +} +EXPORT_SYMBOL(lmv_unpack_md); + +static int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, + struct lov_mds_md *lmm, int disk_len) +{ + return lmv_unpack_md(exp, (struct lmv_stripe_md **)lsmp, + (union lmv_mds_md *)lmm, disk_len); +} + +static int lmv_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, + struct lov_stripe_md *lsm) +{ + const struct lmv_stripe_md *lmv = (struct lmv_stripe_md *)lsm; + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv_obd = &obd->u.lmv; + int stripe_count; - for (i = 0; i < (*tmea)->mea_count; i++) { - (*tmea)->mea_ids[i] = mea->mea_ids[i]; - fid_le_to_cpu(&(*tmea)->mea_ids[i], &(*tmea)->mea_ids[i]); + if (!lmmp) { + if (lsm) + stripe_count = lmv->lsm_md_stripe_count; + else + stripe_count = lmv_obd->desc.ld_tgt_count; + + return lmv_mds_md_size(stripe_count, LMV_MAGIC_V1); } - return mea_size; + + return lmv_pack_md((union lmv_mds_md **)lmmp, lmv, 0); } static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, @@ -2484,7 +3028,7 @@ static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, struct lmv_obd *lmv = &obd->u.lmv; int rc = 0; int err; - int i; + u32 i; LASSERT(fid); @@ -2502,8 +3046,9 @@ static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, return rc; } -static int lmv_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data, - __u64 *bits) +static int lmv_set_lock_data(struct obd_export *exp, + const struct lustre_handle *lockh, + void *data, __u64 *bits) { struct lmv_obd *lmv = &exp->exp_obd->u.lmv; struct lmv_tgt_desc *tgt = lmv->tgts[0]; @@ -2526,24 +3071,32 @@ static enum ldlm_mode lmv_lock_match(struct obd_export *exp, __u64 flags, struct obd_device *obd = exp->exp_obd; struct lmv_obd *lmv = &obd->u.lmv; enum ldlm_mode rc; - int i; + int tgt; + u32 i; CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid)); /* - * With CMD every object can have two locks in different namespaces: - * lookup lock in space of mds storing direntry and update/open lock in - * space of mds storing inode. Thus we check all targets, not only that - * one fid was created in. + * With DNE every object can have two locks in different namespaces: + * lookup lock in space of MDT storing direntry and update/open lock in + * space of MDT storing inode. Try the MDT that the FID maps to first, + * since this can be easily found, and only try others if that fails. */ - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - struct lmv_tgt_desc *tgt = lmv->tgts[i]; + for (i = 0, tgt = lmv_find_target_index(lmv, fid); + i < lmv->desc.ld_tgt_count; + i++, tgt = (tgt + 1) % lmv->desc.ld_tgt_count) { + if (tgt < 0) { + CDEBUG(D_HA, "%s: "DFID" is inaccessible: rc = %d\n", + obd->obd_name, PFID(fid), tgt); + tgt = 0; + } - if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) + if (!lmv->tgts[tgt] || !lmv->tgts[tgt]->ltd_exp || + !lmv->tgts[tgt]->ltd_active) continue; - rc = md_lock_match(tgt->ltd_exp, flags, fid, type, policy, mode, - lockh); + rc = md_lock_match(lmv->tgts[tgt]->ltd_exp, flags, fid, + type, policy, mode, lockh); if (rc) return rc; } @@ -2571,8 +3124,10 @@ static int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md) struct lmv_obd *lmv = &obd->u.lmv; struct lmv_tgt_desc *tgt = lmv->tgts[0]; - if (md->mea) - obd_free_memmd(exp, (void *)&md->mea); + if (md->lmv) { + lmv_free_memmd(md->lmv); + md->lmv = NULL; + } if (!tgt || !tgt->ltd_exp) return -EINVAL; return md_free_lustre_md(tgt->ltd_exp, md); @@ -2621,7 +3176,7 @@ static int lmv_intent_getattr_async(struct obd_export *exp, if (rc) return rc; - tgt = lmv_find_target(lmv, &op_data->op_fid1); + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); if (IS_ERR(tgt)) return PTR_ERR(tgt); @@ -2649,6 +3204,23 @@ static int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, return rc; } +static int +lmv_get_fid_from_lsm(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + const char *name, int namelen, struct lu_fid *fid) +{ + const struct lmv_oinfo *oinfo; + + LASSERT(lsm); + oinfo = lsm_name_to_stripe_info(lsm, name, namelen); + if (IS_ERR(oinfo)) + return PTR_ERR(oinfo); + + *fid = oinfo->lmo_fid; + + return 0; +} + /** * For lmv, only need to send request to master MDT, and the master MDT will * process with other slave MDTs. The only exception is Q_GETOQUOTA for which @@ -2660,8 +3232,9 @@ static int lmv_quotactl(struct obd_device *unused, struct obd_export *exp, struct obd_device *obd = class_exp2obd(exp); struct lmv_obd *lmv = &obd->u.lmv; struct lmv_tgt_desc *tgt = lmv->tgts[0]; - int rc = 0, i; + int rc = 0; __u64 curspace = 0, curinodes = 0; + u32 i; if (!tgt || !tgt->ltd_exp || !tgt->ltd_active || !lmv->desc.ld_tgt_count) { @@ -2704,7 +3277,8 @@ static int lmv_quotacheck(struct obd_device *unused, struct obd_export *exp, struct obd_device *obd = class_exp2obd(exp); struct lmv_obd *lmv = &obd->u.lmv; struct lmv_tgt_desc *tgt; - int i, rc = 0; + int rc = 0; + u32 i; for (i = 0; i < lmv->desc.ld_tgt_count; i++) { int err; @@ -2723,6 +3297,47 @@ static int lmv_quotacheck(struct obd_device *unused, struct obd_export *exp, return rc; } +static int lmv_merge_attr(struct obd_export *exp, + const struct lmv_stripe_md *lsm, + struct cl_attr *attr, + ldlm_blocking_callback cb_blocking) +{ + int rc, i; + + rc = lmv_revalidate_slaves(exp, lsm, cb_blocking, 0); + if (rc < 0) + return rc; + + for (i = 0; i < lsm->lsm_md_stripe_count; i++) { + struct inode *inode = lsm->lsm_md_oinfo[i].lmo_root; + + CDEBUG(D_INFO, ""DFID" size %llu, blocks %llu nlink %u, atime %lu ctime %lu, mtime %lu.\n", + PFID(&lsm->lsm_md_oinfo[i].lmo_fid), + i_size_read(inode), (unsigned long long)inode->i_blocks, + inode->i_nlink, LTIME_S(inode->i_atime), + LTIME_S(inode->i_ctime), LTIME_S(inode->i_mtime)); + + /* for slave stripe, it needs to subtract nlink for . and .. */ + if (i) + attr->cat_nlink += inode->i_nlink - 2; + else + attr->cat_nlink = inode->i_nlink; + + attr->cat_size += i_size_read(inode); + attr->cat_blocks += inode->i_blocks; + + if (attr->cat_atime < LTIME_S(inode->i_atime)) + attr->cat_atime = LTIME_S(inode->i_atime); + + if (attr->cat_ctime < LTIME_S(inode->i_ctime)) + attr->cat_ctime = LTIME_S(inode->i_ctime); + + if (attr->cat_mtime < LTIME_S(inode->i_mtime)) + attr->cat_mtime = LTIME_S(inode->i_mtime); + } + return 0; +} + static struct obd_ops lmv_obd_ops = { .owner = THIS_MODULE, .setup = lmv_setup, @@ -2746,7 +3361,6 @@ static struct obd_ops lmv_obd_ops = { static struct md_ops lmv_md_ops = { .getstatus = lmv_getstatus, .null_inode = lmv_null_inode, - .find_cbdata = lmv_find_cbdata, .close = lmv_close, .create = lmv_create, .done_writing = lmv_done_writing, @@ -2760,7 +3374,7 @@ static struct md_ops lmv_md_ops = { .setattr = lmv_setattr, .setxattr = lmv_setxattr, .sync = lmv_sync, - .readpage = lmv_readpage, + .read_page = lmv_read_page, .unlink = lmv_unlink, .init_ea_size = lmv_init_ea_size, .cancel_unused = lmv_cancel_unused, @@ -2768,10 +3382,12 @@ static struct md_ops lmv_md_ops = { .lock_match = lmv_lock_match, .get_lustre_md = lmv_get_lustre_md, .free_lustre_md = lmv_free_lustre_md, + .merge_attr = lmv_merge_attr, .set_open_replay_data = lmv_set_open_replay_data, .clear_open_replay_data = lmv_clear_open_replay_data, .intent_getattr_async = lmv_intent_getattr_async, - .revalidate_lock = lmv_revalidate_lock + .revalidate_lock = lmv_revalidate_lock, + .get_fid_from_lsm = lmv_get_fid_from_lsm, }; static int __init lmv_init(void) diff --git a/drivers/staging/lustre/lustre/lmv/lproc_lmv.c b/drivers/staging/lustre/lustre/lmv/lproc_lmv.c index c29c361eb0cc..20bbdfc21d15 100644 --- a/drivers/staging/lustre/lustre/lmv/lproc_lmv.c +++ b/drivers/staging/lustre/lustre/lmv/lproc_lmv.c @@ -169,7 +169,7 @@ static int lmv_tgt_seq_show(struct seq_file *p, void *v) if (!tgt) return 0; - seq_printf(p, "%d: %s %sACTIVE\n", + seq_printf(p, "%u: %s %sACTIVE\n", tgt->ltd_idx, tgt->ltd_uuid.uuid, tgt->ltd_active ? "" : "IN"); return 0; @@ -202,7 +202,7 @@ static struct lprocfs_vars lprocfs_lmv_obd_vars[] = { { NULL } }; -struct file_operations lmv_proc_target_fops = { +const struct file_operations lmv_proc_target_fops = { .owner = THIS_MODULE, .open = lmv_target_seq_open, .read = seq_read, diff --git a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h index 9740568d9521..4d2b7d303fea 100644 --- a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h +++ b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h @@ -289,8 +289,8 @@ struct lov_lock { }; struct lov_page { - struct cl_page_slice lps_cl; - int lps_invalid; + struct cl_page_slice lps_cl; + unsigned int lps_stripe; /* stripe index */ }; /* @@ -556,6 +556,8 @@ struct lov_lock_link *lov_lock_link_find(const struct lu_env *env, struct lovsub_lock *sub); struct lov_io_sub *lov_page_subio(const struct lu_env *env, struct lov_io *lio, const struct cl_page_slice *slice); + +struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov); int lov_page_stripe(const struct cl_page *page); #define lov_foreach_target(lov, var) \ @@ -742,11 +744,15 @@ static inline struct lov_thread_info *lov_env_info(const struct lu_env *env) static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov) { LASSERT(lov->lo_type == LLT_RAID0); - LASSERT(lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC || - lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC_V3); + LASSERT(lov->lo_lsm->lsm_magic == LOV_MAGIC || + lov->lo_lsm->lsm_magic == LOV_MAGIC_V3); return &lov->u.raid0; } +/* lov_pack.c */ +int lov_getstripe(struct lov_object *obj, struct lov_stripe_md *lsm, + struct lov_user_md __user *lump); + /** @} lov */ #endif diff --git a/drivers/staging/lustre/lustre/lov/lov_dev.c b/drivers/staging/lustre/lustre/lov/lov_dev.c index b1f260d43bc7..056ae2ed88e8 100644 --- a/drivers/staging/lustre/lustre/lov/lov_dev.c +++ b/drivers/staging/lustre/lustre/lov/lov_dev.c @@ -516,6 +516,5 @@ struct lu_device_type lov_device_type = { .ldt_ops = &lov_device_type_ops, .ldt_ctx_tags = LCT_CL_THREAD }; -EXPORT_SYMBOL(lov_device_type); /** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lov_ea.c b/drivers/staging/lustre/lustre/lov/lov_ea.c index 5053dead17bb..214c561767e0 100644 --- a/drivers/staging/lustre/lustre/lov/lov_ea.c +++ b/drivers/staging/lustre/lustre/lov/lov_ea.c @@ -66,7 +66,8 @@ static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes, } if (lmm->lmm_stripe_size == 0 || - (le32_to_cpu(lmm->lmm_stripe_size)&(LOV_MIN_STRIPE_SIZE-1)) != 0) { + (le32_to_cpu(lmm->lmm_stripe_size) & + (LOV_MIN_STRIPE_SIZE - 1)) != 0) { CERROR("bad stripe size %u\n", le32_to_cpu(lmm->lmm_stripe_size)); lov_dump_lmm_common(D_WARNING, lmm); @@ -146,21 +147,15 @@ lsm_stripe_by_offset_plain(struct lov_stripe_md *lsm, int *stripeno, *swidth = (u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count; } -static int lsm_destroy_plain(struct lov_stripe_md *lsm, struct obdo *oa, - struct obd_export *md_exp) -{ - return 0; -} - /* Find minimum stripe maxbytes value. For inactive or - * reconnecting targets use LUSTRE_STRIPE_MAXBYTES. + * reconnecting targets use LUSTRE_EXT3_STRIPE_MAXBYTES. */ static void lov_tgt_maxbytes(struct lov_tgt_desc *tgt, __u64 *stripe_maxbytes) { struct obd_import *imp = tgt->ltd_obd->u.cli.cl_import; if (!imp || !tgt->ltd_active) { - *stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES; + *stripe_maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES; return; } @@ -171,7 +166,7 @@ static void lov_tgt_maxbytes(struct lov_tgt_desc *tgt, __u64 *stripe_maxbytes) if (*stripe_maxbytes > imp->imp_connect_data.ocd_maxbytes) *stripe_maxbytes = imp->imp_connect_data.ocd_maxbytes; } else { - *stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES; + *stripe_maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES; } spin_unlock(&imp->imp_lock); } @@ -245,7 +240,6 @@ static int lsm_unpackmd_v1(struct lov_obd *lov, struct lov_stripe_md *lsm, const struct lsm_operations lsm_v1_ops = { .lsm_free = lsm_free_plain, - .lsm_destroy = lsm_destroy_plain, .lsm_stripe_by_index = lsm_stripe_by_index_plain, .lsm_stripe_by_offset = lsm_stripe_by_offset_plain, .lsm_lmm_verify = lsm_lmm_verify_v1, @@ -335,7 +329,6 @@ static int lsm_unpackmd_v3(struct lov_obd *lov, struct lov_stripe_md *lsm, const struct lsm_operations lsm_v3_ops = { .lsm_free = lsm_free_plain, - .lsm_destroy = lsm_destroy_plain, .lsm_stripe_by_index = lsm_stripe_by_index_plain, .lsm_stripe_by_offset = lsm_stripe_by_offset_plain, .lsm_lmm_verify = lsm_lmm_verify_v3, diff --git a/drivers/staging/lustre/lustre/lov/lov_internal.h b/drivers/staging/lustre/lustre/lov/lov_internal.h index 12bd511e8988..07e5ede3e952 100644 --- a/drivers/staging/lustre/lustre/lov/lov_internal.h +++ b/drivers/staging/lustre/lustre/lov/lov_internal.h @@ -134,8 +134,6 @@ static inline void lov_put_reqset(struct lov_request_set *set) /* lov_merge.c */ void lov_merge_attrs(struct obdo *tgt, struct obdo *src, u64 valid, struct lov_stripe_md *lsm, int stripeno, int *set); -int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm, - u64 size, int shrink); int lov_merge_lvb_kms(struct lov_stripe_md *lsm, struct ost_lvb *lvb, __u64 *kms_place); @@ -157,11 +155,6 @@ int lov_update_common_set(struct lov_request_set *set, int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo, struct lov_request_set **reqset); int lov_fini_getattr_set(struct lov_request_set *set); -int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo, - struct obdo *src_oa, struct lov_stripe_md *lsm, - struct obd_trans_info *oti, - struct lov_request_set **reqset); -int lov_fini_destroy_set(struct lov_request_set *set); int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo, struct obd_trans_info *oti, struct lov_request_set **reqset); @@ -197,8 +190,6 @@ int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmm, struct lov_stripe_md *lsm); int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, struct lov_mds_md *lmm, int lmm_bytes); -int lov_getstripe(struct obd_export *exp, - struct lov_stripe_md *lsm, struct lov_user_md __user *lump); int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count, int pattern, int magic); int lov_free_memmd(struct lov_stripe_md **lsmp); diff --git a/drivers/staging/lustre/lustre/lov/lov_io.c b/drivers/staging/lustre/lustre/lov/lov_io.c index 84032a510254..d10157985ed9 100644 --- a/drivers/staging/lustre/lustre/lov/lov_io.c +++ b/drivers/staging/lustre/lustre/lov/lov_io.c @@ -87,6 +87,9 @@ static void lov_io_sub_inherit(struct cl_io *io, struct lov_io *lio, case CIT_SETATTR: { io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr; io->u.ci_setattr.sa_valid = parent->u.ci_setattr.sa_valid; + io->u.ci_setattr.sa_stripe_index = stripe; + io->u.ci_setattr.sa_parent_fid = + parent->u.ci_setattr.sa_parent_fid; if (cl_io_is_trunc(io)) { loff_t new_size = parent->u.ci_setattr.sa_attr.lvb_size; @@ -244,14 +247,12 @@ void lov_sub_put(struct lov_io_sub *sub) int lov_page_stripe(const struct cl_page *page) { - struct lovsub_object *subobj; const struct cl_page_slice *slice; - slice = cl_page_at(page, &lovsub_device_type); + slice = cl_page_at(page, &lov_device_type); LASSERT(slice->cpl_obj); - subobj = cl2lovsub(slice->cpl_obj); - return subobj->lso_index; + return cl2lov_page(slice)->lps_stripe; } struct lov_io_sub *lov_page_subio(const struct lu_env *env, struct lov_io *lio, @@ -298,8 +299,8 @@ static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio, return result; } -static void lov_io_slice_init(struct lov_io *lio, - struct lov_object *obj, struct cl_io *io) +static int lov_io_slice_init(struct lov_io *lio, struct lov_object *obj, + struct cl_io *io) { io->ci_result = 0; lio->lis_object = obj; @@ -314,6 +315,15 @@ static void lov_io_slice_init(struct lov_io *lio, lio->lis_io_endpos = lio->lis_endpos; if (cl_io_is_append(io)) { LASSERT(io->ci_type == CIT_WRITE); + + /* + * If there is LOV EA hole, then we may cannot locate + * the current file-tail exactly. + */ + if (unlikely(obj->lo_lsm->lsm_pattern & + LOV_PATTERN_F_HOLE)) + return -EIO; + lio->lis_pos = 0; lio->lis_endpos = OBD_OBJECT_EOF; } @@ -349,6 +359,7 @@ static void lov_io_slice_init(struct lov_io *lio, default: LBUG(); } + return 0; } static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) @@ -870,7 +881,7 @@ int lov_io_init_raid0(const struct lu_env *env, struct cl_object *obj, struct lov_object *lov = cl2lov(obj); INIT_LIST_HEAD(&lio->lis_active); - lov_io_slice_init(lio, lov, io); + io->ci_result = lov_io_slice_init(lio, lov, io); if (io->ci_result == 0) { io->ci_result = lov_io_subio_init(env, lio, io); if (io->ci_result == 0) { diff --git a/drivers/staging/lustre/lustre/lov/lov_merge.c b/drivers/staging/lustre/lustre/lov/lov_merge.c index b9c90865fdfc..674af106b50b 100644 --- a/drivers/staging/lustre/lustre/lov/lov_merge.c +++ b/drivers/staging/lustre/lustre/lov/lov_merge.c @@ -105,45 +105,6 @@ int lov_merge_lvb_kms(struct lov_stripe_md *lsm, return rc; } -/* Must be called under the lov_stripe_lock() */ -int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm, - u64 size, int shrink) -{ - struct lov_oinfo *loi; - int stripe = 0; - __u64 kms; - - assert_spin_locked(&lsm->lsm_lock); - LASSERT(lsm->lsm_lock_owner == current_pid()); - - if (shrink) { - for (; stripe < lsm->lsm_stripe_count; stripe++) { - struct lov_oinfo *loi = lsm->lsm_oinfo[stripe]; - - kms = lov_size_to_stripe(lsm, size, stripe); - CDEBUG(D_INODE, - "stripe %d KMS %sing %llu->%llu\n", - stripe, kms > loi->loi_kms ? "increase":"shrink", - loi->loi_kms, kms); - loi->loi_lvb.lvb_size = kms; - loi_kms_set(loi, loi->loi_lvb.lvb_size); - } - return 0; - } - - if (size > 0) - stripe = lov_stripe_number(lsm, size - 1); - kms = lov_size_to_stripe(lsm, size, stripe); - loi = lsm->lsm_oinfo[stripe]; - - CDEBUG(D_INODE, "stripe %d KMS %sincreasing %llu->%llu\n", - stripe, kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms); - if (kms > loi->loi_kms) - loi_kms_set(loi, kms); - - return 0; -} - void lov_merge_attrs(struct obdo *tgt, struct obdo *src, u64 valid, struct lov_stripe_md *lsm, int stripeno, int *set) { diff --git a/drivers/staging/lustre/lustre/lov/lov_obd.c b/drivers/staging/lustre/lustre/lov/lov_obd.c index 9b92d5522edb..b23016f7ec26 100644 --- a/drivers/staging/lustre/lustre/lov/lov_obd.c +++ b/drivers/staging/lustre/lustre/lov/lov_obd.c @@ -41,6 +41,7 @@ #include "../../include/linux/libcfs/libcfs.h" #include "../include/obd_support.h" +#include "../include/lustre/lustre_ioctl.h" #include "../include/lustre_lib.h" #include "../include/lustre_net.h" #include "../include/lustre/lustre_idl.h" @@ -940,7 +941,7 @@ int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg, } case LCFG_PARAM: { struct lprocfs_static_vars lvars = { NULL }; - struct lov_desc *desc = &(obd->u.lov.desc); + struct lov_desc *desc = &obd->u.lov.desc; if (!desc) { rc = -EINVAL; @@ -971,92 +972,6 @@ out: return rc; } -static int lov_recreate(struct obd_export *exp, struct obdo *src_oa, - struct lov_stripe_md **ea, struct obd_trans_info *oti) -{ - struct lov_stripe_md *obj_mdp, *lsm; - struct lov_obd *lov = &exp->exp_obd->u.lov; - unsigned ost_idx; - int rc, i; - - LASSERT(src_oa->o_valid & OBD_MD_FLFLAGS && - src_oa->o_flags & OBD_FL_RECREATE_OBJS); - - obj_mdp = kzalloc(sizeof(*obj_mdp), GFP_NOFS); - if (!obj_mdp) - return -ENOMEM; - - ost_idx = src_oa->o_nlink; - lsm = *ea; - if (!lsm) { - rc = -EINVAL; - goto out; - } - if (ost_idx >= lov->desc.ld_tgt_count || - !lov->lov_tgts[ost_idx]) { - rc = -EINVAL; - goto out; - } - - for (i = 0; i < lsm->lsm_stripe_count; i++) { - struct lov_oinfo *loi = lsm->lsm_oinfo[i]; - - if (lov_oinfo_is_dummy(loi)) - continue; - - if (loi->loi_ost_idx == ost_idx) { - if (ostid_id(&loi->loi_oi) != ostid_id(&src_oa->o_oi)) { - rc = -EINVAL; - goto out; - } - break; - } - } - if (i == lsm->lsm_stripe_count) { - rc = -EINVAL; - goto out; - } - - rc = obd_create(NULL, lov->lov_tgts[ost_idx]->ltd_exp, - src_oa, &obj_mdp, oti); -out: - kfree(obj_mdp); - return rc; -} - -/* the LOV expects oa->o_id to be set to the LOV object id */ -static int lov_create(const struct lu_env *env, struct obd_export *exp, - struct obdo *src_oa, struct lov_stripe_md **ea, - struct obd_trans_info *oti) -{ - struct lov_obd *lov; - int rc = 0; - - LASSERT(ea); - if (!exp) - return -EINVAL; - - if ((src_oa->o_valid & OBD_MD_FLFLAGS) && - src_oa->o_flags == OBD_FL_DELORPHAN) { - /* should be used with LOV anymore */ - LBUG(); - } - - lov = &exp->exp_obd->u.lov; - if (!lov->desc.ld_active_tgt_count) - return -EIO; - - obd_getref(exp->exp_obd); - /* Recreate a specific object id at the given OST index */ - if ((src_oa->o_valid & OBD_MD_FLFLAGS) && - (src_oa->o_flags & OBD_FL_RECREATE_OBJS)) { - rc = lov_recreate(exp, src_oa, ea, oti); - } - - obd_putref(exp->exp_obd); - return rc; -} - #define ASSERT_LSM_MAGIC(lsmp) \ do { \ LASSERT((lsmp)); \ @@ -1065,59 +980,6 @@ do { \ "%p->lsm_magic=%x\n", (lsmp), (lsmp)->lsm_magic); \ } while (0) -static int lov_destroy(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa, struct lov_stripe_md *lsm, - struct obd_trans_info *oti, struct obd_export *md_exp) -{ - struct lov_request_set *set; - struct obd_info oinfo; - struct lov_request *req; - struct lov_obd *lov; - int rc = 0, err = 0; - - ASSERT_LSM_MAGIC(lsm); - - if (!exp || !exp->exp_obd) - return -ENODEV; - - if (oa->o_valid & OBD_MD_FLCOOKIE) { - LASSERT(oti); - LASSERT(oti->oti_logcookies); - } - - lov = &exp->exp_obd->u.lov; - obd_getref(exp->exp_obd); - rc = lov_prep_destroy_set(exp, &oinfo, oa, lsm, oti, &set); - if (rc) - goto out; - - list_for_each_entry(req, &set->set_list, rq_link) { - if (oa->o_valid & OBD_MD_FLCOOKIE) - oti->oti_logcookies = set->set_cookies + req->rq_stripe; - - err = obd_destroy(env, lov->lov_tgts[req->rq_idx]->ltd_exp, - req->rq_oi.oi_oa, NULL, oti, NULL); - err = lov_update_common_set(set, req, err); - if (err) { - CERROR("%s: destroying objid "DOSTID" subobj " - DOSTID" on OST idx %d: rc = %d\n", - exp->exp_obd->obd_name, POSTID(&oa->o_oi), - POSTID(&req->rq_oi.oi_oa->o_oi), - req->rq_idx, err); - if (!rc) - rc = err; - } - } - - if (rc == 0) - rc = lsm_op_find(lsm->lsm_magic)->lsm_destroy(lsm, oa, md_exp); - - err = lov_fini_destroy_set(set); -out: - obd_putref(exp->exp_obd); - return rc ? rc : err; -} - static int lov_getattr_interpret(struct ptlrpc_request_set *rqset, void *data, int rc) { @@ -1267,46 +1129,6 @@ static int lov_setattr_async(struct obd_export *exp, struct obd_info *oinfo, return 0; } -/* find any ldlm lock of the inode in lov - * return 0 not find - * 1 find one - * < 0 error - */ -static int lov_find_cbdata(struct obd_export *exp, - struct lov_stripe_md *lsm, ldlm_iterator_t it, - void *data) -{ - struct lov_obd *lov; - int rc = 0, i; - - ASSERT_LSM_MAGIC(lsm); - - if (!exp || !exp->exp_obd) - return -ENODEV; - - lov = &exp->exp_obd->u.lov; - for (i = 0; i < lsm->lsm_stripe_count; i++) { - struct lov_stripe_md submd; - struct lov_oinfo *loi = lsm->lsm_oinfo[i]; - - if (lov_oinfo_is_dummy(loi)) - continue; - - if (!lov->lov_tgts[loi->loi_ost_idx]) { - CDEBUG(D_HA, "lov idx %d NULL\n", loi->loi_ost_idx); - continue; - } - - submd.lsm_oi = loi->loi_oi; - submd.lsm_stripe_count = 0; - rc = obd_find_cbdata(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp, - &submd, it, data); - if (rc != 0) - return rc; - } - return rc; -} - int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc) { struct lov_request_set *lovset = (struct lov_request_set *)data; @@ -1460,7 +1282,7 @@ static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len, } desc = (struct lov_desc *)data->ioc_inlbuf1; - memcpy(desc, &(lov->desc), sizeof(*desc)); + memcpy(desc, &lov->desc, sizeof(*desc)); uuidp = (struct obd_uuid *)data->ioc_inlbuf2; genp = (__u32 *)data->ioc_inlbuf3; @@ -1477,9 +1299,6 @@ static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len, obd_ioctl_freedata(buf, len); break; } - case LL_IOC_LOV_GETSTRIPE: - rc = lov_getstripe(exp, karg, uarg); - break; case OBD_IOC_QUOTACTL: { struct if_quotactl *qctl = karg; struct lov_tgt_desc *tgt = NULL; @@ -1726,6 +1545,8 @@ static int lov_fiemap(struct lov_obd *lov, __u32 keylen, void *key, u64 fm_start, fm_end, fm_length, fm_end_offset; u64 curr_loc; int current_extent = 0, rc = 0, i; + /* Whether have we collected enough extents */ + bool enough = false; int ost_eof = 0; /* EOF for object */ int ost_done = 0; /* done with required mapping for this OST? */ int last_stripe; @@ -1860,7 +1681,7 @@ static int lov_fiemap(struct lov_obd *lov, __u32 keylen, void *key, lun_start += len_mapped_single_call; fm_local->fm_length = req_fm_len - len_mapped_single_call; req_fm_len = fm_local->fm_length; - fm_local->fm_extent_count = count_local; + fm_local->fm_extent_count = enough ? 1 : count_local; fm_local->fm_mapped_extents = 0; fm_local->fm_flags = fiemap->fm_flags; @@ -1908,6 +1729,12 @@ inactive_tgt: goto finish; } break; + } else if (enough) { + /* + * We've collected enough extents and there are + * more extents after it. + */ + goto finish; } /* If we just need num of extents then go to next device */ @@ -1916,8 +1743,9 @@ inactive_tgt: break; } - len_mapped_single_call = lcl_fm_ext[ext_count-1].fe_logical - - lun_start + lcl_fm_ext[ext_count - 1].fe_length; + len_mapped_single_call = + lcl_fm_ext[ext_count - 1].fe_logical - + lun_start + lcl_fm_ext[ext_count - 1].fe_length; /* Have we finished mapping on this device? */ if (req_fm_len <= len_mapped_single_call) @@ -1926,14 +1754,15 @@ inactive_tgt: /* Clear the EXTENT_LAST flag which can be present on * last extent */ - if (lcl_fm_ext[ext_count-1].fe_flags & FIEMAP_EXTENT_LAST) + if (lcl_fm_ext[ext_count - 1].fe_flags & + FIEMAP_EXTENT_LAST) lcl_fm_ext[ext_count - 1].fe_flags &= ~FIEMAP_EXTENT_LAST; curr_loc = lov_stripe_size(lsm, - lcl_fm_ext[ext_count - 1].fe_logical+ - lcl_fm_ext[ext_count - 1].fe_length, - cur_stripe); + lcl_fm_ext[ext_count - 1].fe_logical + + lcl_fm_ext[ext_count - 1].fe_length, + cur_stripe); if (curr_loc >= fm_key->oa.o_size) ost_eof = 1; @@ -1945,7 +1774,7 @@ inactive_tgt: /* Ran out of available extents? */ if (current_extent >= fiemap->fm_extent_count) - goto finish; + enough = true; } while (ost_done == 0 && ost_eof == 0); if (cur_stripe_wrap == last_stripe) @@ -1985,73 +1814,14 @@ static int lov_get_info(const struct lu_env *env, struct obd_export *exp, { struct obd_device *obddev = class_exp2obd(exp); struct lov_obd *lov = &obddev->u.lov; - int i, rc; + int rc; if (!vallen || !val) return -EFAULT; obd_getref(obddev); - if (KEY_IS(KEY_LOCK_TO_STRIPE)) { - struct { - char name[16]; - struct ldlm_lock *lock; - } *data = key; - struct ldlm_res_id *res_id = &data->lock->l_resource->lr_name; - struct lov_oinfo *loi; - __u32 *stripe = val; - - if (*vallen < sizeof(*stripe)) { - rc = -EFAULT; - goto out; - } - *vallen = sizeof(*stripe); - - /* XXX This is another one of those bits that will need to - * change if we ever actually support nested LOVs. It uses - * the lock's export to find out which stripe it is. - */ - /* XXX - it's assumed all the locks for deleted OSTs have - * been cancelled. Also, the export for deleted OSTs will - * be NULL and won't match the lock's export. - */ - for (i = 0; i < lsm->lsm_stripe_count; i++) { - loi = lsm->lsm_oinfo[i]; - if (lov_oinfo_is_dummy(loi)) - continue; - - if (!lov->lov_tgts[loi->loi_ost_idx]) - continue; - if (lov->lov_tgts[loi->loi_ost_idx]->ltd_exp == - data->lock->l_conn_export && - ostid_res_name_eq(&loi->loi_oi, res_id)) { - *stripe = i; - rc = 0; - goto out; - } - } - LDLM_ERROR(data->lock, "lock on inode without such object"); - dump_lsm(D_ERROR, lsm); - rc = -ENXIO; - goto out; - } else if (KEY_IS(KEY_LAST_ID)) { - struct obd_id_info *info = val; - __u32 size = sizeof(u64); - struct lov_tgt_desc *tgt; - - LASSERT(*vallen == sizeof(struct obd_id_info)); - tgt = lov->lov_tgts[info->idx]; - - if (!tgt || !tgt->ltd_active) { - rc = -ESRCH; - goto out; - } - - rc = obd_get_info(env, tgt->ltd_exp, keylen, key, - &size, info->data, NULL); - rc = 0; - goto out; - } else if (KEY_IS(KEY_LOVDESC)) { + if (KEY_IS(KEY_LOVDESC)) { struct lov_desc *desc_ret = val; *desc_ret = lov->desc; @@ -2060,22 +1830,6 @@ static int lov_get_info(const struct lu_env *env, struct obd_export *exp, } else if (KEY_IS(KEY_FIEMAP)) { rc = lov_fiemap(lov, keylen, key, vallen, val, lsm); goto out; - } else if (KEY_IS(KEY_CONNECT_FLAG)) { - struct lov_tgt_desc *tgt; - __u64 ost_idx = *((__u64 *)val); - - LASSERT(*vallen == sizeof(__u64)); - LASSERT(ost_idx < lov->desc.ld_tgt_count); - tgt = lov->lov_tgts[ost_idx]; - - if (!tgt || !tgt->ltd_exp) { - rc = -ESRCH; - goto out; - } - - *((__u64 *)val) = exp_connect_flags(tgt->ltd_exp); - rc = 0; - goto out; } else if (KEY_IS(KEY_TGT_COUNT)) { *((int *)val) = lov->desc.ld_tgt_count; rc = 0; @@ -2098,8 +1852,7 @@ static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp, u32 count; int i, rc = 0, err; struct lov_tgt_desc *tgt; - unsigned int incr = 0, check_uuid = 0, do_inactive = 0, no_set = 0; - unsigned int next_id = 0, mds_con = 0; + int do_inactive = 0, no_set = 0; if (!set) { no_set = 1; @@ -2111,18 +1864,8 @@ static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp, obd_getref(obddev); count = lov->desc.ld_tgt_count; - if (KEY_IS(KEY_NEXT_ID)) { - count = vallen / sizeof(struct obd_id_info); - vallen = sizeof(u64); - incr = sizeof(struct obd_id_info); - do_inactive = 1; - next_id = 1; - } else if (KEY_IS(KEY_CHECKSUM)) { + if (KEY_IS(KEY_CHECKSUM)) { do_inactive = 1; - } else if (KEY_IS(KEY_EVICT_BY_NID)) { - /* use defaults: do_inactive = incr = 0; */ - } else if (KEY_IS(KEY_MDS_CONN)) { - mds_con = 1; } else if (KEY_IS(KEY_CACHE_SET)) { LASSERT(!lov->lov_cache); lov->lov_cache = val; @@ -2130,11 +1873,9 @@ static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp, cl_cache_incref(lov->lov_cache); } - for (i = 0; i < count; i++, val = (char *)val + incr) { - if (next_id) - tgt = lov->lov_tgts[((struct obd_id_info *)val)->idx]; - else - tgt = lov->lov_tgts[i]; + for (i = 0; i < count; i++) { + tgt = lov->lov_tgts[i]; + /* OST was disconnected */ if (!tgt || !tgt->ltd_exp) continue; @@ -2143,34 +1884,8 @@ static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp, if (!tgt->ltd_active && !do_inactive) continue; - if (mds_con) { - struct mds_group_info *mgi; - - LASSERT(vallen == sizeof(*mgi)); - mgi = (struct mds_group_info *)val; - - /* Only want a specific OSC */ - if (mgi->uuid && !obd_uuid_equals(mgi->uuid, - &tgt->ltd_uuid)) - continue; - - err = obd_set_info_async(env, tgt->ltd_exp, - keylen, key, sizeof(int), - &mgi->group, set); - } else if (next_id) { - err = obd_set_info_async(env, tgt->ltd_exp, - keylen, key, vallen, - ((struct obd_id_info *)val)->data, set); - } else { - /* Only want a specific OSC */ - if (check_uuid && - !obd_uuid_equals(val, &tgt->ltd_uuid)) - continue; - - err = obd_set_info_async(env, tgt->ltd_exp, - keylen, key, vallen, val, set); - } - + err = obd_set_info_async(env, tgt->ltd_exp, keylen, key, + vallen, val, set); if (!rc) rc = err; } @@ -2318,12 +2033,8 @@ static struct obd_ops lov_obd_ops = { .statfs_async = lov_statfs_async, .packmd = lov_packmd, .unpackmd = lov_unpackmd, - .create = lov_create, - .destroy = lov_destroy, .getattr_async = lov_getattr_async, .setattr_async = lov_setattr_async, - .adjust_kms = lov_adjust_kms, - .find_cbdata = lov_find_cbdata, .iocontrol = lov_iocontrol, .get_info = lov_get_info, .set_info_async = lov_set_info_async, diff --git a/drivers/staging/lustre/lustre/lov/lov_object.c b/drivers/staging/lustre/lustre/lov/lov_object.c index f9621b0fd469..52f736338887 100644 --- a/drivers/staging/lustre/lustre/lov/lov_object.c +++ b/drivers/staging/lustre/lustre/lov/lov_object.c @@ -75,6 +75,13 @@ struct lov_layout_operations { static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov); +void lov_lsm_put(struct cl_object *unused, struct lov_stripe_md *lsm) +{ + if (lsm) + lov_free_memmd(&lsm); +} +EXPORT_SYMBOL(lov_lsm_put); + /***************************************************************************** * * Lov object layout operations. @@ -195,6 +202,10 @@ static int lov_page_slice_fixup(struct lov_object *lov, struct cl_object_header *hdr = cl_object_header(&lov->lo_cl); struct cl_object *o; + if (!stripe) + return hdr->coh_page_bufsize - lov->lo_cl.co_slice_off - + cfs_size_round(sizeof(struct lov_page)); + cl_object_for_each(o, stripe) o->co_slice_off += hdr->coh_page_bufsize; @@ -224,6 +235,7 @@ static int lov_init_raid0(const struct lu_env *env, LASSERT(!lov->lo_lsm); lov->lo_lsm = lsm_addref(lsm); + lov->lo_layout_invalid = true; r0->lo_nr = lsm->lsm_stripe_count; LASSERT(r0->lo_nr <= lov_targets_nr(dev)); @@ -719,6 +731,10 @@ static int lov_layout_change(const struct lu_env *unused, LASSERT(atomic_read(&lov->lo_active_ios) == 0); lov->lo_type = LLT_EMPTY; + /* page bufsize fixup */ + cl_object_header(&lov->lo_cl)->coh_page_bufsize -= + lov_page_slice_fixup(lov, NULL); + result = new_ops->llo_init(env, lu2lov_dev(lov->lo_cl.co_lu.lo_dev), lov, conf, state); @@ -878,8 +894,8 @@ static int lov_attr_get(const struct lu_env *env, struct cl_object *obj, return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_getattr, env, obj, attr); } -static int lov_attr_set(const struct lu_env *env, struct cl_object *obj, - const struct cl_attr *attr, unsigned valid) +static int lov_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned int valid) { /* * No dispatch is required here, as no layout implements this. @@ -895,13 +911,30 @@ int lov_lock_init(const struct lu_env *env, struct cl_object *obj, io); } +static int lov_object_getstripe(const struct lu_env *env, struct cl_object *obj, + struct lov_user_md __user *lum) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_stripe_md *lsm; + int rc = 0; + + lsm = lov_lsm_addref(lov); + if (!lsm) + return -ENODATA; + + rc = lov_getstripe(cl2lov(obj), lsm, lum); + lov_lsm_put(obj, lsm); + return rc; +} + static const struct cl_object_operations lov_ops = { .coo_page_init = lov_page_init, .coo_lock_init = lov_lock_init, .coo_io_init = lov_io_init, .coo_attr_get = lov_attr_get, - .coo_attr_set = lov_attr_set, - .coo_conf_set = lov_conf_set + .coo_attr_update = lov_attr_update, + .coo_conf_set = lov_conf_set, + .coo_getstripe = lov_object_getstripe }; static const struct lu_object_operations lov_lu_obj_ops = { @@ -938,7 +971,7 @@ struct lu_object *lov_object_alloc(const struct lu_env *env, return obj; } -static struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov) +struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov) { struct lov_stripe_md *lsm = NULL; @@ -969,13 +1002,6 @@ struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj) } EXPORT_SYMBOL(lov_lsm_get); -void lov_lsm_put(struct cl_object *unused, struct lov_stripe_md *lsm) -{ - if (lsm) - lov_free_memmd(&lsm); -} -EXPORT_SYMBOL(lov_lsm_put); - int lov_read_and_clear_async_rc(struct cl_object *clob) { struct lu_object *luobj; diff --git a/drivers/staging/lustre/lustre/lov/lov_pack.c b/drivers/staging/lustre/lustre/lov/lov_pack.c index 869ef41b13ca..be6e9857ce2a 100644 --- a/drivers/staging/lustre/lustre/lov/lov_pack.c +++ b/drivers/staging/lustre/lustre/lov/lov_pack.c @@ -45,6 +45,7 @@ #include "../include/lustre/lustre_user.h" #include "lov_internal.h" +#include "lov_cl_internal.h" void lov_dump_lmm_common(int level, void *lmmp) { @@ -104,11 +105,9 @@ void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm) * LOVs properly. For now lov_mds_md_size() just assumes one u64 * per stripe. */ -int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, - struct lov_stripe_md *lsm) +int lov_obd_packmd(struct lov_obd *lov, struct lov_mds_md **lmmp, + struct lov_stripe_md *lsm) { - struct obd_device *obd = class_exp2obd(exp); - struct lov_obd *lov = &obd->u.lov; struct lov_mds_md_v1 *lmmv1; struct lov_mds_md_v3 *lmmv3; __u16 stripe_count; @@ -148,16 +147,11 @@ int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, stripe_count = 0; } } else { - /* No need to allocate more than maximum supported stripes. - * Anyway, this is pretty inaccurate since ld_tgt_count now - * represents max index and we should rely on the actual number - * of OSTs instead + /* + * To calculate maximum easize by active targets at present, + * which is exactly the maximum easize to be seen by LOV */ - stripe_count = lov_mds_md_max_stripe_count( - lov->lov_ocd.ocd_max_easize, lmm_magic); - - if (stripe_count > lov->desc.ld_tgt_count) - stripe_count = lov->desc.ld_tgt_count; + stripe_count = lov->desc.ld_active_tgt_count; } /* XXX LOV STACKING call into osc for sizes */ @@ -225,6 +219,15 @@ int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, return lmm_size; } +int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, + struct lov_stripe_md *lsm) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + + return lov_obd_packmd(lov, lmmp, lsm); +} + /* Find the max stripecount we should use */ __u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count) { @@ -284,7 +287,7 @@ int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count, spin_lock_init(&(*lsmp)->lsm_lock); (*lsmp)->lsm_magic = magic; (*lsmp)->lsm_stripe_count = stripe_count; - (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count; + (*lsmp)->lsm_maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES * stripe_count; (*lsmp)->lsm_pattern = pattern; (*lsmp)->lsm_pool_name[0] = '\0'; (*lsmp)->lsm_layout_gen = 0; @@ -372,16 +375,17 @@ int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, * the maximum number of OST indices which will fit in the user buffer. * lmm_magic must be LOV_USER_MAGIC. */ -int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm, +int lov_getstripe(struct lov_object *obj, struct lov_stripe_md *lsm, struct lov_user_md __user *lump) { /* * XXX huge struct allocated on stack. */ /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */ + struct lov_obd *lov; struct lov_user_md_v3 lum; struct lov_mds_md *lmmk = NULL; - int rc, lmm_size; + int rc, lmmk_size, lmm_size; int lum_size; mm_segment_t seg; @@ -401,12 +405,13 @@ int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm, lum_size = sizeof(struct lov_user_md_v1); if (copy_from_user(&lum, lump, lum_size)) { rc = -EFAULT; - goto out_set; + goto out; } - if ((lum.lmm_magic != LOV_USER_MAGIC) && - (lum.lmm_magic != LOV_USER_MAGIC_V3)) { + if (lum.lmm_magic != LOV_USER_MAGIC_V1 && + lum.lmm_magic != LOV_USER_MAGIC_V3 && + lum.lmm_magic != LOV_USER_MAGIC_SPECIFIC) { rc = -EINVAL; - goto out_set; + goto out; } if (lum.lmm_stripe_count && @@ -415,11 +420,13 @@ int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm, lum.lmm_stripe_count = lsm->lsm_stripe_count; rc = copy_to_user(lump, &lum, lum_size); rc = -EOVERFLOW; - goto out_set; + goto out; } - rc = lov_packmd(exp, &lmmk, lsm); + lov = lu2lov_dev(obj->lo_cl.co_lu.lo_dev)->ld_lov; + rc = lov_obd_packmd(lov, &lmmk, lsm); if (rc < 0) - goto out_set; + goto out; + lmmk_size = rc; lmm_size = rc; rc = 0; @@ -455,7 +462,7 @@ int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm, lmm_size = lum_size; } else if (lum.lmm_stripe_count < lmmk->lmm_stripe_count) { rc = -EOVERFLOW; - goto out_set; + goto out_free; } /* * Have a difference between lov_mds_md & lov_user_md. @@ -468,8 +475,9 @@ int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm, if (copy_to_user(lump, lmmk, lmm_size)) rc = -EFAULT; - obd_free_diskmd(exp, &lmmk); -out_set: +out_free: + kfree(lmmk); +out: set_fs(seg); return rc; } diff --git a/drivers/staging/lustre/lustre/lov/lov_page.c b/drivers/staging/lustre/lustre/lov/lov_page.c index c17026f14896..00bfabad78eb 100644 --- a/drivers/staging/lustre/lustre/lov/lov_page.c +++ b/drivers/staging/lustre/lustre/lov/lov_page.c @@ -65,7 +65,9 @@ static int lov_raid0_page_is_under_lock(const struct lu_env *env, pgoff_t index = *max_index; unsigned int pps; /* pages per stripe */ - CDEBUG(D_READA, "*max_index = %lu, nr = %d\n", index, r0->lo_nr); + CDEBUG(D_READA, DFID "*max_index = %lu, nr = %d\n", + PFID(lu_object_fid(lov2lu(loo))), index, r0->lo_nr); + if (index == 0) /* the page is not covered by any lock */ return 0; @@ -80,7 +82,12 @@ static int lov_raid0_page_is_under_lock(const struct lu_env *env, /* calculate the end of current stripe */ pps = loo->lo_lsm->lsm_stripe_size >> PAGE_SHIFT; - index = ((slice->cpl_index + pps) & ~(pps - 1)) - 1; + index = slice->cpl_index + pps - slice->cpl_index % pps - 1; + + CDEBUG(D_READA, DFID "*max_index = %lu, index = %lu, pps = %u, stripe_size = %u, stripe no = %u, page index = %lu\n", + PFID(lu_object_fid(lov2lu(loo))), *max_index, index, pps, + loo->lo_lsm->lsm_stripe_size, lov_page_stripe(slice->cpl_page), + slice->cpl_index); /* never exceed the end of the stripe */ *max_index = min_t(pgoff_t, *max_index, index); @@ -122,6 +129,7 @@ int lov_page_init_raid0(const struct lu_env *env, struct cl_object *obj, rc = lov_stripe_offset(loo->lo_lsm, offset, stripe, &suboff); LASSERT(rc == 0); + lpg->lps_stripe = stripe; cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_raid0_page_ops); sub = lov_sub_get(env, lio, stripe); diff --git a/drivers/staging/lustre/lustre/lov/lov_pool.c b/drivers/staging/lustre/lustre/lov/lov_pool.c index 4c2d21729589..f8c8a361ef79 100644 --- a/drivers/staging/lustre/lustre/lov/lov_pool.c +++ b/drivers/staging/lustre/lustre/lov/lov_pool.c @@ -61,7 +61,7 @@ void lov_pool_putref(struct pool_desc *pool) LASSERT(hlist_unhashed(&pool->pool_hash)); LASSERT(list_empty(&pool->pool_list)); LASSERT(!pool->pool_debugfs_entry); - lov_ost_pool_free(&(pool->pool_obds)); + lov_ost_pool_free(&pool->pool_obds); kfree(pool); } } @@ -92,7 +92,7 @@ static __u32 pool_hashfn(struct cfs_hash *hash_body, const void *key, unsigned m for (i = 0; i < LOV_MAXPOOLNAME; i++) { if (poolname[i] == '\0') break; - result = (result << 4)^(result >> 28) ^ poolname[i]; + result = (result << 4) ^ (result >> 28) ^ poolname[i]; } return (result % mask); } @@ -260,7 +260,7 @@ static int pool_proc_show(struct seq_file *s, void *v) tgt = pool_tgt(iter->pool, iter->idx); up_read(&pool_tgt_rw_sem(iter->pool)); if (tgt) - seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid))); + seq_printf(s, "%s\n", obd_uuid2str(&tgt->ltd_uuid)); return 0; } @@ -400,7 +400,7 @@ int lov_pool_new(struct obd_device *obd, char *poolname) struct pool_desc *new_pool; int rc; - lov = &(obd->u.lov); + lov = &obd->u.lov; if (strlen(poolname) > LOV_MAXPOOLNAME) return -ENAMETOOLONG; @@ -471,7 +471,7 @@ int lov_pool_del(struct obd_device *obd, char *poolname) struct lov_obd *lov; struct pool_desc *pool; - lov = &(obd->u.lov); + lov = &obd->u.lov; /* lookup and kill hash reference */ pool = cfs_hash_del_key(lov->lov_pools_hash_body, poolname); @@ -503,7 +503,7 @@ int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname) unsigned int lov_idx; int rc; - lov = &(obd->u.lov); + lov = &obd->u.lov; pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname); if (!pool) @@ -517,7 +517,7 @@ int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname) if (!lov->lov_tgts[lov_idx]) continue; if (obd_uuid_equals(&ost_uuid, - &(lov->lov_tgts[lov_idx]->ltd_uuid))) + &lov->lov_tgts[lov_idx]->ltd_uuid)) break; } /* test if ost found in lov */ @@ -547,7 +547,7 @@ int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname) unsigned int lov_idx; int rc = 0; - lov = &(obd->u.lov); + lov = &obd->u.lov; pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname); if (!pool) @@ -562,7 +562,7 @@ int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname) continue; if (obd_uuid_equals(&ost_uuid, - &(lov->lov_tgts[lov_idx]->ltd_uuid))) + &lov->lov_tgts[lov_idx]->ltd_uuid)) break; } diff --git a/drivers/staging/lustre/lustre/lov/lov_request.c b/drivers/staging/lustre/lustre/lov/lov_request.c index 4099b51f826e..09dcaf484c89 100644 --- a/drivers/staging/lustre/lustre/lov/lov_request.c +++ b/drivers/staging/lustre/lustre/lov/lov_request.c @@ -325,84 +325,6 @@ out_set: return rc; } -int lov_fini_destroy_set(struct lov_request_set *set) -{ - if (!set) - return 0; - LASSERT(set->set_exp); - if (atomic_read(&set->set_completes)) { - /* FIXME update qos data here */ - } - - lov_put_reqset(set); - - return 0; -} - -int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo, - struct obdo *src_oa, struct lov_stripe_md *lsm, - struct obd_trans_info *oti, - struct lov_request_set **reqset) -{ - struct lov_request_set *set; - struct lov_obd *lov = &exp->exp_obd->u.lov; - int rc = 0, i; - - set = kzalloc(sizeof(*set), GFP_NOFS); - if (!set) - return -ENOMEM; - lov_init_set(set); - - set->set_exp = exp; - set->set_oi = oinfo; - set->set_oi->oi_md = lsm; - set->set_oi->oi_oa = src_oa; - if (oti && src_oa->o_valid & OBD_MD_FLCOOKIE) - set->set_cookies = oti->oti_logcookies; - - for (i = 0; i < lsm->lsm_stripe_count; i++) { - struct lov_oinfo *loi; - struct lov_request *req; - - loi = lsm->lsm_oinfo[i]; - if (lov_oinfo_is_dummy(loi)) - continue; - - if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) { - CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); - continue; - } - - req = kzalloc(sizeof(*req), GFP_NOFS); - if (!req) { - rc = -ENOMEM; - goto out_set; - } - - req->rq_stripe = i; - req->rq_idx = loi->loi_ost_idx; - - req->rq_oi.oi_oa = kmem_cache_zalloc(obdo_cachep, GFP_NOFS); - if (!req->rq_oi.oi_oa) { - kfree(req); - rc = -ENOMEM; - goto out_set; - } - memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa)); - req->rq_oi.oi_oa->o_oi = loi->loi_oi; - lov_set_add_req(req, set); - } - if (!set->set_count) { - rc = -EIO; - goto out_set; - } - *reqset = set; - return rc; -out_set: - lov_fini_destroy_set(set); - return rc; -} - int lov_fini_setattr_set(struct lov_request_set *set) { int rc = 0; diff --git a/drivers/staging/lustre/lustre/lov/lovsub_object.c b/drivers/staging/lustre/lustre/lov/lovsub_object.c index fb2f2660b3e9..a2bac7a3b71b 100644 --- a/drivers/staging/lustre/lustre/lov/lovsub_object.c +++ b/drivers/staging/lustre/lustre/lov/lovsub_object.c @@ -98,8 +98,8 @@ static int lovsub_object_print(const struct lu_env *env, void *cookie, return (*p)(env, cookie, "[%d]", los->lso_index); } -static int lovsub_attr_set(const struct lu_env *env, struct cl_object *obj, - const struct cl_attr *attr, unsigned valid) +static int lovsub_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned int valid) { struct lov_object *lov = cl2lovsub(obj)->lso_super; @@ -119,7 +119,7 @@ static int lovsub_object_glimpse(const struct lu_env *env, static const struct cl_object_operations lovsub_ops = { .coo_page_init = lovsub_page_init, .coo_lock_init = lovsub_lock_init, - .coo_attr_set = lovsub_attr_set, + .coo_attr_update = lovsub_attr_update, .coo_glimpse = lovsub_object_glimpse }; diff --git a/drivers/staging/lustre/lustre/mdc/lproc_mdc.c b/drivers/staging/lustre/lustre/mdc/lproc_mdc.c index 98d15fb247bc..fca9450de57c 100644 --- a/drivers/staging/lustre/lustre/mdc/lproc_mdc.c +++ b/drivers/staging/lustre/lustre/mdc/lproc_mdc.c @@ -43,11 +43,10 @@ static ssize_t max_rpcs_in_flight_show(struct kobject *kobj, int len; struct obd_device *dev = container_of(kobj, struct obd_device, obd_kobj); - struct client_obd *cli = &dev->u.cli; + __u32 max; - spin_lock(&cli->cl_loi_list_lock); - len = sprintf(buf, "%u\n", cli->cl_max_rpcs_in_flight); - spin_unlock(&cli->cl_loi_list_lock); + max = obd_get_max_rpcs_in_flight(&dev->u.cli); + len = sprintf(buf, "%u\n", max); return len; } @@ -59,7 +58,6 @@ static ssize_t max_rpcs_in_flight_store(struct kobject *kobj, { struct obd_device *dev = container_of(kobj, struct obd_device, obd_kobj); - struct client_obd *cli = &dev->u.cli; int rc; unsigned long val; @@ -67,12 +65,9 @@ static ssize_t max_rpcs_in_flight_store(struct kobject *kobj, if (rc) return rc; - if (val < 1 || val > MDC_MAX_RIF_MAX) - return -ERANGE; - - spin_lock(&cli->cl_loi_list_lock); - cli->cl_max_rpcs_in_flight = val; - spin_unlock(&cli->cl_loi_list_lock); + rc = obd_set_max_rpcs_in_flight(&dev->u.cli, val); + if (rc) + count = rc; return count; } diff --git a/drivers/staging/lustre/lustre/mdc/mdc_internal.h b/drivers/staging/lustre/lustre/mdc/mdc_internal.h index 58f2841cabe4..f446c1c2584b 100644 --- a/drivers/staging/lustre/lustre/mdc/mdc_internal.h +++ b/drivers/staging/lustre/lustre/mdc/mdc_internal.h @@ -34,63 +34,57 @@ #define _MDC_INTERNAL_H #include "../include/lustre_mdc.h" -#include "../include/lustre_mds.h" void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars); void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid, - __u64 valid, int ea_size, __u32 suppgid, int flags); -void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid, - const struct lu_fid *cfid, int flags); + __u64 valid, size_t ea_size, __u32 suppgid, u32 flags); void mdc_swap_layouts_pack(struct ptlrpc_request *req, struct md_op_data *op_data); -void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, __u32 size, +void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, size_t size, const struct lu_fid *fid); -void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags, - struct md_op_data *data, int ea_size); +void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, u32 flags, + struct md_op_data *data, size_t ea_size); void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data, - void *ea, int ealen, void *ea2, int ea2len); + void *ea, size_t ealen, void *ea2, size_t ea2len); void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data, - const void *data, int datalen, __u32 mode, __u32 uid, - __u32 gid, cfs_cap_t capability, __u64 rdev); + const void *data, size_t datalen, umode_t mode, uid_t uid, + gid_t gid, cfs_cap_t capability, __u64 rdev); void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data, - __u32 mode, __u64 rdev, __u64 flags, const void *data, - int datalen); + umode_t mode, __u64 rdev, __u64 flags, const void *data, + size_t datalen); void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data); void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data); void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data, - const char *old, int oldlen, const char *new, int newlen); + const char *old, size_t oldlen, + const char *new, size_t newlen); void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data); -int mdc_enter_request(struct client_obd *cli); -void mdc_exit_request(struct client_obd *cli); /* mdc/mdc_locks.c */ int mdc_set_lock_data(struct obd_export *exp, - __u64 *lockh, void *data, __u64 *bits); + const struct lustre_handle *lockh, + void *data, __u64 *bits); int mdc_null_inode(struct obd_export *exp, const struct lu_fid *fid); -int mdc_find_cbdata(struct obd_export *exp, const struct lu_fid *fid, - ldlm_iterator_t it, void *data); - int mdc_intent_lock(struct obd_export *exp, - struct md_op_data *, - void *lmm, int lmmsize, - struct lookup_intent *, int, + struct md_op_data *op_data, + struct lookup_intent *it, struct ptlrpc_request **reqp, ldlm_blocking_callback cb_blocking, __u64 extra_lock_flags); + int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + const ldlm_policy_data_t *policy, struct lookup_intent *it, struct md_op_data *op_data, - struct lustre_handle *lockh, void *lmm, int lmmsize, - struct ptlrpc_request **req, __u64 extra_lock_flags); + struct lustre_handle *lockh, __u64 extra_lock_flags); int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid, struct list_head *cancels, enum ldlm_mode mode, __u64 bits); /* mdc/mdc_request.c */ -int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid, - struct md_op_data *op_data); +int mdc_fid_alloc(const struct lu_env *env, struct obd_export *exp, + struct lu_fid *fid, struct md_op_data *op_data); struct obd_client_handle; int mdc_set_open_replay_data(struct obd_export *exp, @@ -101,16 +95,17 @@ void mdc_commit_open(struct ptlrpc_request *req); void mdc_replay_open(struct ptlrpc_request *req); int mdc_create(struct obd_export *exp, struct md_op_data *op_data, - const void *data, int datalen, int mode, __u32 uid, __u32 gid, - cfs_cap_t capability, __u64 rdev, + const void *data, size_t datalen, umode_t mode, uid_t uid, + gid_t gid, cfs_cap_t capability, __u64 rdev, struct ptlrpc_request **request); int mdc_link(struct obd_export *exp, struct md_op_data *op_data, struct ptlrpc_request **request); int mdc_rename(struct obd_export *exp, struct md_op_data *op_data, - const char *old, int oldlen, const char *new, int newlen, + const char *old, size_t oldlen, + const char *new, size_t newlen, struct ptlrpc_request **request); int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, - void *ea, int ealen, void *ea2, int ea2len, + void *ea, size_t ealen, void *ea2, size_t ea2len, struct ptlrpc_request **request, struct md_open_data **mod); int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data, struct ptlrpc_request **request); @@ -138,4 +133,12 @@ static inline int mdc_prep_elc_req(struct obd_export *exp, count); } +static inline unsigned long hash_x_index(__u64 hash, int hash64) +{ + if (BITS_PER_LONG == 32 && hash64) + hash >>= 32; + /* save hash 0 with hash 1 */ + return ~0UL - (hash + !hash); +} + #endif diff --git a/drivers/staging/lustre/lustre/mdc/mdc_lib.c b/drivers/staging/lustre/lustre/mdc/mdc_lib.c index 143bd7628572..aac7e04873e2 100644 --- a/drivers/staging/lustre/lustre/mdc/mdc_lib.c +++ b/drivers/staging/lustre/lustre/mdc/mdc_lib.c @@ -37,27 +37,12 @@ static void __mdc_pack_body(struct mdt_body *b, __u32 suppgid) { - b->suppgid = suppgid; - b->uid = from_kuid(&init_user_ns, current_uid()); - b->gid = from_kgid(&init_user_ns, current_gid()); - b->fsuid = from_kuid(&init_user_ns, current_fsuid()); - b->fsgid = from_kgid(&init_user_ns, current_fsgid()); - b->capability = cfs_curproc_cap_pack(); -} - -void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid, - const struct lu_fid *cfid, int flags) -{ - struct mdt_body *b = req_capsule_client_get(&req->rq_pill, - &RMF_MDT_BODY); - - if (pfid) { - b->fid1 = *pfid; - b->valid = OBD_MD_FLID; - } - if (cfid) - b->fid2 = *cfid; - b->flags = flags; + b->mbo_suppgid = suppgid; + b->mbo_uid = from_kuid(&init_user_ns, current_uid()); + b->mbo_gid = from_kgid(&init_user_ns, current_gid()); + b->mbo_fsuid = from_kuid(&init_user_ns, current_fsuid()); + b->mbo_fsgid = from_kgid(&init_user_ns, current_fsgid()); + b->mbo_capability = cfs_curproc_cap_pack(); } void mdc_swap_layouts_pack(struct ptlrpc_request *req, @@ -67,43 +52,74 @@ void mdc_swap_layouts_pack(struct ptlrpc_request *req, &RMF_MDT_BODY); __mdc_pack_body(b, op_data->op_suppgids[0]); - b->fid1 = op_data->op_fid1; - b->fid2 = op_data->op_fid2; - b->valid |= OBD_MD_FLID; + b->mbo_fid1 = op_data->op_fid1; + b->mbo_fid2 = op_data->op_fid2; + b->mbo_valid |= OBD_MD_FLID; } void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid, - __u64 valid, int ea_size, __u32 suppgid, int flags) + __u64 valid, size_t ea_size, __u32 suppgid, u32 flags) { struct mdt_body *b = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY); - b->valid = valid; - b->eadatasize = ea_size; - b->flags = flags; + b->mbo_valid = valid; + b->mbo_eadatasize = ea_size; + b->mbo_flags = flags; __mdc_pack_body(b, suppgid); if (fid) { - b->fid1 = *fid; - b->valid |= OBD_MD_FLID; + b->mbo_fid1 = *fid; + b->mbo_valid |= OBD_MD_FLID; } } -void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, - __u32 size, const struct lu_fid *fid) +/** + * Pack a name (path component) into a request + * + * \param[in] req request + * \param[in] field request field (usually RMF_NAME) + * \param[in] name path component + * \param[in] name_len length of path component + * + * \a field must be present in \a req and of size \a name_len + 1. + * + * \a name must be '\0' terminated of length \a name_len and represent + * a single path component (not contain '/'). + */ +static void mdc_pack_name(struct ptlrpc_request *req, + const struct req_msg_field *field, + const char *name, size_t name_len) +{ + size_t buf_size; + size_t cpy_len; + char *buf; + + buf = req_capsule_client_get(&req->rq_pill, field); + buf_size = req_capsule_get_size(&req->rq_pill, field, RCL_CLIENT); + + LASSERT(name && name_len && buf && buf_size == name_len + 1); + + cpy_len = strlcpy(buf, name, buf_size); + + LASSERT(cpy_len == name_len && lu_name_is_valid_2(buf, cpy_len)); +} + +void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, size_t size, + const struct lu_fid *fid) { struct mdt_body *b = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY); - b->fid1 = *fid; - b->valid |= OBD_MD_FLID; - b->size = pgoff; /* !! */ - b->nlink = size; /* !! */ + b->mbo_fid1 = *fid; + b->mbo_valid |= OBD_MD_FLID; + b->mbo_size = pgoff; /* !! */ + b->mbo_nlink = size; /* !! */ __mdc_pack_body(b, -1); - b->mode = LUDA_FID | LUDA_TYPE; + b->mbo_mode = LUDA_FID | LUDA_TYPE; } /* packing of MDS records */ void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data, - const void *data, int datalen, __u32 mode, - __u32 uid, __u32 gid, cfs_cap_t cap_effective, __u64 rdev) + const void *data, size_t datalen, umode_t mode, + uid_t uid, gid_t gid, cfs_cap_t cap_effective, __u64 rdev) { struct mdt_rec_create *rec; char *tmp; @@ -130,22 +146,17 @@ void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data, rec->cr_bias = op_data->op_bias; rec->cr_umask = current_umask(); - tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); - LOGL0(op_data->op_name, op_data->op_namelen, tmp); - + mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen); if (data) { tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); memcpy(tmp, data, datalen); } } -static __u64 mds_pack_open_flags(__u64 flags, __u32 mode) +static inline __u64 mds_pack_open_flags(__u64 flags) { __u64 cr_flags = (flags & (FMODE_READ | FMODE_WRITE | - MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS | - MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK | - MDS_OPEN_BY_FID | MDS_OPEN_LEASE | - MDS_OPEN_RELEASE)); + MDS_OPEN_FL_INTERNAL)); if (flags & O_CREAT) cr_flags |= MDS_OPEN_CREAT; if (flags & O_EXCL) @@ -171,8 +182,8 @@ static __u64 mds_pack_open_flags(__u64 flags, __u32 mode) /* packing of MDS records */ void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data, - __u32 mode, __u64 rdev, __u64 flags, const void *lmm, - int lmmlen) + umode_t mode, __u64 rdev, __u64 flags, const void *lmm, + size_t lmmlen) { struct mdt_rec_create *rec; char *tmp; @@ -190,7 +201,7 @@ void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data, rec->cr_fid2 = op_data->op_fid2; rec->cr_mode = mode; - cr_flags = mds_pack_open_flags(flags, mode); + cr_flags = mds_pack_open_flags(flags); rec->cr_rdev = rdev; rec->cr_time = op_data->op_mod_time; rec->cr_suppgid1 = op_data->op_suppgids[0]; @@ -200,8 +211,9 @@ void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data, rec->cr_old_handle = op_data->op_handle; if (op_data->op_name) { - tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); - LOGL0(op_data->op_name, op_data->op_namelen, tmp); + mdc_pack_name(req, &RMF_NAME, op_data->op_name, + op_data->op_namelen); + if (op_data->op_bias & MDS_CREATE_VOLATILE) cr_flags |= MDS_OPEN_VOLATILE; } @@ -295,7 +307,7 @@ static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch, } void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data, - void *ea, int ealen, void *ea2, int ea2len) + void *ea, size_t ealen, void *ea2, size_t ea2len) { struct mdt_rec_setattr *rec; struct mdt_ioepoch *epoch; @@ -316,7 +328,7 @@ void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data, lum = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); if (!ea) { /* Remove LOV EA */ - lum->lmm_magic = LOV_USER_MAGIC_V1; + lum->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V1); lum->lmm_stripe_size = 0; lum->lmm_stripe_count = 0; lum->lmm_stripe_offset = (typeof(lum->lmm_stripe_offset))(-1); @@ -334,7 +346,6 @@ void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data, void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data) { struct mdt_rec_unlink *rec; - char *tmp; CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_unlink)); rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); @@ -352,15 +363,12 @@ void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data) rec->ul_time = op_data->op_mod_time; rec->ul_bias = op_data->op_bias; - tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); - LASSERT(tmp); - LOGL0(op_data->op_name, op_data->op_namelen, tmp); + mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen); } void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data) { struct mdt_rec_link *rec; - char *tmp; CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_link)); rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); @@ -376,20 +384,21 @@ void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data) rec->lk_time = op_data->op_mod_time; rec->lk_bias = op_data->op_bias; - tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); - LOGL0(op_data->op_name, op_data->op_namelen, tmp); + mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen); } void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data, - const char *old, int oldlen, const char *new, int newlen) + const char *old, size_t oldlen, + const char *new, size_t newlen) { struct mdt_rec_rename *rec; - char *tmp; CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename)); rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); /* XXX do something about time, uid, gid */ + rec->rn_opcode = op_data->op_cli_flags & CLI_MIGRATE ? + REINT_MIGRATE : REINT_RENAME; rec->rn_opcode = REINT_RENAME; rec->rn_fsuid = op_data->op_fsuid; rec->rn_fsgid = op_data->op_fsgid; @@ -402,39 +411,34 @@ void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data, rec->rn_mode = op_data->op_mode; rec->rn_bias = op_data->op_bias; - tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); - LOGL0(old, oldlen, tmp); + mdc_pack_name(req, &RMF_NAME, old, oldlen); - if (new) { - tmp = req_capsule_client_get(&req->rq_pill, &RMF_SYMTGT); - LOGL0(new, newlen, tmp); - } + if (new) + mdc_pack_name(req, &RMF_SYMTGT, new, newlen); } -void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags, - struct md_op_data *op_data, int ea_size) +void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, u32 flags, + struct md_op_data *op_data, size_t ea_size) { struct mdt_body *b = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY); - b->valid = valid; + b->mbo_valid = valid; if (op_data->op_bias & MDS_CHECK_SPLIT) - b->valid |= OBD_MD_FLCKSPLIT; + b->mbo_valid |= OBD_MD_FLCKSPLIT; if (op_data->op_bias & MDS_CROSS_REF) - b->valid |= OBD_MD_FLCROSSREF; - b->eadatasize = ea_size; - b->flags = flags; + b->mbo_valid |= OBD_MD_FLCROSSREF; + b->mbo_eadatasize = ea_size; + b->mbo_flags = flags; __mdc_pack_body(b, op_data->op_suppgids[0]); - b->fid1 = op_data->op_fid1; - b->fid2 = op_data->op_fid2; - b->valid |= OBD_MD_FLID; - - if (op_data->op_name) { - char *tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + b->mbo_fid1 = op_data->op_fid1; + b->mbo_fid2 = op_data->op_fid2; + b->mbo_valid |= OBD_MD_FLID; - LOGL0(op_data->op_name, op_data->op_namelen, tmp); - } + if (op_data->op_name) + mdc_pack_name(req, &RMF_NAME, op_data->op_name, + op_data->op_namelen); } static void mdc_hsm_release_pack(struct ptlrpc_request *req, @@ -482,67 +486,3 @@ void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data) mdc_ioepoch_pack(epoch, op_data); mdc_hsm_release_pack(req, op_data); } - -static int mdc_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw) -{ - int rc; - - spin_lock(&cli->cl_loi_list_lock); - rc = list_empty(&mcw->mcw_entry); - spin_unlock(&cli->cl_loi_list_lock); - return rc; -}; - -/* We record requests in flight in cli->cl_r_in_flight here. - * There is only one write rpc possible in mdc anyway. If this to change - * in the future - the code may need to be revisited. - */ -int mdc_enter_request(struct client_obd *cli) -{ - int rc = 0; - struct mdc_cache_waiter mcw; - struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); - - spin_lock(&cli->cl_loi_list_lock); - if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) { - list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters); - init_waitqueue_head(&mcw.mcw_waitq); - spin_unlock(&cli->cl_loi_list_lock); - rc = l_wait_event(mcw.mcw_waitq, mdc_req_avail(cli, &mcw), - &lwi); - if (rc) { - spin_lock(&cli->cl_loi_list_lock); - if (list_empty(&mcw.mcw_entry)) - cli->cl_r_in_flight--; - list_del_init(&mcw.mcw_entry); - spin_unlock(&cli->cl_loi_list_lock); - } - } else { - cli->cl_r_in_flight++; - spin_unlock(&cli->cl_loi_list_lock); - } - return rc; -} - -void mdc_exit_request(struct client_obd *cli) -{ - struct list_head *l, *tmp; - struct mdc_cache_waiter *mcw; - - spin_lock(&cli->cl_loi_list_lock); - cli->cl_r_in_flight--; - list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { - if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) { - /* No free request slots anymore */ - break; - } - - mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry); - list_del_init(&mcw->mcw_entry); - cli->cl_r_in_flight++; - wake_up(&mcw->mcw_waitq); - } - /* Empty waiting list? Decrease reqs in-flight number */ - - spin_unlock(&cli->cl_loi_list_lock); -} diff --git a/drivers/staging/lustre/lustre/mdc/mdc_locks.c b/drivers/staging/lustre/lustre/mdc/mdc_locks.c index f48b58423307..f1f6c082fa42 100644 --- a/drivers/staging/lustre/lustre/mdc/mdc_locks.c +++ b/drivers/staging/lustre/lustre/mdc/mdc_locks.c @@ -93,8 +93,8 @@ int it_open_error(int phase, struct lookup_intent *it) EXPORT_SYMBOL(it_open_error); /* this must be called on a lockh that is known to have a referenced lock */ -int mdc_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data, - __u64 *bits) +int mdc_set_lock_data(struct obd_export *exp, const struct lustre_handle *lockh, + void *data, __u64 *bits) { struct ldlm_lock *lock; struct inode *new_inode = data; @@ -102,10 +102,10 @@ int mdc_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data, if (bits) *bits = 0; - if (!*lockh) + if (!lustre_handle_is_used(lockh)) return 0; - lock = ldlm_handle2lock((struct lustre_handle *)lockh); + lock = ldlm_handle2lock(lockh); LASSERT(lock); lock_res_and_lock(lock); @@ -174,7 +174,7 @@ int mdc_null_inode(struct obd_export *exp, fid_build_reg_res_name(fid, &res_id); res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); - if (!res) + if (IS_ERR(res)) return 0; lock_res(res); @@ -185,28 +185,6 @@ int mdc_null_inode(struct obd_export *exp, return 0; } -/* find any ldlm lock of the inode in mdc - * return 0 not find - * 1 find one - * < 0 error - */ -int mdc_find_cbdata(struct obd_export *exp, - const struct lu_fid *fid, - ldlm_iterator_t it, void *data) -{ - struct ldlm_res_id res_id; - int rc = 0; - - fid_build_reg_res_name((struct lu_fid *)fid, &res_id); - rc = ldlm_resource_iterate(class_exp2obd(exp)->obd_namespace, &res_id, - it, data); - if (rc == LDLM_ITER_STOP) - return 1; - else if (rc == LDLM_ITER_CONTINUE) - return 0; - return rc; -} - static inline void mdc_clear_replay_flag(struct ptlrpc_request *req, int rc) { /* Don't hold error requests for replay. */ @@ -240,24 +218,24 @@ static void mdc_realloc_openmsg(struct ptlrpc_request *req, /* FIXME: remove this explicit offset. */ rc = sptlrpc_cli_enlarge_reqbuf(req, DLM_INTENT_REC_OFF + 4, - body->eadatasize); + body->mbo_eadatasize); if (rc) { CERROR("Can't enlarge segment %d size to %d\n", - DLM_INTENT_REC_OFF + 4, body->eadatasize); - body->valid &= ~OBD_MD_FLEASIZE; - body->eadatasize = 0; + DLM_INTENT_REC_OFF + 4, body->mbo_eadatasize); + body->mbo_valid &= ~OBD_MD_FLEASIZE; + body->mbo_eadatasize = 0; } } -static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp, - struct lookup_intent *it, - struct md_op_data *op_data, - void *lmm, int lmmsize, - void *cb_data) +static struct ptlrpc_request * +mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it, + struct md_op_data *op_data) { struct ptlrpc_request *req; struct obd_device *obddev = class_exp2obd(exp); struct ldlm_intent *lit; + const void *lmm = op_data->op_data; + u32 lmmsize = op_data->op_data_size; LIST_HEAD(cancels); int count = 0; int mode; @@ -274,7 +252,7 @@ static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp, else mode = LCK_PR; } else { - if (it->it_flags & (FMODE_WRITE|MDS_OPEN_TRUNC)) + if (it->it_flags & (FMODE_WRITE | MDS_OPEN_TRUNC)) mode = LCK_CW; else if (it->it_flags & __FMODE_EXEC) mode = LCK_PR; @@ -325,6 +303,9 @@ static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp, mdc_open_pack(req, op_data, it->it_create_mode, 0, it->it_flags, lmm, lmmsize); + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obddev->u.cli.cl_max_mds_easize); + ptlrpc_request_set_replen(req); return req; } @@ -336,7 +317,8 @@ mdc_intent_getxattr_pack(struct obd_export *exp, { struct ptlrpc_request *req; struct ldlm_intent *lit; - int rc, count = 0, maxdata; + int rc, count = 0; + u32 maxdata; LIST_HEAD(cancels); req = ptlrpc_request_alloc(class_exp2cliimp(exp), @@ -421,7 +403,7 @@ static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp, OBD_MD_MEA | OBD_MD_FLACL; struct ldlm_intent *lit; int rc; - int easize; + u32 easize; req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_INTENT_GETATTR); @@ -526,7 +508,7 @@ static int mdc_finish_enqueue(struct obd_export *exp, struct ldlm_reply *lockrep; struct ldlm_lock *lock; void *lvb_data = NULL; - int lvb_len = 0; + u32 lvb_len = 0; LASSERT(rc >= 0); /* Similarly, if we're going to replay this request, we don't want to @@ -605,7 +587,7 @@ static int mdc_finish_enqueue(struct obd_export *exp, mdc_set_open_replay_data(NULL, NULL, it); } - if ((body->valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) != 0) { + if ((body->mbo_valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) != 0) { void *eadata; mdc_update_max_ea_from_body(exp, body); @@ -615,7 +597,7 @@ static int mdc_finish_enqueue(struct obd_export *exp, * Eventually, obd_unpackmd() will check the contents. */ eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD, - body->eadatasize); + body->mbo_eadatasize); if (!eadata) return -EPROTO; @@ -623,7 +605,7 @@ static int mdc_finish_enqueue(struct obd_export *exp, * lock */ lvb_data = eadata; - lvb_len = body->eadatasize; + lvb_len = body->mbo_eadatasize; /* * We save the reply LOV EA in case we have to replay a @@ -639,20 +621,20 @@ static int mdc_finish_enqueue(struct obd_export *exp, if (req_capsule_get_size(pill, &RMF_EADATA, RCL_CLIENT) < - body->eadatasize) + body->mbo_eadatasize) mdc_realloc_openmsg(req, body); else req_capsule_shrink(pill, &RMF_EADATA, - body->eadatasize, + body->mbo_eadatasize, RCL_CLIENT); req_capsule_set_size(pill, &RMF_EADATA, RCL_CLIENT, - body->eadatasize); + body->mbo_eadatasize); lmm = req_capsule_client_get(pill, &RMF_EADATA); if (lmm) - memcpy(lmm, eadata, body->eadatasize); + memcpy(lmm, eadata, body->mbo_eadatasize); } } } else if (it->it_op & IT_LAYOUT) { @@ -662,7 +644,8 @@ static int mdc_finish_enqueue(struct obd_export *exp, lvb_len = req_capsule_get_size(pill, &RMF_DLM_LVB, RCL_SERVER); if (lvb_len > 0) { lvb_data = req_capsule_server_sized_get(pill, - &RMF_DLM_LVB, lvb_len); + &RMF_DLM_LVB, + lvb_len); if (!lvb_data) return -EPROTO; } @@ -705,9 +688,9 @@ static int mdc_finish_enqueue(struct obd_export *exp, * we don't know in advance the file type. */ int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + const ldlm_policy_data_t *policy, struct lookup_intent *it, struct md_op_data *op_data, - struct lustre_handle *lockh, void *lmm, int lmmsize, - struct ptlrpc_request **reqp, u64 extra_lock_flags) + struct lustre_handle *lockh, u64 extra_lock_flags) { static const ldlm_policy_data_t lookup_policy = { .l_inodebits = { MDS_INODELOCK_LOOKUP } @@ -721,9 +704,8 @@ int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, static const ldlm_policy_data_t getxattr_policy = { .l_inodebits = { MDS_INODELOCK_XATTR } }; - ldlm_policy_data_t const *policy = &lookup_policy; struct obd_device *obddev = class_exp2obd(exp); - struct ptlrpc_request *req; + struct ptlrpc_request *req = NULL; u64 flags, saved_flags = extra_lock_flags; struct ldlm_res_id res_id; int generation, resends = 0; @@ -733,40 +715,32 @@ int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, LASSERTF(!it || einfo->ei_type == LDLM_IBITS, "lock type %d\n", einfo->ei_type); - fid_build_reg_res_name(&op_data->op_fid1, &res_id); if (it) { + LASSERT(!policy); + saved_flags |= LDLM_FL_HAS_INTENT; - if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR)) + if (it->it_op & (IT_OPEN | IT_UNLINK | IT_GETATTR | IT_READDIR)) policy = &update_policy; else if (it->it_op & IT_LAYOUT) policy = &layout_policy; else if (it->it_op & (IT_GETXATTR | IT_SETXATTR)) policy = &getxattr_policy; + else + policy = &lookup_policy; } - LASSERT(!reqp); - generation = obddev->u.cli.cl_import->imp_generation; resend: flags = saved_flags; if (!it) { - /* The only way right now is FLOCK, in this case we hide flock - * policy as lmm, but lmmsize is 0 - */ - LASSERT(lmm && lmmsize == 0); + /* The only way right now is FLOCK. */ LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n", einfo->ei_type); - policy = lmm; res_id.name[3] = LDLM_FLOCK; - req = NULL; } else if (it->it_op & IT_OPEN) { - req = mdc_intent_open_pack(exp, it, op_data, lmm, lmmsize, - einfo->ei_cbdata); - policy = &update_policy; - einfo->ei_cbdata = NULL; - lmm = NULL; + req = mdc_intent_open_pack(exp, it, op_data); } else if (it->it_op & IT_UNLINK) { req = mdc_intent_unlink_pack(exp, it, op_data); } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) { @@ -806,7 +780,7 @@ resend: */ if (it) { mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it); - rc = mdc_enter_request(&obddev->u.cli); + rc = obd_get_request_slot(&obddev->u.cli); if (rc != 0) { mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it); mdc_clear_replay_flag(req, 0); @@ -834,13 +808,12 @@ resend: return rc; } - mdc_exit_request(&obddev->u.cli); + obd_put_request_slot(&obddev->u.cli); mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it); if (rc < 0) { - CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR, - "%s: ldlm_cli_enqueue failed: rc = %d\n", - obddev->obd_name, rc); + CDEBUG(D_INFO, "%s: ldlm_cli_enqueue failed: rc = %d\n", + obddev->obd_name, rc); mdc_clear_replay_flag(req, rc); ptlrpc_req_finished(req); @@ -903,6 +876,9 @@ static int mdc_finish_intent_lock(struct obd_export *exp, LASSERT(request != LP_POISON); LASSERT(request->rq_repmsg != LP_POISON); + if (it->it_op & IT_READDIR) + return 0; + if (!it_disposition(it, DISP_IT_EXECD)) { /* The server failed before it even started executing the * intent, i.e. because it couldn't unpack the request. @@ -917,27 +893,6 @@ static int mdc_finish_intent_lock(struct obd_export *exp, mdt_body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); LASSERT(mdt_body); /* mdc_enqueue checked */ - /* If we were revalidating a fid/name pair, mark the intent in - * case we fail and get called again from lookup - */ - if (fid_is_sane(&op_data->op_fid2) && - it->it_create_mode & M_CHECK_STALE && - it->it_op != IT_GETATTR) { - /* Also: did we find the same inode? */ - /* sever can return one of two fids: - * op_fid2 - new allocated fid - if file is created. - * op_fid3 - existent fid - if file only open. - * op_fid3 is saved in lmv_intent_open - */ - if ((!lu_fid_eq(&op_data->op_fid2, &mdt_body->fid1)) && - (!lu_fid_eq(&op_data->op_fid3, &mdt_body->fid1))) { - CDEBUG(D_DENTRY, "Found stale data "DFID"("DFID")/"DFID - "\n", PFID(&op_data->op_fid2), - PFID(&op_data->op_fid2), PFID(&mdt_body->fid1)); - return -ESTALE; - } - } - rc = it_open_error(DISP_LOOKUP_EXECD, it); if (rc) return rc; @@ -980,10 +935,10 @@ static int mdc_finish_intent_lock(struct obd_export *exp, LDLM_DEBUG(lock, "matching against this"); - LASSERTF(fid_res_name_eq(&mdt_body->fid1, + LASSERTF(fid_res_name_eq(&mdt_body->mbo_fid1, &lock->l_resource->lr_name), "Lock res_id: "DLDLMRES", fid: "DFID"\n", - PLDLMRES(lock->l_resource), PFID(&mdt_body->fid1)); + PLDLMRES(lock->l_resource), PFID(&mdt_body->mbo_fid1)); LDLM_LOCK_PUT(lock); memcpy(&old_lock, lockh, sizeof(*lockh)); @@ -998,8 +953,8 @@ static int mdc_finish_intent_lock(struct obd_export *exp, } CDEBUG(D_DENTRY, "D_IT dentry %.*s intent: %s status %d disp %x rc %d\n", - op_data->op_namelen, op_data->op_name, ldlm_it2str(it->it_op), - it->it_status, it->it_disposition, rc); + (int)op_data->op_namelen, op_data->op_name, + ldlm_it2str(it->it_op), it->it_status, it->it_disposition, rc); return rc; } @@ -1042,6 +997,9 @@ int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM; break; + case IT_READDIR: + policy.l_inodebits.bits = MDS_INODELOCK_UPDATE; + break; case IT_LAYOUT: policy.l_inodebits.bits = MDS_INODELOCK_LAYOUT; break; @@ -1095,10 +1053,8 @@ int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, * child lookup. */ int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data, - void *lmm, int lmmsize, struct lookup_intent *it, - int lookup_flags, struct ptlrpc_request **reqp, - ldlm_blocking_callback cb_blocking, - __u64 extra_lock_flags) + struct lookup_intent *it, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, __u64 extra_lock_flags) { struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS, @@ -1112,14 +1068,14 @@ int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data, LASSERT(it); CDEBUG(D_DLMTRACE, "(name: %.*s,"DFID") in obj "DFID - ", intent: %s flags %#Lo\n", op_data->op_namelen, + ", intent: %s flags %#Lo\n", (int)op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid2), PFID(&op_data->op_fid1), ldlm_it2str(it->it_op), it->it_flags); lockh.cookie = 0; if (fid_is_sane(&op_data->op_fid2) && - (it->it_op & (IT_LOOKUP | IT_GETATTR))) { + (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_READDIR))) { /* We could just return 1 immediately, but since we should only * be called in revalidate_it if we already have a lock, let's * verify that. @@ -1135,13 +1091,13 @@ int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data, /* For case if upper layer did not alloc fid, do it now. */ if (!fid_is_sane(&op_data->op_fid2) && it->it_op & IT_CREAT) { - rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data); + rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); if (rc < 0) { CERROR("Can't alloc new fid, rc %d\n", rc); return rc; } } - rc = mdc_enqueue(exp, &einfo, it, op_data, &lockh, lmm, lmmsize, NULL, + rc = mdc_enqueue(exp, &einfo, NULL, it, op_data, &lockh, extra_lock_flags); if (rc < 0) return rc; @@ -1170,7 +1126,7 @@ static int mdc_intent_getattr_async_interpret(const struct lu_env *env, obddev = class_exp2obd(exp); - mdc_exit_request(&obddev->u.cli); + obd_put_request_slot(&obddev->u.cli); if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE)) rc = -ETIMEDOUT; @@ -1222,15 +1178,15 @@ int mdc_intent_getattr_async(struct obd_export *exp, CDEBUG(D_DLMTRACE, "name: %.*s in inode " DFID ", intent: %s flags %#Lo\n", - op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1), - ldlm_it2str(it->it_op), it->it_flags); + (int)op_data->op_namelen, op_data->op_name, + PFID(&op_data->op_fid1), ldlm_it2str(it->it_op), it->it_flags); fid_build_reg_res_name(&op_data->op_fid1, &res_id); req = mdc_intent_getattr_pack(exp, it, op_data); if (IS_ERR(req)) return PTR_ERR(req); - rc = mdc_enter_request(&obddev->u.cli); + rc = obd_get_request_slot(&obddev->u.cli); if (rc != 0) { ptlrpc_req_finished(req); return rc; @@ -1239,7 +1195,7 @@ int mdc_intent_getattr_async(struct obd_export *exp, rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, &policy, &flags, NULL, 0, LVB_T_NONE, &minfo->mi_lockh, 1); if (rc < 0) { - mdc_exit_request(&obddev->u.cli); + obd_put_request_slot(&obddev->u.cli); ptlrpc_req_finished(req); return rc; } diff --git a/drivers/staging/lustre/lustre/mdc/mdc_reint.c b/drivers/staging/lustre/lustre/mdc/mdc_reint.c index 5dba2c813857..c921e471fa27 100644 --- a/drivers/staging/lustre/lustre/mdc/mdc_reint.c +++ b/drivers/staging/lustre/lustre/mdc/mdc_reint.c @@ -86,7 +86,7 @@ int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid, fid_build_reg_res_name(fid, &res_id); res = ldlm_resource_get(exp->exp_obd->obd_namespace, NULL, &res_id, 0, 0); - if (!res) + if (IS_ERR(res)) return 0; LDLM_RESOURCE_ADDREF(res); /* Initialize ibits lock policy. */ @@ -99,7 +99,7 @@ int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid, } int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, - void *ea, int ealen, void *ea2, int ea2len, + void *ea, size_t ealen, void *ea2, size_t ea2len, struct ptlrpc_request **request, struct md_open_data **mod) { LIST_HEAD(cancels); @@ -110,11 +110,10 @@ int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, __u64 bits; bits = MDS_INODELOCK_UPDATE; - if (op_data->op_attr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) + if (op_data->op_attr.ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) bits |= MDS_INODELOCK_LOOKUP; if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && - (fid_is_sane(&op_data->op_fid1)) && - !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) + (fid_is_sane(&op_data->op_fid1))) count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels, LCK_EX, bits); req = ptlrpc_request_alloc(class_exp2cliimp(exp), @@ -177,8 +176,8 @@ int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH); body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - epoch->handle = body->handle; - epoch->ioepoch = body->ioepoch; + epoch->handle = body->mbo_handle; + epoch->ioepoch = body->mbo_ioepoch; req->rq_replay_cb = mdc_replay_open; /** bug 3633, open may be committed and estale answer is not error */ } else if (rc == -ESTALE && (op_data->op_flags & MF_SOM_CHANGE)) { @@ -197,9 +196,9 @@ int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, } int mdc_create(struct obd_export *exp, struct md_op_data *op_data, - const void *data, int datalen, int mode, __u32 uid, __u32 gid, - cfs_cap_t cap_effective, __u64 rdev, - struct ptlrpc_request **request) + const void *data, size_t datalen, umode_t mode, + uid_t uid, gid_t gid, cfs_cap_t cap_effective, + __u64 rdev, struct ptlrpc_request **request) { struct ptlrpc_request *req; int level, rc; @@ -214,11 +213,9 @@ int mdc_create(struct obd_export *exp, struct md_op_data *op_data, * mdc_fid_alloc() may return errno 1 in case of switch to new * sequence, handle this. */ - rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data); - if (rc < 0) { - CERROR("Can't alloc new fid, rc %d\n", rc); + rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); + if (rc < 0) return rc; - } } rebuild: @@ -307,14 +304,12 @@ int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data, LASSERT(!req); if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && - (fid_is_sane(&op_data->op_fid1)) && - !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) + (fid_is_sane(&op_data->op_fid1))) count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels, LCK_EX, MDS_INODELOCK_UPDATE); if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && - (fid_is_sane(&op_data->op_fid3)) && - !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) + (fid_is_sane(&op_data->op_fid3))) count += mdc_resource_get_unused(exp, &op_data->op_fid3, &cancels, LCK_EX, MDS_INODELOCK_FULL); @@ -394,7 +389,7 @@ int mdc_link(struct obd_export *exp, struct md_op_data *op_data, } int mdc_rename(struct obd_export *exp, struct md_op_data *op_data, - const char *old, int oldlen, const char *new, int newlen, + const char *old, size_t oldlen, const char *new, size_t newlen, struct ptlrpc_request **request) { LIST_HEAD(cancels); @@ -431,7 +426,8 @@ int mdc_rename(struct obd_export *exp, struct md_op_data *op_data, } req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1); - req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1); + req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, + newlen + 1); rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); if (rc) { diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c index 542801f04b0d..f56ea643f9bf 100644 --- a/drivers/staging/lustre/lustre/mdc/mdc_request.c +++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c @@ -39,7 +39,9 @@ # include <linux/utsname.h> #include "../include/lustre_acl.h" +#include "../include/lustre/lustre_ioctl.h" #include "../include/obd_class.h" +#include "../include/lustre_lmv.h" #include "../include/lustre_fid.h" #include "../include/lprocfs_status.h" #include "../include/lustre_param.h" @@ -57,16 +59,16 @@ static inline int mdc_queue_wait(struct ptlrpc_request *req) struct client_obd *cli = &req->rq_import->imp_obd->u.cli; int rc; - /* mdc_enter_request() ensures that this client has no more + /* obd_get_request_slot() ensures that this client has no more * than cl_max_rpcs_in_flight RPCs simultaneously inf light * against an MDT. */ - rc = mdc_enter_request(cli); + rc = obd_get_request_slot(cli); if (rc != 0) return rc; rc = ptlrpc_queue_wait(req); - mdc_exit_request(cli); + obd_put_request_slot(cli); return rc; } @@ -98,7 +100,7 @@ static int mdc_getstatus(struct obd_export *exp, struct lu_fid *rootfid) goto out; } - *rootfid = body->fid1; + *rootfid = body->mbo_fid1; CDEBUG(D_NET, "root fid="DFID", last_committed=%llu\n", PFID(rootfid), @@ -136,12 +138,12 @@ static int mdc_getattr_common(struct obd_export *exp, if (!body) return -EPROTO; - CDEBUG(D_NET, "mode: %o\n", body->mode); + CDEBUG(D_NET, "mode: %o\n", body->mbo_mode); mdc_update_max_ea_from_body(exp, body); - if (body->eadatasize != 0) { + if (body->mbo_eadatasize != 0) { eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD, - body->eadatasize); + body->mbo_eadatasize); if (!eadata) return -EPROTO; } @@ -230,32 +232,6 @@ static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data, return rc; } -static int mdc_is_subdir(struct obd_export *exp, - const struct lu_fid *pfid, - const struct lu_fid *cfid, - struct ptlrpc_request **request) -{ - struct ptlrpc_request *req; - int rc; - - *request = NULL; - req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), - &RQF_MDS_IS_SUBDIR, LUSTRE_MDS_VERSION, - MDS_IS_SUBDIR); - if (!req) - return -ENOMEM; - - mdc_is_subdir_pack(req, pfid, cfid, 0); - ptlrpc_request_set_replen(req); - - rc = ptlrpc_queue_wait(req); - if (rc && rc != -EREMOTE) - ptlrpc_req_finished(req); - else - *request = req; - return rc; -} - static int mdc_xattr_common(struct obd_export *exp, const struct req_format *fmt, const struct lu_fid *fid, @@ -397,15 +373,15 @@ static int mdc_unpack_acl(struct ptlrpc_request *req, struct lustre_md *md) void *buf; int rc; - if (!body->aclsize) + if (!body->mbo_aclsize) return 0; - buf = req_capsule_server_sized_get(pill, &RMF_ACL, body->aclsize); + buf = req_capsule_server_sized_get(pill, &RMF_ACL, body->mbo_aclsize); if (!buf) return -EPROTO; - acl = posix_acl_from_xattr(&init_user_ns, buf, body->aclsize); + acl = posix_acl_from_xattr(&init_user_ns, buf, body->mbo_aclsize); if (!acl) return 0; @@ -443,24 +419,24 @@ static int mdc_get_lustre_md(struct obd_export *exp, md->body = req_capsule_server_get(pill, &RMF_MDT_BODY); - if (md->body->valid & OBD_MD_FLEASIZE) { + if (md->body->mbo_valid & OBD_MD_FLEASIZE) { int lmmsize; struct lov_mds_md *lmm; - if (!S_ISREG(md->body->mode)) { + if (!S_ISREG(md->body->mbo_mode)) { CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, should be a regular file, but is not\n"); rc = -EPROTO; goto out; } - if (md->body->eadatasize == 0) { + if (md->body->mbo_eadatasize == 0) { CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, but eadatasize 0\n"); rc = -EPROTO; goto out; } - lmmsize = md->body->eadatasize; + lmmsize = md->body->mbo_eadatasize; lmm = req_capsule_server_sized_get(pill, &RMF_MDT_MD, lmmsize); if (!lmm) { rc = -EPROTO; @@ -471,7 +447,7 @@ static int mdc_get_lustre_md(struct obd_export *exp, if (rc < 0) goto out; - if (rc < sizeof(*md->lsm)) { + if (rc < (typeof(rc))sizeof(*md->lsm)) { CDEBUG(D_INFO, "lsm size too small: rc < sizeof (*md->lsm) (%d < %d)\n", rc, (int)sizeof(*md->lsm)); @@ -479,24 +455,24 @@ static int mdc_get_lustre_md(struct obd_export *exp, goto out; } - } else if (md->body->valid & OBD_MD_FLDIREA) { + } else if (md->body->mbo_valid & OBD_MD_FLDIREA) { int lmvsize; struct lov_mds_md *lmv; - if (!S_ISDIR(md->body->mode)) { + if (!S_ISDIR(md->body->mbo_mode)) { CDEBUG(D_INFO, "OBD_MD_FLDIREA set, should be a directory, but is not\n"); rc = -EPROTO; goto out; } - if (md->body->eadatasize == 0) { + if (md->body->mbo_eadatasize == 0) { CDEBUG(D_INFO, "OBD_MD_FLDIREA is set, but eadatasize 0\n"); return -EPROTO; } - if (md->body->valid & OBD_MD_MEA) { - lmvsize = md->body->eadatasize; + if (md->body->mbo_valid & OBD_MD_MEA) { + lmvsize = md->body->mbo_eadatasize; lmv = req_capsule_server_sized_get(pill, &RMF_MDT_MD, lmvsize); if (!lmv) { @@ -504,15 +480,15 @@ static int mdc_get_lustre_md(struct obd_export *exp, goto out; } - rc = obd_unpackmd(md_exp, (void *)&md->mea, lmv, + rc = obd_unpackmd(md_exp, (void *)&md->lmv, lmv, lmvsize); if (rc < 0) goto out; - if (rc < sizeof(*md->mea)) { + if (rc < (typeof(rc))sizeof(*md->lmv)) { CDEBUG(D_INFO, - "size too small: rc < sizeof(*md->mea) (%d < %d)\n", - rc, (int)sizeof(*md->mea)); + "size too small: rc < sizeof(*md->lmv) (%d < %d)\n", + rc, (int)sizeof(*md->lmv)); rc = -EPROTO; goto out; } @@ -520,12 +496,12 @@ static int mdc_get_lustre_md(struct obd_export *exp, } rc = 0; - if (md->body->valid & OBD_MD_FLACL) { + if (md->body->mbo_valid & OBD_MD_FLACL) { /* for ACL, it's possible that FLACL is set but aclsize is zero. * only when aclsize != 0 there's an actual segment for ACL * in reply buffer. */ - if (md->body->aclsize) { + if (md->body->mbo_aclsize) { rc = mdc_unpack_acl(req, md); if (rc) goto out; @@ -580,9 +556,9 @@ void mdc_replay_open(struct ptlrpc_request *req) file_fh = &och->och_fh; CDEBUG(D_HA, "updating handle from %#llx to %#llx\n", - file_fh->cookie, body->handle.cookie); + file_fh->cookie, body->mbo_handle.cookie); old = *file_fh; - *file_fh = body->handle; + *file_fh = body->mbo_handle; } close_req = mod->mod_close_req; if (close_req) { @@ -597,7 +573,7 @@ void mdc_replay_open(struct ptlrpc_request *req) if (och) LASSERT(!memcmp(&old, &epoch->handle, sizeof(old))); DEBUG_REQ(D_HA, close_req, "updating close body with new fh"); - epoch->handle = body->handle; + epoch->handle = body->mbo_handle; } } @@ -679,11 +655,11 @@ int mdc_set_open_replay_data(struct obd_export *exp, spin_unlock(&open_req->rq_lock); } - rec->cr_fid2 = body->fid1; - rec->cr_ioepoch = body->ioepoch; - rec->cr_old_handle.cookie = body->handle.cookie; + rec->cr_fid2 = body->mbo_fid1; + rec->cr_ioepoch = body->mbo_ioepoch; + rec->cr_old_handle.cookie = body->mbo_handle.cookie; open_req->rq_replay_cb = mdc_replay_open; - if (!fid_is_sane(&body->fid1)) { + if (!fid_is_sane(&body->mbo_fid1)) { DEBUG_REQ(D_ERROR, open_req, "Saving replay request with insane fid"); LBUG(); @@ -701,9 +677,15 @@ static void mdc_free_open(struct md_open_data *mod) imp_connect_disp_stripe(mod->mod_open_req->rq_import)) committed = 1; - LASSERT(mod->mod_open_req->rq_replay == 0); - - DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, "free open request\n"); + /* + * No reason to asssert here if the open request has + * rq_replay == 1. It means that mdc_close failed, and + * close request wasn`t sent. It is not fatal to client. + * The worst thing is eviction if the client gets open lock + */ + DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, + "free open request rq_replay = %d\n", + mod->mod_open_req->rq_replay); ptlrpc_request_committed(mod->mod_open_req, committed); if (mod->mod_close_req) @@ -744,7 +726,7 @@ static void mdc_close_handle_reply(struct ptlrpc_request *req, epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH); epoch->flags |= MF_SOM_AU; - if (repbody->valid & OBD_MD_FLGETATTRLOCK) + if (repbody->mbo_valid & OBD_MD_FLGETATTRLOCK) op_data->op_flags |= MF_GETATTR_LOCK; } } @@ -763,7 +745,7 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data, req_fmt = &RQF_MDS_RELEASE_CLOSE; /* allocate a FID for volatile file */ - rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data); + rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); if (rc < 0) { CERROR("%s: "DFID" failed to allocate FID: %d\n", obd->obd_name, PFID(&op_data->op_fid1), rc); @@ -773,22 +755,10 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data, } *request = NULL; - req = ptlrpc_request_alloc(class_exp2cliimp(exp), req_fmt); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a - * portal whose threads are not taking any DLM locks and are therefore - * always progressing - */ - req->rq_request_portal = MDS_READPAGE_PORTAL; - ptlrpc_at_set_req_timeout(req); + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_CLOSE)) + req = NULL; + else + req = ptlrpc_request_alloc(class_exp2cliimp(exp), req_fmt); /* Ensure that this close's handle is fixed up during replay. */ if (likely(mod)) { @@ -809,6 +779,29 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data, CDEBUG(D_HA, "couldn't find open req; expecting close error\n"); } + if (!req) { + /* + * TODO: repeat close after errors + */ + CWARN("%s: close of FID "DFID" failed, file reference will be dropped when this client unmounts or is evicted\n", + obd->obd_name, PFID(&op_data->op_fid1)); + rc = -ENOMEM; + goto out; + } + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE); + if (rc) { + ptlrpc_request_free(req); + goto out; + } + + /* + * To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a + * portal whose threads are not taking any DLM locks and are therefore + * always progressing + */ + req->rq_request_portal = MDS_READPAGE_PORTAL; + ptlrpc_at_set_req_timeout(req); mdc_close_pack(req, op_data); @@ -854,6 +847,7 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data, } } +out: if (mod) { if (rc != 0) mod->mod_close_req = NULL; @@ -936,16 +930,17 @@ static int mdc_done_writing(struct obd_export *exp, struct md_op_data *op_data, return rc; } -static int mdc_readpage(struct obd_export *exp, struct md_op_data *op_data, - struct page **pages, struct ptlrpc_request **request) +static int mdc_getpage(struct obd_export *exp, const struct lu_fid *fid, + u64 offset, struct page **pages, int npages, + struct ptlrpc_request **request) { - struct ptlrpc_request *req; struct ptlrpc_bulk_desc *desc; - int i; - wait_queue_head_t waitq; - int resends = 0; - struct l_wait_info lwi; - int rc; + struct ptlrpc_request *req; + wait_queue_head_t waitq; + struct l_wait_info lwi; + int resends = 0; + int rc; + int i; *request = NULL; init_waitqueue_head(&waitq); @@ -964,7 +959,7 @@ restart_bulk: req->rq_request_portal = MDS_READPAGE_PORTAL; ptlrpc_at_set_req_timeout(req); - desc = ptlrpc_prep_bulk_imp(req, op_data->op_npages, 1, BULK_PUT_SINK, + desc = ptlrpc_prep_bulk_imp(req, npages, 1, BULK_PUT_SINK, MDS_BULK_PORTAL); if (!desc) { ptlrpc_request_free(req); @@ -972,12 +967,10 @@ restart_bulk: } /* NB req now owns desc and will free it when it gets freed */ - for (i = 0; i < op_data->op_npages; i++) + for (i = 0; i < npages; i++) ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_SIZE); - mdc_readdir_pack(req, op_data->op_offset, - PAGE_SIZE * op_data->op_npages, - &op_data->op_fid1); + mdc_readdir_pack(req, offset, PAGE_SIZE * npages, fid); ptlrpc_request_set_replen(req); rc = ptlrpc_queue_wait(req); @@ -988,11 +981,12 @@ restart_bulk: resends++; if (!client_should_resend(resends, &exp->exp_obd->u.cli)) { - CERROR("too many resend retries, returning error\n"); + CERROR("%s: too many resend retries: rc = %d\n", + exp->exp_obd->obd_name, -EIO); return -EIO; } - lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), - NULL, NULL, NULL); + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), NULL, NULL, + NULL); l_wait_event(waitq, 0, &lwi); goto restart_bulk; @@ -1006,9 +1000,9 @@ restart_bulk: } if (req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK) { - CERROR("Unexpected # bytes transferred: %d (%ld expected)\n", - req->rq_bulk->bd_nob_transferred, - PAGE_SIZE * op_data->op_npages); + CERROR("%s: unexpected bytes transferred: %d (%ld expected)\n", + exp->exp_obd->obd_name, req->rq_bulk->bd_nob_transferred, + PAGE_SIZE * npages); ptlrpc_req_finished(req); return -EPROTO; } @@ -1017,6 +1011,453 @@ restart_bulk: return 0; } +static void mdc_release_page(struct page *page, int remove) +{ + if (remove) { + lock_page(page); + if (likely(page->mapping)) + truncate_complete_page(page->mapping, page); + unlock_page(page); + } + put_page(page); +} + +static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash, + __u64 *start, __u64 *end, int hash64) +{ + /* + * Complement of hash is used as an index so that + * radix_tree_gang_lookup() can be used to find a page with starting + * hash _smaller_ than one we are looking for. + */ + unsigned long offset = hash_x_index(*hash, hash64); + struct page *page; + int found; + + spin_lock_irq(&mapping->tree_lock); + found = radix_tree_gang_lookup(&mapping->page_tree, + (void **)&page, offset, 1); + if (found > 0 && !radix_tree_exceptional_entry(page)) { + struct lu_dirpage *dp; + + get_page(page); + spin_unlock_irq(&mapping->tree_lock); + /* + * In contrast to find_lock_page() we are sure that directory + * page cannot be truncated (while DLM lock is held) and, + * hence, can avoid restart. + * + * In fact, page cannot be locked here at all, because + * mdc_read_page_remote does synchronous io. + */ + wait_on_page_locked(page); + if (PageUptodate(page)) { + dp = kmap(page); + if (BITS_PER_LONG == 32 && hash64) { + *start = le64_to_cpu(dp->ldp_hash_start) >> 32; + *end = le64_to_cpu(dp->ldp_hash_end) >> 32; + *hash = *hash >> 32; + } else { + *start = le64_to_cpu(dp->ldp_hash_start); + *end = le64_to_cpu(dp->ldp_hash_end); + } + if (unlikely(*start == 1 && *hash == 0)) + *hash = *start; + else + LASSERTF(*start <= *hash, "start = %#llx,end = %#llx,hash = %#llx\n", + *start, *end, *hash); + CDEBUG(D_VFSTRACE, "offset %lx [%#llx %#llx], hash %#llx\n", + offset, *start, *end, *hash); + if (*hash > *end) { + kunmap(page); + mdc_release_page(page, 0); + page = NULL; + } else if (*end != *start && *hash == *end) { + /* + * upon hash collision, remove this page, + * otherwise put page reference, and + * mdc_read_page_remote() will issue RPC to + * fetch the page we want. + */ + kunmap(page); + mdc_release_page(page, + le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); + page = NULL; + } + } else { + put_page(page); + page = ERR_PTR(-EIO); + } + } else { + spin_unlock_irq(&mapping->tree_lock); + page = NULL; + } + return page; +} + +/* + * Adjust a set of pages, each page containing an array of lu_dirpages, + * so that each page can be used as a single logical lu_dirpage. + * + * A lu_dirpage is laid out as follows, where s = ldp_hash_start, + * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a + * struct lu_dirent. It has size up to LU_PAGE_SIZE. The ldp_hash_end + * value is used as a cookie to request the next lu_dirpage in a + * directory listing that spans multiple pages (two in this example): + * ________ + * | | + * .|--------v------- -----. + * |s|e|f|p|ent|ent| ... |ent| + * '--|-------------- -----' Each PAGE contains a single + * '------. lu_dirpage. + * .---------v------- -----. + * |s|e|f|p|ent| 0 | ... | 0 | + * '----------------- -----' + * + * However, on hosts where the native VM page size (PAGE_SIZE) is + * larger than LU_PAGE_SIZE, a single host page may contain multiple + * lu_dirpages. After reading the lu_dirpages from the MDS, the + * ldp_hash_end of the first lu_dirpage refers to the one immediately + * after it in the same PAGE (arrows simplified for brevity, but + * in general e0==s1, e1==s2, etc.): + * + * .-------------------- -----. + * |s0|e0|f0|p|ent|ent| ... |ent| + * |---v---------------- -----| + * |s1|e1|f1|p|ent|ent| ... |ent| + * |---v---------------- -----| Here, each PAGE contains + * ... multiple lu_dirpages. + * |---v---------------- -----| + * |s'|e'|f'|p|ent|ent| ... |ent| + * '---|---------------- -----' + * v + * .----------------------------. + * | next PAGE | + * + * This structure is transformed into a single logical lu_dirpage as follows: + * + * - Replace e0 with e' so the request for the next lu_dirpage gets the page + * labeled 'next PAGE'. + * + * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether + * a hash collision with the next page exists. + * + * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span + * to the first entry of the next lu_dirpage. + */ +#if PAGE_SIZE > LU_PAGE_SIZE +static void mdc_adjust_dirpages(struct page **pages, int cfs_pgs, int lu_pgs) +{ + int i; + + for (i = 0; i < cfs_pgs; i++) { + struct lu_dirpage *dp = kmap(pages[i]); + __u64 hash_end = le64_to_cpu(dp->ldp_hash_end); + __u32 flags = le32_to_cpu(dp->ldp_flags); + struct lu_dirpage *first = dp; + struct lu_dirent *end_dirent = NULL; + struct lu_dirent *ent; + + while (--lu_pgs > 0) { + ent = lu_dirent_start(dp); + for (end_dirent = ent; ent; + end_dirent = ent, ent = lu_dirent_next(ent)); + + /* Advance dp to next lu_dirpage. */ + dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE); + + /* Check if we've reached the end of the CFS_PAGE. */ + if (!((unsigned long)dp & ~PAGE_MASK)) + break; + + /* Save the hash and flags of this lu_dirpage. */ + hash_end = le64_to_cpu(dp->ldp_hash_end); + flags = le32_to_cpu(dp->ldp_flags); + + /* Check if lu_dirpage contains no entries. */ + if (!end_dirent) + break; + + /* + * Enlarge the end entry lde_reclen from 0 to + * first entry of next lu_dirpage. + */ + LASSERT(!le16_to_cpu(end_dirent->lde_reclen)); + end_dirent->lde_reclen = + cpu_to_le16((char *)(dp->ldp_entries) - + (char *)end_dirent); + } + + first->ldp_hash_end = hash_end; + first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE); + first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE); + + kunmap(pages[i]); + } + LASSERTF(lu_pgs == 0, "left = %d", lu_pgs); +} +#else +#define mdc_adjust_dirpages(pages, cfs_pgs, lu_pgs) do {} while (0) +#endif /* PAGE_SIZE > LU_PAGE_SIZE */ + +/* parameters for readdir page */ +struct readpage_param { + struct md_op_data *rp_mod; + __u64 rp_off; + int rp_hash64; + struct obd_export *rp_exp; + struct md_callback *rp_cb; +}; + +/** + * Read pages from server. + * + * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains + * a header lu_dirpage which describes the start/end hash, and whether this + * page is empty (contains no dir entry) or hash collide with next page. + * After client receives reply, several pages will be integrated into dir page + * in PAGE_SIZE (if PAGE_SIZE greater than LU_PAGE_SIZE), and the + * lu_dirpage for this integrated page will be adjusted. + **/ +static int mdc_read_page_remote(void *data, struct page *page0) +{ + struct readpage_param *rp = data; + struct page **page_pool; + struct page *page; + struct lu_dirpage *dp; + int rd_pgs = 0; /* number of pages read actually */ + int npages; + struct md_op_data *op_data = rp->rp_mod; + struct ptlrpc_request *req; + int max_pages = op_data->op_max_pages; + struct inode *inode; + struct lu_fid *fid; + int i; + int rc; + + LASSERT(max_pages > 0 && max_pages <= PTLRPC_MAX_BRW_PAGES); + inode = op_data->op_data; + fid = &op_data->op_fid1; + LASSERT(inode); + + page_pool = kcalloc(max_pages, sizeof(page), GFP_NOFS); + if (page_pool) { + page_pool[0] = page0; + } else { + page_pool = &page0; + max_pages = 1; + } + + for (npages = 1; npages < max_pages; npages++) { + page = page_cache_alloc_cold(inode->i_mapping); + if (!page) + break; + page_pool[npages] = page; + } + + rc = mdc_getpage(rp->rp_exp, fid, rp->rp_off, page_pool, npages, &req); + if (!rc) { + int lu_pgs = req->rq_bulk->bd_nob_transferred; + + rd_pgs = (req->rq_bulk->bd_nob_transferred + + PAGE_SIZE - 1) >> PAGE_SHIFT; + lu_pgs >>= LU_PAGE_SHIFT; + LASSERT(!(req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK)); + + CDEBUG(D_INODE, "read %d(%d) pages\n", rd_pgs, lu_pgs); + + mdc_adjust_dirpages(page_pool, rd_pgs, lu_pgs); + + SetPageUptodate(page0); + } + + unlock_page(page0); + ptlrpc_req_finished(req); + CDEBUG(D_CACHE, "read %d/%d pages\n", rd_pgs, npages); + for (i = 1; i < npages; i++) { + unsigned long offset; + __u64 hash; + int ret; + + page = page_pool[i]; + + if (rc < 0 || i >= rd_pgs) { + put_page(page); + continue; + } + + SetPageUptodate(page); + + dp = kmap(page); + hash = le64_to_cpu(dp->ldp_hash_start); + kunmap(page); + + offset = hash_x_index(hash, rp->rp_hash64); + + prefetchw(&page->flags); + ret = add_to_page_cache_lru(page, inode->i_mapping, offset, + GFP_KERNEL); + if (!ret) + unlock_page(page); + else + CDEBUG(D_VFSTRACE, "page %lu add to page cache failed: rc = %d\n", + offset, ret); + put_page(page); + } + + if (page_pool != &page0) + kfree(page_pool); + + return rc; +} + +/** + * Read dir page from cache first, if it can not find it, read it from + * server and add into the cache. + * + * \param[in] exp MDC export + * \param[in] op_data client MD stack parameters, transferring parameters + * between different layers on client MD stack. + * \param[in] cb_op callback required for ldlm lock enqueue during + * read page + * \param[in] hash_offset the hash offset of the page to be read + * \param[in] ppage the page to be read + * + * retval = 0 get the page successfully + * errno(<0) get the page failed + */ +static int mdc_read_page(struct obd_export *exp, struct md_op_data *op_data, + struct md_callback *cb_op, __u64 hash_offset, + struct page **ppage) +{ + struct lookup_intent it = { .it_op = IT_READDIR }; + struct page *page; + struct inode *dir = op_data->op_data; + struct address_space *mapping; + struct lu_dirpage *dp; + __u64 start = 0; + __u64 end = 0; + struct lustre_handle lockh; + struct ptlrpc_request *enq_req = NULL; + struct readpage_param rp_param; + int rc; + + *ppage = NULL; + + LASSERT(dir); + mapping = dir->i_mapping; + + rc = mdc_intent_lock(exp, op_data, &it, &enq_req, + cb_op->md_blocking_ast, 0); + if (enq_req) + ptlrpc_req_finished(enq_req); + + if (rc < 0) { + CERROR("%s: "DFID" lock enqueue fails: rc = %d\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), rc); + return rc; + } + + rc = 0; + lockh.cookie = it.it_lock_handle; + mdc_set_lock_data(exp, &lockh, dir, NULL); + + rp_param.rp_off = hash_offset; + rp_param.rp_hash64 = op_data->op_cli_flags & CLI_HASH64; + page = mdc_page_locate(mapping, &rp_param.rp_off, &start, &end, + rp_param.rp_hash64); + if (IS_ERR(page)) { + CDEBUG(D_INFO, "%s: dir page locate: " DFID " at %llu: rc %ld\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), + rp_param.rp_off, PTR_ERR(page)); + rc = PTR_ERR(page); + goto out_unlock; + } else if (page) { + /* + * XXX nikita: not entirely correct handling of a corner case: + * suppose hash chain of entries with hash value HASH crosses + * border between pages P0 and P1. First both P0 and P1 are + * cached, seekdir() is called for some entry from the P0 part + * of the chain. Later P0 goes out of cache. telldir(HASH) + * happens and finds P1, as it starts with matching hash + * value. Remaining entries from P0 part of the chain are + * skipped. (Is that really a bug?) + * + * Possible solutions: 0. don't cache P1 is such case, handle + * it as an "overflow" page. 1. invalidate all pages at + * once. 2. use HASH|1 as an index for P1. + */ + goto hash_collision; + } + + rp_param.rp_exp = exp; + rp_param.rp_mod = op_data; + page = read_cache_page(mapping, + hash_x_index(rp_param.rp_off, + rp_param.rp_hash64), + mdc_read_page_remote, &rp_param); + if (IS_ERR(page)) { + CERROR("%s: read cache page: "DFID" at %llu: rc %ld\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), + rp_param.rp_off, PTR_ERR(page)); + rc = PTR_ERR(page); + goto out_unlock; + } + + wait_on_page_locked(page); + (void)kmap(page); + if (!PageUptodate(page)) { + CERROR("%s: page not updated: "DFID" at %llu: rc %d\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), + rp_param.rp_off, -5); + goto fail; + } + if (!PageChecked(page)) + SetPageChecked(page); + if (PageError(page)) { + CERROR("%s: page error: "DFID" at %llu: rc %d\n", + exp->exp_obd->obd_name, PFID(&op_data->op_fid1), + rp_param.rp_off, -5); + goto fail; + } + +hash_collision: + dp = page_address(page); + if (BITS_PER_LONG == 32 && rp_param.rp_hash64) { + start = le64_to_cpu(dp->ldp_hash_start) >> 32; + end = le64_to_cpu(dp->ldp_hash_end) >> 32; + rp_param.rp_off = hash_offset >> 32; + } else { + start = le64_to_cpu(dp->ldp_hash_start); + end = le64_to_cpu(dp->ldp_hash_end); + rp_param.rp_off = hash_offset; + } + if (end == start) { + LASSERT(start == rp_param.rp_off); + CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end); +#if BITS_PER_LONG == 32 + CWARN("Real page-wide hash collision at [%llu %llu] with hash %llu\n", + le64_to_cpu(dp->ldp_hash_start), + le64_to_cpu(dp->ldp_hash_end), hash_offset); +#endif + /* + * Fetch whole overflow chain... + * + * XXX not yet. + */ + goto fail; + } + *ppage = page; +out_unlock: + ldlm_lock_decref(&lockh, it.it_lock_mode); + return rc; +fail: + kunmap(page); + mdc_release_page(page, 1); + rc = -EIO; + goto out_unlock; +} + static int mdc_statfs(const struct lu_env *env, struct obd_export *exp, struct obd_statfs *osfs, __u64 max_age, __u32 flags) @@ -1401,7 +1842,7 @@ out: return rc; } -static struct kuc_hdr *changelog_kuc_hdr(char *buf, int len, int flags) +static struct kuc_hdr *changelog_kuc_hdr(char *buf, size_t len, u32 flags) { struct kuc_hdr *lh = (struct kuc_hdr *)buf; @@ -1415,40 +1856,44 @@ static struct kuc_hdr *changelog_kuc_hdr(char *buf, int len, int flags) return lh; } -#define D_CHANGELOG 0 - struct changelog_show { __u64 cs_startrec; - __u32 cs_flags; + enum changelog_send_flag cs_flags; struct file *cs_fp; char *cs_buf; struct obd_device *cs_obd; }; +static inline char *cs_obd_name(struct changelog_show *cs) +{ + return cs->cs_obd->obd_name; +} + static int changelog_kkuc_cb(const struct lu_env *env, struct llog_handle *llh, struct llog_rec_hdr *hdr, void *data) { struct changelog_show *cs = data; struct llog_changelog_rec *rec = (struct llog_changelog_rec *)hdr; struct kuc_hdr *lh; - int len, rc; + size_t len; + int rc; if (rec->cr_hdr.lrh_type != CHANGELOG_REC) { rc = -EINVAL; CERROR("%s: not a changelog rec %x/%d: rc = %d\n", - cs->cs_obd->obd_name, rec->cr_hdr.lrh_type, + cs_obd_name(cs), rec->cr_hdr.lrh_type, rec->cr.cr_type, rc); return rc; } if (rec->cr.cr_index < cs->cs_startrec) { /* Skip entries earlier than what we are interested in */ - CDEBUG(D_CHANGELOG, "rec=%llu start=%llu\n", + CDEBUG(D_HSM, "rec=%llu start=%llu\n", rec->cr.cr_index, cs->cs_startrec); return 0; } - CDEBUG(D_CHANGELOG, "%llu %02d%-5s %llu 0x%x t="DFID" p="DFID + CDEBUG(D_HSM, "%llu %02d%-5s %llu 0x%x t=" DFID " p=" DFID " %.*s\n", rec->cr.cr_index, rec->cr.cr_type, changelog_type2str(rec->cr.cr_type), rec->cr.cr_time, rec->cr.cr_flags & CLF_FLAGMASK, @@ -1462,20 +1907,21 @@ static int changelog_kkuc_cb(const struct lu_env *env, struct llog_handle *llh, memcpy(lh + 1, &rec->cr, len - sizeof(*lh)); rc = libcfs_kkuc_msg_put(cs->cs_fp, lh); - CDEBUG(D_CHANGELOG, "kucmsg fp %p len %d rc %d\n", cs->cs_fp, len, rc); + CDEBUG(D_HSM, "kucmsg fp %p len %zu rc %d\n", cs->cs_fp, len, rc); return rc; } static int mdc_changelog_send_thread(void *csdata) { + enum llog_flag flags = LLOG_F_IS_CAT; struct changelog_show *cs = csdata; struct llog_ctxt *ctxt = NULL; struct llog_handle *llh = NULL; struct kuc_hdr *kuch; int rc; - CDEBUG(D_CHANGELOG, "changelog to fp=%p start %llu\n", + CDEBUG(D_HSM, "changelog to fp=%p start %llu\n", cs->cs_fp, cs->cs_startrec); cs->cs_buf = kzalloc(KUC_CHANGELOG_MSG_MAXSIZE, GFP_NOFS); @@ -1494,10 +1940,14 @@ static int mdc_changelog_send_thread(void *csdata) LLOG_OPEN_EXISTS); if (rc) { CERROR("%s: fail to open changelog catalog: rc = %d\n", - cs->cs_obd->obd_name, rc); + cs_obd_name(cs), rc); goto out; } - rc = llog_init_handle(NULL, llh, LLOG_F_IS_CAT, NULL); + + if (cs->cs_flags & CHANGELOG_FLAG_JOBID) + flags |= LLOG_F_EXT_JOBID; + + rc = llog_init_handle(NULL, llh, flags, NULL); if (rc) { CERROR("llog_init_handle failed %d\n", rc); goto out; @@ -1550,12 +2000,12 @@ static int mdc_ioc_changelog_send(struct obd_device *obd, if (IS_ERR(task)) { rc = PTR_ERR(task); CERROR("%s: can't start changelog thread: rc = %d\n", - obd->obd_name, rc); + cs_obd_name(cs), rc); kfree(cs); } else { rc = 0; - CDEBUG(D_CHANGELOG, "%s: started changelog thread\n", - obd->obd_name); + CDEBUG(D_HSM, "%s: started changelog thread\n", + cs_obd_name(cs)); } CERROR("Failed to start changelog thread: %d\n", rc); @@ -1669,9 +2119,11 @@ static int mdc_ioc_swap_layouts(struct obd_export *exp, * with the request RPC to avoid extra RPC round trips */ count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels, - LCK_CR, MDS_INODELOCK_LAYOUT); + LCK_CR, MDS_INODELOCK_LAYOUT | + MDS_INODELOCK_XATTR); count += mdc_resource_get_unused(exp, &op_data->op_fid2, &cancels, - LCK_CR, MDS_INODELOCK_LAYOUT); + LCK_CR, MDS_INODELOCK_LAYOUT | + MDS_INODELOCK_XATTR); req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_SWAP_LAYOUTS); @@ -1917,7 +2369,7 @@ static void lustre_swab_hai(struct hsm_action_item *h) static void lustre_swab_hal(struct hsm_action_list *h) { struct hsm_action_item *hai; - int i; + u32 i; __swab32s(&h->hal_version); __swab32s(&h->hal_count); @@ -1966,14 +2418,14 @@ static int mdc_ioc_hsm_ct_start(struct obd_export *exp, * @param val KUC message (kuc_hdr + hsm_action_list) * @param len total length of message */ -static int mdc_hsm_copytool_send(int len, void *val) +static int mdc_hsm_copytool_send(size_t len, void *val) { struct kuc_hdr *lh = (struct kuc_hdr *)val; struct hsm_action_list *hal = (struct hsm_action_list *)(lh + 1); if (len < sizeof(*lh) + sizeof(*hal)) { - CERROR("Short HSM message %d < %d\n", len, - (int)(sizeof(*lh) + sizeof(*hal))); + CERROR("Short HSM message %zu < %zu\n", len, + sizeof(*lh) + sizeof(*hal)); return -EPROTO; } if (lh->kuc_magic == __swab16(KUC_MAGIC)) { @@ -2044,9 +2496,8 @@ static int mdc_set_info_async(const struct lu_env *env, } spin_unlock(&imp->imp_lock); - rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION, - keylen, key, vallen, val, set); - return rc; + return do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION, + keylen, key, vallen, val, set); } if (KEY_IS(KEY_SPTLRPC_CONF)) { sptlrpc_conf_client_adapt(exp->exp_obd); @@ -2065,6 +2516,12 @@ static int mdc_set_info_async(const struct lu_env *env, rc = mdc_hsm_copytool_send(vallen, val); return rc; } + if (KEY_IS(KEY_DEFAULT_EASIZE)) { + u32 *default_easize = val; + + exp->exp_obd->u.cli.cl_default_mds_easize = *default_easize; + return 0; + } CERROR("Unknown key %s\n", (char *)key); return -EINVAL; @@ -2077,18 +2534,18 @@ static int mdc_get_info(const struct lu_env *env, struct obd_export *exp, int rc = -EINVAL; if (KEY_IS(KEY_MAX_EASIZE)) { - int mdsize, *max_easize; + u32 mdsize, *max_easize; if (*vallen != sizeof(int)) return -EINVAL; - mdsize = *(int *)val; + mdsize = *(u32 *)val; if (mdsize > exp->exp_obd->u.cli.cl_max_mds_easize) exp->exp_obd->u.cli.cl_max_mds_easize = mdsize; max_easize = val; *max_easize = exp->exp_obd->u.cli.cl_max_mds_easize; return 0; } else if (KEY_IS(KEY_DEFAULT_EASIZE)) { - int *default_easize; + u32 *default_easize; if (*vallen != sizeof(int)) return -EINVAL; @@ -2105,7 +2562,7 @@ static int mdc_get_info(const struct lu_env *env, struct obd_export *exp, *data = imp->imp_connect_data; return 0; } else if (KEY_IS(KEY_TGT_COUNT)) { - *((int *)val) = 1; + *((u32 *)val) = 1; return 0; } @@ -2199,13 +2656,13 @@ static int mdc_import_event(struct obd_device *obd, struct obd_import *imp, return rc; } -int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid, - struct md_op_data *op_data) +int mdc_fid_alloc(const struct lu_env *env, struct obd_export *exp, + struct lu_fid *fid, struct md_op_data *op_data) { struct client_obd *cli = &exp->exp_obd->u.cli; struct lu_client_seq *seq = cli->cl_seq; - return seq_client_alloc_fid(NULL, seq, fid); + return seq_client_alloc_fid(env, seq, fid); } static struct obd_uuid *mdc_get_uuid(struct obd_export *exp) @@ -2333,8 +2790,8 @@ err_rpc_lock: * a large number of stripes is possible. If a larger reply buffer is * required it will be reallocated in the ptlrpc layer due to overflow. */ -static int mdc_init_ea_size(struct obd_export *exp, int easize, - int def_easize, int cookiesize, int def_cookiesize) +static int mdc_init_ea_size(struct obd_export *exp, u32 easize, u32 def_easize, + u32 cookiesize, u32 def_cookiesize) { struct obd_device *obd = exp->exp_obd; struct client_obd *cli = &obd->u.cli; @@ -2430,7 +2887,6 @@ static struct obd_ops mdc_obd_ops = { static struct md_ops mdc_md_ops = { .getstatus = mdc_getstatus, .null_inode = mdc_null_inode, - .find_cbdata = mdc_find_cbdata, .close = mdc_close, .create = mdc_create, .done_writing = mdc_done_writing, @@ -2439,13 +2895,12 @@ static struct md_ops mdc_md_ops = { .getattr_name = mdc_getattr_name, .intent_lock = mdc_intent_lock, .link = mdc_link, - .is_subdir = mdc_is_subdir, .rename = mdc_rename, .setattr = mdc_setattr, .setxattr = mdc_setxattr, .getxattr = mdc_getxattr, .sync = mdc_sync, - .readpage = mdc_readpage, + .read_page = mdc_read_page, .unlink = mdc_unlink, .cancel_unused = mdc_cancel_unused, .init_ea_size = mdc_init_ea_size, diff --git a/drivers/staging/lustre/lustre/mgc/mgc_request.c b/drivers/staging/lustre/lustre/mgc/mgc_request.c index 9d0bd4745865..23374cae5133 100644 --- a/drivers/staging/lustre/lustre/mgc/mgc_request.c +++ b/drivers/staging/lustre/lustre/mgc/mgc_request.c @@ -549,8 +549,9 @@ static int mgc_requeue_thread(void *data) * caused the lock revocation to finish its setup, plus some * random so everyone doesn't try to reconnect at once. */ - to = MGC_TIMEOUT_MIN_SECONDS * HZ; - to += rand * HZ / 100; /* rand is centi-seconds */ + to = msecs_to_jiffies(MGC_TIMEOUT_MIN_SECONDS * MSEC_PER_SEC); + /* rand is centi-seconds */ + to += msecs_to_jiffies(rand * MSEC_PER_SEC / 100); lwi = LWI_TIMEOUT(to, NULL, NULL); l_wait_event(rq_waitq, rq_state & (RQ_STOP | RQ_PRECLEANUP), &lwi); @@ -1158,7 +1159,7 @@ static int mgc_apply_recover_logs(struct obd_device *mgc, while (datalen > 0) { int entry_len = sizeof(*entry); - int is_ost; + int is_ost, i; struct obd_device *obd; char *obdname; char *cname; @@ -1264,11 +1265,17 @@ static int mgc_apply_recover_logs(struct obd_device *mgc, continue; } - /* TODO: iterate all nids to find one */ + /* iterate all nids to find one */ /* find uuid by nid */ - rc = client_import_find_conn(obd->u.cli.cl_import, - entry->u.nids[0], - (struct obd_uuid *)uuid); + rc = -ENOENT; + for (i = 0; i < entry->mne_nid_count; i++) { + rc = client_import_find_conn(obd->u.cli.cl_import, + entry->u.nids[0], + (struct obd_uuid *)uuid); + if (!rc) + break; + } + up_read(&obd->u.cli.cl_sem); if (rc < 0) { CERROR("mgc: cannot find uuid by nid %s\n", @@ -1428,14 +1435,12 @@ again: } mne_swab = !!ptlrpc_rep_need_swab(req); -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0) +#if OBD_OCD_VERSION(3, 0, 53, 0) > LUSTRE_VERSION_CODE /* This import flag means the server did an extra swab of IR MNE * records (fixed in LU-1252), reverse it here if needed. LU-1644 */ if (unlikely(req->rq_import->imp_need_mne_swab)) mne_swab = !mne_swab; -#else -#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab" #endif for (i = 0; i < nrpages && ealen > 0; i++) { @@ -1740,8 +1745,6 @@ static struct obd_ops mgc_obd_ops = { .del_conn = client_import_del_conn, .connect = client_connect_import, .disconnect = client_disconnect_export, - /* .enqueue = mgc_enqueue, */ - /* .iocontrol = mgc_iocontrol, */ .set_info_async = mgc_set_info_async, .get_info = mgc_get_info, .import_event = mgc_import_event, diff --git a/drivers/staging/lustre/lustre/obdclass/Makefile b/drivers/staging/lustre/lustre/obdclass/Makefile index df7e47f35a66..b42e109b30e0 100644 --- a/drivers/staging/lustre/lustre/obdclass/Makefile +++ b/drivers/staging/lustre/lustre/obdclass/Makefile @@ -3,6 +3,6 @@ obj-$(CONFIG_LUSTRE_FS) += obdclass.o obdclass-y := linux/linux-module.o linux/linux-obdo.o linux/linux-sysctl.o \ llog.o llog_cat.o llog_obd.o llog_swab.o class_obd.o debug.o \ genops.o uuid.o lprocfs_status.o lprocfs_counters.o \ - lustre_handles.o lustre_peer.o statfs_pack.o \ + lustre_handles.o lustre_peer.o statfs_pack.o linkea.o \ obdo.o obd_config.o obd_mount.o lu_object.o lu_ref.o \ cl_object.o cl_page.o cl_lock.o cl_io.o kernelcomm.o diff --git a/drivers/staging/lustre/lustre/obdclass/cl_io.c b/drivers/staging/lustre/lustre/obdclass/cl_io.c index e72f1fc00a13..bc4b7b6b9a20 100644 --- a/drivers/staging/lustre/lustre/obdclass/cl_io.c +++ b/drivers/staging/lustre/lustre/obdclass/cl_io.c @@ -73,7 +73,6 @@ int cl_io_is_going(const struct lu_env *env) { return cl_env_info(env)->clt_current_io != NULL; } -EXPORT_SYMBOL(cl_io_is_going); /** * cl_io invariant that holds at all times when exported cl_io_*() functions @@ -859,9 +858,6 @@ void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page) LASSERT(page->cp_owner); LINVRNT(plist->pl_owner == current); - lockdep_off(); - mutex_lock(&page->cp_mutex); - lockdep_on(); LASSERT(list_empty(&page->cp_batch)); list_add_tail(&page->cp_batch, &plist->pl_pages); ++plist->pl_nr; @@ -877,12 +873,10 @@ void cl_page_list_del(const struct lu_env *env, struct cl_page_list *plist, struct cl_page *page) { LASSERT(plist->pl_nr > 0); + LASSERT(cl_page_is_vmlocked(env, page)); LINVRNT(plist->pl_owner == current); list_del_init(&page->cp_batch); - lockdep_off(); - mutex_unlock(&page->cp_mutex); - lockdep_on(); --plist->pl_nr; lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist); cl_page_put(env, page); @@ -941,8 +935,6 @@ void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head) } EXPORT_SYMBOL(cl_page_list_splice); -void cl_page_disown0(const struct lu_env *env, - struct cl_io *io, struct cl_page *pg); /** * Disowns pages in a queue. @@ -959,9 +951,6 @@ void cl_page_list_disown(const struct lu_env *env, LASSERT(plist->pl_nr > 0); list_del_init(&page->cp_batch); - lockdep_off(); - mutex_unlock(&page->cp_mutex); - lockdep_on(); --plist->pl_nr; /* * cl_page_disown0 rather than usual cl_page_disown() is used, @@ -1221,7 +1210,7 @@ void cl_req_page_add(const struct lu_env *env, { struct cl_object *obj; struct cl_req_obj *rqo; - int i; + unsigned int i; LASSERT(list_empty(&page->cp_flight)); LASSERT(!page->cp_req); @@ -1268,7 +1257,7 @@ EXPORT_SYMBOL(cl_req_page_done); */ int cl_req_prep(const struct lu_env *env, struct cl_req *req) { - int i; + unsigned int i; int result; const struct cl_req_slice *slice; @@ -1301,7 +1290,7 @@ void cl_req_attr_set(const struct lu_env *env, struct cl_req *req, { const struct cl_req_slice *slice; struct cl_page *page; - int i; + unsigned int i; LASSERT(!list_empty(&req->crq_pages)); diff --git a/drivers/staging/lustre/lustre/obdclass/cl_object.c b/drivers/staging/lustre/lustre/obdclass/cl_object.c index 91a5806d0239..3199dd4a3b72 100644 --- a/drivers/staging/lustre/lustre/obdclass/cl_object.c +++ b/drivers/staging/lustre/lustre/obdclass/cl_object.c @@ -163,7 +163,7 @@ static spinlock_t *cl_object_attr_guard(struct cl_object *o) * * Prevents data-attributes from changing, until lock is released by * cl_object_attr_unlock(). This has to be called before calls to - * cl_object_attr_get(), cl_object_attr_set(). + * cl_object_attr_get(), cl_object_attr_update(). */ void cl_object_attr_lock(struct cl_object *o) __acquires(cl_object_attr_guard(o)) @@ -217,11 +217,11 @@ EXPORT_SYMBOL(cl_object_attr_get); * Updates data-attributes of an object \a obj. * * Only attributes, mentioned in a validness bit-mask \a v are - * updated. Calls cl_object_operations::coo_attr_set() on every layer, bottom - * to top. + * updated. Calls cl_object_operations::coo_attr_update() on every layer, + * bottom to top. */ -int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj, - const struct cl_attr *attr, unsigned v) +int cl_object_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned int v) { struct lu_object_header *top; int result; @@ -231,8 +231,9 @@ int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj, top = obj->co_lu.lo_header; result = 0; list_for_each_entry_reverse(obj, &top->loh_layers, co_lu.lo_linkage) { - if (obj->co_ops->coo_attr_set) { - result = obj->co_ops->coo_attr_set(env, obj, attr, v); + if (obj->co_ops->coo_attr_update) { + result = obj->co_ops->coo_attr_update(env, obj, attr, + v); if (result != 0) { if (result > 0) result = 0; @@ -242,7 +243,7 @@ int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj, } return result; } -EXPORT_SYMBOL(cl_object_attr_set); +EXPORT_SYMBOL(cl_object_attr_update); /** * Notifies layers (bottom-to-top) that glimpse AST was received. @@ -321,6 +322,27 @@ int cl_object_prune(const struct lu_env *env, struct cl_object *obj) EXPORT_SYMBOL(cl_object_prune); /** + * Get stripe information of this object. + */ +int cl_object_getstripe(const struct lu_env *env, struct cl_object *obj, + struct lov_user_md __user *uarg) +{ + struct lu_object_header *top; + int result = 0; + + top = obj->co_lu.lo_header; + list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_getstripe) { + result = obj->co_ops->coo_getstripe(env, obj, uarg); + if (result) + break; + } + } + return result; +} +EXPORT_SYMBOL(cl_object_getstripe); + +/** * Helper function removing all object locks, and marking object for * deletion. All object pages must have been deleted at this point. * @@ -377,7 +399,7 @@ static void cl_env_percpu_refill(void); */ int cl_site_init(struct cl_site *s, struct cl_device *d) { - int i; + size_t i; int result; result = lu_site_init(&s->cs_lu, &d->cd_lu_dev); @@ -411,7 +433,7 @@ static struct cache_stats cl_env_stats = { */ int cl_site_stats_print(const struct cl_site *site, struct seq_file *m) { - int i; + size_t i; static const char *pstate[] = { [CPS_CACHED] = "c", [CPS_OWNED] = "o", @@ -1000,7 +1022,7 @@ static int cl_env_percpu_init(void) * thus we must uninitialize up to i, the rest are undefined. */ for (j = 0; j < i; j++) { - cle = &cl_env_percpu[i]; + cle = &cl_env_percpu[j]; lu_context_exit(&cle->ce_ses); lu_context_fini(&cle->ce_ses); lu_env_fini(&cle->ce_lu); @@ -1126,7 +1148,7 @@ static void *cl_key_init(const struct lu_context *ctx, info = cl0_key_init(ctx, key); if (!IS_ERR(info)) { - int i; + size_t i; for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) lu_ref_init(&info->clt_counters[i].ctc_locks_locked); @@ -1138,7 +1160,7 @@ static void cl_key_fini(const struct lu_context *ctx, struct lu_context_key *key, void *data) { struct cl_thread_info *info; - int i; + size_t i; info = data; for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) @@ -1150,7 +1172,7 @@ static void cl_key_exit(const struct lu_context *ctx, struct lu_context_key *key, void *data) { struct cl_thread_info *info = data; - int i; + size_t i; for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) { LASSERT(info->clt_counters[i].ctc_nr_held == 0); diff --git a/drivers/staging/lustre/lustre/obdclass/cl_page.c b/drivers/staging/lustre/lustre/obdclass/cl_page.c index db2dc6b39073..63973ba096da 100644 --- a/drivers/staging/lustre/lustre/obdclass/cl_page.c +++ b/drivers/staging/lustre/lustre/obdclass/cl_page.c @@ -151,7 +151,6 @@ struct cl_page *cl_page_alloc(const struct lu_env *env, INIT_LIST_HEAD(&page->cp_layers); INIT_LIST_HEAD(&page->cp_batch); INIT_LIST_HEAD(&page->cp_flight); - mutex_init(&page->cp_mutex); lu_ref_init(&page->cp_reference); head = o->co_lu.lo_header; list_for_each_entry(o, &head->loh_layers, co_lu.lo_linkage) { @@ -171,7 +170,6 @@ struct cl_page *cl_page_alloc(const struct lu_env *env, } return page; } -EXPORT_SYMBOL(cl_page_alloc); /** * Returns a cl_page with index \a idx at the object \a o, and associated with @@ -229,11 +227,6 @@ EXPORT_SYMBOL(cl_page_find); static inline int cl_page_invariant(const struct cl_page *pg) { - /* - * Page invariant is protected by a VM lock. - */ - LINVRNT(cl_page_is_vmlocked(NULL, pg)); - return cl_page_in_use_noref(pg); } @@ -478,7 +471,6 @@ static void cl_page_owner_clear(struct cl_page *page) LASSERT(page->cp_owner->ci_owned_nr > 0); page->cp_owner->ci_owned_nr--; page->cp_owner = NULL; - page->cp_task = NULL; } } @@ -562,7 +554,6 @@ static int cl_page_own0(const struct lu_env *env, struct cl_io *io, PASSERT(env, pg, !pg->cp_owner); PASSERT(env, pg, !pg->cp_req); pg->cp_owner = cl_io_top(io); - pg->cp_task = current; cl_page_owner_set(pg); if (pg->cp_state != CPS_FREEING) { cl_page_state_set(env, pg, CPS_OWNED); @@ -619,7 +610,6 @@ void cl_page_assume(const struct lu_env *env, cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_assume)); PASSERT(env, pg, !pg->cp_owner); pg->cp_owner = cl_io_top(io); - pg->cp_task = current; cl_page_owner_set(pg); cl_page_state_set(env, pg, CPS_OWNED); } @@ -860,10 +850,6 @@ void cl_page_completion(const struct lu_env *env, PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt)); CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret); - if (crt == CRT_READ && ioret == 0) { - PASSERT(env, pg, !(pg->cp_flags & CPF_READ_COMPLETED)); - pg->cp_flags |= CPF_READ_COMPLETED; - } cl_page_state_set(env, pg, CPS_CACHED); if (crt >= CRT_NR) @@ -872,7 +858,6 @@ void cl_page_completion(const struct lu_env *env, (const struct lu_env *, const struct cl_page_slice *, int), ioret); if (anchor) { - LASSERT(cl_page_is_vmlocked(env, pg)); LASSERT(pg->cp_sync_io == anchor); pg->cp_sync_io = NULL; } @@ -989,10 +974,10 @@ void cl_page_header_print(const struct lu_env *env, void *cookie, lu_printer_t printer, const struct cl_page *pg) { (*printer)(env, cookie, - "page@%p[%d %p %d %d %d %p %p %#x]\n", + "page@%p[%d %p %d %d %p %p]\n", pg, atomic_read(&pg->cp_ref), pg->cp_obj, - pg->cp_state, pg->cp_error, pg->cp_type, - pg->cp_owner, pg->cp_req, pg->cp_flags); + pg->cp_state, pg->cp_type, + pg->cp_owner, pg->cp_req); } EXPORT_SYMBOL(cl_page_header_print); @@ -1020,7 +1005,6 @@ int cl_page_cancel(const struct lu_env *env, struct cl_page *page) (const struct lu_env *, const struct cl_page_slice *)); } -EXPORT_SYMBOL(cl_page_cancel); /** * Converts a byte offset within object \a obj into a page index. @@ -1046,9 +1030,9 @@ pgoff_t cl_index(const struct cl_object *obj, loff_t offset) } EXPORT_SYMBOL(cl_index); -int cl_page_size(const struct cl_object *obj) +size_t cl_page_size(const struct cl_object *obj) { - return 1 << PAGE_SHIFT; + return 1UL << PAGE_SHIFT; } EXPORT_SYMBOL(cl_page_size); @@ -1087,11 +1071,11 @@ struct cl_client_cache *cl_cache_init(unsigned long lru_page_max) /* Initialize cache data */ atomic_set(&cache->ccc_users, 1); cache->ccc_lru_max = lru_page_max; - atomic_set(&cache->ccc_lru_left, lru_page_max); + atomic_long_set(&cache->ccc_lru_left, lru_page_max); spin_lock_init(&cache->ccc_lru_lock); INIT_LIST_HEAD(&cache->ccc_lru); - atomic_set(&cache->ccc_unstable_nr, 0); + atomic_long_set(&cache->ccc_unstable_nr, 0); init_waitqueue_head(&cache->ccc_unstable_waitq); return cache; diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c index d9d2a1952b8b..76e1ee83a723 100644 --- a/drivers/staging/lustre/lustre/obdclass/class_obd.c +++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c @@ -40,10 +40,10 @@ #include "../include/lprocfs_status.h" #include <linux/list.h> #include "../include/cl_object.h" +#include "../include/lustre/lustre_ioctl.h" #include "llog_internal.h" struct obd_device *obd_devs[MAX_OBD_DEVICES]; -EXPORT_SYMBOL(obd_devs); struct list_head obd_types; DEFINE_RWLOCK(obd_dev_lock); @@ -54,11 +54,9 @@ unsigned int obd_dump_on_timeout; EXPORT_SYMBOL(obd_dump_on_timeout); unsigned int obd_dump_on_eviction; EXPORT_SYMBOL(obd_dump_on_eviction); -unsigned int obd_max_dirty_pages = 256; +unsigned long obd_max_dirty_pages; EXPORT_SYMBOL(obd_max_dirty_pages); -atomic_t obd_unstable_pages; -EXPORT_SYMBOL(obd_unstable_pages); -atomic_t obd_dirty_pages; +atomic_long_t obd_dirty_pages; EXPORT_SYMBOL(obd_dirty_pages); unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */ EXPORT_SYMBOL(obd_timeout); @@ -76,13 +74,11 @@ EXPORT_SYMBOL(at_early_margin); int at_extra = 30; EXPORT_SYMBOL(at_extra); -atomic_t obd_dirty_transit_pages; +atomic_long_t obd_dirty_transit_pages; EXPORT_SYMBOL(obd_dirty_transit_pages); char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE; -EXPORT_SYMBOL(obd_jobid_var); - -char obd_jobid_node[JOBSTATS_JOBID_SIZE + 1]; +char obd_jobid_node[LUSTRE_JOBID_SIZE + 1]; /* Get jobid of current process from stored variable or calculate * it from pid and user_id. @@ -93,14 +89,14 @@ char obd_jobid_node[JOBSTATS_JOBID_SIZE + 1]; */ int lustre_get_jobid(char *jobid) { - memset(jobid, 0, JOBSTATS_JOBID_SIZE); + memset(jobid, 0, LUSTRE_JOBID_SIZE); /* Jobstats isn't enabled */ if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) return 0; /* Use process name + fsuid as jobid */ if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) { - snprintf(jobid, JOBSTATS_JOBID_SIZE, "%s.%u", + snprintf(jobid, LUSTRE_JOBID_SIZE, "%s.%u", current_comm(), from_kuid(&init_user_ns, current_fsuid())); return 0; @@ -116,19 +112,6 @@ int lustre_get_jobid(char *jobid) } EXPORT_SYMBOL(lustre_get_jobid); -static inline void obd_data2conn(struct lustre_handle *conn, - struct obd_ioctl_data *data) -{ - memset(conn, 0, sizeof(*conn)); - conn->cookie = data->ioc_cookie; -} - -static inline void obd_conn2data(struct obd_ioctl_data *data, - struct lustre_handle *conn) -{ - data->ioc_cookie = conn->cookie; -} - static int class_resolve_dev_name(__u32 len, const char *name) { int rc; @@ -287,13 +270,6 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg) goto out; } - case OBD_IOC_CLOSE_UUID: { - CDEBUG(D_IOCTL, "closing all connections to uuid %s (NOOP)\n", - data->ioc_inlbuf1); - err = 0; - goto out; - } - case OBD_IOC_GETDEVICE: { int index = data->ioc_count; char *status, *str; @@ -467,15 +443,10 @@ static int obd_init_checks(void) return ret; } -extern int class_procfs_init(void); -extern int class_procfs_clean(void); - static int __init obdclass_init(void) { int i, err; - int lustre_register_fs(void); - LCONSOLE_INFO("Lustre: Build Version: " LUSTRE_VERSION_STRING "\n"); spin_lock_init(&obd_types_lock); @@ -542,23 +513,9 @@ static int __init obdclass_init(void) static void obdclass_exit(void) { - int i; - - int lustre_unregister_fs(void); - lustre_unregister_fs(); misc_deregister(&obd_psdev); - for (i = 0; i < class_devno_max(); i++) { - struct obd_device *obd = class_num2obd(i); - - if (obd && obd->obd_set_up && - OBT(obd) && OBP(obd, detach)) { - /* XXX should this call generic detach otherwise? */ - LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); - OBP(obd, detach)(obd); - } - } llog_info_fini(); cl_global_fini(); lu_global_fini(); diff --git a/drivers/staging/lustre/lustre/obdclass/debug.c b/drivers/staging/lustre/lustre/obdclass/debug.c index 8acf67239fa8..0bd4ad20aba7 100644 --- a/drivers/staging/lustre/lustre/obdclass/debug.c +++ b/drivers/staging/lustre/lustre/obdclass/debug.c @@ -48,10 +48,10 @@ int block_debug_setup(void *addr, int len, __u64 off, __u64 id) LASSERT(addr); put_unaligned_le64(off, addr); - put_unaligned_le64(id, addr+LPDS); + put_unaligned_le64(id, addr + LPDS); addr += len - LPDS - LPDS; put_unaligned_le64(off, addr); - put_unaligned_le64(id, addr+LPDS); + put_unaligned_le64(id, addr + LPDS); return 0; } diff --git a/drivers/staging/lustre/lustre/obdclass/genops.c b/drivers/staging/lustre/lustre/obdclass/genops.c index 99c2da632b51..cf8bb2a2f40b 100644 --- a/drivers/staging/lustre/lustre/obdclass/genops.c +++ b/drivers/staging/lustre/lustre/obdclass/genops.c @@ -133,7 +133,6 @@ void class_put_type(struct obd_type *type) module_put(type->typ_dt_ops->owner); spin_unlock(&type->obd_type_lock); } -EXPORT_SYMBOL(class_put_type); #define CLASS_MAX_NAME 1024 @@ -166,10 +165,10 @@ int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops, !type->typ_name) goto failed; - *(type->typ_dt_ops) = *dt_ops; + *type->typ_dt_ops = *dt_ops; /* md_ops is optional */ if (md_ops) - *(type->typ_md_ops) = *md_ops; + *type->typ_md_ops = *md_ops; strcpy(type->typ_name, name); spin_lock_init(&type->obd_type_lock); @@ -391,7 +390,6 @@ int class_name2dev(const char *name) return -1; } -EXPORT_SYMBOL(class_name2dev); struct obd_device *class_name2obd(const char *name) { @@ -421,7 +419,6 @@ int class_uuid2dev(struct obd_uuid *uuid) return -1; } -EXPORT_SYMBOL(class_uuid2dev); /** * Get obd device from ::obd_devs[] @@ -450,7 +447,6 @@ struct obd_device *class_num2obd(int num) return obd; } -EXPORT_SYMBOL(class_num2obd); /* Search for a client OBD connected to tgt_uuid. If grp_uuid is * specified, then only the client with that uuid is returned, @@ -509,7 +505,7 @@ struct obd_device *class_devices_in_group(struct obd_uuid *grp_uuid, int *next) continue; if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) { if (next) - *next = i+1; + *next = i + 1; read_unlock(&obd_dev_lock); return obd; } @@ -618,7 +614,7 @@ struct obd_export *class_conn2export(struct lustre_handle *conn) } CDEBUG(D_INFO, "looking for export cookie %#llx\n", conn->cookie); - export = class_handle2object(conn->cookie); + export = class_handle2object(conn->cookie, NULL); return export; } EXPORT_SYMBOL(class_conn2export); @@ -817,7 +813,6 @@ void class_unlink_export(struct obd_export *exp) spin_unlock(&exp->exp_obd->obd_dev_lock); class_export_put(exp); } -EXPORT_SYMBOL(class_unlink_export); /* Import management functions */ static void class_import_destroy(struct obd_import *imp) @@ -973,7 +968,6 @@ void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) lock, exp, lock->l_exp_refs_nr); spin_unlock(&exp->exp_locks_list_guard); } -EXPORT_SYMBOL(__class_export_add_lock_ref); void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) { @@ -991,7 +985,6 @@ void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) lock, exp, lock->l_exp_refs_nr); spin_unlock(&exp->exp_locks_list_guard); } -EXPORT_SYMBOL(__class_export_del_lock_ref); #endif /* A connection defines an export context in which preallocation can @@ -1100,7 +1093,6 @@ EXPORT_SYMBOL(class_fail_export); #if LUSTRE_TRACKS_LOCK_EXP_REFS void (*class_export_dump_hook)(struct obd_export *) = NULL; -EXPORT_SYMBOL(class_export_dump_hook); #endif /* Total amount of zombies to be destroyed */ @@ -1312,3 +1304,135 @@ void obd_zombie_impexp_stop(void) obd_zombie_impexp_notify(); wait_for_completion(&obd_zombie_stop); } + +struct obd_request_slot_waiter { + struct list_head orsw_entry; + wait_queue_head_t orsw_waitq; + bool orsw_signaled; +}; + +static bool obd_request_slot_avail(struct client_obd *cli, + struct obd_request_slot_waiter *orsw) +{ + bool avail; + + spin_lock(&cli->cl_loi_list_lock); + avail = !!list_empty(&orsw->orsw_entry); + spin_unlock(&cli->cl_loi_list_lock); + + return avail; +}; + +/* + * For network flow control, the RPC sponsor needs to acquire a credit + * before sending the RPC. The credits count for a connection is defined + * by the "cl_max_rpcs_in_flight". If all the credits are occpuied, then + * the subsequent RPC sponsors need to wait until others released their + * credits, or the administrator increased the "cl_max_rpcs_in_flight". + */ +int obd_get_request_slot(struct client_obd *cli) +{ + struct obd_request_slot_waiter orsw; + struct l_wait_info lwi; + int rc; + + spin_lock(&cli->cl_loi_list_lock); + if (cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight) { + cli->cl_r_in_flight++; + spin_unlock(&cli->cl_loi_list_lock); + return 0; + } + + init_waitqueue_head(&orsw.orsw_waitq); + list_add_tail(&orsw.orsw_entry, &cli->cl_loi_read_list); + orsw.orsw_signaled = false; + spin_unlock(&cli->cl_loi_list_lock); + + lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + rc = l_wait_event(orsw.orsw_waitq, + obd_request_slot_avail(cli, &orsw) || + orsw.orsw_signaled, + &lwi); + + /* + * Here, we must take the lock to avoid the on-stack 'orsw' to be + * freed but other (such as obd_put_request_slot) is using it. + */ + spin_lock(&cli->cl_loi_list_lock); + if (rc) { + if (!orsw.orsw_signaled) { + if (list_empty(&orsw.orsw_entry)) + cli->cl_r_in_flight--; + else + list_del(&orsw.orsw_entry); + } + } + + if (orsw.orsw_signaled) { + LASSERT(list_empty(&orsw.orsw_entry)); + + rc = -EINTR; + } + spin_unlock(&cli->cl_loi_list_lock); + + return rc; +} +EXPORT_SYMBOL(obd_get_request_slot); + +void obd_put_request_slot(struct client_obd *cli) +{ + struct obd_request_slot_waiter *orsw; + + spin_lock(&cli->cl_loi_list_lock); + cli->cl_r_in_flight--; + + /* If there is free slot, wakeup the first waiter. */ + if (!list_empty(&cli->cl_loi_read_list) && + likely(cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight)) { + orsw = list_entry(cli->cl_loi_read_list.next, + struct obd_request_slot_waiter, orsw_entry); + list_del_init(&orsw->orsw_entry); + cli->cl_r_in_flight++; + wake_up(&orsw->orsw_waitq); + } + spin_unlock(&cli->cl_loi_list_lock); +} +EXPORT_SYMBOL(obd_put_request_slot); + +__u32 obd_get_max_rpcs_in_flight(struct client_obd *cli) +{ + return cli->cl_max_rpcs_in_flight; +} +EXPORT_SYMBOL(obd_get_max_rpcs_in_flight); + +int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max) +{ + struct obd_request_slot_waiter *orsw; + __u32 old; + int diff; + int i; + + if (max > OBD_MAX_RIF_MAX || max < 1) + return -ERANGE; + + spin_lock(&cli->cl_loi_list_lock); + old = cli->cl_max_rpcs_in_flight; + cli->cl_max_rpcs_in_flight = max; + diff = max - old; + + /* We increase the max_rpcs_in_flight, then wakeup some waiters. */ + for (i = 0; i < diff; i++) { + if (list_empty(&cli->cl_loi_read_list)) + break; + + orsw = list_entry(cli->cl_loi_read_list.next, + struct obd_request_slot_waiter, orsw_entry); + list_del_init(&orsw->orsw_entry); + cli->cl_r_in_flight++; + wake_up(&orsw->orsw_waitq); + } + spin_unlock(&cli->cl_loi_list_lock); + + return 0; +} +EXPORT_SYMBOL(obd_set_max_rpcs_in_flight); diff --git a/drivers/staging/lustre/lustre/obdclass/linkea.c b/drivers/staging/lustre/lustre/obdclass/linkea.c new file mode 100644 index 000000000000..0b1d2f0a422c --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/linkea.c @@ -0,0 +1,201 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, 2014, Intel Corporation. + * Use is subject to license terms. + * + * Author: Di Wang <di.wang@intel.com> + */ + +#include "../include/lustre/lustre_idl.h" +#include "../include/obd.h" +#include "../include/lustre_linkea.h" + +int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf) +{ + ldata->ld_buf = lu_buf_check_and_alloc(buf, PAGE_SIZE); + if (!ldata->ld_buf->lb_buf) + return -ENOMEM; + ldata->ld_leh = ldata->ld_buf->lb_buf; + ldata->ld_leh->leh_magic = LINK_EA_MAGIC; + ldata->ld_leh->leh_len = sizeof(struct link_ea_header); + ldata->ld_leh->leh_reccount = 0; + return 0; +} +EXPORT_SYMBOL(linkea_data_new); + +int linkea_init(struct linkea_data *ldata) +{ + struct link_ea_header *leh; + + LASSERT(ldata->ld_buf); + leh = ldata->ld_buf->lb_buf; + if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) { + leh->leh_magic = LINK_EA_MAGIC; + leh->leh_reccount = __swab32(leh->leh_reccount); + leh->leh_len = __swab64(leh->leh_len); + /* entries are swabbed by linkea_entry_unpack */ + } + if (leh->leh_magic != LINK_EA_MAGIC) + return -EINVAL; + if (leh->leh_reccount == 0) + return -ENODATA; + + ldata->ld_leh = leh; + return 0; +} +EXPORT_SYMBOL(linkea_init); + +/** + * Pack a link_ea_entry. + * All elements are stored as chars to avoid alignment issues. + * Numbers are always big-endian + * \retval record length + */ +int linkea_entry_pack(struct link_ea_entry *lee, const struct lu_name *lname, + const struct lu_fid *pfid) +{ + struct lu_fid tmpfid; + int reclen; + + tmpfid = *pfid; + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LINKEA_CRASH)) + tmpfid.f_ver = ~0; + fid_cpu_to_be(&tmpfid, &tmpfid); + memcpy(&lee->lee_parent_fid, &tmpfid, sizeof(tmpfid)); + memcpy(lee->lee_name, lname->ln_name, lname->ln_namelen); + reclen = sizeof(struct link_ea_entry) + lname->ln_namelen; + + lee->lee_reclen[0] = (reclen >> 8) & 0xff; + lee->lee_reclen[1] = reclen & 0xff; + return reclen; +} +EXPORT_SYMBOL(linkea_entry_pack); + +void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen, + struct lu_name *lname, struct lu_fid *pfid) +{ + *reclen = (lee->lee_reclen[0] << 8) | lee->lee_reclen[1]; + memcpy(pfid, &lee->lee_parent_fid, sizeof(*pfid)); + fid_be_to_cpu(pfid, pfid); + if (lname) { + lname->ln_name = lee->lee_name; + lname->ln_namelen = *reclen - sizeof(struct link_ea_entry); + } +} +EXPORT_SYMBOL(linkea_entry_unpack); + +/** + * Add a record to the end of link ea buf + **/ +int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid) +{ + LASSERT(ldata->ld_leh); + + if (!lname || !pfid) + return -EINVAL; + + ldata->ld_reclen = lname->ln_namelen + sizeof(struct link_ea_entry); + if (ldata->ld_leh->leh_len + ldata->ld_reclen > + ldata->ld_buf->lb_len) { + if (lu_buf_check_and_grow(ldata->ld_buf, + ldata->ld_leh->leh_len + + ldata->ld_reclen) < 0) + return -ENOMEM; + } + + ldata->ld_leh = ldata->ld_buf->lb_buf; + ldata->ld_lee = ldata->ld_buf->lb_buf + ldata->ld_leh->leh_len; + ldata->ld_reclen = linkea_entry_pack(ldata->ld_lee, lname, pfid); + ldata->ld_leh->leh_len += ldata->ld_reclen; + ldata->ld_leh->leh_reccount++; + CDEBUG(D_INODE, "New link_ea name '" DFID ":%.*s' is added\n", + PFID(pfid), lname->ln_namelen, lname->ln_name); + return 0; +} +EXPORT_SYMBOL(linkea_add_buf); + +/** Del the current record from the link ea buf */ +void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname) +{ + LASSERT(ldata->ld_leh && ldata->ld_lee); + + ldata->ld_leh->leh_reccount--; + ldata->ld_leh->leh_len -= ldata->ld_reclen; + memmove(ldata->ld_lee, (char *)ldata->ld_lee + ldata->ld_reclen, + (char *)ldata->ld_leh + ldata->ld_leh->leh_len - + (char *)ldata->ld_lee); + CDEBUG(D_INODE, "Old link_ea name '%.*s' is removed\n", + lname->ln_namelen, lname->ln_name); + + if ((char *)ldata->ld_lee >= ((char *)ldata->ld_leh + + ldata->ld_leh->leh_len)) + ldata->ld_lee = NULL; +} +EXPORT_SYMBOL(linkea_del_buf); + +/** + * Check if such a link exists in linkEA. + * + * \param ldata link data the search to be done on + * \param lname name in the parent's directory entry pointing to this object + * \param pfid parent fid the link to be found for + * + * \retval 0 success + * \retval -ENOENT link does not exist + * \retval -ve on error + */ +int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid) +{ + struct lu_name tmpname; + struct lu_fid tmpfid; + int count; + + LASSERT(ldata->ld_leh); + + /* link #0 */ + ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1); + + for (count = 0; count < ldata->ld_leh->leh_reccount; count++) { + linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, + &tmpname, &tmpfid); + if (tmpname.ln_namelen == lname->ln_namelen && + lu_fid_eq(&tmpfid, pfid) && + (strncmp(tmpname.ln_name, lname->ln_name, + tmpname.ln_namelen) == 0)) + break; + ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee + + ldata->ld_reclen); + } + + if (count == ldata->ld_leh->leh_reccount) { + CDEBUG(D_INODE, "Old link_ea name '%.*s' not found\n", + lname->ln_namelen, lname->ln_name); + ldata->ld_lee = NULL; + ldata->ld_reclen = 0; + return -ENOENT; + } + return 0; +} +EXPORT_SYMBOL(linkea_links_find); diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c index 33342bfcc90e..be09e04b042f 100644 --- a/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c +++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c @@ -65,6 +65,7 @@ #include "../../include/obd_support.h" #include "../../include/obd_class.h" #include "../../include/lprocfs_status.h" +#include "../../include/lustre/lustre_ioctl.h" #include "../../include/lustre_ver.h" /* buffer MUST be at least the size of obd_ioctl_hdr */ @@ -157,7 +158,6 @@ int obd_ioctl_popdata(void __user *arg, void *data, int len) err = copy_to_user(arg, data, len) ? -EFAULT : 0; return err; } -EXPORT_SYMBOL(obd_ioctl_popdata); /* opening /dev/obd */ static int obd_class_open(struct inode *inode, struct file *file) @@ -191,7 +191,7 @@ static long obd_class_ioctl(struct file *filp, unsigned int cmd, } /* declare character device */ -static struct file_operations obd_psdev_fops = { +static const struct file_operations obd_psdev_fops = { .owner = THIS_MODULE, .unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */ .open = obd_class_open, /* open */ @@ -291,7 +291,7 @@ static ssize_t jobid_name_store(struct kobject *kobj, struct attribute *attr, const char *buffer, size_t count) { - if (!count || count > JOBSTATS_JOBID_SIZE) + if (!count || count > LUSTRE_JOBID_SIZE) return -EINVAL; memcpy(obd_jobid_node, buffer, count); diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c index c6cc6a7666e3..41b77a30feb3 100644 --- a/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c +++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c @@ -44,7 +44,7 @@ #include <linux/fs.h> -void obdo_refresh_inode(struct inode *dst, struct obdo *src, u32 valid) +void obdo_refresh_inode(struct inode *dst, const struct obdo *src, u32 valid) { valid &= src->o_valid; diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c index 8f70dd2686f9..e6c785afceba 100644 --- a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c +++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c @@ -45,6 +45,7 @@ #include "../../include/obd_support.h" #include "../../include/lprocfs_status.h" +#include "../../include/obd_class.h" struct static_lustre_uintvalue_attr { struct { @@ -95,8 +96,8 @@ LUSTRE_STATIC_UINT_ATTR(timeout, &obd_timeout); static ssize_t max_dirty_mb_show(struct kobject *kobj, struct attribute *attr, char *buf) { - return sprintf(buf, "%ul\n", - obd_max_dirty_pages / (1 << (20 - PAGE_SHIFT))); + return sprintf(buf, "%lu\n", + obd_max_dirty_pages / (1 << (20 - PAGE_SHIFT))); } static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr, diff --git a/drivers/staging/lustre/lustre/obdclass/llog.c b/drivers/staging/lustre/lustre/obdclass/llog.c index 1784ca063428..43797f106745 100644 --- a/drivers/staging/lustre/lustre/obdclass/llog.c +++ b/drivers/staging/lustre/lustre/obdclass/llog.c @@ -80,7 +80,7 @@ static void llog_free_handle(struct llog_handle *loghandle) LASSERT(list_empty(&loghandle->u.phd.phd_entry)); else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) LASSERT(list_empty(&loghandle->u.chd.chd_head)); - LASSERT(sizeof(*(loghandle->lgh_hdr)) == LLOG_CHUNK_SIZE); + LASSERT(sizeof(*loghandle->lgh_hdr) == LLOG_CHUNK_SIZE); kfree(loghandle->lgh_hdr); out: kfree(loghandle); @@ -137,6 +137,7 @@ static int llog_read_header(const struct lu_env *env, int llog_init_handle(const struct lu_env *env, struct llog_handle *handle, int flags, struct obd_uuid *uuid) { + enum llog_flag fmt = flags & LLOG_F_EXT_MASK; struct llog_log_hdr *llh; int rc; @@ -194,6 +195,7 @@ int llog_init_handle(const struct lu_env *env, struct llog_handle *handle, flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN); rc = -EINVAL; } + llh->llh_flags |= fmt; out: if (rc) { kfree(llh); @@ -233,6 +235,10 @@ static int llog_process_thread(void *arg) else last_index = LLOG_BITMAP_BYTES * 8 - 1; + /* Record is not in this buffer. */ + if (index > last_index) + goto out; + while (rc == 0) { struct llog_rec_hdr *rec; @@ -262,7 +268,7 @@ repeat: */ for (rec = (struct llog_rec_hdr *)buf; (char *)rec < buf + LLOG_CHUNK_SIZE; - rec = (struct llog_rec_hdr *)((char *)rec + rec->lrh_len)) { + rec = llog_rec_hdr_next(rec)) { CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n", rec, rec->lrh_type); diff --git a/drivers/staging/lustre/lustre/obdclass/llog_cat.c b/drivers/staging/lustre/lustre/obdclass/llog_cat.c index a82a2950295a..ce8e2f6f002a 100644 --- a/drivers/staging/lustre/lustre/obdclass/llog_cat.c +++ b/drivers/staging/lustre/lustre/obdclass/llog_cat.c @@ -63,11 +63,13 @@ static int llog_cat_id2handle(const struct lu_env *env, struct llog_logid *logid) { struct llog_handle *loghandle; + enum llog_flag fmt; int rc = 0; if (!cathandle) return -EBADF; + fmt = cathandle->lgh_hdr->llh_flags & LLOG_F_EXT_MASK; down_write(&cathandle->lgh_lock); list_for_each_entry(loghandle, &cathandle->u.chd.chd_head, u.phd.phd_entry) { @@ -99,7 +101,7 @@ static int llog_cat_id2handle(const struct lu_env *env, return rc; } - rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, NULL); + rc = llog_init_handle(env, loghandle, fmt | LLOG_F_IS_PLAIN, NULL); if (rc < 0) { llog_close(env, loghandle); loghandle = NULL; @@ -107,7 +109,7 @@ static int llog_cat_id2handle(const struct lu_env *env, } down_write(&cathandle->lgh_lock); - list_add(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head); + list_add_tail(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head); up_write(&cathandle->lgh_lock); loghandle->u.phd.phd_cat_handle = cathandle; @@ -123,7 +125,6 @@ out: int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle) { struct llog_handle *loghandle, *n; - int rc; list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head, u.phd.phd_entry) { @@ -134,8 +135,7 @@ int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle) /* if handle was stored in ctxt, remove it too */ if (cathandle->lgh_ctxt->loc_handle == cathandle) cathandle->lgh_ctxt->loc_handle = NULL; - rc = llog_close(env, cathandle); - return rc; + return llog_close(env, cathandle); } EXPORT_SYMBOL(llog_cat_close); diff --git a/drivers/staging/lustre/lustre/obdclass/llog_internal.h b/drivers/staging/lustre/lustre/obdclass/llog_internal.h index f7949525d952..21a93c73756a 100644 --- a/drivers/staging/lustre/lustre/obdclass/llog_internal.h +++ b/drivers/staging/lustre/lustre/obdclass/llog_internal.h @@ -70,4 +70,9 @@ int llog_process_or_fork(const struct lu_env *env, llog_cb_t cb, void *data, void *catdata, bool fork); int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle, struct llog_handle *loghandle, int index); + +static inline struct llog_rec_hdr *llog_rec_hdr_next(struct llog_rec_hdr *rec) +{ + return (struct llog_rec_hdr *)((char *)rec + rec->lrh_len); +} #endif diff --git a/drivers/staging/lustre/lustre/obdclass/llog_obd.c b/drivers/staging/lustre/lustre/obdclass/llog_obd.c index 6ace7e097859..a4277d684614 100644 --- a/drivers/staging/lustre/lustre/obdclass/llog_obd.c +++ b/drivers/staging/lustre/lustre/obdclass/llog_obd.c @@ -210,7 +210,6 @@ LU_KEY_INIT_FINI(llog, struct llog_thread_info); /* context key: llog_thread_key */ LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL); LU_KEY_INIT_GENERIC(llog); -EXPORT_SYMBOL(llog_thread_key); int llog_info_init(void) { diff --git a/drivers/staging/lustre/lustre/obdclass/llog_swab.c b/drivers/staging/lustre/lustre/obdclass/llog_swab.c index f7b9b190350c..8c4c1b3f1b45 100644 --- a/drivers/staging/lustre/lustre/obdclass/llog_swab.c +++ b/drivers/staging/lustre/lustre/obdclass/llog_swab.c @@ -172,20 +172,23 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec) __swab64s(&cr->cr.cr_time); lustre_swab_lu_fid(&cr->cr.cr_tfid); lustre_swab_lu_fid(&cr->cr.cr_pfid); - if (CHANGELOG_REC_EXTENDED(&cr->cr)) { - struct llog_changelog_ext_rec *ext = - (struct llog_changelog_ext_rec *)rec; - - lustre_swab_lu_fid(&ext->cr.cr_sfid); - lustre_swab_lu_fid(&ext->cr.cr_spfid); - tail = &ext->cr_tail; - } else { - tail = &cr->cr_tail; + if (cr->cr.cr_flags & CLF_RENAME) { + struct changelog_ext_rename *rnm = + changelog_rec_rename(&cr->cr); + + lustre_swab_lu_fid(&rnm->cr_sfid); + lustre_swab_lu_fid(&rnm->cr_spfid); } - tail = (struct llog_rec_tail *)((char *)tail + + /* + * Because the tail follows a variable-length structure we need + * to compute its location at runtime + */ + tail = (struct llog_rec_tail *)((char *)&cr->cr + + changelog_rec_size(&cr->cr) + cr->cr.cr_namelen); break; } + case CHANGELOG_USER_REC: { struct llog_changelog_user_rec *cur = @@ -224,6 +227,7 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec) __swab32s(&lsr->lsr_uid_h); __swab32s(&lsr->lsr_gid); __swab32s(&lsr->lsr_gid_h); + __swab64s(&lsr->lsr_valid); tail = &lsr->lsr_tail; break; } @@ -343,7 +347,6 @@ void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg) print_lustre_cfg(lcfg); } -EXPORT_SYMBOL(lustre_swab_lustre_cfg); /* used only for compatibility with old on-disk cfg_marker data */ struct cfg_marker32 { @@ -403,4 +406,3 @@ void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size) __swab64s(&marker->cm_canceltime); } } -EXPORT_SYMBOL(lustre_swab_cfg_marker); diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c index 279b625f1afe..852a5acfefab 100644 --- a/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c +++ b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c @@ -96,6 +96,12 @@ static const char * const obd_connect_names[] = { "pingless", "flock_deadlock", "disp_stripe", + "open_by_fid", + "lfsck", + "unknown", + "unlink_close", + "unknown", + "dir_stripe", "unknown", NULL }; @@ -309,7 +315,7 @@ struct dentry *ldebugfs_add_simple(struct dentry *root, } EXPORT_SYMBOL_GPL(ldebugfs_add_simple); -static struct file_operations lprocfs_generic_fops = { }; +static const struct file_operations lprocfs_generic_fops = { }; int ldebugfs_add_vars(struct dentry *parent, struct lprocfs_vars *list, @@ -615,7 +621,6 @@ void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); } -EXPORT_SYMBOL(lprocfs_stats_collect); /** * Append a space separated list of current set flags to str. @@ -1043,7 +1048,6 @@ int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid) } return rc; } -EXPORT_SYMBOL(lprocfs_stats_alloc_one); struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags) @@ -1547,6 +1551,146 @@ void lprocfs_oh_clear(struct obd_histogram *oh) } EXPORT_SYMBOL(lprocfs_oh_clear); +int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count, + struct root_squash_info *squash, char *name) +{ + char kernbuf[64], *tmp, *errmsg; + unsigned long uid, gid; + int rc; + + if (count >= sizeof(kernbuf)) { + errmsg = "string too long"; + rc = -EINVAL; + goto failed_noprint; + } + if (copy_from_user(kernbuf, buffer, count)) { + errmsg = "bad address"; + rc = -EFAULT; + goto failed_noprint; + } + kernbuf[count] = '\0'; + + /* look for uid gid separator */ + tmp = strchr(kernbuf, ':'); + if (!tmp) { + errmsg = "needs uid:gid format"; + rc = -EINVAL; + goto failed; + } + *tmp = '\0'; + tmp++; + + /* parse uid */ + if (kstrtoul(kernbuf, 0, &uid) != 0) { + errmsg = "bad uid"; + rc = -EINVAL; + goto failed; + } + /* parse gid */ + if (kstrtoul(tmp, 0, &gid) != 0) { + errmsg = "bad gid"; + rc = -EINVAL; + goto failed; + } + + squash->rsi_uid = uid; + squash->rsi_gid = gid; + + LCONSOLE_INFO("%s: root_squash is set to %u:%u\n", + name, squash->rsi_uid, squash->rsi_gid); + return count; + +failed: + if (tmp) { + tmp--; + *tmp = ':'; + } + CWARN("%s: failed to set root_squash to \"%s\", %s, rc = %d\n", + name, kernbuf, errmsg, rc); + return rc; +failed_noprint: + CWARN("%s: failed to set root_squash due to %s, rc = %d\n", + name, errmsg, rc); + return rc; +} +EXPORT_SYMBOL(lprocfs_wr_root_squash); + +int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count, + struct root_squash_info *squash, char *name) +{ + char *kernbuf = NULL, *errmsg; + struct list_head tmp; + int len = count; + int rc; + + if (count > 4096) { + errmsg = "string too long"; + rc = -EINVAL; + goto failed; + } + + kernbuf = kzalloc(count + 1, GFP_NOFS); + if (!kernbuf) { + errmsg = "no memory"; + rc = -ENOMEM; + goto failed; + } + + if (copy_from_user(kernbuf, buffer, count)) { + errmsg = "bad address"; + rc = -EFAULT; + goto failed; + } + kernbuf[count] = '\0'; + + if (count > 0 && kernbuf[count - 1] == '\n') + len = count - 1; + + if ((len == 4 && !strncmp(kernbuf, "NONE", len)) || + (len == 5 && !strncmp(kernbuf, "clear", len))) { + /* empty string is special case */ + down_write(&squash->rsi_sem); + if (!list_empty(&squash->rsi_nosquash_nids)) + cfs_free_nidlist(&squash->rsi_nosquash_nids); + up_write(&squash->rsi_sem); + LCONSOLE_INFO("%s: nosquash_nids is cleared\n", name); + kfree(kernbuf); + return count; + } + + INIT_LIST_HEAD(&tmp); + if (cfs_parse_nidlist(kernbuf, count, &tmp) <= 0) { + errmsg = "can't parse"; + rc = -EINVAL; + goto failed; + } + LCONSOLE_INFO("%s: nosquash_nids set to %s\n", + name, kernbuf); + kfree(kernbuf); + kernbuf = NULL; + + down_write(&squash->rsi_sem); + if (!list_empty(&squash->rsi_nosquash_nids)) + cfs_free_nidlist(&squash->rsi_nosquash_nids); + list_splice(&tmp, &squash->rsi_nosquash_nids); + up_write(&squash->rsi_sem); + + return count; + +failed: + if (kernbuf) { + CWARN("%s: failed to set nosquash_nids to \"%s\", %s rc = %d\n", + name, kernbuf, errmsg, rc); + kfree(kernbuf); + kernbuf = NULL; + } else { + CWARN("%s: failed to set nosquash_nids due to %s rc = %d\n", + name, errmsg, rc); + } + return rc; +} +EXPORT_SYMBOL(lprocfs_wr_nosquash_nids); + static ssize_t lustre_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c index 9b03059f34d6..054e567e6c8d 100644 --- a/drivers/staging/lustre/lustre/obdclass/lu_object.c +++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c @@ -55,6 +55,34 @@ #include "../include/lu_ref.h" #include <linux/list.h> +enum { + LU_CACHE_PERCENT_MAX = 50, + LU_CACHE_PERCENT_DEFAULT = 20 +}; + +#define LU_CACHE_NR_MAX_ADJUST 128 +#define LU_CACHE_NR_UNLIMITED -1 +#define LU_CACHE_NR_DEFAULT LU_CACHE_NR_UNLIMITED +#define LU_CACHE_NR_LDISKFS_LIMIT LU_CACHE_NR_UNLIMITED +#define LU_CACHE_NR_ZFS_LIMIT 256 + +#define LU_SITE_BITS_MIN 12 +#define LU_SITE_BITS_MAX 24 +/** + * total 256 buckets, we don't want too many buckets because: + * - consume too much memory + * - avoid unbalanced LRU list + */ +#define LU_SITE_BKT_BITS 8 + +static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT; +module_param(lu_cache_percent, int, 0644); +MODULE_PARM_DESC(lu_cache_percent, "Percentage of memory to be used as lu_object cache"); + +static long lu_cache_nr = LU_CACHE_NR_DEFAULT; +module_param(lu_cache_nr, long, 0644); +MODULE_PARM_DESC(lu_cache_nr, "Maximum number of objects in lu_object cache"); + static void lu_object_free(const struct lu_env *env, struct lu_object *o); static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx); @@ -310,10 +338,10 @@ int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr) struct cfs_hash_bd bd2; struct list_head dispose; int did_sth; - int start; + unsigned int start; int count; int bnr; - int i; + unsigned int i; if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU)) return 0; @@ -324,8 +352,13 @@ int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr) * the dispose list, removing them from LRU and hash table. */ start = s->ls_purge_start; - bnr = (nr == ~0) ? -1 : nr / CFS_HASH_NBKT(s->ls_obj_hash) + 1; + bnr = (nr == ~0) ? -1 : nr / (int)CFS_HASH_NBKT(s->ls_obj_hash) + 1; again: + /* + * It doesn't make any sense to make purge threads parallel, that can + * only bring troubles to us. See LU-5331. + */ + mutex_lock(&s->ls_purge_mutex); did_sth = 0; cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) { if (i < start) @@ -371,6 +404,7 @@ int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr) if (nr == 0) break; } + mutex_unlock(&s->ls_purge_mutex); if (nr != 0 && did_sth && start != 0) { start = 0; /* restart from the first bucket */ @@ -573,6 +607,27 @@ static struct lu_object *lu_object_find(const struct lu_env *env, return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf); } +/* + * Limit the lu_object cache to a maximum of lu_cache_nr objects. Because + * the calculation for the number of objects to reclaim is not covered by + * a lock the maximum number of objects is capped by LU_CACHE_MAX_ADJUST. + * This ensures that many concurrent threads will not accidentally purge + * the entire cache. + */ +static void lu_object_limit(const struct lu_env *env, struct lu_device *dev) +{ + __u64 size, nr; + + if (lu_cache_nr == LU_CACHE_NR_UNLIMITED) + return; + + size = cfs_hash_size_get(dev->ld_site->ls_obj_hash); + nr = (__u64)lu_cache_nr; + if (size > nr) + lu_site_purge(env, dev->ld_site, + min_t(__u64, size - nr, LU_CACHE_NR_MAX_ADJUST)); +} + static struct lu_object *lu_object_new(const struct lu_env *env, struct lu_device *dev, const struct lu_fid *f, @@ -590,6 +645,9 @@ static struct lu_object *lu_object_new(const struct lu_env *env, cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1); cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash); cfs_hash_bd_unlock(hs, &bd, 1); + + lu_object_limit(env, dev); + return o; } @@ -656,6 +714,9 @@ static struct lu_object *lu_object_find_try(const struct lu_env *env, if (likely(PTR_ERR(shadow) == -ENOENT)) { cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash); cfs_hash_bd_unlock(hs, &bd, 1); + + lu_object_limit(env, dev); + return o; } @@ -706,13 +767,15 @@ struct lu_object *lu_object_find_slice(const struct lu_env *env, struct lu_object *obj; top = lu_object_find(env, dev, f, conf); - if (!IS_ERR(top)) { - obj = lu_object_locate(top->lo_header, dev->ld_type); - if (!obj) - lu_object_put(env, top); - } else { - obj = top; + if (IS_ERR(top)) + return top; + + obj = lu_object_locate(top->lo_header, dev->ld_type); + if (unlikely(!obj)) { + lu_object_put(env, top); + obj = ERR_PTR(-ENOENT); } + return obj; } EXPORT_SYMBOL(lu_object_find_slice); @@ -726,34 +789,31 @@ int lu_device_type_init(struct lu_device_type *ldt) { int result = 0; + atomic_set(&ldt->ldt_device_nr, 0); INIT_LIST_HEAD(&ldt->ldt_linkage); if (ldt->ldt_ops->ldto_init) result = ldt->ldt_ops->ldto_init(ldt); - if (result == 0) + + if (!result) { + spin_lock(&obd_types_lock); list_add(&ldt->ldt_linkage, &lu_device_types); + spin_unlock(&obd_types_lock); + } + return result; } EXPORT_SYMBOL(lu_device_type_init); void lu_device_type_fini(struct lu_device_type *ldt) { + spin_lock(&obd_types_lock); list_del_init(&ldt->ldt_linkage); + spin_unlock(&obd_types_lock); if (ldt->ldt_ops->ldto_fini) ldt->ldt_ops->ldto_fini(ldt); } EXPORT_SYMBOL(lu_device_type_fini); -void lu_types_stop(void) -{ - struct lu_device_type *ldt; - - list_for_each_entry(ldt, &lu_device_types, ldt_linkage) { - if (ldt->ldt_device_nr == 0 && ldt->ldt_ops->ldto_stop) - ldt->ldt_ops->ldto_stop(ldt); - } -} -EXPORT_SYMBOL(lu_types_stop); - /** * Global list of all sites on this node */ @@ -808,22 +868,14 @@ void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie, } EXPORT_SYMBOL(lu_site_print); -enum { - LU_CACHE_PERCENT_MAX = 50, - LU_CACHE_PERCENT_DEFAULT = 20 -}; - -static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT; -module_param(lu_cache_percent, int, 0644); -MODULE_PARM_DESC(lu_cache_percent, "Percentage of memory to be used as lu_object cache"); - /** * Return desired hash table order. */ -static int lu_htable_order(void) +static unsigned long lu_htable_order(struct lu_device *top) { + unsigned long bits_max = LU_SITE_BITS_MAX; unsigned long cache_size; - int bits; + unsigned long bits; /* * Calculate hash table size, assuming that we want reasonable @@ -854,7 +906,7 @@ static int lu_htable_order(void) for (bits = 1; (1 << bits) < cache_size; ++bits) { ; } - return bits; + return clamp_t(typeof(bits), bits, LU_SITE_BITS_MIN, bits_max); } static unsigned lu_obj_hop_hash(struct cfs_hash *hs, @@ -930,28 +982,18 @@ static void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d) /** * Initialize site \a s, with \a d as the top level device. */ -#define LU_SITE_BITS_MIN 12 -#define LU_SITE_BITS_MAX 19 -/** - * total 256 buckets, we don't want too many buckets because: - * - consume too much memory - * - avoid unbalanced LRU list - */ -#define LU_SITE_BKT_BITS 8 - int lu_site_init(struct lu_site *s, struct lu_device *top) { struct lu_site_bkt_data *bkt; struct cfs_hash_bd bd; + unsigned long bits; + unsigned long i; char name[16]; - int bits; - int i; memset(s, 0, sizeof(*s)); - bits = lu_htable_order(); - snprintf(name, 16, "lu_site_%s", top->ld_type->ldt_name); - for (bits = min(max(LU_SITE_BITS_MIN, bits), LU_SITE_BITS_MAX); - bits >= LU_SITE_BITS_MIN; bits--) { + mutex_init(&s->ls_purge_mutex); + snprintf(name, sizeof(name), "lu_site_%s", top->ld_type->ldt_name); + for (bits = lu_htable_order(top); bits >= LU_SITE_BITS_MIN; bits--) { s->ls_obj_hash = cfs_hash_create(name, bits, bits, bits - LU_SITE_BKT_BITS, sizeof(*bkt), 0, 0, @@ -959,13 +1001,14 @@ int lu_site_init(struct lu_site *s, struct lu_device *top) CFS_HASH_SPIN_BKTLOCK | CFS_HASH_NO_ITEMREF | CFS_HASH_DEPTH | - CFS_HASH_ASSERT_EMPTY); + CFS_HASH_ASSERT_EMPTY | + CFS_HASH_COUNTER); if (s->ls_obj_hash) break; } if (!s->ls_obj_hash) { - CERROR("failed to create lu_site hash with bits: %d\n", bits); + CERROR("failed to create lu_site hash with bits: %lu\n", bits); return -ENOMEM; } @@ -1082,8 +1125,10 @@ EXPORT_SYMBOL(lu_device_put); */ int lu_device_init(struct lu_device *d, struct lu_device_type *t) { - if (t->ldt_device_nr++ == 0 && t->ldt_ops->ldto_start) + if (atomic_inc_return(&t->ldt_device_nr) == 1 && + t->ldt_ops->ldto_start) t->ldt_ops->ldto_start(t); + memset(d, 0, sizeof(*d)); atomic_set(&d->ld_ref, 0); d->ld_type = t; @@ -1098,9 +1143,8 @@ EXPORT_SYMBOL(lu_device_init); */ void lu_device_fini(struct lu_device *d) { - struct lu_device_type *t; + struct lu_device_type *t = d->ld_type; - t = d->ld_type; if (d->ld_obd) { d->ld_obd->obd_lu_dev = NULL; d->ld_obd = NULL; @@ -1109,8 +1153,10 @@ void lu_device_fini(struct lu_device *d) lu_ref_fini(&d->ld_reference); LASSERTF(atomic_read(&d->ld_ref) == 0, "Refcount is %u\n", atomic_read(&d->ld_ref)); - LASSERT(t->ldt_device_nr > 0); - if (--t->ldt_device_nr == 0 && t->ldt_ops->ldto_stop) + LASSERT(atomic_read(&t->ldt_device_nr) > 0); + + if (atomic_dec_and_test(&t->ldt_device_nr) && + t->ldt_ops->ldto_stop) t->ldt_ops->ldto_stop(t); } EXPORT_SYMBOL(lu_device_fini); @@ -1254,7 +1300,6 @@ void lu_stack_fini(const struct lu_env *env, struct lu_device *top) } } } -EXPORT_SYMBOL(lu_stack_fini); enum { /** @@ -1281,7 +1326,7 @@ static unsigned key_set_version; int lu_context_key_register(struct lu_context_key *key) { int result; - int i; + unsigned int i; LASSERT(key->lct_init); LASSERT(key->lct_fini); @@ -1476,18 +1521,16 @@ void lu_context_key_quiesce(struct lu_context_key *key) ++key_set_version; } } -EXPORT_SYMBOL(lu_context_key_quiesce); void lu_context_key_revive(struct lu_context_key *key) { key->lct_tags &= ~LCT_QUIESCENT; ++key_set_version; } -EXPORT_SYMBOL(lu_context_key_revive); static void keys_fini(struct lu_context *ctx) { - int i; + unsigned int i; if (!ctx->lc_value) return; @@ -1501,7 +1544,7 @@ static void keys_fini(struct lu_context *ctx) static int keys_fill(struct lu_context *ctx) { - int i; + unsigned int i; LINVRNT(ctx->lc_value); for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { @@ -1614,7 +1657,7 @@ EXPORT_SYMBOL(lu_context_enter); */ void lu_context_exit(struct lu_context *ctx) { - int i; + unsigned int i; LINVRNT(ctx->lc_state == LCS_ENTERED); ctx->lc_state = LCS_LEFT; @@ -1642,7 +1685,6 @@ int lu_context_refill(struct lu_context *ctx) { return likely(ctx->lc_version == key_set_version) ? 0 : keys_fill(ctx); } -EXPORT_SYMBOL(lu_context_refill); /** * lu_ctx_tags/lu_ses_tags will be updated if there are new types of @@ -1696,7 +1738,7 @@ static void lu_site_stats_get(struct cfs_hash *hs, struct lu_site_stats *stats, int populated) { struct cfs_hash_bd bd; - int i; + unsigned int i; cfs_hash_for_each_bucket(hs, &bd, i) { struct lu_site_bkt_data *bkt = cfs_hash_bd_extra_get(hs, &bd); @@ -1940,3 +1982,73 @@ void lu_kmem_fini(struct lu_kmem_descr *caches) } } EXPORT_SYMBOL(lu_kmem_fini); + +void lu_buf_free(struct lu_buf *buf) +{ + LASSERT(buf); + if (buf->lb_buf) { + LASSERT(buf->lb_len > 0); + kvfree(buf->lb_buf); + buf->lb_buf = NULL; + buf->lb_len = 0; + } +} +EXPORT_SYMBOL(lu_buf_free); + +void lu_buf_alloc(struct lu_buf *buf, size_t size) +{ + LASSERT(buf); + LASSERT(!buf->lb_buf); + LASSERT(!buf->lb_len); + buf->lb_buf = libcfs_kvzalloc(size, GFP_NOFS); + if (likely(buf->lb_buf)) + buf->lb_len = size; +} +EXPORT_SYMBOL(lu_buf_alloc); + +void lu_buf_realloc(struct lu_buf *buf, size_t size) +{ + lu_buf_free(buf); + lu_buf_alloc(buf, size); +} +EXPORT_SYMBOL(lu_buf_realloc); + +struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, size_t len) +{ + if (!buf->lb_buf && !buf->lb_len) + lu_buf_alloc(buf, len); + + if ((len > buf->lb_len) && buf->lb_buf) + lu_buf_realloc(buf, len); + + return buf; +} +EXPORT_SYMBOL(lu_buf_check_and_alloc); + +/** + * Increase the size of the \a buf. + * preserves old data in buffer + * old buffer remains unchanged on error + * \retval 0 or -ENOMEM + */ +int lu_buf_check_and_grow(struct lu_buf *buf, size_t len) +{ + char *ptr; + + if (len <= buf->lb_len) + return 0; + + ptr = libcfs_kvzalloc(len, GFP_NOFS); + if (!ptr) + return -ENOMEM; + + /* Free the old buf */ + if (buf->lb_buf) { + memcpy(ptr, buf->lb_buf, buf->lb_len); + kvfree(buf->lb_buf); + } + + buf->lb_buf = ptr; + buf->lb_len = len; + return 0; +} diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_handles.c b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c index 082f530c527c..c9445e5ec271 100644 --- a/drivers/staging/lustre/lustre/obdclass/lustre_handles.c +++ b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c @@ -130,7 +130,7 @@ void class_handle_unhash(struct portals_handle *h) } EXPORT_SYMBOL(class_handle_unhash); -void *class_handle2object(__u64 cookie) +void *class_handle2object(__u64 cookie, const void *owner) { struct handle_bucket *bucket; struct portals_handle *h; @@ -145,7 +145,7 @@ void *class_handle2object(__u64 cookie) rcu_read_lock(); list_for_each_entry_rcu(h, &bucket->head, h_link) { - if (h->h_cookie != cookie) + if (h->h_cookie != cookie || h->h_owner != owner) continue; spin_lock(&h->h_lock); @@ -164,8 +164,11 @@ EXPORT_SYMBOL(class_handle2object); void class_handle_free_cb(struct rcu_head *rcu) { - struct portals_handle *h = RCU2HANDLE(rcu); - void *ptr = (void *)(unsigned long)h->h_cookie; + struct portals_handle *h; + void *ptr; + + h = container_of(rcu, struct portals_handle, h_rcu); + ptr = (void *)(unsigned long)h->h_cookie; if (h->h_ops->hop_free) h->h_ops->hop_free(ptr, h->h_size); @@ -214,7 +217,7 @@ static int cleanup_all_handles(void) struct portals_handle *h; spin_lock(&handle_hash[i].lock); - list_for_each_entry_rcu(h, &(handle_hash[i].head), h_link) { + list_for_each_entry_rcu(h, &handle_hash[i].head, h_link) { CERROR("force clean handle %#llx addr %p ops %p\n", h->h_cookie, h, h->h_ops); diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_peer.c b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c index 5974a9bf77c0..ffa740aa861c 100644 --- a/drivers/staging/lustre/lustre/obdclass/lustre_peer.c +++ b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c @@ -139,7 +139,6 @@ int class_add_uuid(const char *uuid, __u64 nid) } return 0; } -EXPORT_SYMBOL(class_add_uuid); /* Delete the nids for one uuid if specified, otherwise delete all */ int class_del_uuid(const char *uuid) diff --git a/drivers/staging/lustre/lustre/obdclass/obd_config.c b/drivers/staging/lustre/lustre/obdclass/obd_config.c index 0eab1236501b..bbed1b72d52e 100644 --- a/drivers/staging/lustre/lustre/obdclass/obd_config.c +++ b/drivers/staging/lustre/lustre/obdclass/obd_config.c @@ -37,6 +37,7 @@ #define DEBUG_SUBSYSTEM S_CLASS #include "../include/obd_class.h" #include <linux/string.h> +#include "../include/lustre/lustre_ioctl.h" #include "../include/lustre_log.h" #include "../include/lprocfs_status.h" #include "../include/lustre_param.h" @@ -237,7 +238,7 @@ static int class_attach(struct lustre_cfg *lcfg) /* recovery data */ init_waitqueue_head(&obd->obd_evict_inprogress_waitq); - llog_group_init(&obd->obd_olg, FID_SEQ_LLOG); + llog_group_init(&obd->obd_olg); obd->obd_conn_inprogress = 0; @@ -250,15 +251,6 @@ static int class_attach(struct lustre_cfg *lcfg) } memcpy(obd->obd_uuid.uuid, uuid, len); - /* do the attach */ - if (OBP(obd, attach)) { - rc = OBP(obd, attach)(obd, sizeof(*lcfg), lcfg); - if (rc) { - rc = -EINVAL; - goto out; - } - } - /* Detach drops this */ spin_lock(&obd->obd_dev_lock); atomic_set(&obd->obd_refcount, 1); @@ -422,17 +414,12 @@ static int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg) } /* Leave this on forever */ obd->obd_stopping = 1; - - /* wait for already-arrived-connections to finish. */ - while (obd->obd_conn_inprogress > 0) { - spin_unlock(&obd->obd_dev_lock); - - cond_resched(); - - spin_lock(&obd->obd_dev_lock); - } spin_unlock(&obd->obd_dev_lock); + while (obd->obd_conn_inprogress > 0) + yield(); + smp_rmb(); + if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) { for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++) switch (*flag) { @@ -526,11 +513,6 @@ void class_decref(struct obd_device *obd, const char *scope, const void *source) CERROR("Cleanup %s returned %d\n", obd->obd_name, err); } - if (OBP(obd, detach)) { - err = OBP(obd, detach)(obd); - if (err) - CERROR("Detach returned %d\n", err); - } class_release_dev(obd); } } @@ -756,7 +738,7 @@ static int process_param2_config(struct lustre_cfg *lcfg) } start = ktime_get(); - rc = call_usermodehelper(argv[0], argv, NULL, 1); + rc = call_usermodehelper(argv[0], argv, NULL, UMH_WAIT_PROC); end = ktime_get(); if (rc < 0) { @@ -1026,7 +1008,7 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, oldfs = get_fs(); set_fs(KERNEL_DS); - rc = (var->fops->write)(&fakefile, sval, + rc = var->fops->write(&fakefile, sval, vallen, NULL); set_fs(oldfs); } @@ -1060,8 +1042,6 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, } EXPORT_SYMBOL(class_process_proc_param); -extern int lustre_check_exclusion(struct super_block *sb, char *svname); - /** Parse a configuration llog, doing various manipulations on them * for various reasons, (modifications for compatibility, skip obsolete * records, change uuids, etc), then class_process_config() resulting @@ -1317,33 +1297,33 @@ static int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, if (rc < 0) return rc; - ptr += snprintf(ptr, end-ptr, "cmd=%05x ", lcfg->lcfg_command); + ptr += snprintf(ptr, end - ptr, "cmd=%05x ", lcfg->lcfg_command); if (lcfg->lcfg_flags) - ptr += snprintf(ptr, end-ptr, "flags=%#08x ", + ptr += snprintf(ptr, end - ptr, "flags=%#08x ", lcfg->lcfg_flags); if (lcfg->lcfg_num) - ptr += snprintf(ptr, end-ptr, "num=%#08x ", lcfg->lcfg_num); + ptr += snprintf(ptr, end - ptr, "num=%#08x ", lcfg->lcfg_num); if (lcfg->lcfg_nid) { char nidstr[LNET_NIDSTR_SIZE]; libcfs_nid2str_r(lcfg->lcfg_nid, nidstr, sizeof(nidstr)); - ptr += snprintf(ptr, end-ptr, "nid=%s(%#llx)\n ", + ptr += snprintf(ptr, end - ptr, "nid=%s(%#llx)\n ", nidstr, lcfg->lcfg_nid); } if (lcfg->lcfg_command == LCFG_MARKER) { struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); - ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'", + ptr += snprintf(ptr, end - ptr, "marker=%d(%#x)%s '%s'", marker->cm_step, marker->cm_flags, marker->cm_tgtname, marker->cm_comment); } else { int i; for (i = 0; i < lcfg->lcfg_bufcount; i++) { - ptr += snprintf(ptr, end-ptr, "%d:%s ", i, + ptr += snprintf(ptr, end - ptr, "%d:%s ", i, lustre_cfg_string(lcfg, i)); } } diff --git a/drivers/staging/lustre/lustre/obdclass/obd_mount.c b/drivers/staging/lustre/lustre/obdclass/obd_mount.c index aa84a50e9904..0d3a3b05a637 100644 --- a/drivers/staging/lustre/lustre/obdclass/obd_mount.c +++ b/drivers/staging/lustre/lustre/obdclass/obd_mount.c @@ -37,11 +37,11 @@ */ #define DEBUG_SUBSYSTEM S_CLASS -#define D_MOUNT (D_SUPER|D_CONFIG/*|D_WARNING */) +#define D_MOUNT (D_SUPER | D_CONFIG/*|D_WARNING */) #define PRINT_CMD CDEBUG #include "../include/obd.h" -#include "../include/linux/lustre_compat25.h" +#include "../include/lustre_compat.h" #include "../include/obd_class.h" #include "../include/lustre/lustre_user.h" #include "../include/lustre_log.h" @@ -68,7 +68,7 @@ static void (*kill_super_cb)(struct super_block *sb); * this log, and is added to the mgc's list of logs to follow. */ int lustre_process_log(struct super_block *sb, char *logname, - struct config_llog_instance *cfg) + struct config_llog_instance *cfg) { struct lustre_cfg *lcfg; struct lustre_cfg_bufs *bufs; @@ -384,17 +384,15 @@ int lustre_start_mgc(struct super_block *sb) OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | OBD_CONNECT_LVB_TYPE; -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0) +#if OBD_OCD_VERSION(3, 0, 53, 0) > LUSTRE_VERSION_CODE data->ocd_connect_flags |= OBD_CONNECT_MNE_SWAB; -#else -#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab" #endif if (lmd_is_client(lsi->lsi_lmd) && lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR) data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV; data->ocd_version = LUSTRE_VERSION_CODE; - rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL); + rc = obd_connect(NULL, &exp, obd, &obd->obd_uuid, data, NULL); if (rc) { CERROR("connect failed %d\n", rc); goto out; @@ -670,7 +668,6 @@ int lustre_common_put_super(struct super_block *sb) } /* Drop a ref to the mounted disk */ lustre_put_lsi(sb); - lu_types_stop(); return rc; } EXPORT_SYMBOL(lustre_common_put_super); @@ -731,7 +728,7 @@ int lustre_check_exclusion(struct super_block *sb, char *svname) static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr) { const char *s1 = ptr, *s2; - __u32 index, *exclude_list; + __u32 index = 0, *exclude_list; int rc = 0, devmax; /* The shortest an ost name can be is 8 chars: -OST0000. @@ -758,7 +755,7 @@ static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr) exclude_list[lmd->lmd_exclude_count++] = index; else CDEBUG(D_MOUNT, "ignoring exclude %.*s: type = %#x\n", - (uint)(s2-s1), s1, rc); + (uint)(s2 - s1), s1, rc); s1 = s2; /* now we are pointing at ':' (next exclude) * or ',' (end of excludes) @@ -880,7 +877,7 @@ static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr) */ static int lmd_parse(char *options, struct lustre_mount_data *lmd) { - char *s1, *s2, *devname = NULL; + char *s1, *s2, *s3, *devname = NULL; struct lustre_mount_data *raw = (struct lustre_mount_data *)options; int rc = 0; @@ -913,6 +910,7 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd) /* Skip whitespace and extra commas */ while (*s1 == ' ' || *s1 == ',') s1++; + s3 = s1; /* Client options are parsed in ll_options: eg. flock, * user_xattr, acl @@ -970,6 +968,7 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd) rc = lmd_parse_mgssec(lmd, s1 + 7); if (rc) goto invalid; + s3 = s2; clear++; /* ost exclusion list */ } else if (strncmp(s1, "exclude=", 8) == 0) { @@ -990,10 +989,19 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd) size_t length, params_length; char *tail = strchr(s1 + 6, ','); - if (!tail) + if (!tail) { length = strlen(s1); - else - length = tail - s1; + } else { + lnet_nid_t nid; + char *param_str = tail + 1; + int supplementary = 1; + + while (!class_parse_nid_quiet(param_str, &nid, + ¶m_str)) { + supplementary = 0; + } + length = param_str - s1 - supplementary; + } length -= 6; params_length = strlen(lmd->lmd_params); if (params_length + length + 1 >= LMD_PARAMS_MAXLEN) @@ -1001,6 +1009,7 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd) strncat(lmd->lmd_params, s1 + 6, length); lmd->lmd_params[params_length + length] = '\0'; strlcat(lmd->lmd_params, " ", LMD_PARAMS_MAXLEN); + s3 = s1 + 6 + length; clear++; } else if (strncmp(s1, "osd=", 4) == 0) { rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4); @@ -1097,7 +1106,7 @@ static int lustre_fill_super(struct super_block *sb, void *data, int silent) struct lustre_sb_info *lsi; int rc; - CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb); + CDEBUG(D_MOUNT | D_VFSTRACE, "VFS Op: sb %p\n", sb); lsi = lustre_init_lsi(sb); if (!lsi) @@ -1133,7 +1142,7 @@ static int lustre_fill_super(struct super_block *sb, void *data, int silent) } else { rc = lustre_start_mgc(sb); if (rc) { - lustre_put_lsi(sb); + lustre_common_put_super(sb); goto out; } /* Connect and start */ diff --git a/drivers/staging/lustre/lustre/obdclass/obdo.c b/drivers/staging/lustre/lustre/obdclass/obdo.c index 8583a4a8c206..79104a66da96 100644 --- a/drivers/staging/lustre/lustre/obdclass/obdo.c +++ b/drivers/staging/lustre/lustre/obdclass/obdo.c @@ -112,7 +112,7 @@ void obdo_from_inode(struct obdo *dst, struct inode *src, u32 valid) } EXPORT_SYMBOL(obdo_from_inode); -void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj) +void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj) { ioobj->ioo_oid = oa->o_oi; if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP))) @@ -125,7 +125,8 @@ void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj) } EXPORT_SYMBOL(obdo_to_ioobj); -static void iattr_from_obdo(struct iattr *attr, struct obdo *oa, u32 valid) +static void iattr_from_obdo(struct iattr *attr, const struct obdo *oa, + u32 valid) { valid &= oa->o_valid; @@ -152,12 +153,14 @@ static void iattr_from_obdo(struct iattr *attr, struct obdo *oa, u32 valid) } #if 0 /* you shouldn't be able to change a file's type with setattr */ if (valid & OBD_MD_FLTYPE) { - attr->ia_mode = (attr->ia_mode & ~S_IFMT)|(oa->o_mode & S_IFMT); + attr->ia_mode = (attr->ia_mode & ~S_IFMT) | + (oa->o_mode & S_IFMT); attr->ia_valid |= ATTR_MODE; } #endif if (valid & OBD_MD_FLMODE) { - attr->ia_mode = (attr->ia_mode & S_IFMT)|(oa->o_mode & ~S_IFMT); + attr->ia_mode = (attr->ia_mode & S_IFMT) | + (oa->o_mode & ~S_IFMT); attr->ia_valid |= ATTR_MODE; if (!in_group_p(make_kgid(&init_user_ns, oa->o_gid)) && !capable(CFS_CAP_FSETID)) @@ -173,7 +176,7 @@ static void iattr_from_obdo(struct iattr *attr, struct obdo *oa, u32 valid) } } -void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, u32 valid) +void md_from_obdo(struct md_op_data *op_data, const struct obdo *oa, u32 valid) { iattr_from_obdo(&op_data->op_attr, oa, valid); if (valid & OBD_MD_FLBLOCKS) { diff --git a/drivers/staging/lustre/lustre/obdecho/echo_client.c b/drivers/staging/lustre/lustre/obdecho/echo_client.c index 5b29c4a44fe5..505582ff4d1e 100644 --- a/drivers/staging/lustre/lustre/obdecho/echo_client.c +++ b/drivers/staging/lustre/lustre/obdecho/echo_client.c @@ -41,6 +41,7 @@ #include "../include/cl_object.h" #include "../include/lustre_fid.h" #include "../include/lustre_acl.h" +#include "../include/lustre/lustre_ioctl.h" #include "../include/lustre_net.h" #include "echo_internal.h" @@ -64,14 +65,14 @@ struct echo_object { struct echo_device *eo_dev; struct list_head eo_obj_chain; - struct lov_stripe_md *eo_lsm; + struct lov_oinfo *eo_oinfo; atomic_t eo_npages; int eo_deleted; }; struct echo_object_conf { struct cl_object_conf eoc_cl; - struct lov_stripe_md **eoc_md; + struct lov_oinfo **eoc_oinfo; }; struct echo_page { @@ -152,9 +153,6 @@ struct echo_object_conf *cl2echo_conf(const struct cl_object_conf *c) } /** @} echo_helpers */ - -static struct echo_object *cl_echo_object_find(struct echo_device *d, - struct lov_stripe_md **lsm); static int cl_echo_object_put(struct echo_object *eco); static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset, struct page **pages, int npages, int async); @@ -413,10 +411,13 @@ static int echo_object_init(const struct lu_env *env, struct lu_object *obj, cconf = lu2cl_conf(conf); econf = cl2echo_conf(cconf); - LASSERT(econf->eoc_md); - eco->eo_lsm = *econf->eoc_md; - /* clear the lsm pointer so that it won't get freed. */ - *econf->eoc_md = NULL; + LASSERT(econf->eoc_oinfo); + /* + * Transfer the oinfo pointer to eco that it won't be + * freed. + */ + eco->eo_oinfo = *econf->eoc_oinfo; + *econf->eoc_oinfo = NULL; eco->eo_dev = ed; atomic_set(&eco->eo_npages, 0); @@ -429,52 +430,6 @@ static int echo_object_init(const struct lu_env *env, struct lu_object *obj, return 0; } -/* taken from osc_unpackmd() */ -static int echo_alloc_memmd(struct echo_device *ed, - struct lov_stripe_md **lsmp) -{ - int lsm_size; - - /* If export is lov/osc then use their obd method */ - if (ed->ed_next) - return obd_alloc_memmd(ed->ed_ec->ec_exp, lsmp); - /* OFD has no unpackmd method, do everything here */ - lsm_size = lov_stripe_md_size(1); - - LASSERT(!*lsmp); - *lsmp = kzalloc(lsm_size, GFP_NOFS); - if (!*lsmp) - return -ENOMEM; - - (*lsmp)->lsm_oinfo[0] = kzalloc(sizeof(struct lov_oinfo), GFP_NOFS); - if (!(*lsmp)->lsm_oinfo[0]) { - kfree(*lsmp); - return -ENOMEM; - } - - loi_init((*lsmp)->lsm_oinfo[0]); - (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES; - ostid_set_seq_echo(&(*lsmp)->lsm_oi); - - return lsm_size; -} - -static int echo_free_memmd(struct echo_device *ed, struct lov_stripe_md **lsmp) -{ - int lsm_size; - - /* If export is lov/osc then use their obd method */ - if (ed->ed_next) - return obd_free_memmd(ed->ed_ec->ec_exp, lsmp); - /* OFD has no unpackmd method, do everything here */ - lsm_size = lov_stripe_md_size(1); - - kfree((*lsmp)->lsm_oinfo[0]); - kfree(*lsmp); - *lsmp = NULL; - return 0; -} - static void echo_object_free(const struct lu_env *env, struct lu_object *obj) { struct echo_object *eco = cl2echo_obj(lu2cl(obj)); @@ -489,8 +444,7 @@ static void echo_object_free(const struct lu_env *env, struct lu_object *obj) lu_object_fini(obj); lu_object_header_fini(obj->lo_header); - if (eco->eo_lsm) - echo_free_memmd(eco->eo_dev, &eco->eo_lsm); + kfree(eco->eo_oinfo); kmem_cache_free(echo_object_kmem, eco); } @@ -864,25 +818,21 @@ static struct lu_device_type echo_device_type = { */ /* Interfaces to echo client obd device */ -static struct echo_object *cl_echo_object_find(struct echo_device *d, - struct lov_stripe_md **lsmp) +static struct echo_object * +cl_echo_object_find(struct echo_device *d, const struct ost_id *oi) { struct lu_env *env; struct echo_thread_info *info; struct echo_object_conf *conf; - struct lov_stripe_md *lsm; + struct lov_oinfo *oinfo = NULL; struct echo_object *eco; struct cl_object *obj; struct lu_fid *fid; int refcheck; int rc; - LASSERT(lsmp); - lsm = *lsmp; - LASSERT(lsm); - LASSERTF(ostid_id(&lsm->lsm_oi) != 0, DOSTID"\n", POSTID(&lsm->lsm_oi)); - LASSERTF(ostid_seq(&lsm->lsm_oi) == FID_SEQ_ECHO, DOSTID"\n", - POSTID(&lsm->lsm_oi)); + LASSERTF(ostid_id(oi), DOSTID "\n", POSTID(oi)); + LASSERTF(ostid_seq(oi) == FID_SEQ_ECHO, DOSTID "\n", POSTID(oi)); /* Never return an object if the obd is to be freed. */ if (echo_dev2cl(d)->cd_lu_dev.ld_obd->obd_stopping) @@ -895,16 +845,24 @@ static struct echo_object *cl_echo_object_find(struct echo_device *d, info = echo_env_info(env); conf = &info->eti_conf; if (d->ed_next) { - struct lov_oinfo *oinfo = lsm->lsm_oinfo[0]; + oinfo = kzalloc(sizeof(*oinfo), GFP_NOFS); + if (!oinfo) { + eco = ERR_PTR(-ENOMEM); + goto out; + } - LASSERT(oinfo); - oinfo->loi_oi = lsm->lsm_oi; + oinfo->loi_oi = *oi; conf->eoc_cl.u.coc_oinfo = oinfo; } - conf->eoc_md = lsmp; + + /* + * If echo_object_init() is successful then ownership of oinfo + * is transferred to the object. + */ + conf->eoc_oinfo = &oinfo; fid = &info->eti_fid; - rc = ostid_to_fid(fid, &lsm->lsm_oi, 0); + rc = ostid_to_fid(fid, (struct ost_id *)oi, 0); if (rc != 0) { eco = ERR_PTR(rc); goto out; @@ -927,6 +885,7 @@ static struct echo_object *cl_echo_object_find(struct echo_device *d, } out: + kfree(oinfo); cl_env_put(env, &refcheck); return eco; } @@ -1051,7 +1010,7 @@ static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset, struct cl_io *io; struct cl_page *clp; struct lustre_handle lh = { 0 }; - int page_size = cl_page_size(obj); + size_t page_size = cl_page_size(obj); int refcheck; int rc; int i; @@ -1145,7 +1104,6 @@ static int echo_create_object(const struct lu_env *env, struct echo_device *ed, { struct echo_object *eco; struct echo_client_obd *ec = ed->ed_ec; - struct lov_stripe_md *lsm = NULL; int rc; int created = 0; @@ -1156,30 +1114,19 @@ static int echo_create_object(const struct lu_env *env, struct echo_device *ed, return -EINVAL; } - rc = echo_alloc_memmd(ed, &lsm); - if (rc < 0) { - CERROR("Cannot allocate md: rc = %d\n", rc); - goto failed; - } - - /* setup object ID here */ - lsm->lsm_oi = oa->o_oi; + if (!ostid_id(&oa->o_oi)) + ostid_set_id(&oa->o_oi, ++last_object_id); - if (ostid_id(&lsm->lsm_oi) == 0) - ostid_set_id(&lsm->lsm_oi, ++last_object_id); - - rc = obd_create(env, ec->ec_exp, oa, &lsm, oti); + rc = obd_create(env, ec->ec_exp, oa, oti); if (rc != 0) { CERROR("Cannot create objects: rc = %d\n", rc); goto failed; } created = 1; - /* See what object ID we were given */ - oa->o_oi = lsm->lsm_oi; oa->o_valid |= OBD_MD_FLID; - eco = cl_echo_object_find(ed, &lsm); + eco = cl_echo_object_find(ed, &oa->o_oi); if (IS_ERR(eco)) { rc = PTR_ERR(eco); goto failed; @@ -1190,9 +1137,7 @@ static int echo_create_object(const struct lu_env *env, struct echo_device *ed, failed: if (created && rc) - obd_destroy(env, ec->ec_exp, oa, lsm, oti, NULL); - if (lsm) - echo_free_memmd(ed, &lsm); + obd_destroy(env, ec->ec_exp, oa, oti); if (rc) CERROR("create object failed with: rc = %d\n", rc); return rc; @@ -1201,32 +1146,21 @@ static int echo_create_object(const struct lu_env *env, struct echo_device *ed, static int echo_get_object(struct echo_object **ecop, struct echo_device *ed, struct obdo *oa) { - struct lov_stripe_md *lsm = NULL; struct echo_object *eco; int rc; - if ((oa->o_valid & OBD_MD_FLID) == 0 || ostid_id(&oa->o_oi) == 0) { - /* disallow use of object id 0 */ - CERROR("No valid oid\n"); + if (!(oa->o_valid & OBD_MD_FLID) || !(oa->o_valid & OBD_MD_FLGROUP) || + !ostid_id(&oa->o_oi)) { + CERROR("invalid oid " DOSTID "\n", POSTID(&oa->o_oi)); return -EINVAL; } - rc = echo_alloc_memmd(ed, &lsm); - if (rc < 0) - return rc; - - lsm->lsm_oi = oa->o_oi; - if (!(oa->o_valid & OBD_MD_FLGROUP)) - ostid_set_seq_echo(&lsm->lsm_oi); - rc = 0; - eco = cl_echo_object_find(ed, &lsm); + eco = cl_echo_object_find(ed, &oa->o_oi); if (!IS_ERR(eco)) *ecop = eco; else rc = PTR_ERR(eco); - if (lsm) - echo_free_memmd(ed, &lsm); return rc; } @@ -1436,13 +1370,12 @@ static int echo_client_prep_commit(const struct lu_env *env, npages = tot_pages; for (i = 0; i < npages; i++, off += PAGE_SIZE) { - rnb[i].offset = off; - rnb[i].len = PAGE_SIZE; - rnb[i].flags = brw_flags; + rnb[i].rnb_offset = off; + rnb[i].rnb_len = PAGE_SIZE; + rnb[i].rnb_flags = brw_flags; } ioo.ioo_bufcnt = npages; - oti->oti_transno = 0; lpages = npages; ret = obd_preprw(env, rw, exp, oa, 1, &ioo, rnb, &lpages, @@ -1452,14 +1385,14 @@ static int echo_client_prep_commit(const struct lu_env *env, LASSERT(lpages == npages); for (i = 0; i < lpages; i++) { - struct page *page = lnb[i].page; + struct page *page = lnb[i].lnb_page; /* read past eof? */ - if (!page && lnb[i].rc == 0) + if (!page && lnb[i].lnb_rc == 0) continue; if (async) - lnb[i].flags |= OBD_BRW_ASYNC; + lnb[i].lnb_flags |= OBD_BRW_ASYNC; if (ostid_id(&oa->o_oi) == ECHO_PERSISTENT_OBJID || (oa->o_valid & OBD_MD_FLFLAGS) == 0 || @@ -1469,13 +1402,13 @@ static int echo_client_prep_commit(const struct lu_env *env, if (rw == OBD_BRW_WRITE) echo_client_page_debug_setup(page, rw, ostid_id(&oa->o_oi), - rnb[i].offset, - rnb[i].len); + rnb[i].rnb_offset, + rnb[i].rnb_len); else echo_client_page_debug_check(page, ostid_id(&oa->o_oi), - rnb[i].offset, - rnb[i].len); + rnb[i].rnb_offset, + rnb[i].rnb_len); } ret = obd_commitrw(env, rw, exp, oa, 1, &ioo, @@ -1613,8 +1546,7 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len, rc = echo_get_object(&eco, ed, oa); if (rc == 0) { - rc = obd_destroy(env, ec->ec_exp, oa, NULL, - &dummy_oti, NULL); + rc = obd_destroy(env, ec->ec_exp, oa, &dummy_oti); if (rc == 0) eco->eo_deleted = 1; echo_put_object(eco); diff --git a/drivers/staging/lustre/lustre/obdecho/echo_internal.h b/drivers/staging/lustre/lustre/obdecho/echo_internal.h index f5034a253f6d..966414fd5424 100644 --- a/drivers/staging/lustre/lustre/obdecho/echo_internal.h +++ b/drivers/staging/lustre/lustre/obdecho/echo_internal.h @@ -33,9 +33,9 @@ /* The persistent object (i.e. actually stores stuff!) */ #define ECHO_PERSISTENT_OBJID 1ULL -#define ECHO_PERSISTENT_SIZE ((__u64)(1<<20)) +#define ECHO_PERSISTENT_SIZE ((__u64)(1 << 20)) /* block size to use for data verification */ -#define OBD_ECHO_BLOCK_SIZE (4<<10) +#define OBD_ECHO_BLOCK_SIZE (4 << 10) #endif diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c b/drivers/staging/lustre/lustre/osc/lproc_osc.c index 7e83d395b998..f0062d44ee03 100644 --- a/drivers/staging/lustre/lustre/osc/lproc_osc.c +++ b/drivers/staging/lustre/lustre/osc/lproc_osc.c @@ -119,6 +119,7 @@ static ssize_t max_rpcs_in_flight_store(struct kobject *kobj, spin_lock(&cli->cl_loi_list_lock); cli->cl_max_rpcs_in_flight = val; + client_adjust_max_dirty(cli); spin_unlock(&cli->cl_loi_list_lock); return count; @@ -136,10 +137,10 @@ static ssize_t max_dirty_mb_show(struct kobject *kobj, int mult; spin_lock(&cli->cl_loi_list_lock); - val = cli->cl_dirty_max; + val = cli->cl_dirty_max_pages; spin_unlock(&cli->cl_loi_list_lock); - mult = 1 << 20; + mult = 1 << (20 - PAGE_SHIFT); return lprocfs_read_frac_helper(buf, PAGE_SIZE, val, mult); } @@ -166,7 +167,7 @@ static ssize_t max_dirty_mb_store(struct kobject *kobj, return -ERANGE; spin_lock(&cli->cl_loi_list_lock); - cli->cl_dirty_max = (u32)(pages_number << PAGE_SHIFT); + cli->cl_dirty_max_pages = pages_number; osc_wake_cache_waiters(cli); spin_unlock(&cli->cl_loi_list_lock); @@ -181,11 +182,11 @@ static int osc_cached_mb_seq_show(struct seq_file *m, void *v) int shift = 20 - PAGE_SHIFT; seq_printf(m, - "used_mb: %d\n" - "busy_cnt: %d\n", - (atomic_read(&cli->cl_lru_in_list) + - atomic_read(&cli->cl_lru_busy)) >> shift, - atomic_read(&cli->cl_lru_busy)); + "used_mb: %ld\n" + "busy_cnt: %ld\n", + (atomic_long_read(&cli->cl_lru_in_list) + + atomic_long_read(&cli->cl_lru_busy)) >> shift, + atomic_long_read(&cli->cl_lru_busy)); return 0; } @@ -197,8 +198,10 @@ static ssize_t osc_cached_mb_seq_write(struct file *file, { struct obd_device *dev = ((struct seq_file *)file->private_data)->private; struct client_obd *cli = &dev->u.cli; - int pages_number, mult, rc; + long pages_number, rc; char kernbuf[128]; + int mult; + u64 val; if (count >= sizeof(kernbuf)) return -EINVAL; @@ -210,14 +213,18 @@ static ssize_t osc_cached_mb_seq_write(struct file *file, mult = 1 << (20 - PAGE_SHIFT); buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) - kernbuf; - rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); + rc = lprocfs_write_frac_u64_helper(buffer, count, &val, mult); if (rc) return rc; + if (val > LONG_MAX) + return -ERANGE; + pages_number = (long)val; + if (pages_number < 0) return -ERANGE; - rc = atomic_read(&cli->cl_lru_in_list) - pages_number; + rc = atomic_long_read(&cli->cl_lru_in_list) - pages_number; if (rc > 0) { struct lu_env *env; int refcheck; @@ -244,7 +251,7 @@ static ssize_t cur_dirty_bytes_show(struct kobject *kobj, int len; spin_lock(&cli->cl_loi_list_lock); - len = sprintf(buf, "%lu\n", cli->cl_dirty); + len = sprintf(buf, "%lu\n", cli->cl_dirty_pages << PAGE_SHIFT); spin_unlock(&cli->cl_loi_list_lock); return len; @@ -583,6 +590,7 @@ static ssize_t max_pages_per_rpc_store(struct kobject *kobj, } spin_lock(&cli->cl_loi_list_lock); cli->cl_max_pages_per_rpc = val; + client_adjust_max_dirty(cli); spin_unlock(&cli->cl_loi_list_lock); return count; @@ -596,13 +604,14 @@ static ssize_t unstable_stats_show(struct kobject *kobj, struct obd_device *dev = container_of(kobj, struct obd_device, obd_kobj); struct client_obd *cli = &dev->u.cli; - int pages, mb; + long pages; + int mb; - pages = atomic_read(&cli->cl_unstable_count); + pages = atomic_long_read(&cli->cl_unstable_count); mb = (pages * PAGE_SIZE) >> 20; - return sprintf(buf, "unstable_pages: %8d\n" - "unstable_mb: %8d\n", pages, mb); + return sprintf(buf, "unstable_pages: %20ld\n" + "unstable_mb: %10d\n", pages, mb); } LUSTRE_RO_ATTR(unstable_stats); diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c index d011135802d5..4bbe219add98 100644 --- a/drivers/staging/lustre/lustre/osc/osc_cache.c +++ b/drivers/staging/lustre/lustre/osc/osc_cache.c @@ -44,7 +44,7 @@ static int extent_debug; /* set it to be true for more debug */ static void osc_update_pending(struct osc_object *obj, int cmd, int delta); static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, - int state); + enum osc_extent_state state); static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, struct osc_async_page *oap, int sent, int rc); static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, @@ -177,7 +177,7 @@ static int osc_extent_sanity_check0(struct osc_extent *ext, { struct osc_object *obj = ext->oe_obj; struct osc_async_page *oap; - int page_count; + size_t page_count; int rc = 0; if (!osc_object_is_locked(obj)) { @@ -632,7 +632,7 @@ static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2) */ static struct osc_extent *osc_extent_find(const struct lu_env *env, struct osc_object *obj, pgoff_t index, - int *grants) + unsigned int *grants) { struct client_obd *cli = osc_cli(obj); struct osc_lock *olck; @@ -643,10 +643,10 @@ static struct osc_extent *osc_extent_find(const struct lu_env *env, struct osc_extent *found = NULL; pgoff_t chunk; pgoff_t max_end; - int max_pages; /* max_pages_per_rpc */ - int chunksize; + unsigned int max_pages; /* max_pages_per_rpc */ + unsigned int chunksize; int ppc_bits; /* pages per chunk bits */ - int chunk_mask; + pgoff_t chunk_mask; int rc; cur = osc_extent_alloc(obj); @@ -700,8 +700,8 @@ restart: if (!ext) ext = first_extent(obj); while (ext) { - loff_t ext_chk_start = ext->oe_start >> ppc_bits; - loff_t ext_chk_end = ext->oe_end >> ppc_bits; + pgoff_t ext_chk_start = ext->oe_start >> ppc_bits; + pgoff_t ext_chk_end = ext->oe_end >> ppc_bits; LASSERT(sanity_check_nolock(ext) == 0); if (chunk > ext_chk_end + 1) @@ -913,7 +913,7 @@ int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, return 0; } -static int extent_wait_cb(struct osc_extent *ext, int state) +static int extent_wait_cb(struct osc_extent *ext, enum osc_extent_state state) { int ret; @@ -928,7 +928,7 @@ static int extent_wait_cb(struct osc_extent *ext, int state) * Wait for the extent's state to become @state. */ static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, - int state) + enum osc_extent_state state) { struct osc_object *obj = ext->oe_obj; struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL, @@ -958,8 +958,8 @@ static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi); if (rc == -ETIMEDOUT) { OSC_EXTENT_DUMP(D_ERROR, ext, - "%s: wait ext to %d timedout, recovery in progress?\n", - osc_export(obj)->exp_obd->obd_name, state); + "%s: wait ext to %u timedout, recovery in progress?\n", + osc_export(obj)->exp_obd->obd_name, state); lwi = LWI_INTR(NULL, NULL); rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), @@ -1099,7 +1099,7 @@ static int osc_extent_make_ready(const struct lu_env *env, struct osc_async_page *oap; struct osc_async_page *last = NULL; struct osc_object *obj = ext->oe_obj; - int page_count = 0; + unsigned int page_count = 0; int rc; /* we're going to grab page lock, so object lock must not be taken. */ @@ -1140,9 +1140,11 @@ static int osc_extent_make_ready(const struct lu_env *env, * the size of file. */ if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) { - last->oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE); - LASSERT(last->oap_count > 0); - LASSERT(last->oap_page_off + last->oap_count <= PAGE_SIZE); + int last_oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE); + + LASSERT(last_oap_count > 0); + LASSERT(last->oap_page_off + last_oap_count <= PAGE_SIZE); + last->oap_count = last_oap_count; spin_lock(&last->oap_lock); last->oap_async_flags |= ASYNC_COUNT_STABLE; spin_unlock(&last->oap_lock); @@ -1174,7 +1176,8 @@ static int osc_extent_make_ready(const struct lu_env *env, * called to expand the extent for the same IO. To expand the extent, the * page index must be in the same or next chunk of ext->oe_end. */ -static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, int *grants) +static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, + unsigned int *grants) { struct osc_object *obj = ext->oe_obj; struct client_obd *cli = osc_cli(obj); @@ -1183,7 +1186,7 @@ static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, int *grants) pgoff_t chunk = index >> ppc_bits; pgoff_t end_chunk; pgoff_t end_index; - int chunksize = 1 << cli->cl_chunkbits; + unsigned int chunksize = 1 << cli->cl_chunkbits; int rc = 0; LASSERT(ext->oe_max_end >= index && ext->oe_start <= index); @@ -1361,7 +1364,7 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, if (rc == 0 && srvlock) { struct lu_device *ld = opg->ops_cl.cpl_obj->co_lu.lo_dev; struct osc_stats *stats = &lu2osc_dev(ld)->od_stats; - int bytes = oap->oap_count; + size_t bytes = oap->oap_count; if (crt == CRT_READ) stats->os_lockless_reads += bytes; @@ -1383,18 +1386,16 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, #define OSC_DUMP_GRANT(lvl, cli, fmt, args...) do { \ struct client_obd *__tmp = (cli); \ - CDEBUG(lvl, "%s: grant { dirty: %ld/%ld dirty_pages: %d/%d " \ - "unstable_pages: %d/%d dropped: %ld avail: %ld, " \ - "reserved: %ld, flight: %d } lru {in list: %d, " \ - "left: %d, waiters: %d }" fmt, \ + CDEBUG(lvl, "%s: grant { dirty: %ld/%ld dirty_pages: %ld/%lu " \ + "dropped: %ld avail: %ld, reserved: %ld, flight: %d }" \ + "lru {in list: %ld, left: %ld, waiters: %d }" fmt "\n", \ __tmp->cl_import->imp_obd->obd_name, \ - __tmp->cl_dirty, __tmp->cl_dirty_max, \ - atomic_read(&obd_dirty_pages), obd_max_dirty_pages, \ - atomic_read(&obd_unstable_pages), obd_max_dirty_pages, \ + __tmp->cl_dirty_pages, __tmp->cl_dirty_max_pages, \ + atomic_long_read(&obd_dirty_pages), obd_max_dirty_pages, \ __tmp->cl_lost_grant, __tmp->cl_avail_grant, \ __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, \ - atomic_read(&__tmp->cl_lru_in_list), \ - atomic_read(&__tmp->cl_lru_busy), \ + atomic_long_read(&__tmp->cl_lru_in_list), \ + atomic_long_read(&__tmp->cl_lru_busy), \ atomic_read(&__tmp->cl_lru_shrinkers), ##args); \ } while (0) @@ -1404,8 +1405,8 @@ static void osc_consume_write_grant(struct client_obd *cli, { assert_spin_locked(&cli->cl_loi_list_lock); LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT)); - atomic_inc(&obd_dirty_pages); - cli->cl_dirty += PAGE_SIZE; + atomic_long_inc(&obd_dirty_pages); + cli->cl_dirty_pages++; pga->flag |= OBD_BRW_FROM_GRANT; CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n", PAGE_SIZE, pga, pga->pg); @@ -1424,12 +1425,12 @@ static void osc_release_write_grant(struct client_obd *cli, } pga->flag &= ~OBD_BRW_FROM_GRANT; - atomic_dec(&obd_dirty_pages); - cli->cl_dirty -= PAGE_SIZE; + atomic_long_dec(&obd_dirty_pages); + cli->cl_dirty_pages--; if (pga->flag & OBD_BRW_NOCACHE) { pga->flag &= ~OBD_BRW_NOCACHE; - atomic_dec(&obd_dirty_transit_pages); - cli->cl_dirty_transit -= PAGE_SIZE; + atomic_long_dec(&obd_dirty_transit_pages); + cli->cl_dirty_transit--; } } @@ -1494,11 +1495,11 @@ static void osc_unreserve_grant(struct client_obd *cli, static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, unsigned int lost_grant) { - int grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; + unsigned long grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; spin_lock(&cli->cl_loi_list_lock); - atomic_sub(nr_pages, &obd_dirty_pages); - cli->cl_dirty -= nr_pages << PAGE_SHIFT; + atomic_long_sub(nr_pages, &obd_dirty_pages); + cli->cl_dirty_pages -= nr_pages; cli->cl_lost_grant += lost_grant; if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) { /* borrow some grant from truncate to avoid the case that @@ -1511,7 +1512,7 @@ static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, spin_unlock(&cli->cl_loi_list_lock); CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n", lost_grant, cli->cl_lost_grant, - cli->cl_avail_grant, cli->cl_dirty); + cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_SHIFT); } /** @@ -1535,19 +1536,18 @@ static int osc_enter_cache_try(struct client_obd *cli, { int rc; - OSC_DUMP_GRANT(D_CACHE, cli, "need:%d.\n", bytes); + OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes); rc = osc_reserve_grant(cli, bytes); if (rc < 0) return 0; - if (cli->cl_dirty + PAGE_SIZE <= cli->cl_dirty_max && - atomic_read(&obd_unstable_pages) + 1 + - atomic_read(&obd_dirty_pages) <= obd_max_dirty_pages) { + if (cli->cl_dirty_pages <= cli->cl_dirty_max_pages && + atomic_long_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages) { osc_consume_write_grant(cli, &oap->oap_brw_page); if (transient) { - cli->cl_dirty_transit += PAGE_SIZE; - atomic_inc(&obd_dirty_transit_pages); + cli->cl_dirty_transit++; + atomic_long_inc(&obd_dirty_transit_pages); oap->oap_brw_flags |= OBD_BRW_NOCACHE; } rc = 1; @@ -1581,11 +1581,13 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, struct osc_object *osc = oap->oap_obj; struct lov_oinfo *loi = osc->oo_oinfo; struct osc_cache_waiter ocw; - struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL, - LWI_ON_SIGNAL_NOOP, NULL); + struct l_wait_info lwi; int rc = -EDQUOT; - OSC_DUMP_GRANT(D_CACHE, cli, "need:%d.\n", bytes); + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(AT_OFF ? obd_timeout : at_max), + NULL, LWI_ON_SIGNAL_NOOP, NULL); + + OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes); spin_lock(&cli->cl_loi_list_lock); @@ -1593,14 +1595,16 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, * of queued writes and create a discontiguous rpc stream */ if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) || - cli->cl_dirty_max < PAGE_SIZE || - cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync) { + !cli->cl_dirty_max_pages || cli->cl_ar.ar_force_sync || + loi->loi_ar.ar_force_sync) { + OSC_DUMP_GRANT(D_CACHE, cli, "forced sync i/o\n"); rc = -EDQUOT; goto out; } /* Hopefully normal case - cache space and write credits available */ if (osc_enter_cache_try(cli, oap, bytes, 0)) { + OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n"); rc = 0; goto out; } @@ -1615,7 +1619,7 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, init_waitqueue_head(&ocw.ocw_waitq); ocw.ocw_oap = oap; ocw.ocw_grant = bytes; - while (cli->cl_dirty > 0 || cli->cl_w_in_flight > 0) { + while (cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0) { list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters); ocw.ocw_rc = 0; spin_unlock(&cli->cl_loi_list_lock); @@ -1629,32 +1633,49 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, spin_lock(&cli->cl_loi_list_lock); - /* l_wait_event is interrupted by signal, or timed out */ if (rc < 0) { - if (rc == -ETIMEDOUT) { - OSC_DUMP_GRANT(D_ERROR, cli, - "try to reserve %d.\n", bytes); - osc_extent_tree_dump(D_ERROR, osc); - rc = -EDQUOT; - } - + /* l_wait_event is interrupted by signal, or timed out */ list_del_init(&ocw.ocw_entry); - goto out; + break; } - LASSERT(list_empty(&ocw.ocw_entry)); rc = ocw.ocw_rc; if (rc != -EDQUOT) - goto out; + break; if (osc_enter_cache_try(cli, oap, bytes, 0)) { rc = 0; - goto out; + break; } } + + switch (rc) { + case 0: + OSC_DUMP_GRANT(D_CACHE, cli, "finally got grant space\n"); + break; + case -ETIMEDOUT: + OSC_DUMP_GRANT(D_CACHE, cli, + "timeout, fall back to sync i/o\n"); + osc_extent_tree_dump(D_CACHE, osc); + /* fall back to synchronous I/O */ + rc = -EDQUOT; + break; + case -EINTR: + /* Ensures restartability - LU-3581 */ + OSC_DUMP_GRANT(D_CACHE, cli, "interrupted\n"); + rc = -ERESTARTSYS; + break; + case -EDQUOT: + OSC_DUMP_GRANT(D_CACHE, cli, + "no grant space, fall back to sync i/o\n"); + break; + default: + CDEBUG(D_CACHE, "%s: event for cache space @ %p never arrived due to %d, fall back to sync i/o\n", + cli->cl_import->imp_obd->obd_name, &ocw, rc); + break; + } out: spin_unlock(&cli->cl_loi_list_lock); - OSC_DUMP_GRANT(D_CACHE, cli, "returned %d.\n", rc); return rc; } @@ -1670,19 +1691,17 @@ void osc_wake_cache_waiters(struct client_obd *cli) ocw->ocw_rc = -EDQUOT; /* we can't dirty more */ - if ((cli->cl_dirty + PAGE_SIZE > cli->cl_dirty_max) || - (atomic_read(&obd_unstable_pages) + 1 + - atomic_read(&obd_dirty_pages) > obd_max_dirty_pages)) { - CDEBUG(D_CACHE, "no dirty room: dirty: %ld osc max %ld, sys max %d\n", - cli->cl_dirty, - cli->cl_dirty_max, obd_max_dirty_pages); + if ((cli->cl_dirty_pages > cli->cl_dirty_max_pages) || + (atomic_long_read(&obd_dirty_pages) + 1 > + obd_max_dirty_pages)) { + CDEBUG(D_CACHE, "no dirty room: dirty: %ld osc max %ld, sys max %ld\n", + cli->cl_dirty_pages, cli->cl_dirty_max_pages, + obd_max_dirty_pages); goto wakeup; } - ocw->ocw_rc = 0; - if (!osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0)) - ocw->ocw_rc = -EDQUOT; - + if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0)) + ocw->ocw_rc = 0; wakeup: CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n", ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc); @@ -1843,97 +1862,6 @@ static void osc_process_ar(struct osc_async_rc *ar, __u64 xid, ar->ar_force_sync = 0; } -/** - * Performs "unstable" page accounting. This function balances the - * increment operations performed in osc_inc_unstable_pages. It is - * registered as the RPC request callback, and is executed when the - * bulk RPC is committed on the server. Thus at this point, the pages - * involved in the bulk transfer are no longer considered unstable. - */ -void osc_dec_unstable_pages(struct ptlrpc_request *req) -{ - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; - struct ptlrpc_bulk_desc *desc = req->rq_bulk; - int page_count = desc->bd_iov_count; - int i; - - /* No unstable page tracking */ - if (!cli->cl_cache) - return; - - LASSERT(page_count >= 0); - - for (i = 0; i < page_count; i++) - dec_node_page_state(desc->bd_iov[i].kiov_page, - NR_UNSTABLE_NFS); - - atomic_sub(page_count, &cli->cl_cache->ccc_unstable_nr); - LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); - - atomic_sub(page_count, &cli->cl_unstable_count); - LASSERT(atomic_read(&cli->cl_unstable_count) >= 0); - - atomic_sub(page_count, &obd_unstable_pages); - LASSERT(atomic_read(&obd_unstable_pages) >= 0); - - spin_lock(&req->rq_lock); - req->rq_committed = 1; - req->rq_unstable = 0; - spin_unlock(&req->rq_lock); - - wake_up_all(&cli->cl_cache->ccc_unstable_waitq); -} - -/* "unstable" page accounting. See: osc_dec_unstable_pages. */ -void osc_inc_unstable_pages(struct ptlrpc_request *req) -{ - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; - struct ptlrpc_bulk_desc *desc = req->rq_bulk; - long page_count = desc->bd_iov_count; - int i; - - /* No unstable page tracking */ - if (!cli->cl_cache) - return; - - LASSERT(page_count >= 0); - - for (i = 0; i < page_count; i++) - inc_node_page_state(desc->bd_iov[i].kiov_page, - NR_UNSTABLE_NFS); - - LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); - atomic_add(page_count, &cli->cl_cache->ccc_unstable_nr); - - LASSERT(atomic_read(&cli->cl_unstable_count) >= 0); - atomic_add(page_count, &cli->cl_unstable_count); - - LASSERT(atomic_read(&obd_unstable_pages) >= 0); - atomic_add(page_count, &obd_unstable_pages); - - spin_lock(&req->rq_lock); - - /* - * If the request has already been committed (i.e. brw_commit - * called via rq_commit_cb), we need to undo the unstable page - * increments we just performed because rq_commit_cb wont be - * called again. Otherwise, just set the commit callback so the - * unstable page accounting is properly updated when the request - * is committed - */ - if (req->rq_committed) { - /* Drop lock before calling osc_dec_unstable_pages */ - spin_unlock(&req->rq_lock); - osc_dec_unstable_pages(req); - spin_lock(&req->rq_lock); - } else { - req->rq_unstable = 1; - req->rq_commit_cb = osc_dec_unstable_pages; - } - - spin_unlock(&req->rq_lock); -} - /* this must be called holding the loi list lock to give coverage to exit_cache, * async_flag maintenance, and oap_request */ @@ -1945,9 +1873,6 @@ static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, __u64 xid = 0; if (oap->oap_request) { - if (!rc) - osc_inc_unstable_pages(oap->oap_request); - xid = ptlrpc_req_xid(oap->oap_request); ptlrpc_req_finished(oap->oap_request); oap->oap_request = NULL; @@ -1979,7 +1904,7 @@ static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, */ static int try_to_add_extent_for_io(struct client_obd *cli, struct osc_extent *ext, struct list_head *rpclist, - int *pc, unsigned int *max_pages) + unsigned int *pc, unsigned int *max_pages) { struct osc_extent *tmp; struct osc_async_page *oap = list_first_entry(&ext->oe_pages, @@ -2032,12 +1957,13 @@ static int try_to_add_extent_for_io(struct client_obd *cli, * 5. Traverse the extent tree from the 1st extent; * 6. Above steps exit if there is no space in this RPC. */ -static int get_write_extents(struct osc_object *obj, struct list_head *rpclist) +static unsigned int get_write_extents(struct osc_object *obj, + struct list_head *rpclist) { struct client_obd *cli = osc_cli(obj); struct osc_extent *ext; struct osc_extent *temp; - int page_count = 0; + unsigned int page_count = 0; unsigned int max_pages = cli->cl_max_pages_per_rpc; LASSERT(osc_object_is_locked(obj)); @@ -2175,7 +2101,7 @@ osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli, struct osc_extent *ext; struct osc_extent *next; LIST_HEAD(rpclist); - int page_count = 0; + unsigned int page_count = 0; unsigned int max_pages = cli->cl_max_pages_per_rpc; int rc = 0; @@ -2390,7 +2316,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, struct client_obd *cli = oap->oap_cli; struct osc_object *osc = oap->oap_obj; pgoff_t index; - int grants = 0; + unsigned int grants = 0, tmp; int brw_flags = OBD_BRW_ASYNC; int cmd = OBD_BRW_WRITE; int need_release = 0; @@ -2434,9 +2360,6 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, return rc; } - if (osc_over_unstable_soft_limit(cli)) - brw_flags |= OBD_BRW_SOFT_SYNC; - oap->oap_cmd = cmd; oap->oap_page_off = ops->ops_from; oap->oap_count = ops->ops_to - ops->ops_from; @@ -2476,7 +2399,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, grants = 0; need_release = 1; } else if (ext->oe_end < index) { - int tmp = grants; + tmp = grants; /* try to expand this extent */ rc = osc_extent_expand(ext, index, &tmp); if (rc < 0) { @@ -2501,7 +2424,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, } if (!ext) { - int tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; + tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; /* try to find new extent to cover this page */ LASSERT(!oio->oi_active); @@ -2645,7 +2568,7 @@ int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, goto out; spin_lock(&oap->oap_lock); - oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT; + oap->oap_async_flags |= ASYNC_READY | ASYNC_URGENT; spin_unlock(&oap->oap_lock); if (memory_pressure_get()) diff --git a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h index c8c3f1ca77be..9c8de15c309c 100644 --- a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h +++ b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h @@ -64,7 +64,7 @@ struct osc_io { /** true if this io is lockless. */ unsigned int oi_lockless; /** how many LRU pages are reserved for this IO */ - int oi_lru_reserved; + unsigned long oi_lru_reserved; /** active extents, we know how many bytes is going to be written, * so having an active extent will prevent it from being fragmented @@ -389,7 +389,7 @@ extern struct lu_device_type osc_device_type; extern struct lu_context_key osc_key; extern struct lu_context_key osc_session_key; -#define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY) +#define OSC_FLAGS (ASYNC_URGENT | ASYNC_READY) int osc_lock_init(const struct lu_env *env, struct cl_object *obj, struct cl_lock *lock, @@ -608,7 +608,7 @@ struct osc_extent { /** link list of osc_object's oo_{hp|urgent|locking}_exts. */ struct list_head oe_link; /** state of this extent */ - unsigned int oe_state; + enum osc_extent_state oe_state; /** flags for this extent. */ unsigned int oe_intree:1, /** 0 is write, 1 is read */ diff --git a/drivers/staging/lustre/lustre/osc/osc_internal.h b/drivers/staging/lustre/lustre/osc/osc_internal.h index 7a27f0961955..67fe0a254991 100644 --- a/drivers/staging/lustre/lustre/osc/osc_internal.h +++ b/drivers/staging/lustre/lustre/osc/osc_internal.h @@ -71,7 +71,6 @@ struct osc_async_page { struct client_obd *oap_cli; struct osc_object *oap_obj; - struct ldlm_lock *oap_ldlm_lock; spinlock_t oap_lock; }; @@ -134,9 +133,9 @@ int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo, int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg); int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, struct list_head *ext_list, int cmd); -int osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, - int target, bool force); -int osc_lru_reclaim(struct client_obd *cli); +long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, + long target, bool force); +long osc_lru_reclaim(struct client_obd *cli); unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock); @@ -198,7 +197,7 @@ int osc_quotacheck(struct obd_device *unused, struct obd_export *exp, int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk); void osc_inc_unstable_pages(struct ptlrpc_request *req); void osc_dec_unstable_pages(struct ptlrpc_request *req); -int osc_over_unstable_soft_limit(struct client_obd *cli); +bool osc_over_unstable_soft_limit(struct client_obd *cli); struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env, struct osc_object *obj, pgoff_t index, diff --git a/drivers/staging/lustre/lustre/osc/osc_io.c b/drivers/staging/lustre/lustre/osc/osc_io.c index 6e3dcd38913f..8a559cbcdd0c 100644 --- a/drivers/staging/lustre/lustre/osc/osc_io.c +++ b/drivers/staging/lustre/lustre/osc/osc_io.c @@ -109,11 +109,11 @@ static int osc_io_submit(const struct lu_env *env, struct cl_page_list *qin = &queue->c2_qin; struct cl_page_list *qout = &queue->c2_qout; - int queued = 0; + unsigned int queued = 0; int result = 0; int cmd; int brw_flags; - int max_pages; + unsigned int max_pages; LASSERT(qin->pl_nr > 0); @@ -163,14 +163,19 @@ static int osc_io_submit(const struct lu_env *env, continue; } - cl_page_list_move(qout, qin, page); spin_lock(&oap->oap_lock); - oap->oap_async_flags = ASYNC_URGENT|ASYNC_READY; + oap->oap_async_flags = ASYNC_URGENT | ASYNC_READY; oap->oap_async_flags |= ASYNC_COUNT_STABLE; spin_unlock(&oap->oap_lock); osc_page_submit(env, opg, crt, brw_flags); list_add_tail(&oap->oap_pending_item, &list); + + if (page->cp_sync_io) + cl_page_list_move(qout, qin, page); + else /* async IO */ + cl_page_list_del(env, qin, page); + if (++queued == max_pages) { queued = 0; result = osc_queue_sync_pages(env, osc, &list, cmd, @@ -195,7 +200,7 @@ static int osc_io_submit(const struct lu_env *env, * Expand stripe KMS if necessary. */ static void osc_page_touch_at(const struct lu_env *env, - struct cl_object *obj, pgoff_t idx, unsigned to) + struct cl_object *obj, pgoff_t idx, size_t to) { struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; struct cl_attr *attr = &osc_env_info(env)->oti_attr; @@ -228,7 +233,7 @@ static void osc_page_touch_at(const struct lu_env *env, attr->cat_size = kms; valid |= CAT_SIZE; } - cl_object_attr_set(env, obj, attr, valid); + cl_object_attr_update(env, obj, attr, valid); cl_object_attr_unlock(obj); } @@ -314,8 +319,8 @@ static int osc_io_rw_iter_init(const struct lu_env *env, struct osc_object *osc = cl2osc(ios->cis_obj); struct client_obd *cli = osc_cli(osc); unsigned long c; - unsigned int npages; - unsigned int max_pages; + unsigned long npages; + unsigned long max_pages; if (cl_io_is_append(io)) return 0; @@ -328,15 +333,15 @@ static int osc_io_rw_iter_init(const struct lu_env *env, if (npages > max_pages) npages = max_pages; - c = atomic_read(cli->cl_lru_left); + c = atomic_long_read(cli->cl_lru_left); if (c < npages && osc_lru_reclaim(cli) > 0) - c = atomic_read(cli->cl_lru_left); + c = atomic_long_read(cli->cl_lru_left); while (c >= npages) { - if (c == atomic_cmpxchg(cli->cl_lru_left, c, c - npages)) { + if (c == atomic_long_cmpxchg(cli->cl_lru_left, c, c - npages)) { oio->oi_lru_reserved = npages; break; } - c = atomic_read(cli->cl_lru_left); + c = atomic_long_read(cli->cl_lru_left); } return 0; @@ -350,7 +355,7 @@ static void osc_io_rw_iter_fini(const struct lu_env *env, struct client_obd *cli = osc_cli(osc); if (oio->oi_lru_reserved > 0) { - atomic_add(oio->oi_lru_reserved, cli->cl_lru_left); + atomic_long_add(oio->oi_lru_reserved, cli->cl_lru_left); oio->oi_lru_reserved = 0; } oio->oi_write_osclock = NULL; @@ -364,7 +369,7 @@ static int osc_io_fault_start(const struct lu_env *env, io = ios->cis_io; fio = &io->u.ci_fault; - CDEBUG(D_INFO, "%lu %d %d\n", + CDEBUG(D_INFO, "%lu %d %zu\n", fio->ft_index, fio->ft_writable, fio->ft_nob); /* * If mapping is writeable, adjust kms to cover this page, @@ -471,18 +476,21 @@ static int osc_io_setattr_start(const struct lu_env *env, attr->cat_ctime = lvb->lvb_ctime; cl_valid |= CAT_CTIME; } - result = cl_object_attr_set(env, obj, attr, cl_valid); + result = cl_object_attr_update(env, obj, attr, + cl_valid); } cl_object_attr_unlock(obj); } memset(oa, 0, sizeof(*oa)); if (result == 0) { oa->o_oi = loi->loi_oi; + obdo_set_parent_fid(oa, io->u.ci_setattr.sa_parent_fid); + oa->o_stripe_idx = io->u.ci_setattr.sa_stripe_index; oa->o_mtime = attr->cat_mtime; oa->o_atime = attr->cat_atime; oa->o_ctime = attr->cat_ctime; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME | - OBD_MD_FLCTIME | OBD_MD_FLMTIME; + oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME | + OBD_MD_FLCTIME | OBD_MD_FLMTIME; if (ia_valid & ATTR_SIZE) { oa->o_size = size; oa->o_blocks = OBD_OBJECT_EOF; @@ -559,7 +567,7 @@ static int osc_io_read_start(const struct lu_env *env, if (!slice->cis_io->ci_noatime) { cl_object_attr_lock(obj); attr->cat_atime = ktime_get_real_seconds(); - rc = cl_object_attr_set(env, obj, attr, CAT_ATIME); + rc = cl_object_attr_update(env, obj, attr, CAT_ATIME); cl_object_attr_unlock(obj); } return rc; @@ -576,7 +584,7 @@ static int osc_io_write_start(const struct lu_env *env, cl_object_attr_lock(obj); attr->cat_ctime = ktime_get_real_seconds(); attr->cat_mtime = attr->cat_ctime; - rc = cl_object_attr_set(env, obj, attr, CAT_MTIME | CAT_CTIME); + rc = cl_object_attr_update(env, obj, attr, CAT_MTIME | CAT_CTIME); cl_object_attr_unlock(obj); return rc; diff --git a/drivers/staging/lustre/lustre/osc/osc_lock.c b/drivers/staging/lustre/lustre/osc/osc_lock.c index 717d3ffb6789..39a8a5851603 100644 --- a/drivers/staging/lustre/lustre/osc/osc_lock.c +++ b/drivers/staging/lustre/lustre/osc/osc_lock.c @@ -222,7 +222,7 @@ static void osc_lock_lvb_update(const struct lu_env *env, ldlm_lock_allow_match_locked(dlmlock); } - cl_object_attr_set(env, obj, attr, valid); + cl_object_attr_update(env, obj, attr, valid); cl_object_attr_unlock(obj); } @@ -467,7 +467,7 @@ static int osc_dlm_blocking_ast0(const struct lu_env *env, */ attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms); - cl_object_attr_set(env, obj, attr, CAT_KMS); + cl_object_attr_update(env, obj, attr, CAT_KMS); cl_object_attr_unlock(obj); unlock_res_and_lock(dlmlock); diff --git a/drivers/staging/lustre/lustre/osc/osc_object.c b/drivers/staging/lustre/lustre/osc/osc_object.c index d211d1905e83..aae3a2d4243f 100644 --- a/drivers/staging/lustre/lustre/osc/osc_object.c +++ b/drivers/staging/lustre/lustre/osc/osc_object.c @@ -159,8 +159,8 @@ static int osc_attr_get(const struct lu_env *env, struct cl_object *obj, return 0; } -static int osc_attr_set(const struct lu_env *env, struct cl_object *obj, - const struct cl_attr *attr, unsigned valid) +static int osc_attr_update(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned int valid) { struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; struct ost_lvb *lvb = &oinfo->loi_lvb; @@ -195,7 +195,6 @@ static int osc_object_glimpse(const struct lu_env *env, static int osc_object_ast_clear(struct ldlm_lock *lock, void *data) { - LASSERT(lock->l_granted_mode == lock->l_req_mode); if (lock->l_ast_data == data) lock->l_ast_data = NULL; return LDLM_ITER_CONTINUE; @@ -262,7 +261,7 @@ static const struct cl_object_operations osc_ops = { .coo_lock_init = osc_lock_init, .coo_io_init = osc_io_init, .coo_attr_get = osc_attr_get, - .coo_attr_set = osc_attr_set, + .coo_attr_update = osc_attr_update, .coo_glimpse = osc_object_glimpse, .coo_prune = osc_object_prune }; diff --git a/drivers/staging/lustre/lustre/osc/osc_page.c b/drivers/staging/lustre/lustre/osc/osc_page.c index 355f496a2093..2a7a70aa9e80 100644 --- a/drivers/staging/lustre/lustre/osc/osc_page.c +++ b/drivers/staging/lustre/lustre/osc/osc_page.c @@ -323,32 +323,6 @@ int osc_page_init(const struct lu_env *env, struct cl_object *obj, return result; } -int osc_over_unstable_soft_limit(struct client_obd *cli) -{ - long obd_upages, obd_dpages, osc_upages; - - /* Can't check cli->cl_unstable_count, therefore, no soft limit */ - if (!cli) - return 0; - - obd_upages = atomic_read(&obd_unstable_pages); - obd_dpages = atomic_read(&obd_dirty_pages); - - osc_upages = atomic_read(&cli->cl_unstable_count); - - /* - * obd_max_dirty_pages is the max number of (dirty + unstable) - * pages allowed at any given time. To simulate an unstable page - * only limit, we subtract the current number of dirty pages - * from this max. This difference is roughly the amount of pages - * currently available for unstable pages. Thus, the soft limit - * is half of that difference. Check osc_upages to ensure we don't - * set SOFT_SYNC for OSCs without any outstanding unstable pages. - */ - return osc_upages && - obd_upages >= (obd_max_dirty_pages - obd_dpages) / 2; -} - /** * Helper function called by osc_io_submit() for every page in an immediate * transfer (i.e., transferred synchronously). @@ -368,9 +342,6 @@ void osc_page_submit(const struct lu_env *env, struct osc_page *opg, oap->oap_count = opg->ops_to - opg->ops_from; oap->oap_brw_flags = brw_flags | OBD_BRW_SYNC; - if (osc_over_unstable_soft_limit(oap->oap_cli)) - oap->oap_brw_flags |= OBD_BRW_SOFT_SYNC; - if (capable(CFS_CAP_SYS_RESOURCE)) { oap->oap_brw_flags |= OBD_BRW_NOQUOTA; oap->oap_cmd |= OBD_BRW_NOQUOTA; @@ -409,7 +380,7 @@ static const int lru_shrink_max = 8 << (20 - PAGE_SHIFT); /* 8M */ static int osc_cache_too_much(struct client_obd *cli) { struct cl_client_cache *cache = cli->cl_cache; - int pages = atomic_read(&cli->cl_lru_in_list); + long pages = atomic_long_read(&cli->cl_lru_in_list); unsigned long budget; budget = cache->ccc_lru_max / (atomic_read(&cache->ccc_users) - 2); @@ -417,7 +388,7 @@ static int osc_cache_too_much(struct client_obd *cli) /* if it's going to run out LRU slots, we should free some, but not * too much to maintain fairness among OSCs. */ - if (atomic_read(cli->cl_lru_left) < cache->ccc_lru_max >> 4) { + if (atomic_long_read(cli->cl_lru_left) < cache->ccc_lru_max >> 4) { if (pages >= budget) return lru_shrink_max; else if (pages >= budget / 2) @@ -444,7 +415,7 @@ void osc_lru_add_batch(struct client_obd *cli, struct list_head *plist) { LIST_HEAD(lru); struct osc_async_page *oap; - int npages = 0; + long npages = 0; list_for_each_entry(oap, plist, oap_pending_item) { struct osc_page *opg = oap2osc_page(oap); @@ -460,8 +431,8 @@ void osc_lru_add_batch(struct client_obd *cli, struct list_head *plist) if (npages > 0) { spin_lock(&cli->cl_lru_list_lock); list_splice_tail(&lru, &cli->cl_lru_list); - atomic_sub(npages, &cli->cl_lru_busy); - atomic_add(npages, &cli->cl_lru_in_list); + atomic_long_sub(npages, &cli->cl_lru_busy); + atomic_long_add(npages, &cli->cl_lru_in_list); spin_unlock(&cli->cl_lru_list_lock); /* XXX: May set force to be true for better performance */ @@ -472,9 +443,9 @@ void osc_lru_add_batch(struct client_obd *cli, struct list_head *plist) static void __osc_lru_del(struct client_obd *cli, struct osc_page *opg) { - LASSERT(atomic_read(&cli->cl_lru_in_list) > 0); + LASSERT(atomic_long_read(&cli->cl_lru_in_list) > 0); list_del_init(&opg->ops_lru); - atomic_dec(&cli->cl_lru_in_list); + atomic_long_dec(&cli->cl_lru_in_list); } /** @@ -488,12 +459,12 @@ static void osc_lru_del(struct client_obd *cli, struct osc_page *opg) if (!list_empty(&opg->ops_lru)) { __osc_lru_del(cli, opg); } else { - LASSERT(atomic_read(&cli->cl_lru_busy) > 0); - atomic_dec(&cli->cl_lru_busy); + LASSERT(atomic_long_read(&cli->cl_lru_busy) > 0); + atomic_long_dec(&cli->cl_lru_busy); } spin_unlock(&cli->cl_lru_list_lock); - atomic_inc(cli->cl_lru_left); + atomic_long_inc(cli->cl_lru_left); /* this is a great place to release more LRU pages if * this osc occupies too many LRU pages and kernel is * stealing one of them. @@ -518,7 +489,7 @@ static void osc_lru_use(struct client_obd *cli, struct osc_page *opg) spin_lock(&cli->cl_lru_list_lock); __osc_lru_del(cli, opg); spin_unlock(&cli->cl_lru_list_lock); - atomic_inc(&cli->cl_lru_busy); + atomic_long_inc(&cli->cl_lru_busy); } } @@ -540,10 +511,32 @@ static void discard_pagevec(const struct lu_env *env, struct cl_io *io, } /** + * Check if a cl_page can be released, i.e, it's not being used. + * + * If unstable account is turned on, bulk transfer may hold one refcount + * for recovery so we need to check vmpage refcount as well; otherwise, + * even we can destroy cl_page but the corresponding vmpage can't be reused. + */ +static inline bool lru_page_busy(struct client_obd *cli, struct cl_page *page) +{ + if (cl_page_in_use_noref(page)) + return true; + + if (cli->cl_cache->ccc_unstable_check) { + struct page *vmpage = cl_page_vmpage(page); + + /* vmpage have two known users: cl_page and VM page cache */ + if (page_count(vmpage) - page_mapcount(vmpage) > 2) + return true; + } + return false; +} + +/** * Drop @target of pages from LRU at most. */ -int osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, - int target, bool force) +long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, + long target, bool force) { struct cl_io *io; struct cl_object *clobj = NULL; @@ -551,12 +544,12 @@ int osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, struct osc_page *opg; struct osc_page *temp; int maxscan = 0; - int count = 0; + long count = 0; int index = 0; int rc = 0; - LASSERT(atomic_read(&cli->cl_lru_in_list) >= 0); - if (atomic_read(&cli->cl_lru_in_list) == 0 || target <= 0) + LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0); + if (atomic_long_read(&cli->cl_lru_in_list) == 0 || target <= 0) return 0; if (!force) { @@ -575,7 +568,7 @@ int osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, io = &osc_env_info(env)->oti_io; spin_lock(&cli->cl_lru_list_lock); - maxscan = min(target << 1, atomic_read(&cli->cl_lru_in_list)); + maxscan = min(target << 1, atomic_long_read(&cli->cl_lru_in_list)); list_for_each_entry_safe(opg, temp, &cli->cl_lru_list, ops_lru) { struct cl_page *page; bool will_free = false; @@ -584,7 +577,7 @@ int osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, break; page = opg->ops_cl.cpl_page; - if (cl_page_in_use_noref(page)) { + if (lru_page_busy(cli, page)) { list_move_tail(&opg->ops_lru, &cli->cl_lru_list); continue; } @@ -620,7 +613,7 @@ int osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, } if (cl_page_own_try(env, io, page) == 0) { - if (!cl_page_in_use_noref(page)) { + if (!lru_page_busy(cli, page)) { /* remove it from lru list earlier to avoid * lock contention */ @@ -663,24 +656,19 @@ int osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, atomic_dec(&cli->cl_lru_shrinkers); if (count > 0) { - atomic_add(count, cli->cl_lru_left); + atomic_long_add(count, cli->cl_lru_left); wake_up_all(&osc_lru_waitq); } return count > 0 ? count : rc; } -static inline int max_to_shrink(struct client_obd *cli) -{ - return min(atomic_read(&cli->cl_lru_in_list) >> 1, lru_shrink_max); -} - -int osc_lru_reclaim(struct client_obd *cli) +long osc_lru_reclaim(struct client_obd *cli) { struct cl_env_nest nest; struct lu_env *env; struct cl_client_cache *cache = cli->cl_cache; int max_scans; - int rc = 0; + long rc = 0; LASSERT(cache); @@ -693,15 +681,15 @@ int osc_lru_reclaim(struct client_obd *cli) if (rc == -EBUSY) rc = 0; - CDEBUG(D_CACHE, "%s: Free %d pages from own LRU: %p.\n", + CDEBUG(D_CACHE, "%s: Free %ld pages from own LRU: %p.\n", cli->cl_import->imp_obd->obd_name, rc, cli); goto out; } - CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %d, busy: %d.\n", + CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %ld, busy: %ld.\n", cli->cl_import->imp_obd->obd_name, cli, - atomic_read(&cli->cl_lru_in_list), - atomic_read(&cli->cl_lru_busy)); + atomic_long_read(&cli->cl_lru_in_list), + atomic_long_read(&cli->cl_lru_busy)); /* Reclaim LRU slots from other client_obd as it can't free enough * from its own. This should rarely happen. @@ -717,10 +705,10 @@ int osc_lru_reclaim(struct client_obd *cli) cli = list_entry(cache->ccc_lru.next, struct client_obd, cl_lru_osc); - CDEBUG(D_CACHE, "%s: cli %p LRU pages: %d, busy: %d.\n", + CDEBUG(D_CACHE, "%s: cli %p LRU pages: %ld, busy: %ld.\n", cli->cl_import->imp_obd->obd_name, cli, - atomic_read(&cli->cl_lru_in_list), - atomic_read(&cli->cl_lru_busy)); + atomic_long_read(&cli->cl_lru_in_list), + atomic_long_read(&cli->cl_lru_busy)); list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru); if (osc_cache_too_much(cli) > 0) { @@ -737,11 +725,18 @@ int osc_lru_reclaim(struct client_obd *cli) out: cl_env_nested_put(&nest, env); - CDEBUG(D_CACHE, "%s: cli %p freed %d pages.\n", + CDEBUG(D_CACHE, "%s: cli %p freed %ld pages.\n", cli->cl_import->imp_obd->obd_name, cli, rc); return rc; } +/** + * osc_lru_reserve() is called to reserve an LRU slot for a cl_page. + * + * Usually the LRU slots are reserved in osc_io_iter_rw_init(). + * Only in the case that the LRU slots are in extreme shortage, it should + * have reserved enough slots for an IO. + */ static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj, struct osc_page *opg) { @@ -758,8 +753,8 @@ static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj, goto out; } - LASSERT(atomic_read(cli->cl_lru_left) >= 0); - while (!atomic_add_unless(cli->cl_lru_left, -1, 0)) { + LASSERT(atomic_long_read(cli->cl_lru_left) >= 0); + while (!atomic_long_add_unless(cli->cl_lru_left, -1, 0)) { /* run out of LRU spaces, try to drop some by itself */ rc = osc_lru_reclaim(cli); if (rc < 0) @@ -770,7 +765,7 @@ static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj, cond_resched(); rc = l_wait_event(osc_lru_waitq, - atomic_read(cli->cl_lru_left) > 0, + atomic_long_read(cli->cl_lru_left) > 0, &lwi); if (rc < 0) @@ -779,7 +774,7 @@ static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj, out: if (rc >= 0) { - atomic_inc(&cli->cl_lru_busy); + atomic_long_inc(&cli->cl_lru_busy); opg->ops_in_lru = 1; rc = 0; } @@ -787,4 +782,151 @@ out: return rc; } +/** + * Atomic operations are expensive. We accumulate the accounting for the + * same page pgdat to get better performance. + * In practice this can work pretty good because the pages in the same RPC + * are likely from the same page zone. + */ +static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc, + int factor) +{ + int page_count = desc->bd_iov_count; + pg_data_t *last = NULL; + int count = 0; + int i; + + for (i = 0; i < page_count; i++) { + pg_data_t *pgdat = page_pgdat(desc->bd_iov[i].bv_page); + + if (likely(pgdat == last)) { + ++count; + continue; + } + + if (count > 0) { + mod_node_page_state(pgdat, NR_UNSTABLE_NFS, + factor * count); + count = 0; + } + last = pgdat; + ++count; + } + if (count > 0) + mod_node_page_state(last, NR_UNSTABLE_NFS, factor * count); +} + +static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc) +{ + unstable_page_accounting(desc, 1); +} + +static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc) +{ + unstable_page_accounting(desc, -1); +} + +/** + * Performs "unstable" page accounting. This function balances the + * increment operations performed in osc_inc_unstable_pages. It is + * registered as the RPC request callback, and is executed when the + * bulk RPC is committed on the server. Thus at this point, the pages + * involved in the bulk transfer are no longer considered unstable. + * + * If this function is called, the request should have been committed + * or req:rq_unstable must have been set; it implies that the unstable + * statistic have been added. + */ +void osc_dec_unstable_pages(struct ptlrpc_request *req) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + int page_count = desc->bd_iov_count; + long unstable_count; + + LASSERT(page_count >= 0); + dec_unstable_page_accounting(desc); + + unstable_count = atomic_long_sub_return(page_count, + &cli->cl_unstable_count); + LASSERT(unstable_count >= 0); + + unstable_count = atomic_long_sub_return(page_count, + &cli->cl_cache->ccc_unstable_nr); + LASSERT(unstable_count >= 0); + if (!unstable_count) + wake_up_all(&cli->cl_cache->ccc_unstable_waitq); + + if (osc_cache_too_much(cli)) + (void)ptlrpcd_queue_work(cli->cl_lru_work); +} + +/** + * "unstable" page accounting. See: osc_dec_unstable_pages. + */ +void osc_inc_unstable_pages(struct ptlrpc_request *req) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + long page_count = desc->bd_iov_count; + + /* No unstable page tracking */ + if (!cli->cl_cache || !cli->cl_cache->ccc_unstable_check) + return; + + add_unstable_page_accounting(desc); + atomic_long_add(page_count, &cli->cl_unstable_count); + atomic_long_add(page_count, &cli->cl_cache->ccc_unstable_nr); + + /* + * If the request has already been committed (i.e. brw_commit + * called via rq_commit_cb), we need to undo the unstable page + * increments we just performed because rq_commit_cb wont be + * called again. + */ + spin_lock(&req->rq_lock); + if (unlikely(req->rq_committed)) { + spin_unlock(&req->rq_lock); + + osc_dec_unstable_pages(req); + } else { + req->rq_unstable = 1; + spin_unlock(&req->rq_lock); + } +} + +/** + * Check if it piggybacks SOFT_SYNC flag to OST from this OSC. + * This function will be called by every BRW RPC so it's critical + * to make this function fast. + */ +bool osc_over_unstable_soft_limit(struct client_obd *cli) +{ + long unstable_nr, osc_unstable_count; + + /* Can't check cli->cl_unstable_count, therefore, no soft limit */ + if (!cli->cl_cache || !cli->cl_cache->ccc_unstable_check) + return false; + + osc_unstable_count = atomic_long_read(&cli->cl_unstable_count); + unstable_nr = atomic_long_read(&cli->cl_cache->ccc_unstable_nr); + + CDEBUG(D_CACHE, + "%s: cli: %p unstable pages: %lu, osc unstable pages: %lu\n", + cli->cl_import->imp_obd->obd_name, cli, + unstable_nr, osc_unstable_count); + + /* + * If the LRU slots are in shortage - 25% remaining AND this OSC + * has one full RPC window of unstable pages, it's a good chance + * to piggyback a SOFT_SYNC flag. + * Please notice that the OST won't take immediate response for the + * SOFT_SYNC request so active OSCs will have more chance to carry + * the flag, this is reasonable. + */ + return unstable_nr > cli->cl_cache->ccc_lru_max >> 2 && + osc_unstable_count > cli->cl_max_pages_per_rpc * + cli->cl_max_rpcs_in_flight; +} + /** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_request.c b/drivers/staging/lustre/lustre/osc/osc_request.c index 536b868ff776..749781f022e2 100644 --- a/drivers/staging/lustre/lustre/osc/osc_request.c +++ b/drivers/staging/lustre/lustre/osc/osc_request.c @@ -41,6 +41,7 @@ #include "../include/lustre_ha.h" #include "../include/lprocfs_status.h" +#include "../include/lustre/lustre_ioctl.h" #include "../include/lustre_debug.h" #include "../include/lustre_param.h" #include "../include/lustre_fid.h" @@ -102,36 +103,6 @@ static void osc_release_ppga(struct brw_page **ppga, u32 count); static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req, void *data, int rc); -/* Pack OSC object metadata for disk storage (LE byte order). */ -static int osc_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, - struct lov_stripe_md *lsm) -{ - int lmm_size; - - lmm_size = sizeof(**lmmp); - if (!lmmp) - return lmm_size; - - if (*lmmp && !lsm) { - kfree(*lmmp); - *lmmp = NULL; - return 0; - } else if (unlikely(lsm && ostid_id(&lsm->lsm_oi) == 0)) { - return -EBADF; - } - - if (!*lmmp) { - *lmmp = kzalloc(lmm_size, GFP_NOFS); - if (!*lmmp) - return -ENOMEM; - } - - if (lsm) - ostid_cpu_to_le(&lsm->lsm_oi, &(*lmmp)->lmm_oi); - - return lmm_size; -} - /* Unpack OSC object metadata from disk storage (LE byte order). */ static int osc_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, struct lov_mds_md *lmm, int lmm_bytes) @@ -189,7 +160,7 @@ static int osc_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES)) (*lsmp)->lsm_maxbytes = imp->imp_connect_data.ocd_maxbytes; else - (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES; + (*lsmp)->lsm_maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES; return lsm_size; } @@ -427,24 +398,16 @@ static int osc_setattr_async(struct obd_export *exp, struct obd_info *oinfo, oinfo->oi_cb_up, oinfo, rqset); } -static int osc_real_create(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md **ea, - struct obd_trans_info *oti) +static int osc_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct obd_trans_info *oti) { struct ptlrpc_request *req; struct ost_body *body; - struct lov_stripe_md *lsm; int rc; LASSERT(oa); - LASSERT(ea); - - lsm = *ea; - if (!lsm) { - rc = obd_alloc_memmd(exp, &lsm); - if (rc < 0) - return rc; - } + LASSERT(oa->o_valid & OBD_MD_FLGROUP); + LASSERT(fid_seq_is_echo(ostid_seq(&oa->o_oi))); req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE); if (!req) { @@ -490,21 +453,10 @@ static int osc_real_create(struct obd_export *exp, struct obdo *oa, oa->o_blksize = cli_brw_size(exp->exp_obd); oa->o_valid |= OBD_MD_FLBLKSZ; - /* XXX LOV STACKING: the lsm that is passed to us from LOV does not - * have valid lsm_oinfo data structs, so don't go touching that. - * This needs to be fixed in a big way. - */ - lsm->lsm_oi = oa->o_oi; - *ea = lsm; - - if (oti) { - oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg); - - if (oa->o_valid & OBD_MD_FLCOOKIE) { - if (!oti->oti_logcookies) - oti_alloc_cookies(oti, 1); - *oti->oti_logcookies = oa->o_lcookie; - } + if (oti && oa->o_valid & OBD_MD_FLCOOKIE) { + if (!oti->oti_logcookies) + oti->oti_logcookies = &oti->oti_onecookie; + *oti->oti_logcookies = oa->o_lcookie; } CDEBUG(D_HA, "transno: %lld\n", @@ -512,8 +464,6 @@ static int osc_real_create(struct obd_export *exp, struct obdo *oa, out_req: ptlrpc_req_finished(req); out: - if (rc && !*ea) - obd_free_memmd(exp, &lsm); return rc; } @@ -649,7 +599,7 @@ static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa, ostid_build_res_name(&oa->o_oi, &res_id); res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); - if (!res) + if (IS_ERR(res)) return 0; LDLM_RESOURCE_ADDREF(res); @@ -689,30 +639,6 @@ static int osc_can_send_destroy(struct client_obd *cli) return 0; } -static int osc_create(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa, struct lov_stripe_md **ea, - struct obd_trans_info *oti) -{ - int rc = 0; - - LASSERT(oa); - LASSERT(ea); - LASSERT(oa->o_valid & OBD_MD_FLGROUP); - - if ((oa->o_valid & OBD_MD_FLFLAGS) && - oa->o_flags == OBD_FL_RECREATE_OBJS) { - return osc_real_create(exp, oa, ea, oti); - } - - if (!fid_seq_is_mdt(ostid_seq(&oa->o_oi))) - return osc_real_create(exp, oa, ea, oti); - - /* we should not get here anymore */ - LBUG(); - - return rc; -} - /* Destroy requests can be async always on the client, and we don't even really * care about the return code since the client cannot do anything at all about * a destroy failure. @@ -725,8 +651,7 @@ static int osc_create(const struct lu_env *env, struct obd_export *exp, * cookies to the MDS after committing destroy transactions. */ static int osc_destroy(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa, struct lov_stripe_md *ea, - struct obd_trans_info *oti, struct obd_export *md_export) + struct obdo *oa, struct obd_trans_info *oti) { struct client_obd *cli = &exp->exp_obd->u.cli; struct ptlrpc_request *req; @@ -794,42 +719,44 @@ static int osc_destroy(const struct lu_env *env, struct obd_export *exp, static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, long writing_bytes) { - u32 bits = OBD_MD_FLBLOCKS|OBD_MD_FLGRANT; + u32 bits = OBD_MD_FLBLOCKS | OBD_MD_FLGRANT; LASSERT(!(oa->o_valid & bits)); oa->o_valid |= bits; spin_lock(&cli->cl_loi_list_lock); - oa->o_dirty = cli->cl_dirty; - if (unlikely(cli->cl_dirty - cli->cl_dirty_transit > - cli->cl_dirty_max)) { + oa->o_dirty = cli->cl_dirty_pages << PAGE_SHIFT; + if (unlikely(cli->cl_dirty_pages - cli->cl_dirty_transit > + cli->cl_dirty_max_pages)) { CERROR("dirty %lu - %lu > dirty_max %lu\n", - cli->cl_dirty, cli->cl_dirty_transit, cli->cl_dirty_max); + cli->cl_dirty_pages, cli->cl_dirty_transit, + cli->cl_dirty_max_pages); oa->o_undirty = 0; - } else if (unlikely(atomic_read(&obd_unstable_pages) + - atomic_read(&obd_dirty_pages) - - atomic_read(&obd_dirty_transit_pages) > - (long)(obd_max_dirty_pages + 1))) { + } else if (unlikely(atomic_long_read(&obd_dirty_pages) - + atomic_long_read(&obd_dirty_transit_pages) > + (obd_max_dirty_pages + 1))) { /* The atomic_read() allowing the atomic_inc() are * not covered by a lock thus they may safely race and trip * this CERROR() unless we add in a small fudge factor (+1). */ - CERROR("%s: dirty %d + %d - %d > system dirty_max %d\n", + CERROR("%s: dirty %ld + %ld > system dirty_max %lu\n", cli->cl_import->imp_obd->obd_name, - atomic_read(&obd_unstable_pages), - atomic_read(&obd_dirty_pages), - atomic_read(&obd_dirty_transit_pages), + atomic_long_read(&obd_dirty_pages), + atomic_long_read(&obd_dirty_transit_pages), obd_max_dirty_pages); oa->o_undirty = 0; - } else if (unlikely(cli->cl_dirty_max - cli->cl_dirty > 0x7fffffff)) { + } else if (unlikely(cli->cl_dirty_max_pages - cli->cl_dirty_pages > + 0x7fffffff)) { CERROR("dirty %lu - dirty_max %lu too big???\n", - cli->cl_dirty, cli->cl_dirty_max); + cli->cl_dirty_pages, cli->cl_dirty_max_pages); oa->o_undirty = 0; } else { - long max_in_flight = (cli->cl_max_pages_per_rpc << - PAGE_SHIFT)* - (cli->cl_max_rpcs_in_flight + 1); - oa->o_undirty = max(cli->cl_dirty_max, max_in_flight); + unsigned long max_in_flight; + + max_in_flight = (cli->cl_max_pages_per_rpc << PAGE_SHIFT) * + (cli->cl_max_rpcs_in_flight + 1); + oa->o_undirty = max(cli->cl_dirty_max_pages << PAGE_SHIFT, + max_in_flight); } oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant; oa->o_dropped = cli->cl_lost_grant; @@ -1029,22 +956,24 @@ static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd) { /* * ocd_grant is the total grant amount we're expect to hold: if we've - * been evicted, it's the new avail_grant amount, cl_dirty will drop - * to 0 as inflight RPCs fail out; otherwise, it's avail_grant + dirty. + * been evicted, it's the new avail_grant amount, cl_dirty_pages will + * drop to 0 as inflight RPCs fail out; otherwise, it's avail_grant + + * dirty. * * race is tolerable here: if we're evicted, but imp_state already - * left EVICTED state, then cl_dirty must be 0 already. + * left EVICTED state, then cl_dirty_pages must be 0 already. */ spin_lock(&cli->cl_loi_list_lock); if (cli->cl_import->imp_state == LUSTRE_IMP_EVICTED) cli->cl_avail_grant = ocd->ocd_grant; else - cli->cl_avail_grant = ocd->ocd_grant - cli->cl_dirty; + cli->cl_avail_grant = ocd->ocd_grant - + (cli->cl_dirty_pages << PAGE_SHIFT); if (cli->cl_avail_grant < 0) { CWARN("%s: available grant < 0: avail/ocd/dirty %ld/%u/%ld\n", cli->cl_import->imp_obd->obd_name, cli->cl_avail_grant, - ocd->ocd_grant, cli->cl_dirty); + ocd->ocd_grant, cli->cl_dirty_pages << PAGE_SHIFT); /* workaround for servers which do not have the patch from * LU-2679 */ @@ -1181,7 +1110,7 @@ static u32 osc_checksum_bulk(int nob, u32 pg_count, } while (nob > 0 && pg_count > 0) { - int count = pga[i]->count > nob ? nob : pga[i]->count; + unsigned int count = pga[i]->count > nob ? nob : pga[i]->count; /* corrupt the data before we compute the checksum, to * simulate an OST->client data error @@ -1191,7 +1120,7 @@ static u32 osc_checksum_bulk(int nob, u32 pg_count, unsigned char *ptr = kmap(pga[i]->pg); int off = pga[i]->off & ~PAGE_MASK; - memcpy(ptr + off, "bad1", min(4, nob)); + memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob)); kunmap(pga[i]->pg); } cfs_crypto_hash_update_page(hdesc, pga[i]->pg, @@ -1335,11 +1264,11 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli, if (i > 0 && can_merge_pages(pg_prev, pg)) { niobuf--; - niobuf->len += pg->count; + niobuf->rnb_len += pg->count; } else { - niobuf->offset = pg->off; - niobuf->len = pg->count; - niobuf->flags = pg->flag; + niobuf->rnb_offset = pg->off; + niobuf->rnb_len = pg->count; + niobuf->rnb_flags = pg->flag; } pg_prev = pg; } @@ -1418,6 +1347,11 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli, INIT_LIST_HEAD(&aa->aa_oaps); *reqp = req; + niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE); + CDEBUG(D_RPCTRACE, "brw rpc %p - object " DOSTID " offset %lld<>%lld\n", + req, POSTID(&oa->o_oi), niobuf[0].rnb_offset, + niobuf[niocount - 1].rnb_offset + niobuf[niocount - 1].rnb_len); + return 0; out: @@ -1463,7 +1397,8 @@ static int check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer, oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, POSTID(&oa->o_oi), pga[0]->off, - pga[page_count-1]->off + pga[page_count-1]->count - 1); + pga[page_count - 1]->off + + pga[page_count - 1]->count - 1); CERROR("original client csum %x (type %x), server csum %x (type %x), client csum now %x\n", client_cksum, client_cksum_type, server_cksum, cksum_type, new_cksum); @@ -1565,7 +1500,8 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) char *router = ""; enum cksum_type cksum_type; - cksum_type = cksum_type_unpack(body->oa.o_valid&OBD_MD_FLFLAGS ? + cksum_type = cksum_type_unpack(body->oa.o_valid & + OBD_MD_FLFLAGS ? body->oa.o_flags : 0); client_cksum = osc_checksum_bulk(rc, aa->aa_page_count, aa->aa_ppga, OST_READ, @@ -1794,7 +1730,8 @@ static int brw_interpret(const struct lu_env *env, if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) { struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; - loff_t last_off = last->oap_count + last->oap_obj_off; + loff_t last_off = last->oap_count + last->oap_obj_off + + last->oap_page_off; /* Change file size if this is an out of quota or * direct IO write and it extends the file size @@ -1812,11 +1749,14 @@ static int brw_interpret(const struct lu_env *env, } if (valid != 0) - cl_object_attr_set(env, obj, attr, valid); + cl_object_attr_update(env, obj, attr, valid); cl_object_attr_unlock(obj); } kmem_cache_free(obdo_cachep, aa->aa_oa); + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && rc == 0) + osc_inc_unstable_pages(req); + list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) { list_del_init(&ext->oe_link); osc_extent_finish(env, ext, 1, rc); @@ -1847,21 +1787,21 @@ static int brw_interpret(const struct lu_env *env, static void brw_commit(struct ptlrpc_request *req) { - spin_lock(&req->rq_lock); /* * If osc_inc_unstable_pages (via osc_extent_finish) races with * this called via the rq_commit_cb, I need to ensure * osc_dec_unstable_pages is still called. Otherwise unstable * pages may be leaked. */ - if (req->rq_unstable) { + spin_lock(&req->rq_lock); + if (unlikely(req->rq_unstable)) { + req->rq_unstable = 0; spin_unlock(&req->rq_lock); osc_dec_unstable_pages(req); - spin_lock(&req->rq_lock); } else { req->rq_committed = 1; + spin_unlock(&req->rq_lock); } - spin_unlock(&req->rq_lock); } /** @@ -1881,13 +1821,13 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, struct osc_async_page *tmp; struct cl_req *clerq = NULL; enum cl_req_type crt = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ; - struct ldlm_lock *lock = NULL; struct cl_req_attr *crattr = NULL; u64 starting_offset = OBD_OBJECT_EOF; u64 ending_offset = 0; int mpflag = 0; int mem_tight = 0; int page_count = 0; + bool soft_sync = false; int i; int rc; struct ost_body *body; @@ -1915,6 +1855,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, } } + soft_sync = osc_over_unstable_soft_limit(cli); if (mem_tight) mpflag = cfs_memory_pressure_get_and_set(); @@ -1947,10 +1888,11 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, rc = PTR_ERR(clerq); goto out; } - lock = oap->oap_ldlm_lock; } if (mem_tight) oap->oap_brw_flags |= OBD_BRW_MEMALLOC; + if (soft_sync) + oap->oap_brw_flags |= OBD_BRW_SOFT_SYNC; pga[i] = &oap->oap_brw_page; pga[i]->off = oap->oap_obj_off + oap->oap_page_off; CDEBUG(0, "put page %p index %lu oap %p flg %x to pga\n", @@ -1964,10 +1906,6 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, LASSERT(clerq); crattr->cra_oa = oa; cl_req_attr_set(env, clerq, crattr, ~0ULL); - if (lock) { - oa->o_handle = lock->l_remote_handle; - oa->o_valid |= OBD_MD_FLHANDLE; - } rc = cl_req_prep(env, clerq); if (rc != 0) { @@ -1998,7 +1936,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); crattr->cra_oa = &body->oa; cl_req_attr_set(env, clerq, crattr, - OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME); + OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME); lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid); @@ -2044,7 +1982,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, } spin_unlock(&cli->cl_loi_list_lock); - DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %dr/%dw in flight", + DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %ur/%dw in flight", page_count, aa, cli->cl_r_in_flight, cli->cl_w_in_flight); @@ -2116,27 +2054,6 @@ static int osc_set_data_with_check(struct lustre_handle *lockh, return set; } -/* find any ldlm lock of the inode in osc - * return 0 not find - * 1 find one - * < 0 error - */ -static int osc_find_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, - ldlm_iterator_t replace, void *data) -{ - struct ldlm_res_id res_id; - struct obd_device *obd = class_exp2obd(exp); - int rc = 0; - - ostid_build_res_name(&lsm->lsm_oi, &res_id); - rc = ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data); - if (rc == LDLM_ITER_STOP) - return 1; - if (rc == LDLM_ITER_CONTINUE) - return 0; - return rc; -} - static int osc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall, void *cookie, struct lustre_handle *lockh, enum ldlm_mode mode, @@ -2586,71 +2503,6 @@ static int osc_statfs(const struct lu_env *env, struct obd_export *exp, return rc; } -/* Retrieve object striping information. - * - * @lmmu is a pointer to an in-core struct with lmm_ost_count indicating - * the maximum number of OST indices which will fit in the user buffer. - * lmm_magic must be LOV_MAGIC (we only use 1 slot here). - */ -static int osc_getstripe(struct lov_stripe_md *lsm, - struct lov_user_md __user *lump) -{ - /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */ - struct lov_user_md_v3 lum, *lumk; - struct lov_user_ost_data_v1 *lmm_objects; - int rc = 0, lum_size; - - if (!lsm) - return -ENODATA; - - /* we only need the header part from user space to get lmm_magic and - * lmm_stripe_count, (the header part is common to v1 and v3) - */ - lum_size = sizeof(struct lov_user_md_v1); - if (copy_from_user(&lum, lump, lum_size)) - return -EFAULT; - - if ((lum.lmm_magic != LOV_USER_MAGIC_V1) && - (lum.lmm_magic != LOV_USER_MAGIC_V3)) - return -EINVAL; - - /* lov_user_md_vX and lov_mds_md_vX must have the same size */ - LASSERT(sizeof(struct lov_user_md_v1) == sizeof(struct lov_mds_md_v1)); - LASSERT(sizeof(struct lov_user_md_v3) == sizeof(struct lov_mds_md_v3)); - LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lumk->lmm_objects[0])); - - /* we can use lov_mds_md_size() to compute lum_size - * because lov_user_md_vX and lov_mds_md_vX have the same size - */ - if (lum.lmm_stripe_count > 0) { - lum_size = lov_mds_md_size(lum.lmm_stripe_count, lum.lmm_magic); - lumk = kzalloc(lum_size, GFP_NOFS); - if (!lumk) - return -ENOMEM; - - if (lum.lmm_magic == LOV_USER_MAGIC_V1) - lmm_objects = - &(((struct lov_user_md_v1 *)lumk)->lmm_objects[0]); - else - lmm_objects = &(lumk->lmm_objects[0]); - lmm_objects->l_ost_oi = lsm->lsm_oi; - } else { - lum_size = lov_mds_md_size(0, lum.lmm_magic); - lumk = &lum; - } - - lumk->lmm_oi = lsm->lsm_oi; - lumk->lmm_stripe_count = 1; - - if (copy_to_user(lump, lumk, lum_size)) - rc = -EFAULT; - - if (lumk != &lum) - kfree(lumk); - - return rc; -} - static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, void *karg, void __user *uarg) { @@ -2664,57 +2516,6 @@ static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, return -EINVAL; } switch (cmd) { - case OBD_IOC_LOV_GET_CONFIG: { - char *buf; - struct lov_desc *desc; - struct obd_uuid uuid; - - buf = NULL; - len = 0; - if (obd_ioctl_getdata(&buf, &len, uarg)) { - err = -EINVAL; - goto out; - } - - data = (struct obd_ioctl_data *)buf; - - if (sizeof(*desc) > data->ioc_inllen1) { - obd_ioctl_freedata(buf, len); - err = -EINVAL; - goto out; - } - - if (data->ioc_inllen2 < sizeof(uuid)) { - obd_ioctl_freedata(buf, len); - err = -EINVAL; - goto out; - } - - desc = (struct lov_desc *)data->ioc_inlbuf1; - desc->ld_tgt_count = 1; - desc->ld_active_tgt_count = 1; - desc->ld_default_stripe_count = 1; - desc->ld_default_stripe_size = 0; - desc->ld_default_stripe_offset = 0; - desc->ld_pattern = 0; - memcpy(&desc->ld_uuid, &obd->obd_uuid, sizeof(uuid)); - - memcpy(data->ioc_inlbuf2, &obd->obd_uuid, sizeof(uuid)); - - err = copy_to_user(uarg, buf, len); - if (err) - err = -EFAULT; - obd_ioctl_freedata(buf, len); - goto out; - } - case LL_IOC_LOV_SETSTRIPE: - err = obd_alloc_memmd(exp, karg); - if (err > 0) - err = 0; - goto out; - case LL_IOC_LOV_GETSTRIPE: - err = osc_getstripe(karg, uarg); - goto out; case OBD_IOC_CLIENT_RECOVER: err = ptlrpc_recover_import(obd->u.cli.cl_import, data->ioc_inlbuf1, 0); @@ -2749,51 +2550,7 @@ static int osc_get_info(const struct lu_env *env, struct obd_export *exp, if (!vallen || !val) return -EFAULT; - if (KEY_IS(KEY_LOCK_TO_STRIPE)) { - __u32 *stripe = val; - *vallen = sizeof(*stripe); - *stripe = 0; - return 0; - } else if (KEY_IS(KEY_LAST_ID)) { - struct ptlrpc_request *req; - u64 *reply; - char *tmp; - int rc; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_OST_GET_INFO_LAST_ID); - if (!req) - return -ENOMEM; - - req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, - RCL_CLIENT, keylen); - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); - memcpy(tmp, key, keylen); - - req->rq_no_delay = 1; - req->rq_no_resend = 1; - ptlrpc_request_set_replen(req); - rc = ptlrpc_queue_wait(req); - if (rc) - goto out; - - reply = req_capsule_server_get(&req->rq_pill, &RMF_OBD_ID); - if (!reply) { - rc = -EPROTO; - goto out; - } - - *((u64 *)val) = *reply; -out: - ptlrpc_req_finished(req); - return rc; - } else if (KEY_IS(KEY_FIEMAP)) { + if (KEY_IS(KEY_FIEMAP)) { struct ll_fiemap_info_key *fm_key = key; struct ldlm_res_id res_id; ldlm_policy_data_t policy; @@ -2931,11 +2688,11 @@ static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, if (KEY_IS(KEY_CACHE_LRU_SHRINK)) { struct client_obd *cli = &obd->u.cli; - int nr = atomic_read(&cli->cl_lru_in_list) >> 1; - int target = *(int *)val; + long nr = atomic_long_read(&cli->cl_lru_in_list) >> 1; + long target = *(long *)val; nr = osc_lru_shrink(env, cli, min(nr, target), true); - *(int *)val -= nr; + *(long *)val -= nr; return 0; } @@ -3014,8 +2771,9 @@ static int osc_reconnect(const struct lu_env *env, long lost_grant; spin_lock(&cli->cl_loi_list_lock); - data->ocd_grant = (cli->cl_avail_grant + cli->cl_dirty) ?: - 2 * cli_brw_size(obd); + data->ocd_grant = (cli->cl_avail_grant + + (cli->cl_dirty_pages << PAGE_SHIFT)) ?: + 2 * cli_brw_size(obd); lost_grant = cli->cl_lost_grant; cli->cl_lost_grant = 0; spin_unlock(&cli->cl_loi_list_lock); @@ -3346,7 +3104,6 @@ static struct obd_ops osc_obd_ops = { .disconnect = osc_disconnect, .statfs = osc_statfs, .statfs_async = osc_statfs_async, - .packmd = osc_packmd, .unpackmd = osc_unpackmd, .create = osc_create, .destroy = osc_destroy, @@ -3354,7 +3111,6 @@ static struct obd_ops osc_obd_ops = { .getattr_async = osc_getattr_async, .setattr = osc_setattr, .setattr_async = osc_setattr_async, - .find_cbdata = osc_find_cbdata, .iocontrol = osc_iocontrol, .get_info = osc_get_info, .set_info_async = osc_set_info_async, diff --git a/drivers/staging/lustre/lustre/ptlrpc/client.c b/drivers/staging/lustre/lustre/ptlrpc/client.c index d4463d7c81d2..8c51d51a678b 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/client.c +++ b/drivers/staging/lustre/lustre/ptlrpc/client.c @@ -45,6 +45,7 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req); static int ptlrpcd_check_work(struct ptlrpc_request *req); +static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async); /** * Initialize passed in client structure \a cl. @@ -89,7 +90,6 @@ struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid) return c; } -EXPORT_SYMBOL(ptlrpc_uuid_to_connection); /** * Allocate and initialize new bulk descriptor on the sender. @@ -202,7 +202,7 @@ void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin) if (unpin) { for (i = 0; i < desc->bd_iov_count; i++) - put_page(desc->bd_iov[i].kiov_page); + put_page(desc->bd_iov[i].bv_page); } kfree(desc); @@ -283,8 +283,8 @@ int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) } /* Adjust expected network latency */ -static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, - unsigned int service_time) +void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, + unsigned int service_time) { unsigned int nl, oldnl; struct imp_at *at; @@ -364,31 +364,37 @@ static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) } rc = unpack_reply(early_req); - if (rc == 0) { - /* Expecting to increase the service time estimate here */ - ptlrpc_at_adj_service(req, - lustre_msg_get_timeout(early_req->rq_repmsg)); - ptlrpc_at_adj_net_latency(req, - lustre_msg_get_service_time(early_req->rq_repmsg)); - } - - sptlrpc_cli_finish_early_reply(early_req); - - if (rc != 0) { + if (rc) { + sptlrpc_cli_finish_early_reply(early_req); spin_lock(&req->rq_lock); return rc; } - /* Adjust the local timeout for this req */ - ptlrpc_at_set_req_timeout(req); + /* + * Use new timeout value just to adjust the local value for this + * request, don't include it into at_history. It is unclear yet why + * service time increased and should it be counted or skipped, e.g. + * that can be recovery case or some error or server, the real reply + * will add all new data if it is worth to add. + */ + req->rq_timeout = lustre_msg_get_timeout(early_req->rq_repmsg); + lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); + + /* Network latency can be adjusted, it is pure network delays */ + ptlrpc_at_adj_net_latency(req, + lustre_msg_get_service_time(early_req->rq_repmsg)); + + sptlrpc_cli_finish_early_reply(early_req); spin_lock(&req->rq_lock); olddl = req->rq_deadline; /* - * server assumes it now has rq_timeout from when it sent the - * early reply, so client should give it at least that long. + * server assumes it now has rq_timeout from when the request + * arrived, so the client should give it at least that long. + * since we don't know the arrival time we'll use the original + * sent time */ - req->rq_deadline = ktime_get_real_seconds() + req->rq_timeout + + req->rq_deadline = req->rq_sent + req->rq_timeout + ptlrpc_at_get_net_latency(req); DEBUG_REQ(D_ADAPTTO, req, @@ -884,7 +890,6 @@ struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, return set; } -EXPORT_SYMBOL(ptlrpc_prep_fcset); /** * Wind down and free request set structure previously allocated with @@ -1004,7 +1009,6 @@ void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, wake_up(&pc->pc_partners[i]->pc_set->set_waitq); } } -EXPORT_SYMBOL(ptlrpc_set_add_new_req); /** * Based on the current state of the import, determine if the request @@ -1035,8 +1039,8 @@ static int ptlrpc_import_delay_req(struct obd_import *imp, *status = -EIO; } else if (ptlrpc_send_limit_expired(req)) { /* probably doesn't need to be a D_ERROR after initial testing */ - DEBUG_REQ(D_ERROR, req, "send limit expired "); - *status = -EIO; + DEBUG_REQ(D_HA, req, "send limit expired "); + *status = -ETIMEDOUT; } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING && imp->imp_state == LUSTRE_IMP_CONNECTING) { /* allow CONNECT even if import is invalid */ @@ -1073,36 +1077,42 @@ static int ptlrpc_import_delay_req(struct obd_import *imp, } /** - * Decide if the error message regarding provided request \a req - * should be printed to the console or not. - * Makes it's decision on request status and other properties. - * Returns 1 to print error on the system console or 0 if not. + * Decide if the error message should be printed to the console or not. + * Makes its decision based on request type, status, and failure frequency. + * + * \param[in] req request that failed and may need a console message + * + * \retval false if no message should be printed + * \retval true if console message should be printed */ -static int ptlrpc_console_allow(struct ptlrpc_request *req) +static bool ptlrpc_console_allow(struct ptlrpc_request *req) { __u32 opc; - int err; LASSERT(req->rq_reqmsg); opc = lustre_msg_get_opc(req->rq_reqmsg); - /* - * Suppress particular reconnect errors which are to be expected. No - * errors are suppressed for the initial connection on an import - */ - if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) && - (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) { + /* Suppress particular reconnect errors which are to be expected. */ + if (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT) { + int err; + /* Suppress timed out reconnect requests */ - if (req->rq_timedout) - return 0; + if (lustre_handle_is_used(&req->rq_import->imp_remote_handle) || + req->rq_timedout) + return false; - /* Suppress unavailable/again reconnect requests */ + /* + * Suppress most unavailable/again reconnect requests, but + * print occasionally so it is clear client is trying to + * connect to a server where no target is running. + */ err = lustre_msg_get_status(req->rq_repmsg); - if (err == -ENODEV || err == -EAGAIN) - return 0; + if ((err == -ENODEV || err == -EAGAIN) && + req->rq_import->imp_conn_cnt % 30 != 20) + return false; } - return 1; + return true; } /** @@ -1116,14 +1126,14 @@ static int ptlrpc_check_status(struct ptlrpc_request *req) err = lustre_msg_get_status(req->rq_repmsg); if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { struct obd_import *imp = req->rq_import; + lnet_nid_t nid = imp->imp_connection->c_peer.nid; __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); if (ptlrpc_console_allow(req)) - LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s, operation %s failed with %d.\n", + LCONSOLE_ERROR_MSG(0x011, "%s: operation %s to node %s failed: rc = %d\n", imp->imp_obd->obd_name, - libcfs_nid2str( - imp->imp_connection->c_peer.nid), - ll_opcode2str(opc), err); + ll_opcode2str(opc), + libcfs_nid2str(nid), err); return err < 0 ? err : -EINVAL; } @@ -1280,7 +1290,7 @@ static int after_reply(struct ptlrpc_request *req) * some reason. Try to reconnect, and if that fails, punt to * the upcall. */ - if (ll_rpc_recoverable_error(rc)) { + if (ptlrpc_recoverable_error(rc)) { if (req->rq_send_state != LUSTRE_IMP_FULL || imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) { return rc; @@ -1628,8 +1638,10 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) req->rq_waiting || req->rq_wait_ctx) { int status; - if (!ptlrpc_unregister_reply(req, 1)) + if (!ptlrpc_unregister_reply(req, 1)) { + ptlrpc_unregister_bulk(req, 1); continue; + } spin_lock(&imp->imp_lock); if (ptlrpc_import_delay_req(imp, req, @@ -1995,7 +2007,6 @@ int ptlrpc_expired_set(void *data) */ return 1; } -EXPORT_SYMBOL(ptlrpc_expired_set); /** * Sets rq_intr flag in \a req under spinlock. @@ -2012,7 +2023,7 @@ EXPORT_SYMBOL(ptlrpc_mark_interrupted); * Interrupts (sets interrupted flag) all uncompleted requests in * a set \a data. Callback for l_wait_event for interruptible waits. */ -void ptlrpc_interrupted_set(void *data) +static void ptlrpc_interrupted_set(void *data) { struct ptlrpc_request_set *set = data; struct list_head *tmp; @@ -2030,7 +2041,6 @@ void ptlrpc_interrupted_set(void *data) ptlrpc_mark_interrupted(req); } } -EXPORT_SYMBOL(ptlrpc_interrupted_set); /** * Get the smallest timeout in the set; this does NOT set a timeout. @@ -2074,7 +2084,6 @@ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) } return timeout; } -EXPORT_SYMBOL(ptlrpc_set_next_timeout); /** * Send all unset request from the set and then wait until all @@ -2325,7 +2334,7 @@ EXPORT_SYMBOL(ptlrpc_req_xid); * The request owner (i.e. the thread doing the I/O) must call... * Returns 0 on success or 1 if unregistering cannot be made. */ -int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) +static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) { int rc; wait_queue_head_t *wq; @@ -2390,7 +2399,6 @@ int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) } return 0; } -EXPORT_SYMBOL(ptlrpc_unregister_reply); static void ptlrpc_free_request(struct ptlrpc_request *req) { @@ -2451,7 +2459,8 @@ void ptlrpc_free_committed(struct obd_import *imp) imp->imp_obd->obd_name, imp->imp_peer_committed_transno, imp->imp_generation); - if (imp->imp_generation != imp->imp_last_generation_checked) + if (imp->imp_generation != imp->imp_last_generation_checked || + !imp->imp_last_transno_checked) skip_committed_list = false; imp->imp_last_transno_checked = imp->imp_peer_committed_transno; @@ -2499,6 +2508,9 @@ free_req: if (req->rq_import_generation < imp->imp_generation) { DEBUG_REQ(D_RPCTRACE, req, "free stale open request"); ptlrpc_free_request(req); + } else if (!req->rq_replay) { + DEBUG_REQ(D_RPCTRACE, req, "free closed open request"); + ptlrpc_free_request(req); } } } @@ -2541,7 +2553,6 @@ void ptlrpc_resend_req(struct ptlrpc_request *req) ptlrpc_client_wake_req(req); spin_unlock(&req->rq_lock); } -EXPORT_SYMBOL(ptlrpc_resend_req); /** * Grab additional reference on a request \a req @@ -2610,7 +2621,6 @@ void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, list_add(&req->rq_replay_list, &imp->imp_replay_list); } -EXPORT_SYMBOL(ptlrpc_retain_replayable_request); /** * Send request and wait until it completes. @@ -2783,7 +2793,6 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) ptlrpcd_add_req(req); return 0; } -EXPORT_SYMBOL(ptlrpc_replay_req); /** * Aborts all in-flight request on import \a imp sending and delayed lists @@ -2843,7 +2852,6 @@ void ptlrpc_abort_inflight(struct obd_import *imp) spin_unlock(&imp->imp_lock); } -EXPORT_SYMBOL(ptlrpc_abort_inflight); /** * Abort all uncompleted requests in request set \a set @@ -2929,7 +2937,6 @@ __u64 ptlrpc_next_xid(void) return next; } -EXPORT_SYMBOL(ptlrpc_next_xid); /** * Get a glimpse at what next xid value might have been. diff --git a/drivers/staging/lustre/lustre/ptlrpc/connection.c b/drivers/staging/lustre/lustre/ptlrpc/connection.c index 177a379da9fa..7b020d60c9e5 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/connection.c +++ b/drivers/staging/lustre/lustre/ptlrpc/connection.c @@ -82,7 +82,6 @@ out: libcfs_nid2str(conn->c_peer.nid)); return conn; } -EXPORT_SYMBOL(ptlrpc_connection_get); int ptlrpc_connection_put(struct ptlrpc_connection *conn) { @@ -118,7 +117,6 @@ int ptlrpc_connection_put(struct ptlrpc_connection *conn) return rc; } -EXPORT_SYMBOL(ptlrpc_connection_put); struct ptlrpc_connection * ptlrpc_connection_addref(struct ptlrpc_connection *conn) @@ -130,7 +128,6 @@ ptlrpc_connection_addref(struct ptlrpc_connection *conn) return conn; } -EXPORT_SYMBOL(ptlrpc_connection_addref); int ptlrpc_connection_init(void) { @@ -146,13 +143,11 @@ int ptlrpc_connection_init(void) return 0; } -EXPORT_SYMBOL(ptlrpc_connection_init); void ptlrpc_connection_fini(void) { cfs_hash_putref(conn_hash); } -EXPORT_SYMBOL(ptlrpc_connection_fini); /* * Hash operations for net_peer<->connection diff --git a/drivers/staging/lustre/lustre/ptlrpc/events.c b/drivers/staging/lustre/lustre/ptlrpc/events.c index b1ce72511509..283dfb296d35 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/events.c +++ b/drivers/staging/lustre/lustre/ptlrpc/events.c @@ -543,7 +543,7 @@ static int ptlrpc_ni_init(void) rc = LNetNIInit(pid); if (rc < 0) { CDEBUG(D_NET, "Can't init network interface: %d\n", rc); - return -ENOENT; + return rc; } /* CAVEAT EMPTOR: how we process portals events is _radically_ @@ -561,7 +561,7 @@ static int ptlrpc_ni_init(void) CERROR("Failed to allocate event queue: %d\n", rc); LNetNIFini(); - return -ENOMEM; + return rc; } int ptlrpc_init_portals(void) @@ -570,7 +570,7 @@ int ptlrpc_init_portals(void) if (rc != 0) { CERROR("network initialisation failed\n"); - return -EIO; + return rc; } rc = ptlrpcd_addref(); if (rc == 0) diff --git a/drivers/staging/lustre/lustre/ptlrpc/import.c b/drivers/staging/lustre/lustre/ptlrpc/import.c index 3292e6ea0102..a23d0a05b574 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/import.c +++ b/drivers/staging/lustre/lustre/ptlrpc/import.c @@ -307,7 +307,8 @@ void ptlrpc_invalidate_import(struct obd_import *imp) */ lwi = LWI_TIMEOUT_INTERVAL( cfs_timeout_cap(cfs_time_seconds(timeout)), - (timeout > 1)?cfs_time_seconds(1):cfs_time_seconds(1)/2, + (timeout > 1) ? cfs_time_seconds(1) : + cfs_time_seconds(1) / 2, NULL, NULL); rc = l_wait_event(imp->imp_recovery_waitq, (atomic_read(&imp->imp_inflight) == 0), @@ -424,7 +425,6 @@ void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt) ptlrpc_pinger_force(imp); } } -EXPORT_SYMBOL(ptlrpc_fail_import); int ptlrpc_reconnect_import(struct obd_import *imp) { @@ -698,7 +698,8 @@ int ptlrpc_connect_import(struct obd_import *imp) request->rq_send_state = LUSTRE_IMP_CONNECTING; /* Allow a slightly larger reply for future growth compatibility */ req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER, - sizeof(struct obd_connect_data)+16*sizeof(__u64)); + sizeof(struct obd_connect_data) + + 16 * sizeof(__u64)); ptlrpc_request_set_replen(request); request->rq_interpret_reply = ptlrpc_connect_interpret; @@ -750,6 +751,153 @@ static int ptlrpc_busy_reconnect(int rc) return (rc == -EBUSY) || (rc == -EAGAIN); } +static int ptlrpc_connect_set_flags(struct obd_import *imp, + struct obd_connect_data *ocd, + u64 old_connect_flags, + struct obd_export *exp, int init_connect) +{ + struct client_obd *cli = &imp->imp_obd->u.cli; + static bool warned; + + if ((imp->imp_connect_flags_orig & OBD_CONNECT_IBITS) && + !(ocd->ocd_connect_flags & OBD_CONNECT_IBITS)) { + LCONSOLE_WARN("%s: MDS %s does not support ibits lock, either very old or invalid: requested %#llx, replied %#llx\n", + imp->imp_obd->obd_name, + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_connect_flags_orig, + ocd->ocd_connect_flags); + return -EPROTO; + } + + spin_lock(&imp->imp_lock); + list_del(&imp->imp_conn_current->oic_item); + list_add(&imp->imp_conn_current->oic_item, &imp->imp_conn_list); + imp->imp_last_success_conn = imp->imp_conn_current->oic_last_attempt; + + spin_unlock(&imp->imp_lock); + + if (!warned && (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && + (ocd->ocd_version > LUSTRE_VERSION_CODE + + LUSTRE_VERSION_OFFSET_WARN || + ocd->ocd_version < LUSTRE_VERSION_CODE - + LUSTRE_VERSION_OFFSET_WARN)) { + /* + * Sigh, some compilers do not like #ifdef in the middle + * of macro arguments + */ + const char *older = "older than client. Consider upgrading server"; + const char *newer = "newer than client. Consider recompiling application"; + + LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) is much %s (%s)\n", + obd2cli_tgt(imp->imp_obd), + OBD_OCD_VERSION_MAJOR(ocd->ocd_version), + OBD_OCD_VERSION_MINOR(ocd->ocd_version), + OBD_OCD_VERSION_PATCH(ocd->ocd_version), + OBD_OCD_VERSION_FIX(ocd->ocd_version), + ocd->ocd_version > LUSTRE_VERSION_CODE ? + newer : older, LUSTRE_VERSION_STRING); + warned = true; + } + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0) + /* + * Check if server has LU-1252 fix applied to not always swab + * the IR MNE entries. Do this only once per connection. This + * fixup is version-limited, because we don't want to carry the + * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we + * need interop with unpatched 2.2 servers. For newer servers, + * the client will do MNE swabbing only as needed. LU-1644 + */ + if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && + !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) && + OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 && + OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 && + OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 && + !strcmp(imp->imp_obd->obd_type->typ_name, + LUSTRE_MGC_NAME))) + imp->imp_need_mne_swab = 1; + else /* clear if server was upgraded since last connect */ + imp->imp_need_mne_swab = 0; +#endif + + if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) { + /* + * We sent to the server ocd_cksum_types with bits set + * for algorithms we understand. The server masked off + * the checksum types it doesn't support + */ + if (!(ocd->ocd_cksum_types & cksum_types_supported_client())) { + LCONSOLE_WARN("The negotiation of the checksum algorithm to use with server %s failed (%x/%x), disabling checksums\n", + obd2cli_tgt(imp->imp_obd), + ocd->ocd_cksum_types, + cksum_types_supported_client()); + cli->cl_checksum = 0; + cli->cl_supp_cksum_types = OBD_CKSUM_ADLER; + } else { + cli->cl_supp_cksum_types = ocd->ocd_cksum_types; + } + } else { + /* + * The server does not support OBD_CONNECT_CKSUM. + * Enforce ADLER for backward compatibility + */ + cli->cl_supp_cksum_types = OBD_CKSUM_ADLER; + } + cli->cl_cksum_type = cksum_type_select(cli->cl_supp_cksum_types); + + if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) + cli->cl_max_pages_per_rpc = + min(ocd->ocd_brw_size >> PAGE_SHIFT, + cli->cl_max_pages_per_rpc); + else if (imp->imp_connect_op == MDS_CONNECT || + imp->imp_connect_op == MGS_CONNECT) + cli->cl_max_pages_per_rpc = 1; + + LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) && + (cli->cl_max_pages_per_rpc > 0)); + + client_adjust_max_dirty(cli); + + /* + * Reset ns_connect_flags only for initial connect. It might be + * changed in while using FS and if we reset it in reconnect + * this leads to losing user settings done before such as + * disable lru_resize, etc. + */ + if (old_connect_flags != exp_connect_flags(exp) || init_connect) { + CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server flags: %#llx\n", + imp->imp_obd->obd_name, ocd->ocd_connect_flags); + imp->imp_obd->obd_namespace->ns_connect_flags = + ocd->ocd_connect_flags; + imp->imp_obd->obd_namespace->ns_orig_connect_flags = + ocd->ocd_connect_flags; + } + + if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) && + (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2)) + /* + * We need a per-message support flag, because + * a. we don't know if the incoming connect reply + * supports AT or not (in reply_in_callback) + * until we unpack it. + * b. failovered server means export and flags are gone + * (in ptlrpc_send_reply). + * Can only be set when we know AT is supported at + * both ends + */ + imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT; + else + imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; + + if ((ocd->ocd_connect_flags & OBD_CONNECT_FULL20) && + (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2)) + imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18; + else + imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18; + + return 0; +} + /** * interpret_reply callback for connect RPCs. * Looks into returned status of connect operation and decides @@ -762,7 +910,6 @@ static int ptlrpc_connect_interpret(const struct lu_env *env, { struct ptlrpc_connect_async_args *aa = data; struct obd_import *imp = request->rq_import; - struct client_obd *cli = &imp->imp_obd->u.cli; struct lustre_handle old_hdl; __u64 old_connect_flags; int msg_flags; @@ -842,7 +989,21 @@ static int ptlrpc_connect_interpret(const struct lu_env *env, old_connect_flags = exp_connect_flags(exp); exp->exp_connect_data = *ocd; imp->imp_obd->obd_self_export->exp_connect_data = *ocd; + + /* + * The net statistics after (re-)connect is not valid anymore, + * because may reflect other routing, etc. + */ + at_init(&imp->imp_at.iat_net_latency, 0, 0); + ptlrpc_at_adj_net_latency(request, + lustre_msg_get_service_time(request->rq_repmsg)); + + /* Import flags should be updated before waking import at FULL state */ + rc = ptlrpc_connect_set_flags(imp, ocd, old_connect_flags, exp, + aa->pcaa_initial_connect); class_export_put(exp); + if (rc) + goto out; obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD); @@ -987,151 +1148,13 @@ static int ptlrpc_connect_interpret(const struct lu_env *env, finish: rc = ptlrpc_import_recovery_state_machine(imp); - if (rc != 0) { - if (rc == -ENOTCONN) { - CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery; invalidating and reconnecting\n", - obd2cli_tgt(imp->imp_obd), - imp->imp_connection->c_remote_uuid.uuid); - ptlrpc_connect_import(imp); - imp->imp_connect_tried = 1; - return 0; - } - } else { - static bool warned; - - spin_lock(&imp->imp_lock); - list_del(&imp->imp_conn_current->oic_item); - list_add(&imp->imp_conn_current->oic_item, &imp->imp_conn_list); - imp->imp_last_success_conn = - imp->imp_conn_current->oic_last_attempt; - - spin_unlock(&imp->imp_lock); - - if ((imp->imp_connect_flags_orig & OBD_CONNECT_IBITS) && - !(ocd->ocd_connect_flags & OBD_CONNECT_IBITS)) { - LCONSOLE_WARN("%s: MDS %s does not support ibits lock, either very old or invalid: requested %llx, replied %llx\n", - imp->imp_obd->obd_name, - imp->imp_connection->c_remote_uuid.uuid, - imp->imp_connect_flags_orig, - ocd->ocd_connect_flags); - rc = -EPROTO; - goto out; - } - - if (!warned && (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && - (ocd->ocd_version > LUSTRE_VERSION_CODE + - LUSTRE_VERSION_OFFSET_WARN || - ocd->ocd_version < LUSTRE_VERSION_CODE - - LUSTRE_VERSION_OFFSET_WARN)) { - /* Sigh, some compilers do not like #ifdef in the middle - * of macro arguments - */ - const char *older = "older than client. Consider upgrading server"; - const char *newer = "newer than client. Consider recompiling application"; - - LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) is much %s (%s)\n", - obd2cli_tgt(imp->imp_obd), - OBD_OCD_VERSION_MAJOR(ocd->ocd_version), - OBD_OCD_VERSION_MINOR(ocd->ocd_version), - OBD_OCD_VERSION_PATCH(ocd->ocd_version), - OBD_OCD_VERSION_FIX(ocd->ocd_version), - ocd->ocd_version > LUSTRE_VERSION_CODE ? - newer : older, LUSTRE_VERSION_STRING); - warned = true; - } - -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0) - /* Check if server has LU-1252 fix applied to not always swab - * the IR MNE entries. Do this only once per connection. This - * fixup is version-limited, because we don't want to carry the - * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we - * need interop with unpatched 2.2 servers. For newer servers, - * the client will do MNE swabbing only as needed. LU-1644 - */ - if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && - !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) && - OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 && - OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 && - OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 && - strcmp(imp->imp_obd->obd_type->typ_name, - LUSTRE_MGC_NAME) == 0)) - imp->imp_need_mne_swab = 1; - else /* clear if server was upgraded since last connect */ - imp->imp_need_mne_swab = 0; -#else -#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab" -#endif - - if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) { - /* We sent to the server ocd_cksum_types with bits set - * for algorithms we understand. The server masked off - * the checksum types it doesn't support - */ - if ((ocd->ocd_cksum_types & - cksum_types_supported_client()) == 0) { - LCONSOLE_WARN("The negotiation of the checksum algorithm to use with server %s failed (%x/%x), disabling checksums\n", - obd2cli_tgt(imp->imp_obd), - ocd->ocd_cksum_types, - cksum_types_supported_client()); - cli->cl_checksum = 0; - cli->cl_supp_cksum_types = OBD_CKSUM_ADLER; - } else { - cli->cl_supp_cksum_types = ocd->ocd_cksum_types; - } - } else { - /* The server does not support OBD_CONNECT_CKSUM. - * Enforce ADLER for backward compatibility - */ - cli->cl_supp_cksum_types = OBD_CKSUM_ADLER; - } - cli->cl_cksum_type = cksum_type_select(cli->cl_supp_cksum_types); - - if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) - cli->cl_max_pages_per_rpc = - min(ocd->ocd_brw_size >> PAGE_SHIFT, - cli->cl_max_pages_per_rpc); - else if (imp->imp_connect_op == MDS_CONNECT || - imp->imp_connect_op == MGS_CONNECT) - cli->cl_max_pages_per_rpc = 1; - - /* Reset ns_connect_flags only for initial connect. It might be - * changed in while using FS and if we reset it in reconnect - * this leads to losing user settings done before such as - * disable lru_resize, etc. - */ - if (old_connect_flags != exp_connect_flags(exp) || - aa->pcaa_initial_connect) { - CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server flags: %#llx\n", - imp->imp_obd->obd_name, ocd->ocd_connect_flags); - imp->imp_obd->obd_namespace->ns_connect_flags = - ocd->ocd_connect_flags; - imp->imp_obd->obd_namespace->ns_orig_connect_flags = - ocd->ocd_connect_flags; - } - - if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) && - (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2)) - /* We need a per-message support flag, because - * a. we don't know if the incoming connect reply - * supports AT or not (in reply_in_callback) - * until we unpack it. - * b. failovered server means export and flags are gone - * (in ptlrpc_send_reply). - * Can only be set when we know AT is supported at - * both ends - */ - imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT; - else - imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; - - if ((ocd->ocd_connect_flags & OBD_CONNECT_FULL20) && - (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2)) - imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18; - else - imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18; - - LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) && - (cli->cl_max_pages_per_rpc > 0)); + if (rc == -ENOTCONN) { + CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery; invalidating and reconnecting\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + ptlrpc_connect_import(imp); + imp->imp_connect_tried = 1; + return 0; } out: @@ -1497,10 +1520,13 @@ EXPORT_SYMBOL(ptlrpc_disconnect_import); /* Adaptive Timeout utils */ extern unsigned int at_min, at_max, at_history; -/* Bin into timeslices using AT_BINS bins. - * This gives us a max of the last binlimit*AT_BINS secs without the storage, - * but still smoothing out a return to normalcy from a slow response. - * (E.g. remember the maximum latency in each minute of the last 4 minutes.) +/* + *Update at_current with the specified value (bounded by at_min and at_max), + * as well as the AT history "bins". + * - Bin into timeslices using AT_BINS bins. + * - This gives us a max of the last at_history seconds without the storage, + * but still smoothing out a return to normalcy from a slow response. + * - (E.g. remember the maximum latency in each minute of the last 4 minutes.) */ int at_measured(struct adaptive_timeout *at, unsigned int val) { diff --git a/drivers/staging/lustre/lustre/ptlrpc/layout.c b/drivers/staging/lustre/lustre/ptlrpc/layout.c index ab5d85174245..839ef3e80c1a 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/layout.c +++ b/drivers/staging/lustre/lustre/ptlrpc/layout.c @@ -667,11 +667,8 @@ static struct req_format *req_formats[] = { &RQF_MDS_SYNC, &RQF_MDS_CLOSE, &RQF_MDS_RELEASE_CLOSE, - &RQF_MDS_PIN, - &RQF_MDS_UNPIN, &RQF_MDS_READPAGE, &RQF_MDS_WRITEPAGE, - &RQF_MDS_IS_SUBDIR, &RQF_MDS_DONE_WRITING, &RQF_MDS_REINT, &RQF_MDS_REINT_CREATE, @@ -1116,9 +1113,9 @@ EXPORT_SYMBOL(RMF_SWAP_LAYOUTS); struct req_format { const char *rf_name; - int rf_idx; + size_t rf_idx; struct { - int nr; + size_t nr; const struct req_msg_field **d; } rf_fields[RCL_NR]; }; @@ -1389,15 +1386,6 @@ struct req_format RQF_MDS_RELEASE_CLOSE = mdt_release_close_client, mds_last_unlink_server); EXPORT_SYMBOL(RQF_MDS_RELEASE_CLOSE); -struct req_format RQF_MDS_PIN = - DEFINE_REQ_FMT0("MDS_PIN", - mdt_body_capa, mdt_body_only); -EXPORT_SYMBOL(RQF_MDS_PIN); - -struct req_format RQF_MDS_UNPIN = - DEFINE_REQ_FMT0("MDS_UNPIN", mdt_body_only, empty); -EXPORT_SYMBOL(RQF_MDS_UNPIN); - struct req_format RQF_MDS_DONE_WRITING = DEFINE_REQ_FMT0("MDS_DONE_WRITING", mdt_close_client, mdt_body_only); @@ -1448,11 +1436,6 @@ struct req_format RQF_MDS_WRITEPAGE = mdt_body_capa, mdt_body_only); EXPORT_SYMBOL(RQF_MDS_WRITEPAGE); -struct req_format RQF_MDS_IS_SUBDIR = - DEFINE_REQ_FMT0("MDS_IS_SUBDIR", - mdt_body_only, mdt_body_only); -EXPORT_SYMBOL(RQF_MDS_IS_SUBDIR); - struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE = DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE", llog_origin_handle_create_client, llogd_body_only); @@ -1572,9 +1555,9 @@ EXPORT_SYMBOL(RQF_OST_GET_INFO_FIEMAP); */ int req_layout_init(void) { - int i; - int j; - int k; + size_t i; + size_t j; + size_t k; struct req_format *rf = NULL; for (i = 0; i < ARRAY_SIZE(req_formats); ++i) { @@ -1616,7 +1599,7 @@ EXPORT_SYMBOL(req_layout_fini); */ static void req_capsule_init_area(struct req_capsule *pill) { - int i; + size_t i; for (i = 0; i < ARRAY_SIZE(pill->rc_area[RCL_CLIENT]); i++) { pill->rc_area[RCL_CLIENT][i] = -1; @@ -1667,8 +1650,7 @@ EXPORT_SYMBOL(req_capsule_fini); static int __req_format_is_sane(const struct req_format *fmt) { - return - 0 <= fmt->rf_idx && fmt->rf_idx < ARRAY_SIZE(req_formats) && + return fmt->rf_idx < ARRAY_SIZE(req_formats) && req_formats[fmt->rf_idx] == fmt; } @@ -1702,11 +1684,11 @@ EXPORT_SYMBOL(req_capsule_set); * variable-sized fields. The field sizes come from the declared \a rmf_size * field of a \a pill's \a rc_fmt's RMF's. */ -int req_capsule_filled_sizes(struct req_capsule *pill, - enum req_location loc) +size_t req_capsule_filled_sizes(struct req_capsule *pill, + enum req_location loc) { const struct req_format *fmt = pill->rc_fmt; - int i; + size_t i; for (i = 0; i < fmt->rf_fields[loc].nr; ++i) { if (pill->rc_area[loc][i] == -1) { @@ -1761,11 +1743,11 @@ EXPORT_SYMBOL(req_capsule_server_pack); * Returns the PTLRPC request or reply (\a loc) buffer offset of a \a pill * corresponding to the given RMF (\a field). */ -static int __req_capsule_offset(const struct req_capsule *pill, +static u32 __req_capsule_offset(const struct req_capsule *pill, const struct req_msg_field *field, enum req_location loc) { - int offset; + u32 offset; offset = field->rmf_offset[pill->rc_fmt->rf_idx][loc]; LASSERTF(offset > 0, "%s:%s, off=%d, loc=%d\n", pill->rc_fmt->rf_name, @@ -1869,10 +1851,10 @@ static void *__req_capsule_get(struct req_capsule *pill, const struct req_format *fmt; struct lustre_msg *msg; void *value; - int len; - int offset; + u32 len; + u32 offset; - void *(*getter)(struct lustre_msg *m, int n, int minlen); + void *(*getter)(struct lustre_msg *m, u32 n, u32 minlen); static const char *rcl_names[RCL_NR] = { [RCL_CLIENT] = "client", @@ -1899,20 +1881,20 @@ static void *__req_capsule_get(struct req_capsule *pill, */ len = lustre_msg_buflen(msg, offset); if ((len % field->rmf_size) != 0) { - CERROR("%s: array field size mismatch %d modulo %d != 0 (%d)\n", + CERROR("%s: array field size mismatch %d modulo %u != 0 (%d)\n", field->rmf_name, len, field->rmf_size, loc); return NULL; } } else if (pill->rc_area[loc][offset] != -1) { len = pill->rc_area[loc][offset]; } else { - len = max(field->rmf_size, 0); + len = max_t(typeof(field->rmf_size), field->rmf_size, 0); } value = getter(msg, offset, len); if (!value) { DEBUG_REQ(D_ERROR, pill->rc_req, - "Wrong buffer for field `%s' (%d of %d) in format `%s': %d vs. %d (%s)\n", + "Wrong buffer for field `%s' (%u of %u) in format `%s': %u vs. %u (%s)\n", field->rmf_name, offset, lustre_msg_bufcount(msg), fmt->rf_name, lustre_msg_buflen(msg, offset), len, rcl_names[loc]); @@ -1958,7 +1940,7 @@ EXPORT_SYMBOL(req_capsule_client_swab_get); */ void *req_capsule_client_sized_get(struct req_capsule *pill, const struct req_msg_field *field, - int len) + u32 len) { req_capsule_set_size(pill, field, RCL_CLIENT, len); return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0); @@ -1999,7 +1981,7 @@ EXPORT_SYMBOL(req_capsule_server_swab_get); */ void *req_capsule_server_sized_get(struct req_capsule *pill, const struct req_msg_field *field, - int len) + u32 len) { req_capsule_set_size(pill, field, RCL_SERVER, len); return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0); @@ -2008,7 +1990,7 @@ EXPORT_SYMBOL(req_capsule_server_sized_get); void *req_capsule_server_sized_swab_get(struct req_capsule *pill, const struct req_msg_field *field, - int len, void *swabber) + u32 len, void *swabber) { req_capsule_set_size(pill, field, RCL_SERVER, len); return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0); @@ -2024,23 +2006,25 @@ EXPORT_SYMBOL(req_capsule_server_sized_swab_get); */ void req_capsule_set_size(struct req_capsule *pill, const struct req_msg_field *field, - enum req_location loc, int size) + enum req_location loc, u32 size) { LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); - if ((size != field->rmf_size) && + if ((size != (u32)field->rmf_size) && (field->rmf_size != -1) && !(field->rmf_flags & RMF_F_NO_SIZE_CHECK) && (size > 0)) { + u32 rmf_size = (u32)field->rmf_size; + if ((field->rmf_flags & RMF_F_STRUCT_ARRAY) && - (size % field->rmf_size != 0)) { - CERROR("%s: array field size mismatch %d %% %d != 0 (%d)\n", - field->rmf_name, size, field->rmf_size, loc); + (size % rmf_size != 0)) { + CERROR("%s: array field size mismatch %u %% %u != 0 (%d)\n", + field->rmf_name, size, rmf_size, loc); LBUG(); } else if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY) && - size < field->rmf_size) { - CERROR("%s: field size mismatch %d != %d (%d)\n", - field->rmf_name, size, field->rmf_size, loc); + size < rmf_size) { + CERROR("%s: field size mismatch %u != %u (%d)\n", + field->rmf_name, size, rmf_size, loc); LBUG(); } } @@ -2057,7 +2041,7 @@ EXPORT_SYMBOL(req_capsule_set_size); * actually sets the size in pill.rc_area[loc][offset], but this function * returns the message buflen[offset], maybe we should use another name. */ -int req_capsule_get_size(const struct req_capsule *pill, +u32 req_capsule_get_size(const struct req_capsule *pill, const struct req_msg_field *field, enum req_location loc) { @@ -2075,7 +2059,7 @@ EXPORT_SYMBOL(req_capsule_get_size); * * See also req_capsule_set_size(). */ -int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc) +u32 req_capsule_msg_size(struct req_capsule *pill, enum req_location loc) { return lustre_msg_size(pill->rc_req->rq_import->imp_msg_magic, pill->rc_fmt->rf_fields[loc].nr, @@ -2090,10 +2074,11 @@ int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc) * This function should not be used for formats which contain variable size * fields. */ -int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, +u32 req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, enum req_location loc) { - int size, i = 0; + size_t i = 0; + u32 size; /* * This function should probably LASSERT() that fmt has no fields with @@ -2103,7 +2088,7 @@ int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, * we do. */ size = lustre_msg_hdr_size(magic, fmt->rf_fields[loc].nr); - if (size < 0) + if (!size) return size; for (; i < fmt->rf_fields[loc].nr; ++i) @@ -2135,7 +2120,7 @@ int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt) { int i; - int j; + size_t j; const struct req_format *old; @@ -2193,7 +2178,7 @@ static int req_capsule_field_present(const struct req_capsule *pill, const struct req_msg_field *field, enum req_location loc) { - int offset; + u32 offset; LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); LASSERT(req_capsule_has_field(pill, field, loc)); @@ -2210,12 +2195,11 @@ static int req_capsule_field_present(const struct req_capsule *pill, */ void req_capsule_shrink(struct req_capsule *pill, const struct req_msg_field *field, - unsigned int newlen, - enum req_location loc) + u32 newlen, enum req_location loc) { const struct req_format *fmt; struct lustre_msg *msg; - int len; + u32 len; int offset; fmt = pill->rc_fmt; @@ -2228,7 +2212,7 @@ void req_capsule_shrink(struct req_capsule *pill, msg = __req_msg(pill, loc); len = lustre_msg_buflen(msg, offset); - LASSERTF(newlen <= len, "%s:%s, oldlen=%d, newlen=%d\n", + LASSERTF(newlen <= len, "%s:%s, oldlen=%u, newlen=%u\n", fmt->rf_name, field->rmf_name, len, newlen); if (loc == RCL_CLIENT) diff --git a/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c index bc93b75744e1..9bad57d65db4 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c +++ b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c @@ -191,7 +191,7 @@ ptlrpc_ldebugfs_register(struct dentry *root, char *dir, LASSERT(!*debugfs_root_ret); LASSERT(!*stats_ret); - svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES+LUSTRE_MAX_OPCODES, + svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES + LUSTRE_MAX_OPCODES, 0); if (!svc_stats) return; @@ -937,7 +937,7 @@ static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter) static int ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file) { - static struct seq_operations sops = { + static const struct seq_operations sops = { .start = ptlrpc_lprocfs_svc_req_history_start, .stop = ptlrpc_lprocfs_svc_req_history_stop, .next = ptlrpc_lprocfs_svc_req_history_next, diff --git a/drivers/staging/lustre/lustre/ptlrpc/niobuf.c b/drivers/staging/lustre/lustre/ptlrpc/niobuf.c index 11ec82545347..9c937398a085 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/niobuf.c +++ b/drivers/staging/lustre/lustre/ptlrpc/niobuf.c @@ -295,7 +295,6 @@ int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async) } return 0; } -EXPORT_SYMBOL(ptlrpc_unregister_bulk); static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags) { @@ -398,7 +397,8 @@ int ptlrpc_send_reply(struct ptlrpc_request *req, int flags) lustre_msg_set_status(req->rq_repmsg, ptlrpc_status_hton(req->rq_status)); lustre_msg_set_opc(req->rq_repmsg, - req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0); + req->rq_reqmsg ? + lustre_msg_get_opc(req->rq_reqmsg) : 0); target_pack_pool_reply(req); @@ -433,7 +433,6 @@ out: ptlrpc_connection_put(conn); return rc; } -EXPORT_SYMBOL(ptlrpc_send_reply); int ptlrpc_reply(struct ptlrpc_request *req) { @@ -441,7 +440,6 @@ int ptlrpc_reply(struct ptlrpc_request *req) return 0; return ptlrpc_send_reply(req, 0); } -EXPORT_SYMBOL(ptlrpc_reply); /** * For request \a req send an error reply back. Create empty @@ -468,13 +466,11 @@ int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult) rc = ptlrpc_send_reply(req, may_be_difficult); return rc; } -EXPORT_SYMBOL(ptlrpc_send_error); int ptlrpc_error(struct ptlrpc_request *req) { return ptlrpc_send_error(req, 0); } -EXPORT_SYMBOL(ptlrpc_error); /** * Send request \a request. @@ -490,7 +486,8 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) struct ptlrpc_connection *connection; lnet_handle_me_t reply_me_h; lnet_md_t reply_md; - struct obd_device *obd = request->rq_import->imp_obd; + struct obd_import *imp = request->rq_import; + struct obd_device *obd = imp->imp_obd; if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC)) return 0; @@ -503,7 +500,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) */ LASSERT(!request->rq_receiving_reply); LASSERT(!((lustre_msg_get_flags(request->rq_reqmsg) & MSG_REPLAY) && - (request->rq_import->imp_state == LUSTRE_IMP_FULL))); + (imp->imp_state == LUSTRE_IMP_FULL))); if (unlikely(obd && obd->obd_fail)) { CDEBUG(D_HA, "muting rpc for failed imp obd %s\n", @@ -516,15 +513,22 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) return -ENODEV; } - connection = request->rq_import->imp_connection; + connection = imp->imp_connection; lustre_msg_set_handle(request->rq_reqmsg, - &request->rq_import->imp_remote_handle); + &imp->imp_remote_handle); lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST); - lustre_msg_set_conn_cnt(request->rq_reqmsg, - request->rq_import->imp_conn_cnt); - lustre_msghdr_set_flags(request->rq_reqmsg, - request->rq_import->imp_msghdr_flags); + lustre_msg_set_conn_cnt(request->rq_reqmsg, imp->imp_conn_cnt); + lustre_msghdr_set_flags(request->rq_reqmsg, imp->imp_msghdr_flags); + + /** + * For enabled AT all request should have AT_SUPPORT in the + * FULL import state when OBD_CONNECT_AT is set + */ + LASSERT(AT_OFF || imp->imp_state != LUSTRE_IMP_FULL || + (imp->imp_msghdr_flags & MSGHDR_AT_SUPPORT) || + !(imp->imp_connect_data.ocd_connect_flags & + OBD_CONNECT_AT)); if (request->rq_resend) lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT); @@ -628,7 +632,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) ptlrpc_request_addref(request); if (obd && obd->obd_svc_stats) lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR, - atomic_read(&request->rq_import->imp_inflight)); + atomic_read(&imp->imp_inflight)); OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5); @@ -640,7 +644,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) request->rq_deadline = request->rq_sent + request->rq_timeout + ptlrpc_at_get_net_latency(request); - ptlrpc_pinger_sending_on_import(request->rq_import); + ptlrpc_pinger_sending_on_import(imp); DEBUG_REQ(D_INFO, request, "send flg=%x", lustre_msg_get_flags(request->rq_reqmsg)); diff --git a/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c index b514f18fae50..871768511e8c 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c +++ b/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c @@ -50,36 +50,34 @@ #include "ptlrpc_internal.h" -static inline int lustre_msg_hdr_size_v2(int count) +static inline u32 lustre_msg_hdr_size_v2(u32 count) { return cfs_size_round(offsetof(struct lustre_msg_v2, lm_buflens[count])); } -int lustre_msg_hdr_size(__u32 magic, int count) +u32 lustre_msg_hdr_size(__u32 magic, u32 count) { switch (magic) { case LUSTRE_MSG_MAGIC_V2: return lustre_msg_hdr_size_v2(count); default: LASSERTF(0, "incorrect message magic: %08x\n", magic); - return -EINVAL; + return 0; } } -EXPORT_SYMBOL(lustre_msg_hdr_size); void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout, - int index) + u32 index) { if (inout) lustre_set_req_swabbed(req, index); else lustre_set_rep_swabbed(req, index); } -EXPORT_SYMBOL(ptlrpc_buf_set_swabbed); int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout, - int index) + u32 index) { if (inout) return (ptlrpc_req_need_swab(req) && @@ -88,12 +86,11 @@ int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout, return (ptlrpc_rep_need_swab(req) && !lustre_rep_swabbed(req, index)); } -EXPORT_SYMBOL(ptlrpc_buf_need_swab); /* early reply size */ -int lustre_msg_early_size(void) +u32 lustre_msg_early_size(void) { - static int size; + static u32 size; if (!size) { /* Always reply old ptlrpc_body_v2 to keep interoperability @@ -111,9 +108,9 @@ int lustre_msg_early_size(void) } EXPORT_SYMBOL(lustre_msg_early_size); -int lustre_msg_size_v2(int count, __u32 *lengths) +u32 lustre_msg_size_v2(int count, __u32 *lengths) { - int size; + u32 size; int i; size = lustre_msg_hdr_size_v2(count); @@ -131,7 +128,7 @@ EXPORT_SYMBOL(lustre_msg_size_v2); * target then the first buffer will be stripped because the ptlrpc * data is part of the lustre_msg_v1 header. b=14043 */ -int lustre_msg_size(__u32 magic, int count, __u32 *lens) +u32 lustre_msg_size(__u32 magic, int count, __u32 *lens) { __u32 size[] = { sizeof(struct ptlrpc_body) }; @@ -148,15 +145,14 @@ int lustre_msg_size(__u32 magic, int count, __u32 *lens) return lustre_msg_size_v2(count, lens); default: LASSERTF(0, "incorrect message magic: %08x\n", magic); - return -EINVAL; + return 0; } } -EXPORT_SYMBOL(lustre_msg_size); /* This is used to determine the size of a buffer that was already packed * and will correctly handle the different message formats. */ -int lustre_packed_msg_size(struct lustre_msg *msg) +u32 lustre_packed_msg_size(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V2: @@ -166,7 +162,6 @@ int lustre_packed_msg_size(struct lustre_msg *msg) return 0; } } -EXPORT_SYMBOL(lustre_packed_msg_size); void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens, char **bufs) @@ -227,7 +222,6 @@ int lustre_pack_request(struct ptlrpc_request *req, __u32 magic, int count, /* only use new format, we don't need to be compatible with 1.4 */ return lustre_pack_request_v2(req, count, lens, bufs); } -EXPORT_SYMBOL(lustre_pack_request); #if RS_DEBUG LIST_HEAD(ptlrpc_rs_debug_lru); @@ -369,7 +363,6 @@ int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, __u32 *lens, lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens)); return rc; } -EXPORT_SYMBOL(lustre_pack_reply_flags); int lustre_pack_reply(struct ptlrpc_request *req, int count, __u32 *lens, char **bufs) @@ -378,11 +371,9 @@ int lustre_pack_reply(struct ptlrpc_request *req, int count, __u32 *lens, } EXPORT_SYMBOL(lustre_pack_reply); -void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size) +void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, u32 n, u32 min_size) { - int i, offset, buflen, bufcount; - - LASSERT(n >= 0); + u32 i, offset, buflen, bufcount; bufcount = m->lm_bufcount; if (unlikely(n >= bufcount)) { @@ -406,7 +397,7 @@ void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size) return (char *)m + offset; } -void *lustre_msg_buf(struct lustre_msg *m, int n, int min_size) +void *lustre_msg_buf(struct lustre_msg *m, u32 n, u32 min_size) { switch (m->lm_magic) { case LUSTRE_MSG_MAGIC_V2: @@ -419,7 +410,7 @@ void *lustre_msg_buf(struct lustre_msg *m, int n, int min_size) } EXPORT_SYMBOL(lustre_msg_buf); -static int lustre_shrink_msg_v2(struct lustre_msg_v2 *msg, int segment, +static int lustre_shrink_msg_v2(struct lustre_msg_v2 *msg, u32 segment, unsigned int newlen, int move_data) { char *tail = NULL, *newpos; @@ -493,7 +484,6 @@ void lustre_free_reply_state(struct ptlrpc_reply_state *rs) sptlrpc_svc_free_rs(rs); } -EXPORT_SYMBOL(lustre_free_reply_state); static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len) { @@ -581,7 +571,6 @@ int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len) } return rc; } -EXPORT_SYMBOL(ptlrpc_unpack_req_msg); int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len) { @@ -594,7 +583,6 @@ int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len) } return rc; } -EXPORT_SYMBOL(ptlrpc_unpack_rep_msg); static inline int lustre_unpack_ptlrpc_body_v2(struct ptlrpc_request *req, const int inout, int offset) @@ -647,7 +635,7 @@ int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset) } } -static inline int lustre_msg_buflen_v2(struct lustre_msg_v2 *m, int n) +static inline u32 lustre_msg_buflen_v2(struct lustre_msg_v2 *m, u32 n) { if (n >= m->lm_bufcount) return 0; @@ -662,14 +650,14 @@ static inline int lustre_msg_buflen_v2(struct lustre_msg_v2 *m, int n) * * returns zero for non-existent message indices */ -int lustre_msg_buflen(struct lustre_msg *m, int n) +u32 lustre_msg_buflen(struct lustre_msg *m, u32 n) { switch (m->lm_magic) { case LUSTRE_MSG_MAGIC_V2: return lustre_msg_buflen_v2(m, n); default: CERROR("incorrect message magic: %08x\n", m->lm_magic); - return -EINVAL; + return 0; } } EXPORT_SYMBOL(lustre_msg_buflen); @@ -677,23 +665,22 @@ EXPORT_SYMBOL(lustre_msg_buflen); /* NB return the bufcount for lustre_msg_v2 format, so if message is packed * in V1 format, the result is one bigger. (add struct ptlrpc_body). */ -int lustre_msg_bufcount(struct lustre_msg *m) +u32 lustre_msg_bufcount(struct lustre_msg *m) { switch (m->lm_magic) { case LUSTRE_MSG_MAGIC_V2: return m->lm_bufcount; default: CERROR("incorrect message magic: %08x\n", m->lm_magic); - return -EINVAL; + return 0; } } -EXPORT_SYMBOL(lustre_msg_bufcount); -char *lustre_msg_string(struct lustre_msg *m, int index, int max_len) +char *lustre_msg_string(struct lustre_msg *m, u32 index, u32 max_len) { /* max_len == 0 means the string should fill the buffer */ char *str; - int slen, blen; + u32 slen, blen; switch (m->lm_magic) { case LUSTRE_MSG_MAGIC_V2: @@ -731,11 +718,10 @@ char *lustre_msg_string(struct lustre_msg *m, int index, int max_len) return str; } -EXPORT_SYMBOL(lustre_msg_string); /* Wrap up the normal fixed length cases */ -static inline void *__lustre_swab_buf(struct lustre_msg *msg, int index, - int min_size, void *swabber) +static inline void *__lustre_swab_buf(struct lustre_msg *msg, u32 index, + u32 min_size, void *swabber) { void *ptr = NULL; @@ -804,7 +790,7 @@ __u32 lustre_msg_get_flags(struct lustre_msg *msg) } EXPORT_SYMBOL(lustre_msg_get_flags); -void lustre_msg_add_flags(struct lustre_msg *msg, int flags) +void lustre_msg_add_flags(struct lustre_msg *msg, u32 flags) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V2: { @@ -820,7 +806,7 @@ void lustre_msg_add_flags(struct lustre_msg *msg, int flags) } EXPORT_SYMBOL(lustre_msg_add_flags); -void lustre_msg_set_flags(struct lustre_msg *msg, int flags) +void lustre_msg_set_flags(struct lustre_msg *msg, u32 flags) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V2: { @@ -834,9 +820,8 @@ void lustre_msg_set_flags(struct lustre_msg *msg, int flags) LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); } } -EXPORT_SYMBOL(lustre_msg_set_flags); -void lustre_msg_clear_flags(struct lustre_msg *msg, int flags) +void lustre_msg_clear_flags(struct lustre_msg *msg, u32 flags) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V2: { @@ -868,9 +853,8 @@ __u32 lustre_msg_get_op_flags(struct lustre_msg *msg) return 0; } } -EXPORT_SYMBOL(lustre_msg_get_op_flags); -void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags) +void lustre_msg_add_op_flags(struct lustre_msg *msg, u32 flags) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V2: { @@ -903,7 +887,6 @@ struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg) return NULL; } } -EXPORT_SYMBOL(lustre_msg_get_handle); __u32 lustre_msg_get_type(struct lustre_msg *msg) { @@ -924,7 +907,7 @@ __u32 lustre_msg_get_type(struct lustre_msg *msg) } EXPORT_SYMBOL(lustre_msg_get_type); -void lustre_msg_add_version(struct lustre_msg *msg, int version) +void lustre_msg_add_version(struct lustre_msg *msg, u32 version) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V2: { @@ -938,7 +921,6 @@ void lustre_msg_add_version(struct lustre_msg *msg, int version) LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); } } -EXPORT_SYMBOL(lustre_msg_add_version); __u32 lustre_msg_get_opc(struct lustre_msg *msg) { @@ -1055,7 +1037,6 @@ __u64 lustre_msg_get_slv(struct lustre_msg *msg) return -EINVAL; } } -EXPORT_SYMBOL(lustre_msg_get_slv); void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv) { @@ -1075,7 +1056,6 @@ void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv) return; } } -EXPORT_SYMBOL(lustre_msg_set_slv); __u32 lustre_msg_get_limit(struct lustre_msg *msg) { @@ -1094,7 +1074,6 @@ __u32 lustre_msg_get_limit(struct lustre_msg *msg) return -EINVAL; } } -EXPORT_SYMBOL(lustre_msg_get_limit); void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit) { @@ -1114,7 +1093,6 @@ void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit) return; } } -EXPORT_SYMBOL(lustre_msg_set_limit); __u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg) { @@ -1145,7 +1123,6 @@ __u32 lustre_msg_get_magic(struct lustre_msg *msg) return 0; } } -EXPORT_SYMBOL(lustre_msg_get_magic); __u32 lustre_msg_get_timeout(struct lustre_msg *msg) { @@ -1203,8 +1180,9 @@ __u32 lustre_msg_calc_cksum(struct lustre_msg *msg) unsigned int hsize = 4; cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb, - lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF), - NULL, 0, (unsigned char *)&crc, &hsize); + lustre_msg_buflen(msg, + MSG_PTLRPC_BODY_OFF), + NULL, 0, (unsigned char *)&crc, &hsize); return crc; } default: @@ -1227,7 +1205,6 @@ void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle) LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); } } -EXPORT_SYMBOL(lustre_msg_set_handle); void lustre_msg_set_type(struct lustre_msg *msg, __u32 type) { @@ -1243,7 +1220,6 @@ void lustre_msg_set_type(struct lustre_msg *msg, __u32 type) LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); } } -EXPORT_SYMBOL(lustre_msg_set_type); void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc) { @@ -1259,7 +1235,6 @@ void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc) LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); } } -EXPORT_SYMBOL(lustre_msg_set_opc); void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions) { @@ -1326,7 +1301,6 @@ void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt) LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); } } -EXPORT_SYMBOL(lustre_msg_set_conn_cnt); void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout) { @@ -1377,7 +1351,7 @@ void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid) LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); if (jobid) - memcpy(pb->pb_jobid, jobid, JOBSTATS_JOBID_SIZE); + memcpy(pb->pb_jobid, jobid, LUSTRE_JOBID_SIZE); else if (pb->pb_jobid[0] == '\0') lustre_get_jobid(pb->pb_jobid); return; @@ -1491,7 +1465,6 @@ void lustre_swab_ptlrpc_body(struct ptlrpc_body *b) */ CLASSERT(offsetof(typeof(*b), pb_jobid) != 0); } -EXPORT_SYMBOL(lustre_swab_ptlrpc_body); void lustre_swab_connect(struct obd_connect_data *ocd) { @@ -1591,7 +1564,6 @@ void lustre_swab_obd_statfs(struct obd_statfs *os) CLASSERT(offsetof(typeof(*os), os_spare8) != 0); CLASSERT(offsetof(typeof(*os), os_spare9) != 0); } -EXPORT_SYMBOL(lustre_swab_obd_statfs); void lustre_swab_obd_ioobj(struct obd_ioobj *ioo) { @@ -1599,33 +1571,28 @@ void lustre_swab_obd_ioobj(struct obd_ioobj *ioo) __swab32s(&ioo->ioo_max_brw); __swab32s(&ioo->ioo_bufcnt); } -EXPORT_SYMBOL(lustre_swab_obd_ioobj); void lustre_swab_niobuf_remote(struct niobuf_remote *nbr) { - __swab64s(&nbr->offset); - __swab32s(&nbr->len); - __swab32s(&nbr->flags); + __swab64s(&nbr->rnb_offset); + __swab32s(&nbr->rnb_len); + __swab32s(&nbr->rnb_flags); } -EXPORT_SYMBOL(lustre_swab_niobuf_remote); void lustre_swab_ost_body(struct ost_body *b) { lustre_swab_obdo(&b->oa); } -EXPORT_SYMBOL(lustre_swab_ost_body); void lustre_swab_ost_last_id(u64 *id) { __swab64s(id); } -EXPORT_SYMBOL(lustre_swab_ost_last_id); void lustre_swab_generic_32s(__u32 *val) { __swab32s(val); } -EXPORT_SYMBOL(lustre_swab_generic_32s); void lustre_swab_gl_desc(union ldlm_gl_desc *desc) { @@ -1674,37 +1641,36 @@ EXPORT_SYMBOL(lustre_swab_lquota_lvb); void lustre_swab_mdt_body(struct mdt_body *b) { - lustre_swab_lu_fid(&b->fid1); - lustre_swab_lu_fid(&b->fid2); + lustre_swab_lu_fid(&b->mbo_fid1); + lustre_swab_lu_fid(&b->mbo_fid2); /* handle is opaque */ - __swab64s(&b->valid); - __swab64s(&b->size); - __swab64s(&b->mtime); - __swab64s(&b->atime); - __swab64s(&b->ctime); - __swab64s(&b->blocks); - __swab64s(&b->ioepoch); - __swab64s(&b->t_state); - __swab32s(&b->fsuid); - __swab32s(&b->fsgid); - __swab32s(&b->capability); - __swab32s(&b->mode); - __swab32s(&b->uid); - __swab32s(&b->gid); - __swab32s(&b->flags); - __swab32s(&b->rdev); - __swab32s(&b->nlink); - CLASSERT(offsetof(typeof(*b), unused2) != 0); - __swab32s(&b->suppgid); - __swab32s(&b->eadatasize); - __swab32s(&b->aclsize); - __swab32s(&b->max_mdsize); - __swab32s(&b->max_cookiesize); - __swab32s(&b->uid_h); - __swab32s(&b->gid_h); - CLASSERT(offsetof(typeof(*b), padding_5) != 0); -} -EXPORT_SYMBOL(lustre_swab_mdt_body); + __swab64s(&b->mbo_valid); + __swab64s(&b->mbo_size); + __swab64s(&b->mbo_mtime); + __swab64s(&b->mbo_atime); + __swab64s(&b->mbo_ctime); + __swab64s(&b->mbo_blocks); + __swab64s(&b->mbo_ioepoch); + __swab64s(&b->mbo_t_state); + __swab32s(&b->mbo_fsuid); + __swab32s(&b->mbo_fsgid); + __swab32s(&b->mbo_capability); + __swab32s(&b->mbo_mode); + __swab32s(&b->mbo_uid); + __swab32s(&b->mbo_gid); + __swab32s(&b->mbo_flags); + __swab32s(&b->mbo_rdev); + __swab32s(&b->mbo_nlink); + CLASSERT(offsetof(typeof(*b), mbo_unused2) != 0); + __swab32s(&b->mbo_suppgid); + __swab32s(&b->mbo_eadatasize); + __swab32s(&b->mbo_aclsize); + __swab32s(&b->mbo_max_mdsize); + __swab32s(&b->mbo_max_cookiesize); + __swab32s(&b->mbo_uid_h); + __swab32s(&b->mbo_gid_h); + CLASSERT(offsetof(typeof(*b), mbo_padding_5) != 0); +} void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b) { @@ -1713,7 +1679,6 @@ void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b) __swab32s(&b->flags); CLASSERT(offsetof(typeof(*b), padding) != 0); } -EXPORT_SYMBOL(lustre_swab_mdt_ioepoch); void lustre_swab_mgs_target_info(struct mgs_target_info *mti) { @@ -1729,11 +1694,10 @@ void lustre_swab_mgs_target_info(struct mgs_target_info *mti) for (i = 0; i < MTI_NIDS_MAX; i++) __swab64s(&mti->mti_nids[i]); } -EXPORT_SYMBOL(lustre_swab_mgs_target_info); void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry) { - int i; + __u8 i; __swab64s(&entry->mne_version); __swab32s(&entry->mne_instance); @@ -1760,14 +1724,12 @@ void lustre_swab_mgs_config_body(struct mgs_config_body *body) __swab32s(&body->mcb_units); __swab16s(&body->mcb_type); } -EXPORT_SYMBOL(lustre_swab_mgs_config_body); void lustre_swab_mgs_config_res(struct mgs_config_res *body) { __swab64s(&body->mcr_offset); __swab64s(&body->mcr_size); } -EXPORT_SYMBOL(lustre_swab_mgs_config_res); static void lustre_swab_obd_dqinfo(struct obd_dqinfo *i) { @@ -1800,7 +1762,6 @@ void lustre_swab_obd_quotactl(struct obd_quotactl *q) lustre_swab_obd_dqinfo(&q->qc_dqinfo); lustre_swab_obd_dqblk(&q->qc_dqblk); } -EXPORT_SYMBOL(lustre_swab_obd_quotactl); void lustre_swab_fid2path(struct getinfo_fid2path *gf) { @@ -1822,7 +1783,7 @@ static void lustre_swab_fiemap_extent(struct ll_fiemap_extent *fm_extent) void lustre_swab_fiemap(struct ll_user_fiemap *fiemap) { - int i; + __u32 i; __swab64s(&fiemap->fm_start); __swab64s(&fiemap->fm_length); @@ -1834,7 +1795,6 @@ void lustre_swab_fiemap(struct ll_user_fiemap *fiemap) for (i = 0; i < fiemap->fm_mapped_extents; i++) lustre_swab_fiemap_extent(&fiemap->fm_extents[i]); } -EXPORT_SYMBOL(lustre_swab_fiemap); void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr) { @@ -1863,7 +1823,6 @@ void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr) CLASSERT(offsetof(typeof(*rr), rr_padding_4) != 0); }; -EXPORT_SYMBOL(lustre_swab_mdt_rec_reint); void lustre_swab_lov_desc(struct lov_desc *ld) { @@ -1878,18 +1837,42 @@ void lustre_swab_lov_desc(struct lov_desc *ld) } EXPORT_SYMBOL(lustre_swab_lov_desc); -static void print_lum(struct lov_user_md *lum) +/* This structure is always in little-endian */ +static void lustre_swab_lmv_mds_md_v1(struct lmv_mds_md_v1 *lmm1) +{ + int i; + + __swab32s(&lmm1->lmv_magic); + __swab32s(&lmm1->lmv_stripe_count); + __swab32s(&lmm1->lmv_master_mdt_index); + __swab32s(&lmm1->lmv_hash_type); + __swab32s(&lmm1->lmv_layout_version); + for (i = 0; i < lmm1->lmv_stripe_count; i++) + lustre_swab_lu_fid(&lmm1->lmv_stripe_fids[i]); +} + +void lustre_swab_lmv_mds_md(union lmv_mds_md *lmm) +{ + switch (lmm->lmv_magic) { + case LMV_MAGIC_V1: + lustre_swab_lmv_mds_md_v1(&lmm->lmv_md_v1); + break; + default: + break; + } +} +EXPORT_SYMBOL(lustre_swab_lmv_mds_md); + +void lustre_swab_lmv_user_md(struct lmv_user_md *lum) { - CDEBUG(D_OTHER, "lov_user_md %p:\n", lum); - CDEBUG(D_OTHER, "\tlmm_magic: %#x\n", lum->lmm_magic); - CDEBUG(D_OTHER, "\tlmm_pattern: %#x\n", lum->lmm_pattern); - CDEBUG(D_OTHER, "\tlmm_object_id: %llu\n", lmm_oi_id(&lum->lmm_oi)); - CDEBUG(D_OTHER, "\tlmm_object_gr: %llu\n", lmm_oi_seq(&lum->lmm_oi)); - CDEBUG(D_OTHER, "\tlmm_stripe_size: %#x\n", lum->lmm_stripe_size); - CDEBUG(D_OTHER, "\tlmm_stripe_count: %#x\n", lum->lmm_stripe_count); - CDEBUG(D_OTHER, "\tlmm_stripe_offset/lmm_layout_gen: %#x\n", - lum->lmm_stripe_offset); + __swab32s(&lum->lum_magic); + __swab32s(&lum->lum_stripe_count); + __swab32s(&lum->lum_stripe_offset); + __swab32s(&lum->lum_hash_type); + __swab32s(&lum->lum_type); + CLASSERT(offsetof(typeof(*lum), lum_padding1)); } +EXPORT_SYMBOL(lustre_swab_lmv_user_md); static void lustre_swab_lmm_oi(struct ost_id *oi) { @@ -1905,7 +1888,6 @@ static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum) __swab32s(&lum->lmm_stripe_size); __swab16s(&lum->lmm_stripe_count); __swab16s(&lum->lmm_stripe_offset); - print_lum(lum); } void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum) @@ -1941,9 +1923,9 @@ void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod, int i; for (i = 0; i < stripe_count; i++) { - lustre_swab_ost_id(&(lod[i].l_ost_oi)); - __swab32s(&(lod[i].l_ost_gen)); - __swab32s(&(lod[i].l_ost_idx)); + lustre_swab_ost_id(&lod[i].l_ost_oi); + __swab32s(&lod[i].l_ost_gen); + __swab32s(&lod[i].l_ost_idx); } } EXPORT_SYMBOL(lustre_swab_lov_user_md_objects); @@ -1973,7 +1955,6 @@ void lustre_swab_ldlm_intent(struct ldlm_intent *i) { __swab64s(&i->opc); } -EXPORT_SYMBOL(lustre_swab_ldlm_intent); static void lustre_swab_ldlm_resource_desc(struct ldlm_resource_desc *r) { @@ -1997,7 +1978,6 @@ void lustre_swab_ldlm_request(struct ldlm_request *rq) __swab32s(&rq->lock_count); /* lock_handle[] opaque */ } -EXPORT_SYMBOL(lustre_swab_ldlm_request); void lustre_swab_ldlm_reply(struct ldlm_reply *r) { @@ -2008,7 +1988,6 @@ void lustre_swab_ldlm_reply(struct ldlm_reply *r) __swab64s(&r->lock_policy_res1); __swab64s(&r->lock_policy_res2); } -EXPORT_SYMBOL(lustre_swab_ldlm_reply); /* Dump functions */ void dump_ioo(struct obd_ioobj *ioo) @@ -2018,14 +1997,12 @@ void dump_ioo(struct obd_ioobj *ioo) POSTID(&ioo->ioo_oid), ioo->ioo_max_brw, ioo->ioo_bufcnt); } -EXPORT_SYMBOL(dump_ioo); void dump_rniobuf(struct niobuf_remote *nb) { CDEBUG(D_RPCTRACE, "niobuf_remote: offset=%llu, len=%d, flags=%x\n", - nb->offset, nb->len, nb->flags); + nb->rnb_offset, nb->rnb_len, nb->rnb_flags); } -EXPORT_SYMBOL(dump_rniobuf); static void dump_obdo(struct obdo *oa) { @@ -2093,13 +2070,11 @@ void dump_ost_body(struct ost_body *ob) { dump_obdo(&ob->oa); } -EXPORT_SYMBOL(dump_ost_body); void dump_rcs(__u32 *rc) { CDEBUG(D_RPCTRACE, "rmf_rcs: %d\n", *rc); } -EXPORT_SYMBOL(dump_rcs); static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req) { @@ -2184,14 +2159,12 @@ void lustre_swab_lustre_capa(struct lustre_capa *c) __swab32s(&c->lc_timeout); __swab32s(&c->lc_expiry); } -EXPORT_SYMBOL(lustre_swab_lustre_capa); void lustre_swab_hsm_user_state(struct hsm_user_state *state) { __swab32s(&state->hus_states); __swab32s(&state->hus_archive_id); } -EXPORT_SYMBOL(lustre_swab_hsm_user_state); void lustre_swab_hsm_state_set(struct hsm_state_set *hss) { @@ -2214,14 +2187,12 @@ void lustre_swab_hsm_current_action(struct hsm_current_action *action) __swab32s(&action->hca_action); lustre_swab_hsm_extent(&action->hca_location); } -EXPORT_SYMBOL(lustre_swab_hsm_current_action); void lustre_swab_hsm_user_item(struct hsm_user_item *hui) { lustre_swab_lu_fid(&hui->hui_fid); lustre_swab_hsm_extent(&hui->hui_extent); } -EXPORT_SYMBOL(lustre_swab_hsm_user_item); void lustre_swab_layout_intent(struct layout_intent *li) { @@ -2230,7 +2201,6 @@ void lustre_swab_layout_intent(struct layout_intent *li) __swab64s(&li->li_start); __swab64s(&li->li_end); } -EXPORT_SYMBOL(lustre_swab_layout_intent); void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk) { @@ -2241,7 +2211,6 @@ void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk) __swab16s(&hpk->hpk_flags); __swab16s(&hpk->hpk_errval); } -EXPORT_SYMBOL(lustre_swab_hsm_progress_kernel); void lustre_swab_hsm_request(struct hsm_request *hr) { @@ -2251,7 +2220,6 @@ void lustre_swab_hsm_request(struct hsm_request *hr) __swab32s(&hr->hr_itemcount); __swab32s(&hr->hr_data_len); } -EXPORT_SYMBOL(lustre_swab_hsm_request); void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl) { @@ -2264,4 +2232,3 @@ void lustre_swab_close_data(struct close_data *cd) lustre_swab_lu_fid(&cd->cd_fid); __swab64s(&cd->cd_data_version); } -EXPORT_SYMBOL(lustre_swab_close_data); diff --git a/drivers/staging/lustre/lustre/ptlrpc/pers.c b/drivers/staging/lustre/lustre/ptlrpc/pers.c index 6c820e944171..5b9fb11c0b6b 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/pers.c +++ b/drivers/staging/lustre/lustre/ptlrpc/pers.c @@ -64,9 +64,9 @@ void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page, { lnet_kiov_t *kiov = &desc->bd_iov[desc->bd_iov_count]; - kiov->kiov_page = page; - kiov->kiov_offset = pageoffset; - kiov->kiov_len = len; + kiov->bv_page = page; + kiov->bv_offset = pageoffset; + kiov->bv_len = len; desc->bd_iov_count++; } diff --git a/drivers/staging/lustre/lustre/ptlrpc/pinger.c b/drivers/staging/lustre/lustre/ptlrpc/pinger.c index c0529d808d81..5504fc2363ac 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/pinger.c +++ b/drivers/staging/lustre/lustre/ptlrpc/pinger.c @@ -340,7 +340,6 @@ void ptlrpc_pinger_sending_on_import(struct obd_import *imp) { ptlrpc_update_next_ping(imp, 0); } -EXPORT_SYMBOL(ptlrpc_pinger_sending_on_import); void ptlrpc_pinger_commit_expected(struct obd_import *imp) { diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h index a9831fab80f3..f14d193287da 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h +++ b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h @@ -53,6 +53,8 @@ int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait); int ptlrpcd_start(struct ptlrpcd_ctl *pc); /* client.c */ +void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, + unsigned int service_time); struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw, unsigned type, unsigned portal); int ptlrpc_request_cache_init(void); @@ -60,6 +62,11 @@ void ptlrpc_request_cache_fini(void); struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags); void ptlrpc_request_cache_free(struct ptlrpc_request *req); void ptlrpc_init_xid(void); +void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, + struct ptlrpc_request *req); +int ptlrpc_expired_set(void *data); +int ptlrpc_set_next_timeout(struct ptlrpc_request_set *); +void ptlrpc_resend_req(struct ptlrpc_request *request); /* events.c */ int ptlrpc_init_portals(void); @@ -268,7 +275,7 @@ void sptlrpc_conf_fini(void); int sptlrpc_init(void); void sptlrpc_fini(void); -static inline int ll_rpc_recoverable_error(int rc) +static inline bool ptlrpc_recoverable_error(int rc) { return (rc == -ENOTCONN || rc == -ENODEV); } diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c b/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c index 0a374b6c2f71..1f55d642aa75 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c +++ b/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c @@ -412,7 +412,7 @@ static int ptlrpcd(void *arg) * an argument, describing its "scope". */ rc = lu_context_init(&env.le_ctx, - LCT_CL_THREAD|LCT_REMEMBER|LCT_NOREF); + LCT_CL_THREAD | LCT_REMEMBER | LCT_NOREF); if (rc == 0) { rc = lu_context_init(env.le_ses, LCT_SESSION | LCT_REMEMBER | LCT_NOREF); @@ -567,7 +567,7 @@ int ptlrpcd_start(struct ptlrpcd_ctl *pc) * ptlrpcd thread (or a thread-set) has to be given an argument, * describing its "scope". */ - rc = lu_context_init(&pc->pc_env.le_ctx, LCT_CL_THREAD|LCT_REMEMBER); + rc = lu_context_init(&pc->pc_env.le_ctx, LCT_CL_THREAD | LCT_REMEMBER); if (rc != 0) goto out; diff --git a/drivers/staging/lustre/lustre/ptlrpc/recover.c b/drivers/staging/lustre/lustre/ptlrpc/recover.c index 718b3a8d61c6..405faf0dc9fc 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/recover.c +++ b/drivers/staging/lustre/lustre/ptlrpc/recover.c @@ -201,7 +201,6 @@ int ptlrpc_resend(struct obd_import *imp) return 0; } -EXPORT_SYMBOL(ptlrpc_resend); /** * Go through all requests in delayed list and wake their threads @@ -221,7 +220,6 @@ void ptlrpc_wake_delayed(struct obd_import *imp) } spin_unlock(&imp->imp_lock); } -EXPORT_SYMBOL(ptlrpc_wake_delayed); void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req) { diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec.c b/drivers/staging/lustre/lustre/ptlrpc/sec.c index dbd819fa6b75..5d3995d5c69a 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/sec.c +++ b/drivers/staging/lustre/lustre/ptlrpc/sec.c @@ -311,6 +311,19 @@ static int import_sec_check_expire(struct obd_import *imp) return sptlrpc_import_sec_adapt(imp, NULL, NULL); } +/** + * Get and validate the client side ptlrpc security facilities from + * \a imp. There is a race condition on client reconnect when the import is + * being destroyed while there are outstanding client bound requests. In + * this case do not output any error messages if import secuity is not + * found. + * + * \param[in] imp obd import associated with client + * \param[out] sec client side ptlrpc security + * + * \retval 0 if security retrieved successfully + * \retval -ve errno if there was a problem + */ static int import_sec_validate_get(struct obd_import *imp, struct ptlrpc_sec **sec) { @@ -323,9 +336,11 @@ static int import_sec_validate_get(struct obd_import *imp, } *sec = sptlrpc_import_sec_ref(imp); + /* Only output an error when the import is still active */ if (!*sec) { - CERROR("import %p (%s) with no sec\n", - imp, ptlrpc_import_state_name(imp->imp_state)); + if (list_empty(&imp->imp_zombie_chain)) + CERROR("import %p (%s) with no sec\n", + imp, ptlrpc_import_state_name(imp->imp_state)); return -EACCES; } @@ -499,7 +514,7 @@ static int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req) newctx, newctx->cc_flags); set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); + schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC)); } else { /* * it's possible newctx == oldctx if we're switching @@ -718,8 +733,9 @@ again: req->rq_restart = 0; spin_unlock(&req->rq_lock); - lwi = LWI_TIMEOUT_INTR(timeout * HZ, ctx_refresh_timeout, - ctx_refresh_interrupt, req); + lwi = LWI_TIMEOUT_INTR(msecs_to_jiffies(timeout * MSEC_PER_SEC), + ctx_refresh_timeout, ctx_refresh_interrupt, + req); rc = l_wait_event(req->rq_reply_waitq, ctx_check_refresh(ctx), &lwi); /* diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c index 5f4d79718589..b2cc5ea6cb93 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c +++ b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c @@ -139,7 +139,7 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v) "cache missing: %lu\n" "low free mark: %lu\n" "max waitqueue depth: %u\n" - "max wait time: %ld/%u\n", + "max wait time: %ld/%lu\n", totalram_pages, PAGES_PER_POOL, page_pools.epp_max_pages, @@ -158,7 +158,7 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v) page_pools.epp_st_lowfree, page_pools.epp_st_max_wqlen, page_pools.epp_st_max_wait, - HZ); + msecs_to_jiffies(MSEC_PER_SEC)); spin_unlock(&page_pools.epp_lock); @@ -326,12 +326,12 @@ void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc) LASSERT(page_pools.epp_pools[p_idx]); for (i = 0; i < desc->bd_iov_count; i++) { - LASSERT(desc->bd_enc_iov[i].kiov_page); + LASSERT(desc->bd_enc_iov[i].bv_page); LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]); LASSERT(!page_pools.epp_pools[p_idx][g_idx]); page_pools.epp_pools[p_idx][g_idx] = - desc->bd_enc_iov[i].kiov_page; + desc->bd_enc_iov[i].bv_page; if (++g_idx == PAGES_PER_POOL) { p_idx++; @@ -348,7 +348,6 @@ void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc) kfree(desc->bd_enc_iov); desc->bd_enc_iov = NULL; } -EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages); static inline void enc_pools_alloc(void) { @@ -432,12 +431,13 @@ void sptlrpc_enc_pool_fini(void) if (page_pools.epp_st_access > 0) { CDEBUG(D_SEC, - "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait %ld/%d\n", + "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait %ld/%ld\n", page_pools.epp_st_max_pages, page_pools.epp_st_grows, page_pools.epp_st_grow_fails, page_pools.epp_st_shrinks, page_pools.epp_st_access, page_pools.epp_st_missings, page_pools.epp_st_max_wqlen, - page_pools.epp_st_max_wait, HZ); + page_pools.epp_st_max_wait, + msecs_to_jiffies(MSEC_PER_SEC)); } } @@ -456,13 +456,11 @@ const char *sptlrpc_get_hash_name(__u8 hash_alg) { return cfs_crypto_hash_name(cfs_hash_alg_id[hash_alg]); } -EXPORT_SYMBOL(sptlrpc_get_hash_name); __u8 sptlrpc_get_hash_alg(const char *algname) { return cfs_crypto_hash_alg(algname); } -EXPORT_SYMBOL(sptlrpc_get_hash_alg); int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed) { @@ -522,9 +520,10 @@ int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]); for (i = 0; i < desc->bd_iov_count; i++) { - cfs_crypto_hash_update_page(hdesc, desc->bd_iov[i].kiov_page, - desc->bd_iov[i].kiov_offset & ~PAGE_MASK, - desc->bd_iov[i].kiov_len); + cfs_crypto_hash_update_page(hdesc, desc->bd_iov[i].bv_page, + desc->bd_iov[i].bv_offset & + ~PAGE_MASK, + desc->bd_iov[i].bv_len); } if (hashsize > buflen) { @@ -542,4 +541,3 @@ int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, return err; } -EXPORT_SYMBOL(sptlrpc_get_bulk_checksum); diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_config.c b/drivers/staging/lustre/lustre/ptlrpc/sec_config.c index c14035479c5f..2181a85efd49 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/sec_config.c +++ b/drivers/staging/lustre/lustre/ptlrpc/sec_config.c @@ -58,7 +58,6 @@ enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd) CERROR("unknown target %p(%s)\n", obd, type); return LUSTRE_SP_ANY; } -EXPORT_SYMBOL(sptlrpc_target_sec_part); /**************************************** * user supplied flavor string parsing * diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c b/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c index 9b9801ece582..8ffd000eafac 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c +++ b/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c @@ -71,7 +71,6 @@ void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec) CDEBUG(D_SEC, "added sec %p(%s)\n", sec, sec->ps_policy->sp_name); } -EXPORT_SYMBOL(sptlrpc_gc_add_sec); void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec) { @@ -95,7 +94,6 @@ void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec) CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name); } -EXPORT_SYMBOL(sptlrpc_gc_del_sec); static void sec_process_ctx_list(void) { @@ -182,7 +180,8 @@ again: /* check ctx list again before sleep */ sec_process_ctx_list(); - lwi = LWI_TIMEOUT(SEC_GC_INTERVAL * HZ, NULL, NULL); + lwi = LWI_TIMEOUT(msecs_to_jiffies(SEC_GC_INTERVAL * MSEC_PER_SEC), + NULL, NULL); l_wait_event(thread->t_ctl_waitq, thread_is_stopping(thread) || thread_is_signal(thread), diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c b/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c index 5c4590b0c521..cd305bcb334a 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c +++ b/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c @@ -154,13 +154,13 @@ static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc) unsigned int off, i; for (i = 0; i < desc->bd_iov_count; i++) { - if (desc->bd_iov[i].kiov_len == 0) + if (desc->bd_iov[i].bv_len == 0) continue; - ptr = kmap(desc->bd_iov[i].kiov_page); - off = desc->bd_iov[i].kiov_offset & ~PAGE_MASK; + ptr = kmap(desc->bd_iov[i].bv_page); + off = desc->bd_iov[i].bv_offset & ~PAGE_MASK; ptr[off] ^= 0x1; - kunmap(desc->bd_iov[i].kiov_page); + kunmap(desc->bd_iov[i].bv_page); return; } } @@ -249,9 +249,12 @@ int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) unsigned int hsize = 4; cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, - lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0), - lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF), - NULL, 0, (unsigned char *)&cksum, &hsize); + lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, + 0), + lustre_msg_buflen(msg, + PLAIN_PACK_MSG_OFF), + NULL, 0, (unsigned char *)&cksum, + &hsize); if (cksum != msg->lm_cksum) { CDEBUG(D_SEC, "early reply checksum mismatch: %08x != %08x\n", @@ -349,11 +352,11 @@ int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, /* fix the actual data size */ for (i = 0, nob = 0; i < desc->bd_iov_count; i++) { - if (desc->bd_iov[i].kiov_len + nob > desc->bd_nob_transferred) { - desc->bd_iov[i].kiov_len = + if (desc->bd_iov[i].bv_len + nob > desc->bd_nob_transferred) { + desc->bd_iov[i].bv_len = desc->bd_nob_transferred - nob; } - nob += desc->bd_iov[i].kiov_len; + nob += desc->bd_iov[i].bv_len; } rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, @@ -869,9 +872,12 @@ int plain_authorize(struct ptlrpc_request *req) unsigned int hsize = 4; cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, - lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0), - lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF), - NULL, 0, (unsigned char *)&msg->lm_cksum, &hsize); + lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, + 0), + lustre_msg_buflen(msg, + PLAIN_PACK_MSG_OFF), + NULL, 0, (unsigned char *)&msg->lm_cksum, + &hsize); req->rq_reply_off = 0; } diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c b/drivers/staging/lustre/lustre/ptlrpc/service.c index 4788c4940c2a..72f39308eebb 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/service.c +++ b/drivers/staging/lustre/lustre/ptlrpc/service.c @@ -1005,6 +1005,10 @@ ptlrpc_at_remove_timed(struct ptlrpc_request *req) array->paa_count--; } +/* + * Attempt to extend the request deadline by sending an early reply to the + * client. + */ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req) { struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; @@ -1039,24 +1043,26 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req) return -ENOSYS; } - /* Fake our processing time into the future to ask the clients - * for some extra amount of time + /* + * We want to extend the request deadline by at_extra seconds, + * so we set our service estimate to reflect how much time has + * passed since this request arrived plus an additional + * at_extra seconds. The client will calculate the new deadline + * based on this service estimate (plus some additional time to + * account for network latency). See ptlrpc_at_recv_early_reply */ at_measured(&svcpt->scp_at_estimate, at_extra + ktime_get_real_seconds() - req->rq_arrival_time.tv_sec); + newdl = req->rq_arrival_time.tv_sec + at_get(&svcpt->scp_at_estimate); /* Check to see if we've actually increased the deadline - * we may be past adaptive_max */ - if (req->rq_deadline >= req->rq_arrival_time.tv_sec + - at_get(&svcpt->scp_at_estimate)) { + if (req->rq_deadline >= newdl) { DEBUG_REQ(D_WARNING, req, "Couldn't add any time (%ld/%lld), not sending early reply\n", - olddl, req->rq_arrival_time.tv_sec + - at_get(&svcpt->scp_at_estimate) - - ktime_get_real_seconds()); + olddl, newdl - ktime_get_real_seconds()); return -ETIMEDOUT; } - newdl = ktime_get_real_seconds() + at_get(&svcpt->scp_at_estimate); reqcopy = ptlrpc_request_cache_alloc(GFP_NOFS); if (!reqcopy) @@ -1982,11 +1988,12 @@ ptlrpc_wait_event(struct ptlrpc_service_part *svcpt, cond_resched(); l_wait_event_exclusive_head(svcpt->scp_waitq, - ptlrpc_thread_stopping(thread) || - ptlrpc_server_request_incoming(svcpt) || - ptlrpc_server_request_pending(svcpt, false) || - ptlrpc_rqbd_pending(svcpt) || - ptlrpc_at_check(svcpt), &lwi); + ptlrpc_thread_stopping(thread) || + ptlrpc_server_request_incoming(svcpt) || + ptlrpc_server_request_pending(svcpt, + false) || + ptlrpc_rqbd_pending(svcpt) || + ptlrpc_at_check(svcpt), &lwi); if (ptlrpc_thread_stopping(thread)) return -EINTR; @@ -2049,7 +2056,7 @@ static int ptlrpc_main(void *arg) } rc = lu_context_init(&env->le_ctx, - svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF); + svc->srv_ctx_tags | LCT_REMEMBER | LCT_NOREF); if (rc) goto out_srv_fini; @@ -2349,7 +2356,7 @@ static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt) while (!list_empty(&zombie)) { thread = list_entry(zombie.next, - struct ptlrpc_thread, t_link); + struct ptlrpc_thread, t_link); list_del(&thread->t_link); kfree(thread); } @@ -2398,7 +2405,6 @@ int ptlrpc_start_threads(struct ptlrpc_service *svc) ptlrpc_stop_all_threads(svc); return rc; } -EXPORT_SYMBOL(ptlrpc_start_threads); int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait) { @@ -2539,8 +2545,8 @@ int ptlrpc_hr_init(void) LASSERT(hrp->hrp_nthrs > 0); hrp->hrp_thrs = kzalloc_node(hrp->hrp_nthrs * sizeof(*hrt), GFP_NOFS, - cfs_cpt_spread_node(ptlrpc_hr.hr_cpt_table, - i)); + cfs_cpt_spread_node(ptlrpc_hr.hr_cpt_table, + i)); if (!hrp->hrp_thrs) { rc = -ENOMEM; goto out; @@ -2593,7 +2599,8 @@ static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt) NULL, NULL); rc = l_wait_event(svcpt->scp_waitq, - atomic_read(&svcpt->scp_nreps_difficult) == 0, &lwi); + atomic_read(&svcpt->scp_nreps_difficult) == 0, + &lwi); if (rc == 0) break; CWARN("Unexpectedly long timeout %s %p\n", @@ -2639,7 +2646,7 @@ ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc) * event with its 'unlink' flag set for each posted rqbd */ list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted, - rqbd_list) { + rqbd_list) { rc = LNetMDUnlink(rqbd->rqbd_md_h); LASSERT(rc == 0 || rc == -ENOENT); } diff --git a/drivers/staging/lustre/lustre/ptlrpc/wiretest.c b/drivers/staging/lustre/lustre/ptlrpc/wiretest.c index 6cc2b2edf3fc..e5945e2ccc49 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/wiretest.c +++ b/drivers/staging/lustre/lustre/ptlrpc/wiretest.c @@ -190,28 +190,30 @@ void lustre_assert_wire_constants(void) (long long)REINT_SETXATTR); LASSERTF(REINT_RMENTRY == 8, "found %lld\n", (long long)REINT_RMENTRY); - LASSERTF(REINT_MAX == 9, "found %lld\n", + LASSERTF(REINT_MIGRATE == 9, "found %lld\n", + (long long)REINT_MIGRATE); + LASSERTF(REINT_MAX == 10, "found %lld\n", (long long)REINT_MAX); LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned)DISP_IT_EXECD); + (unsigned)DISP_IT_EXECD); LASSERTF(DISP_LOOKUP_EXECD == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned)DISP_LOOKUP_EXECD); + (unsigned)DISP_LOOKUP_EXECD); LASSERTF(DISP_LOOKUP_NEG == 0x00000004UL, "found 0x%.8xUL\n", - (unsigned)DISP_LOOKUP_NEG); + (unsigned)DISP_LOOKUP_NEG); LASSERTF(DISP_LOOKUP_POS == 0x00000008UL, "found 0x%.8xUL\n", - (unsigned)DISP_LOOKUP_POS); + (unsigned)DISP_LOOKUP_POS); LASSERTF(DISP_OPEN_CREATE == 0x00000010UL, "found 0x%.8xUL\n", - (unsigned)DISP_OPEN_CREATE); + (unsigned)DISP_OPEN_CREATE); LASSERTF(DISP_OPEN_OPEN == 0x00000020UL, "found 0x%.8xUL\n", - (unsigned)DISP_OPEN_OPEN); + (unsigned)DISP_OPEN_OPEN); LASSERTF(DISP_ENQ_COMPLETE == 0x00400000UL, "found 0x%.8xUL\n", - (unsigned)DISP_ENQ_COMPLETE); + (unsigned)DISP_ENQ_COMPLETE); LASSERTF(DISP_ENQ_OPEN_REF == 0x00800000UL, "found 0x%.8xUL\n", - (unsigned)DISP_ENQ_OPEN_REF); + (unsigned)DISP_ENQ_OPEN_REF); LASSERTF(DISP_ENQ_CREATE_REF == 0x01000000UL, "found 0x%.8xUL\n", - (unsigned)DISP_ENQ_CREATE_REF); + (unsigned)DISP_ENQ_CREATE_REF); LASSERTF(DISP_OPEN_LOCK == 0x02000000UL, "found 0x%.8xUL\n", - (unsigned)DISP_OPEN_LOCK); + (unsigned)DISP_OPEN_LOCK); LASSERTF(MDS_STATUS_CONN == 1, "found %lld\n", (long long)MDS_STATUS_CONN); LASSERTF(MDS_STATUS_LOV == 2, "found %lld\n", @@ -219,55 +221,55 @@ void lustre_assert_wire_constants(void) LASSERTF(LUSTRE_BFLAG_UNCOMMITTED_WRITES == 1, "found %lld\n", (long long)LUSTRE_BFLAG_UNCOMMITTED_WRITES); LASSERTF(MF_SOM_CHANGE == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned)MF_SOM_CHANGE); + (unsigned)MF_SOM_CHANGE); LASSERTF(MF_EPOCH_OPEN == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned)MF_EPOCH_OPEN); + (unsigned)MF_EPOCH_OPEN); LASSERTF(MF_EPOCH_CLOSE == 0x00000004UL, "found 0x%.8xUL\n", - (unsigned)MF_EPOCH_CLOSE); + (unsigned)MF_EPOCH_CLOSE); LASSERTF(MF_MDC_CANCEL_FID1 == 0x00000008UL, "found 0x%.8xUL\n", - (unsigned)MF_MDC_CANCEL_FID1); + (unsigned)MF_MDC_CANCEL_FID1); LASSERTF(MF_MDC_CANCEL_FID2 == 0x00000010UL, "found 0x%.8xUL\n", - (unsigned)MF_MDC_CANCEL_FID2); + (unsigned)MF_MDC_CANCEL_FID2); LASSERTF(MF_MDC_CANCEL_FID3 == 0x00000020UL, "found 0x%.8xUL\n", - (unsigned)MF_MDC_CANCEL_FID3); + (unsigned)MF_MDC_CANCEL_FID3); LASSERTF(MF_MDC_CANCEL_FID4 == 0x00000040UL, "found 0x%.8xUL\n", - (unsigned)MF_MDC_CANCEL_FID4); + (unsigned)MF_MDC_CANCEL_FID4); LASSERTF(MF_SOM_AU == 0x00000080UL, "found 0x%.8xUL\n", - (unsigned)MF_SOM_AU); + (unsigned)MF_SOM_AU); LASSERTF(MF_GETATTR_LOCK == 0x00000100UL, "found 0x%.8xUL\n", - (unsigned)MF_GETATTR_LOCK); + (unsigned)MF_GETATTR_LOCK); LASSERTF(MDS_ATTR_MODE == 0x0000000000000001ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_MODE); + (long long)MDS_ATTR_MODE); LASSERTF(MDS_ATTR_UID == 0x0000000000000002ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_UID); + (long long)MDS_ATTR_UID); LASSERTF(MDS_ATTR_GID == 0x0000000000000004ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_GID); + (long long)MDS_ATTR_GID); LASSERTF(MDS_ATTR_SIZE == 0x0000000000000008ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_SIZE); + (long long)MDS_ATTR_SIZE); LASSERTF(MDS_ATTR_ATIME == 0x0000000000000010ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_ATIME); + (long long)MDS_ATTR_ATIME); LASSERTF(MDS_ATTR_MTIME == 0x0000000000000020ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_MTIME); + (long long)MDS_ATTR_MTIME); LASSERTF(MDS_ATTR_CTIME == 0x0000000000000040ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_CTIME); + (long long)MDS_ATTR_CTIME); LASSERTF(MDS_ATTR_ATIME_SET == 0x0000000000000080ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_ATIME_SET); + (long long)MDS_ATTR_ATIME_SET); LASSERTF(MDS_ATTR_MTIME_SET == 0x0000000000000100ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_MTIME_SET); + (long long)MDS_ATTR_MTIME_SET); LASSERTF(MDS_ATTR_FORCE == 0x0000000000000200ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_FORCE); + (long long)MDS_ATTR_FORCE); LASSERTF(MDS_ATTR_ATTR_FLAG == 0x0000000000000400ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_ATTR_FLAG); + (long long)MDS_ATTR_ATTR_FLAG); LASSERTF(MDS_ATTR_KILL_SUID == 0x0000000000000800ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_KILL_SUID); + (long long)MDS_ATTR_KILL_SUID); LASSERTF(MDS_ATTR_KILL_SGID == 0x0000000000001000ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_KILL_SGID); + (long long)MDS_ATTR_KILL_SGID); LASSERTF(MDS_ATTR_CTIME_SET == 0x0000000000002000ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_CTIME_SET); + (long long)MDS_ATTR_CTIME_SET); LASSERTF(MDS_ATTR_FROM_OPEN == 0x0000000000004000ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_FROM_OPEN); + (long long)MDS_ATTR_FROM_OPEN); LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_BLOCKS); + (long long)MDS_ATTR_BLOCKS); LASSERTF(FLD_QUERY == 900, "found %lld\n", (long long)FLD_QUERY); LASSERTF(FLD_FIRST_OPC == 900, "found %lld\n", @@ -418,15 +420,15 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid) == 16, "found %lld\n", (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid)); LASSERTF(LMAI_RELEASED == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned)LMAI_RELEASED); + (unsigned)LMAI_RELEASED); LASSERTF(LMAC_HSM == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned)LMAC_HSM); + (unsigned)LMAC_HSM); LASSERTF(LMAC_SOM == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned)LMAC_SOM); + (unsigned)LMAC_SOM); LASSERTF(LMAC_NOT_IN_OI == 0x00000004UL, "found 0x%.8xUL\n", - (unsigned)LMAC_NOT_IN_OI); + (unsigned)LMAC_NOT_IN_OI); LASSERTF(LMAC_FID_ON_OST == 0x00000008UL, "found 0x%.8xUL\n", - (unsigned)LMAC_FID_ON_OST); + (unsigned)LMAC_FID_ON_OST); /* Checks for struct ost_id */ LASSERTF((int)sizeof(struct ost_id) == 16, "found %lld\n", @@ -452,35 +454,35 @@ void lustre_assert_wire_constants(void) LASSERTF(FID_SEQ_IGIF == 12, "found %lld\n", (long long)FID_SEQ_IGIF); LASSERTF(FID_SEQ_IGIF_MAX == 0x00000000ffffffffULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_IGIF_MAX); + (long long)FID_SEQ_IGIF_MAX); LASSERTF(FID_SEQ_IDIF == 0x0000000100000000ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_IDIF); + (long long)FID_SEQ_IDIF); LASSERTF(FID_SEQ_IDIF_MAX == 0x00000001ffffffffULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_IDIF_MAX); + (long long)FID_SEQ_IDIF_MAX); LASSERTF(FID_SEQ_START == 0x0000000200000000ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_START); + (long long)FID_SEQ_START); LASSERTF(FID_SEQ_LOCAL_FILE == 0x0000000200000001ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_LOCAL_FILE); + (long long)FID_SEQ_LOCAL_FILE); LASSERTF(FID_SEQ_DOT_LUSTRE == 0x0000000200000002ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_DOT_LUSTRE); + (long long)FID_SEQ_DOT_LUSTRE); LASSERTF(FID_SEQ_SPECIAL == 0x0000000200000004ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_SPECIAL); + (long long)FID_SEQ_SPECIAL); LASSERTF(FID_SEQ_QUOTA == 0x0000000200000005ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_QUOTA); + (long long)FID_SEQ_QUOTA); LASSERTF(FID_SEQ_QUOTA_GLB == 0x0000000200000006ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_QUOTA_GLB); + (long long)FID_SEQ_QUOTA_GLB); LASSERTF(FID_SEQ_ROOT == 0x0000000200000007ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_ROOT); + (long long)FID_SEQ_ROOT); LASSERTF(FID_SEQ_NORMAL == 0x0000000200000400ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_NORMAL); + (long long)FID_SEQ_NORMAL); LASSERTF(FID_SEQ_LOV_DEFAULT == 0xffffffffffffffffULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_LOV_DEFAULT); + (long long)FID_SEQ_LOV_DEFAULT); LASSERTF(FID_OID_SPECIAL_BFL == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned)FID_OID_SPECIAL_BFL); + (unsigned)FID_OID_SPECIAL_BFL); LASSERTF(FID_OID_DOT_LUSTRE == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned)FID_OID_DOT_LUSTRE); + (unsigned)FID_OID_DOT_LUSTRE); LASSERTF(FID_OID_DOT_LUSTRE_OBF == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned)FID_OID_DOT_LUSTRE_OBF); + (unsigned)FID_OID_DOT_LUSTRE_OBF); /* Checks for struct lu_dirent */ LASSERTF((int)sizeof(struct lu_dirent) == 32, "found %lld\n", @@ -510,11 +512,11 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_name[0]) == 1, "found %lld\n", (long long)(int)sizeof(((struct lu_dirent *)0)->lde_name[0])); LASSERTF(LUDA_FID == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned)LUDA_FID); + (unsigned)LUDA_FID); LASSERTF(LUDA_TYPE == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned)LUDA_TYPE); + (unsigned)LUDA_TYPE); LASSERTF(LUDA_64BITHASH == 0x00000004UL, "found 0x%.8xUL\n", - (unsigned)LUDA_64BITHASH); + (unsigned)LUDA_64BITHASH); /* Checks for struct luda_type */ LASSERTF((int)sizeof(struct luda_type) == 2, "found %lld\n", @@ -602,9 +604,9 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n", (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0])); LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0BD00BD3, "found 0x%.8x\n", - LUSTRE_MSG_MAGIC_V2); + LUSTRE_MSG_MAGIC_V2); LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xD30BD00B, "found 0x%.8x\n", - LUSTRE_MSG_MAGIC_V2_SWABBED); + LUSTRE_MSG_MAGIC_V2_SWABBED); /* Checks for struct ptlrpc_body */ LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n", @@ -682,7 +684,7 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding)); LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == 32, "found %lld\n", (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding)); - CLASSERT(JOBSTATS_JOBID_SIZE == 32); + CLASSERT(LUSTRE_JOBID_SIZE == 32); LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_jobid) == 152, "found %lld\n", (long long)(int)offsetof(struct ptlrpc_body_v3, pb_jobid)); LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid) == 32, "found %lld\n", @@ -780,61 +782,61 @@ void lustre_assert_wire_constants(void) LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n", (long long)MSG_PTLRPC_HEADER_OFF); LASSERTF(PTLRPC_MSG_VERSION == 0x00000003, "found 0x%.8x\n", - PTLRPC_MSG_VERSION); + PTLRPC_MSG_VERSION); LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000, "found 0x%.8x\n", - LUSTRE_VERSION_MASK); + LUSTRE_VERSION_MASK); LASSERTF(LUSTRE_OBD_VERSION == 0x00010000, "found 0x%.8x\n", - LUSTRE_OBD_VERSION); + LUSTRE_OBD_VERSION); LASSERTF(LUSTRE_MDS_VERSION == 0x00020000, "found 0x%.8x\n", - LUSTRE_MDS_VERSION); + LUSTRE_MDS_VERSION); LASSERTF(LUSTRE_OST_VERSION == 0x00030000, "found 0x%.8x\n", - LUSTRE_OST_VERSION); + LUSTRE_OST_VERSION); LASSERTF(LUSTRE_DLM_VERSION == 0x00040000, "found 0x%.8x\n", - LUSTRE_DLM_VERSION); + LUSTRE_DLM_VERSION); LASSERTF(LUSTRE_LOG_VERSION == 0x00050000, "found 0x%.8x\n", - LUSTRE_LOG_VERSION); + LUSTRE_LOG_VERSION); LASSERTF(LUSTRE_MGS_VERSION == 0x00060000, "found 0x%.8x\n", - LUSTRE_MGS_VERSION); + LUSTRE_MGS_VERSION); LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n", (long long)MSGHDR_AT_SUPPORT); LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n", (long long)MSGHDR_CKSUM_INCOMPAT18); LASSERTF(MSG_OP_FLAG_MASK == 0xffff0000UL, "found 0x%.8xUL\n", - (unsigned)MSG_OP_FLAG_MASK); + (unsigned)MSG_OP_FLAG_MASK); LASSERTF(MSG_OP_FLAG_SHIFT == 16, "found %lld\n", (long long)MSG_OP_FLAG_SHIFT); LASSERTF(MSG_GEN_FLAG_MASK == 0x0000ffffUL, "found 0x%.8xUL\n", - (unsigned)MSG_GEN_FLAG_MASK); + (unsigned)MSG_GEN_FLAG_MASK); LASSERTF(MSG_LAST_REPLAY == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned)MSG_LAST_REPLAY); + (unsigned)MSG_LAST_REPLAY); LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned)MSG_RESENT); + (unsigned)MSG_RESENT); LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n", - (unsigned)MSG_REPLAY); + (unsigned)MSG_REPLAY); LASSERTF(MSG_DELAY_REPLAY == 0x00000010UL, "found 0x%.8xUL\n", - (unsigned)MSG_DELAY_REPLAY); + (unsigned)MSG_DELAY_REPLAY); LASSERTF(MSG_VERSION_REPLAY == 0x00000020UL, "found 0x%.8xUL\n", - (unsigned)MSG_VERSION_REPLAY); + (unsigned)MSG_VERSION_REPLAY); LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n", - (unsigned)MSG_REQ_REPLAY_DONE); + (unsigned)MSG_REQ_REPLAY_DONE); LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n", - (unsigned)MSG_LOCK_REPLAY_DONE); + (unsigned)MSG_LOCK_REPLAY_DONE); LASSERTF(MSG_CONNECT_RECOVERING == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned)MSG_CONNECT_RECOVERING); + (unsigned)MSG_CONNECT_RECOVERING); LASSERTF(MSG_CONNECT_RECONNECT == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned)MSG_CONNECT_RECONNECT); + (unsigned)MSG_CONNECT_RECONNECT); LASSERTF(MSG_CONNECT_REPLAYABLE == 0x00000004UL, "found 0x%.8xUL\n", - (unsigned)MSG_CONNECT_REPLAYABLE); + (unsigned)MSG_CONNECT_REPLAYABLE); LASSERTF(MSG_CONNECT_LIBCLIENT == 0x00000010UL, "found 0x%.8xUL\n", - (unsigned)MSG_CONNECT_LIBCLIENT); + (unsigned)MSG_CONNECT_LIBCLIENT); LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n", - (unsigned)MSG_CONNECT_INITIAL); + (unsigned)MSG_CONNECT_INITIAL); LASSERTF(MSG_CONNECT_ASYNC == 0x00000040UL, "found 0x%.8xUL\n", - (unsigned)MSG_CONNECT_ASYNC); + (unsigned)MSG_CONNECT_ASYNC); LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n", - (unsigned)MSG_CONNECT_NEXT_VER); + (unsigned)MSG_CONNECT_NEXT_VER); LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n", - (unsigned)MSG_CONNECT_TRANSNO); + (unsigned)MSG_CONNECT_TRANSNO); /* Checks for struct obd_connect_data */ LASSERTF((int)sizeof(struct obd_connect_data) == 192, "found %lld\n", @@ -1069,12 +1071,18 @@ void lustre_assert_wire_constants(void) "found 0x%.16llxULL\n", OBD_CONNECT_FLOCK_DEAD); LASSERTF(OBD_CONNECT_OPEN_BY_FID == 0x20000000000000ULL, "found 0x%.16llxULL\n", OBD_CONNECT_OPEN_BY_FID); + LASSERTF(OBD_CONNECT_LFSCK == 0x40000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LFSCK); + LASSERTF(OBD_CONNECT_UNLINK_CLOSE == 0x100000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_UNLINK_CLOSE); + LASSERTF(OBD_CONNECT_DIR_STRIPE == 0x400000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_DIR_STRIPE); LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned)OBD_CKSUM_CRC32); + (unsigned)OBD_CKSUM_CRC32); LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned)OBD_CKSUM_ADLER); + (unsigned)OBD_CKSUM_ADLER); LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n", - (unsigned)OBD_CKSUM_CRC32C); + (unsigned)OBD_CKSUM_CRC32C); /* Checks for struct obdo */ LASSERTF((int)sizeof(struct obdo) == 208, "found %lld\n", @@ -1346,7 +1354,7 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lov_mds_md_v1, lmm_objects[0])); LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]) == 24, "found %lld\n", (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0])); - CLASSERT(LOV_MAGIC_V1 == 0x0BD10BD0); + CLASSERT(LOV_MAGIC_V1 == (0x0BD10000 | 0x0BD0)); /* Checks for struct lov_mds_md_v3 */ LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, "found %lld\n", @@ -1375,7 +1383,7 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lov_mds_md_v3, lmm_layout_gen)); LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen) == 2, "found %lld\n", (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen)); - CLASSERT(LOV_MAXPOOLNAME == 16); + CLASSERT(LOV_MAXPOOLNAME == 15); LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16]) == 48, "found %lld\n", (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16])); LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16]) == 1, "found %lld\n", @@ -1384,15 +1392,64 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects[0])); LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]) == 24, "found %lld\n", (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0])); - CLASSERT(LOV_MAGIC_V3 == 0x0BD30BD0); + CLASSERT(LOV_MAGIC_V3 == (0x0BD30000 | 0x0BD0)); LASSERTF(LOV_PATTERN_RAID0 == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned)LOV_PATTERN_RAID0); + (unsigned)LOV_PATTERN_RAID0); LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned)LOV_PATTERN_RAID1); + (unsigned)LOV_PATTERN_RAID1); LASSERTF(LOV_PATTERN_FIRST == 0x00000100UL, "found 0x%.8xUL\n", - (unsigned)LOV_PATTERN_FIRST); + (unsigned)LOV_PATTERN_FIRST); LASSERTF(LOV_PATTERN_CMOBD == 0x00000200UL, "found 0x%.8xUL\n", - (unsigned)LOV_PATTERN_CMOBD); + (unsigned)LOV_PATTERN_CMOBD); + + /* Checks for struct lmv_mds_md_v1 */ + LASSERTF((int)sizeof(struct lmv_mds_md_v1) == 56, "found %lld\n", + (long long)(int)sizeof(struct lmv_mds_md_v1)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_magic)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_magic)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_stripe_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_stripe_count)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_count)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_master_mdt_index) == 8, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_master_mdt_index)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_master_mdt_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_master_mdt_index)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_hash_type) == 12, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_hash_type)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_hash_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_hash_type)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_layout_version) == 16, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_layout_version)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding1) == 20, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding1)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding1)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding2) == 24, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding2)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding3) == 32, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding3)); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding3)); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_pool_name[16]) == 56, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_pool_name[16])); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_pool_name[16]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_pool_name[16])); + LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_stripe_fids[0]) == 56, "found %lld\n", + (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_stripe_fids[0])); + LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_fids[0]) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_fids[0])); + CLASSERT(LMV_MAGIC_V1 == 0x0CD20CD0); + CLASSERT(LMV_MAGIC_STRIPE == 0x0CD40CD0); + CLASSERT(LMV_HASH_TYPE_MASK == 0x0000ffff); + CLASSERT(LMV_HASH_FLAG_MIGRATION == 0x80000000); + CLASSERT(LMV_HASH_FLAG_DEAD == 0x40000000); /* Checks for struct obd_statfs */ LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n", @@ -1582,53 +1639,53 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_padding) == 4, "found %lld\n", (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_padding)); LASSERTF(Q_QUOTACHECK == 0x800100, "found 0x%.8x\n", - Q_QUOTACHECK); + Q_QUOTACHECK); LASSERTF(Q_INITQUOTA == 0x800101, "found 0x%.8x\n", - Q_INITQUOTA); + Q_INITQUOTA); LASSERTF(Q_GETOINFO == 0x800102, "found 0x%.8x\n", - Q_GETOINFO); + Q_GETOINFO); LASSERTF(Q_GETOQUOTA == 0x800103, "found 0x%.8x\n", - Q_GETOQUOTA); + Q_GETOQUOTA); LASSERTF(Q_FINVALIDATE == 0x800104, "found 0x%.8x\n", - Q_FINVALIDATE); + Q_FINVALIDATE); /* Checks for struct niobuf_remote */ LASSERTF((int)sizeof(struct niobuf_remote) == 16, "found %lld\n", (long long)(int)sizeof(struct niobuf_remote)); - LASSERTF((int)offsetof(struct niobuf_remote, offset) == 0, "found %lld\n", - (long long)(int)offsetof(struct niobuf_remote, offset)); - LASSERTF((int)sizeof(((struct niobuf_remote *)0)->offset) == 8, "found %lld\n", - (long long)(int)sizeof(((struct niobuf_remote *)0)->offset)); - LASSERTF((int)offsetof(struct niobuf_remote, len) == 8, "found %lld\n", - (long long)(int)offsetof(struct niobuf_remote, len)); - LASSERTF((int)sizeof(((struct niobuf_remote *)0)->len) == 4, "found %lld\n", - (long long)(int)sizeof(((struct niobuf_remote *)0)->len)); - LASSERTF((int)offsetof(struct niobuf_remote, flags) == 12, "found %lld\n", - (long long)(int)offsetof(struct niobuf_remote, flags)); - LASSERTF((int)sizeof(((struct niobuf_remote *)0)->flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct niobuf_remote *)0)->flags)); + LASSERTF((int)offsetof(struct niobuf_remote, rnb_offset) == 0, "found %lld\n", + (long long)(int)offsetof(struct niobuf_remote, rnb_offset)); + LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_offset)); + LASSERTF((int)offsetof(struct niobuf_remote, rnb_len) == 8, "found %lld\n", + (long long)(int)offsetof(struct niobuf_remote, rnb_len)); + LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_len)); + LASSERTF((int)offsetof(struct niobuf_remote, rnb_flags) == 12, "found %lld\n", + (long long)(int)offsetof(struct niobuf_remote, rnb_flags)); + LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_flags)); LASSERTF(OBD_BRW_READ == 0x01, "found 0x%.8x\n", - OBD_BRW_READ); + OBD_BRW_READ); LASSERTF(OBD_BRW_WRITE == 0x02, "found 0x%.8x\n", - OBD_BRW_WRITE); + OBD_BRW_WRITE); LASSERTF(OBD_BRW_SYNC == 0x08, "found 0x%.8x\n", - OBD_BRW_SYNC); + OBD_BRW_SYNC); LASSERTF(OBD_BRW_CHECK == 0x10, "found 0x%.8x\n", - OBD_BRW_CHECK); + OBD_BRW_CHECK); LASSERTF(OBD_BRW_FROM_GRANT == 0x20, "found 0x%.8x\n", - OBD_BRW_FROM_GRANT); + OBD_BRW_FROM_GRANT); LASSERTF(OBD_BRW_GRANTED == 0x40, "found 0x%.8x\n", - OBD_BRW_GRANTED); + OBD_BRW_GRANTED); LASSERTF(OBD_BRW_NOCACHE == 0x80, "found 0x%.8x\n", - OBD_BRW_NOCACHE); + OBD_BRW_NOCACHE); LASSERTF(OBD_BRW_NOQUOTA == 0x100, "found 0x%.8x\n", - OBD_BRW_NOQUOTA); + OBD_BRW_NOQUOTA); LASSERTF(OBD_BRW_SRVLOCK == 0x200, "found 0x%.8x\n", - OBD_BRW_SRVLOCK); + OBD_BRW_SRVLOCK); LASSERTF(OBD_BRW_ASYNC == 0x400, "found 0x%.8x\n", - OBD_BRW_ASYNC); + OBD_BRW_ASYNC); LASSERTF(OBD_BRW_MEMALLOC == 0x800, "found 0x%.8x\n", - OBD_BRW_MEMALLOC); + OBD_BRW_MEMALLOC); LASSERTF(OBD_BRW_OVER_USRQUOTA == 0x1000, "found 0x%.8x\n", OBD_BRW_OVER_USRQUOTA); LASSERTF(OBD_BRW_OVER_GRPQUOTA == 0x2000, "found 0x%.8x\n", @@ -1663,203 +1720,203 @@ void lustre_assert_wire_constants(void) /* Checks for struct mdt_body */ LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n", (long long)(int)sizeof(struct mdt_body)); - LASSERTF((int)offsetof(struct mdt_body, fid1) == 0, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, fid1)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->fid1) == 16, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->fid1)); - LASSERTF((int)offsetof(struct mdt_body, fid2) == 16, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, fid2)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->fid2) == 16, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->fid2)); - LASSERTF((int)offsetof(struct mdt_body, handle) == 32, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, handle)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->handle) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->handle)); - LASSERTF((int)offsetof(struct mdt_body, valid) == 40, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, valid)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->valid) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->valid)); - LASSERTF((int)offsetof(struct mdt_body, size) == 48, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, size)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->size) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->size)); - LASSERTF((int)offsetof(struct mdt_body, mtime) == 56, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mtime)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mtime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mtime)); - LASSERTF((int)offsetof(struct mdt_body, atime) == 64, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, atime)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->atime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->atime)); - LASSERTF((int)offsetof(struct mdt_body, ctime) == 72, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, ctime)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->ctime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->ctime)); - LASSERTF((int)offsetof(struct mdt_body, blocks) == 80, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, blocks)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->blocks) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->blocks)); - LASSERTF((int)offsetof(struct mdt_body, t_state) == 96, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, t_state)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->t_state) == 8, + LASSERTF((int)offsetof(struct mdt_body, mbo_fid1) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_fid1)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fid1)); + LASSERTF((int)offsetof(struct mdt_body, mbo_fid2) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_fid2)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fid2)); + LASSERTF((int)offsetof(struct mdt_body, mbo_handle) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_handle)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_handle)); + LASSERTF((int)offsetof(struct mdt_body, mbo_valid) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_valid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_valid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_size) == 48, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_size)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_size)); + LASSERTF((int)offsetof(struct mdt_body, mbo_mtime) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_mtime)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_mtime)); + LASSERTF((int)offsetof(struct mdt_body, mbo_atime) == 64, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_atime)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_atime)); + LASSERTF((int)offsetof(struct mdt_body, mbo_ctime) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_ctime)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_ctime)); + LASSERTF((int)offsetof(struct mdt_body, mbo_blocks) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_blocks)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_blocks)); + LASSERTF((int)offsetof(struct mdt_body, mbo_t_state) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_t_state)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_t_state) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->t_state)); - LASSERTF((int)offsetof(struct mdt_body, fsuid) == 104, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, fsuid)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->fsuid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->fsuid)); - LASSERTF((int)offsetof(struct mdt_body, fsgid) == 108, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, fsgid)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->fsgid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->fsgid)); - LASSERTF((int)offsetof(struct mdt_body, capability) == 112, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, capability)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->capability) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->capability)); - LASSERTF((int)offsetof(struct mdt_body, mode) == 116, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mode)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mode)); - LASSERTF((int)offsetof(struct mdt_body, uid) == 120, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, uid)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->uid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->uid)); - LASSERTF((int)offsetof(struct mdt_body, gid) == 124, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, gid)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->gid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->gid)); - LASSERTF((int)offsetof(struct mdt_body, flags) == 128, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, flags)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->flags)); - LASSERTF((int)offsetof(struct mdt_body, rdev) == 132, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, rdev)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->rdev) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->rdev)); - LASSERTF((int)offsetof(struct mdt_body, nlink) == 136, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, nlink)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->nlink) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->nlink)); - LASSERTF((int)offsetof(struct mdt_body, unused2) == 140, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, unused2)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->unused2) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->unused2)); - LASSERTF((int)offsetof(struct mdt_body, suppgid) == 144, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, suppgid)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->suppgid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->suppgid)); - LASSERTF((int)offsetof(struct mdt_body, eadatasize) == 148, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, eadatasize)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->eadatasize) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->eadatasize)); - LASSERTF((int)offsetof(struct mdt_body, aclsize) == 152, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, aclsize)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->aclsize) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->aclsize)); - LASSERTF((int)offsetof(struct mdt_body, max_mdsize) == 156, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, max_mdsize)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->max_mdsize) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->max_mdsize)); - LASSERTF((int)offsetof(struct mdt_body, max_cookiesize) == 160, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, max_cookiesize)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->max_cookiesize) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->max_cookiesize)); - LASSERTF((int)offsetof(struct mdt_body, uid_h) == 164, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, uid_h)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->uid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->uid_h)); - LASSERTF((int)offsetof(struct mdt_body, gid_h) == 168, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, gid_h)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->gid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->gid_h)); - LASSERTF((int)offsetof(struct mdt_body, padding_5) == 172, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, padding_5)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_5) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->padding_5)); - LASSERTF((int)offsetof(struct mdt_body, padding_6) == 176, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, padding_6)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_6) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->padding_6)); - LASSERTF((int)offsetof(struct mdt_body, padding_7) == 184, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, padding_7)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_7) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->padding_7)); - LASSERTF((int)offsetof(struct mdt_body, padding_8) == 192, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, padding_8)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_8) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->padding_8)); - LASSERTF((int)offsetof(struct mdt_body, padding_9) == 200, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, padding_9)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_9) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->padding_9)); - LASSERTF((int)offsetof(struct mdt_body, padding_10) == 208, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, padding_10)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_10) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->padding_10)); + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_t_state)); + LASSERTF((int)offsetof(struct mdt_body, mbo_fsuid) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_fsuid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fsuid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_fsgid) == 108, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_fsgid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fsgid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_capability) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_capability)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_capability) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_capability)); + LASSERTF((int)offsetof(struct mdt_body, mbo_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_mode)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_mode)); + LASSERTF((int)offsetof(struct mdt_body, mbo_uid) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_uid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_uid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_gid) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_gid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_gid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_flags) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_flags)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_flags)); + LASSERTF((int)offsetof(struct mdt_body, mbo_rdev) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_rdev)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_rdev) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_rdev)); + LASSERTF((int)offsetof(struct mdt_body, mbo_nlink) == 136, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_nlink)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_nlink) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_nlink)); + LASSERTF((int)offsetof(struct mdt_body, mbo_unused2) == 140, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_unused2)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_unused2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_unused2)); + LASSERTF((int)offsetof(struct mdt_body, mbo_suppgid) == 144, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_suppgid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_suppgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_suppgid)); + LASSERTF((int)offsetof(struct mdt_body, mbo_eadatasize) == 148, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_eadatasize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_eadatasize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_eadatasize)); + LASSERTF((int)offsetof(struct mdt_body, mbo_aclsize) == 152, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_aclsize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_aclsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_aclsize)); + LASSERTF((int)offsetof(struct mdt_body, mbo_max_mdsize) == 156, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_max_mdsize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_max_mdsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_max_mdsize)); + LASSERTF((int)offsetof(struct mdt_body, mbo_max_cookiesize) == 160, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_max_cookiesize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_max_cookiesize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_max_cookiesize)); + LASSERTF((int)offsetof(struct mdt_body, mbo_uid_h) == 164, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_uid_h)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_uid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_uid_h)); + LASSERTF((int)offsetof(struct mdt_body, mbo_gid_h) == 168, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_gid_h)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_gid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_gid_h)); + LASSERTF((int)offsetof(struct mdt_body, mbo_padding_5) == 172, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_padding_5)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_5)); + LASSERTF((int)offsetof(struct mdt_body, mbo_padding_6) == 176, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_padding_6)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_6)); + LASSERTF((int)offsetof(struct mdt_body, mbo_padding_7) == 184, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_padding_7)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_7) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_7)); + LASSERTF((int)offsetof(struct mdt_body, mbo_padding_8) == 192, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_padding_8)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_8) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_8)); + LASSERTF((int)offsetof(struct mdt_body, mbo_padding_9) == 200, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_padding_9)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_9) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_9)); + LASSERTF((int)offsetof(struct mdt_body, mbo_padding_10) == 208, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mbo_padding_10)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_10) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_10)); LASSERTF(MDS_FMODE_CLOSED == 000000000000UL, "found 0%.11oUL\n", - MDS_FMODE_CLOSED); + MDS_FMODE_CLOSED); LASSERTF(MDS_FMODE_EXEC == 000000000004UL, "found 0%.11oUL\n", - MDS_FMODE_EXEC); + MDS_FMODE_EXEC); LASSERTF(MDS_FMODE_EPOCH == 000001000000UL, "found 0%.11oUL\n", - MDS_FMODE_EPOCH); + MDS_FMODE_EPOCH); LASSERTF(MDS_FMODE_TRUNC == 000002000000UL, "found 0%.11oUL\n", - MDS_FMODE_TRUNC); + MDS_FMODE_TRUNC); LASSERTF(MDS_FMODE_SOM == 000004000000UL, "found 0%.11oUL\n", - MDS_FMODE_SOM); + MDS_FMODE_SOM); LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n", - MDS_OPEN_CREATED); + MDS_OPEN_CREATED); LASSERTF(MDS_OPEN_CROSS == 000000000020UL, "found 0%.11oUL\n", - MDS_OPEN_CROSS); + MDS_OPEN_CROSS); LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n", - MDS_OPEN_CREAT); + MDS_OPEN_CREAT); LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n", - MDS_OPEN_EXCL); + MDS_OPEN_EXCL); LASSERTF(MDS_OPEN_TRUNC == 000000001000UL, "found 0%.11oUL\n", - MDS_OPEN_TRUNC); + MDS_OPEN_TRUNC); LASSERTF(MDS_OPEN_APPEND == 000000002000UL, "found 0%.11oUL\n", - MDS_OPEN_APPEND); + MDS_OPEN_APPEND); LASSERTF(MDS_OPEN_SYNC == 000000010000UL, "found 0%.11oUL\n", - MDS_OPEN_SYNC); + MDS_OPEN_SYNC); LASSERTF(MDS_OPEN_DIRECTORY == 000000200000UL, "found 0%.11oUL\n", - MDS_OPEN_DIRECTORY); + MDS_OPEN_DIRECTORY); LASSERTF(MDS_OPEN_BY_FID == 000040000000UL, "found 0%.11oUL\n", - MDS_OPEN_BY_FID); + MDS_OPEN_BY_FID); LASSERTF(MDS_OPEN_DELAY_CREATE == 000100000000UL, "found 0%.11oUL\n", - MDS_OPEN_DELAY_CREATE); + MDS_OPEN_DELAY_CREATE); LASSERTF(MDS_OPEN_OWNEROVERRIDE == 000200000000UL, "found 0%.11oUL\n", - MDS_OPEN_OWNEROVERRIDE); + MDS_OPEN_OWNEROVERRIDE); LASSERTF(MDS_OPEN_JOIN_FILE == 000400000000UL, "found 0%.11oUL\n", - MDS_OPEN_JOIN_FILE); + MDS_OPEN_JOIN_FILE); LASSERTF(MDS_OPEN_LOCK == 004000000000UL, "found 0%.11oUL\n", - MDS_OPEN_LOCK); + MDS_OPEN_LOCK); LASSERTF(MDS_OPEN_HAS_EA == 010000000000UL, "found 0%.11oUL\n", - MDS_OPEN_HAS_EA); + MDS_OPEN_HAS_EA); LASSERTF(MDS_OPEN_HAS_OBJS == 020000000000UL, "found 0%.11oUL\n", - MDS_OPEN_HAS_OBJS); + MDS_OPEN_HAS_OBJS); LASSERTF(MDS_OPEN_NORESTORE == 00000000000100000000000ULL, "found 0%.22lloULL\n", - (long long)MDS_OPEN_NORESTORE); + (long long)MDS_OPEN_NORESTORE); LASSERTF(MDS_OPEN_NEWSTRIPE == 00000000000200000000000ULL, "found 0%.22lloULL\n", - (long long)MDS_OPEN_NEWSTRIPE); + (long long)MDS_OPEN_NEWSTRIPE); LASSERTF(MDS_OPEN_VOLATILE == 00000000000400000000000ULL, "found 0%.22lloULL\n", - (long long)MDS_OPEN_VOLATILE); + (long long)MDS_OPEN_VOLATILE); LASSERTF(LUSTRE_SYNC_FL == 0x00000008, "found 0x%.8x\n", - LUSTRE_SYNC_FL); + LUSTRE_SYNC_FL); LASSERTF(LUSTRE_IMMUTABLE_FL == 0x00000010, "found 0x%.8x\n", - LUSTRE_IMMUTABLE_FL); + LUSTRE_IMMUTABLE_FL); LASSERTF(LUSTRE_APPEND_FL == 0x00000020, "found 0x%.8x\n", - LUSTRE_APPEND_FL); + LUSTRE_APPEND_FL); LASSERTF(LUSTRE_NOATIME_FL == 0x00000080, "found 0x%.8x\n", - LUSTRE_NOATIME_FL); + LUSTRE_NOATIME_FL); LASSERTF(LUSTRE_DIRSYNC_FL == 0x00010000, "found 0x%.8x\n", - LUSTRE_DIRSYNC_FL); + LUSTRE_DIRSYNC_FL); LASSERTF(MDS_INODELOCK_LOOKUP == 0x000001, "found 0x%.8x\n", - MDS_INODELOCK_LOOKUP); + MDS_INODELOCK_LOOKUP); LASSERTF(MDS_INODELOCK_UPDATE == 0x000002, "found 0x%.8x\n", - MDS_INODELOCK_UPDATE); + MDS_INODELOCK_UPDATE); LASSERTF(MDS_INODELOCK_OPEN == 0x000004, "found 0x%.8x\n", - MDS_INODELOCK_OPEN); + MDS_INODELOCK_OPEN); LASSERTF(MDS_INODELOCK_LAYOUT == 0x000008, "found 0x%.8x\n", - MDS_INODELOCK_LAYOUT); + MDS_INODELOCK_LAYOUT); /* Checks for struct mdt_ioepoch */ LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n", @@ -2617,35 +2674,6 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_uuid) == 40, "found %lld\n", (long long)(int)sizeof(((struct lmv_desc *)0)->ld_uuid)); - /* Checks for struct lmv_stripe_md */ - LASSERTF((int)sizeof(struct lmv_stripe_md) == 32, "found %lld\n", - (long long)(int)sizeof(struct lmv_stripe_md)); - LASSERTF((int)offsetof(struct lmv_stripe_md, mea_magic) == 0, "found %lld\n", - (long long)(int)offsetof(struct lmv_stripe_md, mea_magic)); - LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_magic) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_magic)); - LASSERTF((int)offsetof(struct lmv_stripe_md, mea_count) == 4, "found %lld\n", - (long long)(int)offsetof(struct lmv_stripe_md, mea_count)); - LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_count) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_count)); - LASSERTF((int)offsetof(struct lmv_stripe_md, mea_master) == 8, "found %lld\n", - (long long)(int)offsetof(struct lmv_stripe_md, mea_master)); - LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_master) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_master)); - LASSERTF((int)offsetof(struct lmv_stripe_md, mea_padding) == 12, "found %lld\n", - (long long)(int)offsetof(struct lmv_stripe_md, mea_padding)); - LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_padding) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_padding)); - CLASSERT(LOV_MAXPOOLNAME == 16); - LASSERTF((int)offsetof(struct lmv_stripe_md, mea_pool_name[16]) == 32, "found %lld\n", - (long long)(int)offsetof(struct lmv_stripe_md, mea_pool_name[16])); - LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16]) == 1, "found %lld\n", - (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16])); - LASSERTF((int)offsetof(struct lmv_stripe_md, mea_ids[0]) == 32, "found %lld\n", - (long long)(int)offsetof(struct lmv_stripe_md, mea_ids[0])); - LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0]) == 16, "found %lld\n", - (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0])); - /* Checks for struct lov_desc */ LASSERTF((int)sizeof(struct lov_desc) == 88, "found %lld\n", (long long)(int)sizeof(struct lov_desc)); @@ -3195,10 +3223,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h)); LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, "found %lld\n", (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h)); - LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_padding) == 48, "found %lld\n", - (long long)(int)offsetof(struct llog_setattr64_rec, lsr_padding)); - LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_valid) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_valid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid)); LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 56, "found %lld\n", (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail)); LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, "found %lld\n", @@ -3272,50 +3300,6 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_pfid) == 16, "found %lld\n", (long long)(int)sizeof(((struct changelog_rec *)0)->cr_pfid)); - /* Checks for struct changelog_ext_rec */ - LASSERTF((int)sizeof(struct changelog_ext_rec) == 96, "found %lld\n", - (long long)(int)sizeof(struct changelog_ext_rec)); - LASSERTF((int)offsetof(struct changelog_ext_rec, cr_namelen) == 0, "found %lld\n", - (long long)(int)offsetof(struct changelog_ext_rec, cr_namelen)); - LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen) == 2, "found %lld\n", - (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen)); - LASSERTF((int)offsetof(struct changelog_ext_rec, cr_flags) == 2, "found %lld\n", - (long long)(int)offsetof(struct changelog_ext_rec, cr_flags)); - LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_flags) == 2, "found %lld\n", - (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_flags)); - LASSERTF((int)offsetof(struct changelog_ext_rec, cr_type) == 4, "found %lld\n", - (long long)(int)offsetof(struct changelog_ext_rec, cr_type)); - LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_type) == 4, "found %lld\n", - (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_type)); - LASSERTF((int)offsetof(struct changelog_ext_rec, cr_index) == 8, "found %lld\n", - (long long)(int)offsetof(struct changelog_ext_rec, cr_index)); - LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_index) == 8, "found %lld\n", - (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_index)); - LASSERTF((int)offsetof(struct changelog_ext_rec, cr_prev) == 16, "found %lld\n", - (long long)(int)offsetof(struct changelog_ext_rec, cr_prev)); - LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_prev) == 8, "found %lld\n", - (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_prev)); - LASSERTF((int)offsetof(struct changelog_ext_rec, cr_time) == 24, "found %lld\n", - (long long)(int)offsetof(struct changelog_ext_rec, cr_time)); - LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_time) == 8, "found %lld\n", - (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_time)); - LASSERTF((int)offsetof(struct changelog_ext_rec, cr_tfid) == 32, "found %lld\n", - (long long)(int)offsetof(struct changelog_ext_rec, cr_tfid)); - LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid)); - LASSERTF((int)offsetof(struct changelog_ext_rec, cr_pfid) == 48, "found %lld\n", - (long long)(int)offsetof(struct changelog_ext_rec, cr_pfid)); - LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid)); - LASSERTF((int)offsetof(struct changelog_ext_rec, cr_sfid) == 64, "found %lld\n", - (long long)(int)offsetof(struct changelog_ext_rec, cr_sfid)); - LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid)); - LASSERTF((int)offsetof(struct changelog_ext_rec, cr_spfid) == 80, "found %lld\n", - (long long)(int)offsetof(struct changelog_ext_rec, cr_spfid)); - LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid)); - /* Checks for struct changelog_setinfo */ LASSERTF((int)sizeof(struct changelog_setinfo) == 12, "found %lld\n", (long long)(int)sizeof(struct changelog_setinfo)); @@ -3339,10 +3323,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct llog_changelog_rec, cr)); LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr) == 64, "found %lld\n", (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr)); - LASSERTF((int)offsetof(struct llog_changelog_rec, cr_tail) == 80, "found %lld\n", - (long long)(int)offsetof(struct llog_changelog_rec, cr_tail)); - LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_tail) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_tail)); + LASSERTF((int)offsetof(struct llog_changelog_rec, cr_do_not_use) == 80, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_rec, cr_do_not_use)); + LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_do_not_use) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_do_not_use)); /* Checks for struct llog_changelog_user_rec */ LASSERTF((int)sizeof(struct llog_changelog_user_rec) == 40, "found %lld\n", @@ -3506,6 +3490,19 @@ void lustre_assert_wire_constants(void) CLASSERT(LLOG_ORIGIN_HANDLE_DESTROY == 509); CLASSERT(LLOG_FIRST_OPC == 501); CLASSERT(LLOG_LAST_OPC == 510); + CLASSERT(LLOG_CONFIG_ORIG_CTXT == 0); + CLASSERT(LLOG_CONFIG_REPL_CTXT == 1); + CLASSERT(LLOG_MDS_OST_ORIG_CTXT == 2); + CLASSERT(LLOG_MDS_OST_REPL_CTXT == 3); + CLASSERT(LLOG_SIZE_ORIG_CTXT == 4); + CLASSERT(LLOG_SIZE_REPL_CTXT == 5); + CLASSERT(LLOG_TEST_ORIG_CTXT == 8); + CLASSERT(LLOG_TEST_REPL_CTXT == 9); + CLASSERT(LLOG_CHANGELOG_ORIG_CTXT == 12); + CLASSERT(LLOG_CHANGELOG_REPL_CTXT == 13); + CLASSERT(LLOG_CHANGELOG_USER_ORIG_CTXT == 14); + CLASSERT(LLOG_AGENT_ORIG_CTXT == 15); + CLASSERT(LLOG_MAX_CTXTS == 16); /* Checks for struct llogd_conn_body */ LASSERTF((int)sizeof(struct llogd_conn_body) == 40, "found %lld\n", @@ -3943,9 +3940,9 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((struct hsm_progress *)0)->padding) == 4, "found %lld\n", (long long)(int)sizeof(((struct hsm_progress *)0)->padding)); LASSERTF(HP_FLAG_COMPLETED == 0x01, "found 0x%.8x\n", - HP_FLAG_COMPLETED); + HP_FLAG_COMPLETED); LASSERTF(HP_FLAG_RETRY == 0x02, "found 0x%.8x\n", - HP_FLAG_RETRY); + HP_FLAG_RETRY); LASSERTF((int)offsetof(struct hsm_copy, hc_data_version) == 0, "found %lld\n", (long long)(int)offsetof(struct hsm_copy, hc_data_version)); @@ -4100,9 +4097,9 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_data_len) == 4, "found %lld\n", (long long)(int)sizeof(((struct hsm_request *)0)->hr_data_len)); LASSERTF(HSM_FORCE_ACTION == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned)HSM_FORCE_ACTION); + (unsigned)HSM_FORCE_ACTION); LASSERTF(HSM_GHOST_COPY == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned)HSM_GHOST_COPY); + (unsigned)HSM_GHOST_COPY); /* Checks for struct hsm_user_request */ LASSERTF((int)sizeof(struct hsm_user_request) == 24, "found %lld\n", |