summaryrefslogtreecommitdiff
path: root/include/linux/scif.h
blob: eeb250b73c4b950557fa83f65cffe2eccb6afa22 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
/*
 * Intel MIC Platform Software Stack (MPSS)
 *
 * This file is provided under a dual BSD/GPLv2 license.  When using or
 * redistributing this file, you may do so under either license.
 *
 * GPL LICENSE SUMMARY
 *
 * Copyright(c) 2014 Intel Corporation.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of version 2 of the GNU General Public License as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * BSD LICENSE
 *
 * Copyright(c) 2014 Intel Corporation.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * * Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 * * Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in
 *   the documentation and/or other materials provided with the
 *   distribution.
 * * Neither the name of Intel Corporation nor the names of its
 *   contributors may be used to endorse or promote products derived
 *   from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * Intel SCIF driver.
 *
 */
#ifndef __SCIF_H__
#define __SCIF_H__

#include <linux/types.h>
#include <linux/poll.h>
#include <linux/device.h>
#include <linux/scif_ioctl.h>

#define SCIF_ACCEPT_SYNC	1
#define SCIF_SEND_BLOCK		1
#define SCIF_RECV_BLOCK		1

enum {
	SCIF_PROT_READ = (1 << 0),
	SCIF_PROT_WRITE = (1 << 1)
};

enum {
	SCIF_MAP_FIXED = 0x10,
	SCIF_MAP_KERNEL	= 0x20,
};

enum {
	SCIF_FENCE_INIT_SELF = (1 << 0),
	SCIF_FENCE_INIT_PEER = (1 << 1),
	SCIF_SIGNAL_LOCAL = (1 << 4),
	SCIF_SIGNAL_REMOTE = (1 << 5)
};

enum {
	SCIF_RMA_USECPU = (1 << 0),
	SCIF_RMA_USECACHE = (1 << 1),
	SCIF_RMA_SYNC = (1 << 2),
	SCIF_RMA_ORDERED = (1 << 3)
};

/* End of SCIF Admin Reserved Ports */
#define SCIF_ADMIN_PORT_END	1024

/* End of SCIF Reserved Ports */
#define SCIF_PORT_RSVD		1088

typedef struct scif_endpt *scif_epd_t;
typedef struct scif_pinned_pages *scif_pinned_pages_t;

/**
 * struct scif_range - SCIF registered range used in kernel mode
 * @cookie: cookie used internally by SCIF
 * @nr_pages: number of pages of PAGE_SIZE
 * @prot_flags: R/W protection
 * @phys_addr: Array of bus addresses
 * @va: Array of kernel virtual addresses backed by the pages in the phys_addr
 *	array. The va is populated only when called on the host for a remote
 *	SCIF connection on MIC. This is required to support the use case of DMA
 *	between MIC and another device which is not a SCIF node e.g., an IB or
 *	ethernet NIC.
 */
struct scif_range {
	void *cookie;
	int nr_pages;
	int prot_flags;
	dma_addr_t *phys_addr;
	void __iomem **va;
};

/**
 * struct scif_pollepd - SCIF endpoint to be monitored via scif_poll
 * @epd: SCIF endpoint
 * @events: requested events
 * @revents: returned events
 */
struct scif_pollepd {
	scif_epd_t epd;
	__poll_t events;
	__poll_t revents;
};

/**
 * scif_peer_dev - representation of a peer SCIF device
 *
 * Peer devices show up as PCIe devices for the mgmt node but not the cards.
 * The mgmt node discovers all the cards on the PCIe bus and informs the other
 * cards about their peers. Upon notification of a peer a node adds a peer
 * device to the peer bus to maintain symmetry in the way devices are
 * discovered across all nodes in the SCIF network.
 *
 * @dev: underlying device
 * @dnode - The destination node which this device will communicate with.
 */
struct scif_peer_dev {
	struct device dev;
	u8 dnode;
};

/**
 * scif_client - representation of a SCIF client
 * @name: client name
 * @probe - client method called when a peer device is registered
 * @remove - client method called when a peer device is unregistered
 * @si - subsys_interface used internally for implementing SCIF clients
 */
struct scif_client {
	const char *name;
	void (*probe)(struct scif_peer_dev *spdev);
	void (*remove)(struct scif_peer_dev *spdev);
	struct subsys_interface si;
};

#define SCIF_OPEN_FAILED ((scif_epd_t)-1)
#define SCIF_REGISTER_FAILED ((off_t)-1)
#define SCIF_MMAP_FAILED ((void *)-1)

/**
 * scif_open() - Create an endpoint
 *
 * Return:
 * Upon successful completion, scif_open() returns an endpoint descriptor to
 * be used in subsequent SCIF functions calls to refer to that endpoint;
 * otherwise in user mode SCIF_OPEN_FAILED (that is ((scif_epd_t)-1)) is
 * returned and errno is set to indicate the error; in kernel mode a NULL
 * scif_epd_t is returned.
 *
 * Errors:
 * ENOMEM - Insufficient kernel memory was available
 */
scif_epd_t scif_open(void);

/**
 * scif_bind() - Bind an endpoint to a port
 * @epd:	endpoint descriptor
 * @pn:		port number
 *
 * scif_bind() binds endpoint epd to port pn, where pn is a port number on the
 * local node. If pn is zero, a port number greater than or equal to
 * SCIF_PORT_RSVD is assigned and returned. Each endpoint may be bound to
 * exactly one local port. Ports less than 1024 when requested can only be bound
 * by system (or root) processes or by processes executed by privileged users.
 *
 * Return:
 * Upon successful completion, scif_bind() returns the port number to which epd
 * is bound; otherwise in user mode -1 is returned and errno is set to
 * indicate the error; in kernel mode the negative of one of the following
 * errors is returned.
 *
 * Errors:
 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
 * EINVAL - the endpoint or the port is already bound
 * EISCONN - The endpoint is already connected
 * ENOSPC - No port number available for assignment
 * EACCES - The port requested is protected and the user is not the superuser
 */
int scif_bind(scif_epd_t epd, u16 pn);

/**
 * scif_listen() - Listen for connections on an endpoint
 * @epd:	endpoint descriptor
 * @backlog:	maximum pending connection requests
 *
 * scif_listen() marks the endpoint epd as a listening endpoint - that is, as
 * an endpoint that will be used to accept incoming connection requests. Once
 * so marked, the endpoint is said to be in the listening state and may not be
 * used as the endpoint of a connection.
 *
 * The endpoint, epd, must have been bound to a port.
 *
 * The backlog argument defines the maximum length to which the queue of
 * pending connections for epd may grow. If a connection request arrives when
 * the queue is full, the client may receive an error with an indication that
 * the connection was refused.
 *
 * Return:
 * Upon successful completion, scif_listen() returns 0; otherwise in user mode
 * -1 is returned and errno is set to indicate the error; in kernel mode the
 * negative of one of the following errors is returned.
 *
 * Errors:
 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
 * EINVAL - the endpoint is not bound to a port
 * EISCONN - The endpoint is already connected or listening
 */
int scif_listen(scif_epd_t epd, int backlog);

/**
 * scif_connect() - Initiate a connection on a port
 * @epd:	endpoint descriptor
 * @dst:	global id of port to which to connect
 *
 * The scif_connect() function requests the connection of endpoint epd to remote
 * port dst. If the connection is successful, a peer endpoint, bound to dst, is
 * created on node dst.node. On successful return, the connection is complete.
 *
 * If the endpoint epd has not already been bound to a port, scif_connect()
 * will bind it to an unused local port.
 *
 * A connection is terminated when an endpoint of the connection is closed,
 * either explicitly by scif_close(), or when a process that owns one of the
 * endpoints of the connection is terminated.
 *
 * In user space, scif_connect() supports an asynchronous connection mode
 * if the application has set the O_NONBLOCK flag on the endpoint via the
 * fcntl() system call. Setting this flag will result in the calling process
 * not to wait during scif_connect().
 *
 * Return:
 * Upon successful completion, scif_connect() returns the port ID to which the
 * endpoint, epd, is bound; otherwise in user mode -1 is returned and errno is
 * set to indicate the error; in kernel mode the negative of one of the
 * following errors is returned.
 *
 * Errors:
 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
 * ECONNREFUSED - The destination was not listening for connections or refused
 * the connection request
 * EINVAL - dst.port is not a valid port ID
 * EISCONN - The endpoint is already connected
 * ENOMEM - No buffer space is available
 * ENODEV - The destination node does not exist, or the node is lost or existed,
 * but is not currently in the network since it may have crashed
 * ENOSPC - No port number available for assignment
 * EOPNOTSUPP - The endpoint is listening and cannot be connected
 */
int scif_connect(scif_epd_t epd, struct scif_port_id *dst);

/**
 * scif_accept() - Accept a connection on an endpoint
 * @epd:	endpoint descriptor
 * @peer:	global id of port to which connected
 * @newepd:	new connected endpoint descriptor
 * @flags:	flags
 *
 * The scif_accept() call extracts the first connection request from the queue
 * of pending connections for the port on which epd is listening. scif_accept()
 * creates a new endpoint, bound to the same port as epd, and allocates a new
 * SCIF endpoint descriptor, returned in newepd, for the endpoint. The new
 * endpoint is connected to the endpoint through which the connection was
 * requested. epd is unaffected by this call, and remains in the listening
 * state.
 *
 * On successful return, peer holds the global port identifier (node id and
 * local port number) of the port which requested the connection.
 *
 * A connection is terminated when an endpoint of the connection is closed,
 * either explicitly by scif_close(), or when a process that owns one of the
 * endpoints of the connection is terminated.
 *
 * The number of connections that can (subsequently) be accepted on epd is only
 * limited by system resources (memory).
 *
 * The flags argument is formed by OR'ing together zero or more of the
 * following values.
 * SCIF_ACCEPT_SYNC - block until a connection request is presented. If
 *			SCIF_ACCEPT_SYNC is not in flags, and no pending
 *			connections are present on the queue, scif_accept()
 *			fails with an EAGAIN error
 *
 * In user mode, the select() and poll() functions can be used to determine
 * when there is a connection request. In kernel mode, the scif_poll()
 * function may be used for this purpose. A readable event will be delivered
 * when a connection is requested.
 *
 * Return:
 * Upon successful completion, scif_accept() returns 0; otherwise in user mode
 * -1 is returned and errno is set to indicate the error; in kernel mode the
 *	negative of one of the following errors is returned.
 *
 * Errors:
 * EAGAIN - SCIF_ACCEPT_SYNC is not set and no connections are present to be
 * accepted or SCIF_ACCEPT_SYNC is not set and remote node failed to complete
 * its connection request
 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
 * EINTR - Interrupted function
 * EINVAL - epd is not a listening endpoint, or flags is invalid, or peer is
 * NULL, or newepd is NULL
 * ENODEV - The requesting node is lost or existed, but is not currently in the
 * network since it may have crashed
 * ENOMEM - Not enough space
 * ENOENT - Secondary part of epd registration failed
 */
int scif_accept(scif_epd_t epd, struct scif_port_id *peer, scif_epd_t
		*newepd, int flags);

/**
 * scif_close() - Close an endpoint
 * @epd:	endpoint descriptor
 *
 * scif_close() closes an endpoint and performs necessary teardown of
 * facilities associated with that endpoint.
 *
 * If epd is a listening endpoint then it will no longer accept connection
 * requests on the port to which it is bound. Any pending connection requests
 * are rejected.
 *
 * If epd is a connected endpoint, then its peer endpoint is also closed. RMAs
 * which are in-process through epd or its peer endpoint will complete before
 * scif_close() returns. Registered windows of the local and peer endpoints are
 * released as if scif_unregister() was called against each window.
 *
 * Closing a SCIF endpoint does not affect local registered memory mapped by
 * a SCIF endpoint on a remote node. The local memory remains mapped by the peer
 * SCIF endpoint explicitly removed by calling munmap(..) by the peer.
 *
 * If the peer endpoint's receive queue is not empty at the time that epd is
 * closed, then the peer endpoint can be passed as the endpoint parameter to
 * scif_recv() until the receive queue is empty.
 *
 * epd is freed and may no longer be accessed.
 *
 * Return:
 * Upon successful completion, scif_close() returns 0; otherwise in user mode
 * -1 is returned and errno is set to indicate the error; in kernel mode the
 * negative of one of the following errors is returned.
 *
 * Errors:
 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
 */
int scif_close(scif_epd_t epd);

/**
 * scif_send() - Send a message
 * @epd:	endpoint descriptor
 * @msg:	message buffer address
 * @len:	message length
 * @flags:	blocking mode flags
 *
 * scif_send() sends data to the peer of endpoint epd. Up to len bytes of data
 * are copied from memory starting at address msg. On successful execution the
 * return value of scif_send() is the number of bytes that were sent, and is
 * zero if no bytes were sent because len was zero. scif_send() may be called
 * only when the endpoint is in a connected state.
 *
 * If a scif_send() call is non-blocking, then it sends only those bytes which
 * can be sent without waiting, up to a maximum of len bytes.
 *
 * If a scif_send() call is blocking, then it normally returns after sending
 * all len bytes. If a blocking call is interrupted or the connection is
 * reset, the call is considered successful if some bytes were sent or len is
 * zero, otherwise the call is considered unsuccessful.
 *
 * In user mode, the select() and poll() functions can be used to determine
 * when the send queue is not full. In kernel mode, the scif_poll() function
 * may be used for this purpose.
 *
 * It is recommended that scif_send()/scif_recv() only be used for short
 * control-type message communication between SCIF endpoints. The SCIF RMA
 * APIs are expected to provide better performance for transfer sizes of
 * 1024 bytes or longer for the current MIC hardware and software
 * implementation.
 *
 * scif_send() will block until the entire message is sent if SCIF_SEND_BLOCK
 * is passed as the flags argument.
 *
 * Return:
 * Upon successful completion, scif_send() returns the number of bytes sent;
 * otherwise in user mode -1 is returned and errno is set to indicate the
 * error; in kernel mode the negative of one of the following errors is
 * returned.
 *
 * Errors:
 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
 * ECONNRESET - Connection reset by peer
 * EINVAL - flags is invalid, or len is negative
 * ENODEV - The remote node is lost or existed, but is not currently in the
 * network since it may have crashed
 * ENOMEM - Not enough space
 * ENOTCONN - The endpoint is not connected
 */
int scif_send(scif_epd_t epd, void *msg, int len, int flags);

/**
 * scif_recv() - Receive a message
 * @epd:	endpoint descriptor
 * @msg:	message buffer address
 * @len:	message buffer length
 * @flags:	blocking mode flags
 *
 * scif_recv() receives data from the peer of endpoint epd. Up to len bytes of
 * data are copied to memory starting at address msg. On successful execution
 * the return value of scif_recv() is the number of bytes that were received,
 * and is zero if no bytes were received because len was zero. scif_recv() may
 * be called only when the endpoint is in a connected state.
 *
 * If a scif_recv() call is non-blocking, then it receives only those bytes
 * which can be received without waiting, up to a maximum of len bytes.
 *
 * If a scif_recv() call is blocking, then it normally returns after receiving
 * all len bytes. If the blocking call was interrupted due to a disconnection,
 * subsequent calls to scif_recv() will copy all bytes received upto the point
 * of disconnection.
 *
 * In user mode, the select() and poll() functions can be used to determine
 * when data is available to be received. In kernel mode, the scif_poll()
 * function may be used for this purpose.
 *
 * It is recommended that scif_send()/scif_recv() only be used for short
 * control-type message communication between SCIF endpoints. The SCIF RMA
 * APIs are expected to provide better performance for transfer sizes of
 * 1024 bytes or longer for the current MIC hardware and software
 * implementation.
 *
 * scif_recv() will block until the entire message is received if
 * SCIF_RECV_BLOCK is passed as the flags argument.
 *
 * Return:
 * Upon successful completion, scif_recv() returns the number of bytes
 * received; otherwise in user mode -1 is returned and errno is set to
 * indicate the error; in kernel mode the negative of one of the following
 * errors is returned.
 *
 * Errors:
 * EAGAIN - The destination node is returning from a low power state
 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
 * ECONNRESET - Connection reset by peer
 * EINVAL - flags is invalid, or len is negative
 * ENODEV - The remote node is lost or existed, but is not currently in the
 * network since it may have crashed
 * ENOMEM - Not enough space
 * ENOTCONN - The endpoint is not connected
 */
int scif_recv(scif_epd_t epd, void *msg, int len, int flags);

/**
 * scif_register() - Mark a memory region for remote access.
 * @epd:		endpoint descriptor
 * @addr:		starting virtual address
 * @len:		length of range
 * @offset:		offset of window
 * @prot_flags:		read/write protection flags
 * @map_flags:		mapping flags
 *
 * The scif_register() function opens a window, a range of whole pages of the
 * registered address space of the endpoint epd, starting at offset po and
 * continuing for len bytes. The value of po, further described below, is a
 * function of the parameters offset and len, and the value of map_flags. Each
 * page of the window represents the physical memory page which backs the
 * corresponding page of the range of virtual address pages starting at addr
 * and continuing for len bytes. addr and len are constrained to be multiples
 * of the page size. A successful scif_register() call returns po.
 *
 * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset
 * exactly, and offset is constrained to be a multiple of the page size. The
 * mapping established by scif_register() will not replace any existing
 * registration; an error is returned if any page within the range [offset,
 * offset + len - 1] intersects an existing window.
 *
 * When SCIF_MAP_FIXED is not set, the implementation uses offset in an
 * implementation-defined manner to arrive at po. The po value so chosen will
 * be an area of the registered address space that the implementation deems
 * suitable for a mapping of len bytes. An offset value of 0 is interpreted as
 * granting the implementation complete freedom in selecting po, subject to
 * constraints described below. A non-zero value of offset is taken to be a
 * suggestion of an offset near which the mapping should be placed. When the
 * implementation selects a value for po, it does not replace any extant
 * window. In all cases, po will be a multiple of the page size.
 *
 * The physical pages which are so represented by a window are available for
 * access in calls to mmap(), scif_readfrom(), scif_writeto(),
 * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the
 * physical pages represented by the window will not be reused by the memory
 * subsystem for any other purpose. Note that the same physical page may be
 * represented by multiple windows.
 *
 * Subsequent operations which change the memory pages to which virtual
 * addresses are mapped (such as mmap(), munmap()) have no effect on
 * existing window.
 *
 * If the process will fork(), it is recommended that the registered
 * virtual address range be marked with MADV_DONTFORK. Doing so will prevent
 * problems due to copy-on-write semantics.
 *
 * The prot_flags argument is formed by OR'ing together one or more of the
 * following values.
 * SCIF_PROT_READ - allow read operations from the window
 * SCIF_PROT_WRITE - allow write operations to the window
 *
 * Return:
 * Upon successful completion, scif_register() returns the offset at which the
 * mapping was placed (po); otherwise in user mode SCIF_REGISTER_FAILED (that
 * is (off_t *)-1) is returned and errno is set to indicate the error; in
 * kernel mode the negative of one of the following errors is returned.
 *
 * Errors:
 * EADDRINUSE - SCIF_MAP_FIXED is set in map_flags, and pages in the range
 * [offset, offset + len -1] are already registered
 * EAGAIN - The mapping could not be performed due to lack of resources
 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
 * ECONNRESET - Connection reset by peer
 * EINVAL - map_flags is invalid, or prot_flags is invalid, or SCIF_MAP_FIXED is
 * set in flags, and offset is not a multiple of the page size, or addr is not a
 * multiple of the page size, or len is not a multiple of the page size, or is
 * 0, or offset is negative
 * ENODEV - The remote node is lost or existed, but is not currently in the
 * network since it may have crashed
 * ENOMEM - Not enough space
 * ENOTCONN -The endpoint is not connected
 */
off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset,
		    int prot_flags, int map_flags);

/**
 * scif_unregister() - Mark a memory region for remote access.
 * @epd:	endpoint descriptor
 * @offset:	start of range to unregister
 * @len:	length of range to unregister
 *
 * The scif_unregister() function closes those previously registered windows
 * which are entirely within the range [offset, offset + len - 1]. It is an
 * error to specify a range which intersects only a subrange of a window.
 *
 * On a successful return, pages within the window may no longer be specified
 * in calls to mmap(), scif_readfrom(), scif_writeto(), scif_vreadfrom(),
 * scif_vwriteto(), scif_get_pages, and scif_fence_signal(). The window,
 * however, continues to exist until all previous references against it are
 * removed. A window is referenced if there is a mapping to it created by
 * mmap(), or if scif_get_pages() was called against the window
 * (and the pages have not been returned via scif_put_pages()). A window is
 * also referenced while an RMA, in which some range of the window is a source
 * or destination, is in progress. Finally a window is referenced while some
 * offset in that window was specified to scif_fence_signal(), and the RMAs
 * marked by that call to scif_fence_signal() have not completed. While a
 * window is in this state, its registered address space pages are not
 * available for use in a new registered window.
 *
 * When all such references to the window have been removed, its references to
 * all the physical pages which it represents are removed. Similarly, the
 * registered address space pages of the window become available for
 * registration in a new window.
 *
 * Return:
 * Upon successful completion, scif_unregister() returns 0; otherwise in user
 * mode -1 is returned and errno is set to indicate the error; in kernel mode
 * the negative of one of the following errors is returned. In the event of an
 * error, no windows are unregistered.
 *
 * Errors:
 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
 * ECONNRESET - Connection reset by peer
 * EINVAL - the range [offset, offset + len - 1] intersects a subrange of a
 * window, or offset is negative
 * ENODEV - The remote node is lost or existed, but is not currently in the
 * network since it may have crashed
 * ENOTCONN - The endpoint is not connected
 * ENXIO - Offsets in the range [offset, offset + len - 1] are invalid for the
 * registered address space of epd
 */
int scif_unregister(scif_epd_t epd, off_t offset, size_t len);

/**
 * scif_readfrom() - Copy from a remote address space
 * @epd:	endpoint descriptor
 * @loffset:	offset in local registered address space to
 *		which to copy
 * @len:	length of range to copy
 * @roffset:	offset in remote registered address space
 *		from which to copy
 * @rma_flags:	transfer mode flags
 *
 * scif_readfrom() copies len bytes from the remote registered address space of
 * the peer of endpoint epd, starting at the offset roffset to the local
 * registered address space of epd, starting at the offset loffset.
 *
 * Each of the specified ranges [loffset, loffset + len - 1] and [roffset,
 * roffset + len - 1] must be within some registered window or windows of the
 * local and remote nodes. A range may intersect multiple registered windows,
 * but only if those windows are contiguous in the registered address space.
 *
 * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
 * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
 * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the
 * transfer is complete. Otherwise, the transfer may be performed asynchron-
 * ously. The order in which any two asynchronous RMA operations complete
 * is non-deterministic. The synchronization functions, scif_fence_mark()/
 * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
 * the completion of asynchronous RMA operations on the same endpoint.
 *
 * The DMA transfer of individual bytes is not guaranteed to complete in
 * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
 * cacheline or partial cacheline of the source range will become visible on
 * the destination node after all other transferred data in the source
 * range has become visible on the destination node.
 *
 * The optimal DMA performance will likely be realized if both
 * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
 * performance will likely be realized if loffset and roffset are not
 * cacheline aligned but are separated by some multiple of 64. The lowest level
 * of performance is likely if loffset and roffset are not separated by a
 * multiple of 64.
 *
 * The rma_flags argument is formed by ORing together zero or more of the
 * following values.
 * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
 *	engine.
 * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
 *		transfer has completed. Passing this flag results in the
 *		current implementation busy waiting and consuming CPU cycles
 *		while the DMA transfer is in progress for best performance by
 *		avoiding the interrupt latency.
 * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
 *		the source range becomes visible on the destination node
 *		after all other transferred data in the source range has
 *		become visible on the destination
 *
 * Return:
 * Upon successful completion, scif_readfrom() returns 0; otherwise in user
 * mode -1 is returned and errno is set to indicate the error; in kernel mode
 * the negative of one of the following errors is returned.
 *
 * Errors:
 * EACCESS - Attempt to write to a read-only range
 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
 * ECONNRESET - Connection reset by peer
 * EINVAL - rma_flags is invalid
 * ENODEV - The remote node is lost or existed, but is not currently in the
 * network since it may have crashed
 * ENOTCONN - The endpoint is not connected
 * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered
 * address space of epd, or, The range [roffset, roffset + len - 1] is invalid
 * for the registered address space of the peer of epd, or loffset or roffset
 * is negative
 */
int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t
		  roffset, int rma_flags);

/**
 * scif_writeto() - Copy to a remote address space
 * @epd:	endpoint descriptor
 * @loffset:	offset in local registered address space
 *		from which to copy
 * @len:	length of range to copy
 * @roffset:	offset in remote registered address space to
 *		which to copy
 * @rma_flags:	transfer mode flags
 *
 * scif_writeto() copies len bytes from the local registered address space of
 * epd, starting at the offset loffset to the remote registered address space
 * of the peer of endpoint epd, starting at the offset roffset.
 *
 * Each of the specified ranges [loffset, loffset + len - 1] and [roffset,
 * roffset + len - 1] must be within some registered window or windows of the
 * local and remote nodes. A range may intersect multiple registered windows,
 * but only if those windows are contiguous in the registered address space.
 *
 * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
 * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
 * flags includes SCIF_RMA_SYNC, then scif_writeto() will return after the
 * transfer is complete. Otherwise, the transfer may be performed asynchron-
 * ously. The order in which any two asynchronous RMA operations complete
 * is non-deterministic. The synchronization functions, scif_fence_mark()/
 * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
 * the completion of asynchronous RMA operations on the same endpoint.
 *
 * The DMA transfer of individual bytes is not guaranteed to complete in
 * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
 * cacheline or partial cacheline of the source range will become visible on
 * the destination node after all other transferred data in the source
 * range has become visible on the destination node.
 *
 * The optimal DMA performance will likely be realized if both
 * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
 * performance will likely be realized if loffset and roffset are not cacheline
 * aligned but are separated by some multiple of 64. The lowest level of
 * performance is likely if loffset and roffset are not separated by a multiple
 * of 64.
 *
 * The rma_flags argument is formed by ORing together zero or more of the
 * following values.
 * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
 *			engine.
 * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
 *		transfer has completed. Passing this flag results in the
 *		current implementation busy waiting and consuming CPU cycles
 *		while the DMA transfer is in progress for best performance by
 *		avoiding the interrupt latency.
 * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
 *		the source range becomes visible on the destination node
 *		after all other transferred data in the source range has
 *		become visible on the destination
 *
 * Return:
 * Upon successful completion, scif_readfrom() returns 0; otherwise in user
 * mode -1 is returned and errno is set to indicate the error; in kernel mode
 * the negative of one of the following errors is returned.
 *
 * Errors:
 * EACCESS - Attempt to write to a read-only range
 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
 * ECONNRESET - Connection reset by peer
 * EINVAL - rma_flags is invalid
 * ENODEV - The remote node is lost or existed, but is not currently in the
 * network since it may have crashed
 * ENOTCONN - The endpoint is not connected
 * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered
 * address space of epd, or, The range [roffset , roffset + len -1] is invalid
 * for the registered address space of the peer of epd, or loffset or roffset
 * is negative
 */
int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t
		 roffset, int rma_flags);

/**
 * scif_vreadfrom() - Copy from a remote address space
 * @epd:	endpoint descriptor
 * @addr:	address to which to copy
 * @len:	length of range to copy
 * @roffset:	offset in remote registered address space
 *		from which to copy
 * @rma_flags:	transfer mode flags
 *
 * scif_vreadfrom() copies len bytes from the remote registered address
 * space of the peer of endpoint epd, starting at the offset roffset, to local
 * memory, starting at addr.
 *
 * The specified range [roffset, roffset + len - 1] must be within some
 * registered window or windows of the remote nodes. The range may
 * intersect multiple registered windows, but only if those windows are
 * contiguous in the registered address space.
 *
 * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
 * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
 * flags includes SCIF_RMA_SYNC, then scif_vreadfrom() will return after the
 * transfer is complete. Otherwise, the transfer may be performed asynchron-
 * ously. The order in which any two asynchronous RMA operations complete
 * is non-deterministic. The synchronization functions, scif_fence_mark()/
 * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
 * the completion of asynchronous RMA operations on the same endpoint.
 *
 * The DMA transfer of individual bytes is not guaranteed to complete in
 * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
 * cacheline or partial cacheline of the source range will become visible on
 * the destination node after all other transferred data in the source
 * range has become visible on the destination node.
 *
 * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back
 * the specified local memory range may be remain in a pinned state even after
 * the specified transfer completes. This may reduce overhead if some or all of
 * the same virtual address range is referenced in a subsequent call of
 * scif_vreadfrom() or scif_vwriteto().
 *
 * The optimal DMA performance will likely be realized if both
 * addr and roffset are cacheline aligned (are a multiple of 64). Lower
 * performance will likely be realized if addr and roffset are not
 * cacheline aligned but are separated by some multiple of 64. The lowest level
 * of performance is likely if addr and roffset are not separated by a
 * multiple of 64.
 *
 * The rma_flags argument is formed by ORing together zero or more of the
 * following values.
 * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
 *	engine.
 * SCIF_RMA_USECACHE - enable registration caching
 * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
 *		transfer has completed. Passing this flag results in the
 *		current implementation busy waiting and consuming CPU cycles
 *		while the DMA transfer is in progress for best performance by
 *		avoiding the interrupt latency.
 * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
 *	the source range becomes visible on the destination node
 *	after all other transferred data in the source range has
 *	become visible on the destination
 *
 * Return:
 * Upon successful completion, scif_vreadfrom() returns 0; otherwise in user
 * mode -1 is returned and errno is set to indicate the error; in kernel mode
 * the negative of one of the following errors is returned.
 *
 * Errors:
 * EACCESS - Attempt to write to a read-only range
 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
 * ECONNRESET - Connection reset by peer
 * EINVAL - rma_flags is invalid
 * ENODEV - The remote node is lost or existed, but is not currently in the
 * network since it may have crashed
 * ENOTCONN - The endpoint is not connected
 * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the
 * registered address space of epd
 */
int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t roffset,
		   int rma_flags);

/**
 * scif_vwriteto() - Copy to a remote address space
 * @epd:	endpoint descriptor
 * @addr:	address from which to copy
 * @len:	length of range to copy
 * @roffset:	offset in remote registered address space to
 *		which to copy
 * @rma_flags:	transfer mode flags
 *
 * scif_vwriteto() copies len bytes from the local memory, starting at addr, to
 * the remote registered address space of the peer of endpoint epd, starting at
 * the offset roffset.
 *
 * The specified range [roffset, roffset + len - 1] must be within some
 * registered window or windows of the remote nodes. The range may intersect
 * multiple registered windows, but only if those windows are contiguous in the
 * registered address space.
 *
 * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
 * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
 * flags includes SCIF_RMA_SYNC, then scif_vwriteto() will return after the
 * transfer is complete. Otherwise, the transfer may be performed asynchron-
 * ously. The order in which any two asynchronous RMA operations complete
 * is non-deterministic. The synchronization functions, scif_fence_mark()/
 * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
 * the completion of asynchronous RMA operations on the same endpoint.
 *
 * The DMA transfer of individual bytes is not guaranteed to complete in
 * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
 * cacheline or partial cacheline of the source range will become visible on
 * the destination node after all other transferred data in the source
 * range has become visible on the destination node.
 *
 * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back
 * the specified local memory range may be remain in a pinned state even after
 * the specified transfer completes. This may reduce overhead if some or all of
 * the same virtual address range is referenced in a subsequent call of
 * scif_vreadfrom() or scif_vwriteto().
 *
 * The optimal DMA performance will likely be realized if both
 * addr and offset are cacheline aligned (are a multiple of 64). Lower
 * performance will likely be realized if addr and offset are not cacheline
 * aligned but are separated by some multiple of 64. The lowest level of
 * performance is likely if addr and offset are not separated by a multiple of
 * 64.
 *
 * The rma_flags argument is formed by ORing together zero or more of the
 * following values.
 * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
 *	engine.
 * SCIF_RMA_USECACHE - allow registration caching
 * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
 *		transfer has completed. Passing this flag results in the
 *		current implementation busy waiting and consuming CPU cycles
 *		while the DMA transfer is in progress for best performance by
 *		avoiding the interrupt latency.
 * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
 *		the source range becomes visible on the destination node
 *		after all other transferred data in the source range has
 *		become visible on the destination
 *
 * Return:
 * Upon successful completion, scif_vwriteto() returns 0; otherwise in user
 * mode -1 is returned and errno is set to indicate the error; in kernel mode
 * the negative of one of the following errors is returned.
 *
 * Errors:
 * EACCESS - Attempt to write to a read-only range
 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
 * ECONNRESET - Connection reset by peer
 * EINVAL - rma_flags is invalid
 * ENODEV - The remote node is lost or existed, but is not currently in the
 * network since it may have crashed
 * ENOTCONN - The endpoint is not connected
 * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the
 * registered address space of epd
 */
int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t roffset,
		  int rma_flags);

/**
 * scif_fence_mark() - Mark previously issued RMAs
 * @epd:	endpoint descriptor
 * @flags:	control flags
 * @mark:	marked value returned as output.
 *
 * scif_fence_mark() returns after marking the current set of all uncompleted
 * RMAs initiated through the endpoint epd or the current set of all
 * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are
 * marked with a value returned at mark. The application may subsequently call
 * scif_fence_wait(), passing the value returned at mark, to await completion
 * of all RMAs so marked.
 *
 * The flags argument has exactly one of the following values.
 * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint
 *	epd are marked
 * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer
 *	of endpoint epd are marked
 *
 * Return:
 * Upon successful completion, scif_fence_mark() returns 0; otherwise in user
 * mode -1 is returned and errno is set to indicate the error; in kernel mode
 * the negative of one of the following errors is returned.
 *
 * Errors:
 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
 * ECONNRESET - Connection reset by peer
 * EINVAL - flags is invalid
 * ENODEV - The remote node is lost or existed, but is not currently in the
 * network since it may have crashed
 * ENOTCONN - The endpoint is not connected
 * ENOMEM - Insufficient kernel memory was available
 */
int scif_fence_mark(scif_epd_t epd, int flags, int *mark);

/**
 * scif_fence_wait() - Wait for completion of marked RMAs
 * @epd:	endpoint descriptor
 * @mark:	mark request
 *
 * scif_fence_wait() returns after all RMAs marked with mark have completed.
 * The value passed in mark must have been obtained in a previous call to
 * scif_fence_mark().
 *
 * Return:
 * Upon successful completion, scif_fence_wait() returns 0; otherwise in user
 * mode -1 is returned and errno is set to indicate the error; in kernel mode
 * the negative of one of the following errors is returned.
 *
 * Errors:
 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
 * ECONNRESET - Connection reset by peer
 * ENODEV - The remote node is lost or existed, but is not currently in the
 * network since it may have crashed
 * ENOTCONN - The endpoint is not connected
 * ENOMEM - Insufficient kernel memory was available
 */
int scif_fence_wait(scif_epd_t epd, int mark);

/**
 * scif_fence_signal() - Request a memory update on completion of RMAs
 * @epd:	endpoint descriptor
 * @loff:	local offset
 * @lval:	local value to write to loffset
 * @roff:	remote offset
 * @rval:	remote value to write to roffset
 * @flags:	flags
 *
 * scif_fence_signal() returns after marking the current set of all uncompleted
 * RMAs initiated through the endpoint epd or marking the current set of all
 * uncompleted RMAs initiated through the peer of endpoint epd.
 *
 * If flags includes SCIF_SIGNAL_LOCAL, then on completion of the RMAs in the
 * marked set, lval is written to memory at the address corresponding to offset
 * loff in the local registered address space of epd. loff must be within a
 * registered window. If flags includes SCIF_SIGNAL_REMOTE, then on completion
 * of the RMAs in the marked set, rval is written to memory at the address
 * corresponding to offset roff in the remote registered address space of epd.
 * roff must be within a remote registered window of the peer of epd. Note
 * that any specified offset must be DWORD (4 byte / 32 bit) aligned.
 *
 * The flags argument is formed by OR'ing together the following.
 * Exactly one of the following values.
 * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint
 *	epd are marked
 * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer
 *	of endpoint epd are marked
 * One or more of the following values.
 * SCIF_SIGNAL_LOCAL - On completion of the marked set of RMAs, write lval to
 *	memory at the address corresponding to offset loff in the local
 *	registered address space of epd.
 * SCIF_SIGNAL_REMOTE - On completion of the marked set of RMAs, write rval to
 *	memory at the address corresponding to offset roff in the remote
 *	registered address space of epd.
 *
 * Return:
 * Upon successful completion, scif_fence_signal() returns 0; otherwise in
 * user mode -1 is returned and errno is set to indicate the error; in kernel
 * mode the negative of one of the following errors is returned.
 *
 * Errors:
 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
 * ECONNRESET - Connection reset by peer
 * EINVAL - flags is invalid, or loff or roff are not DWORD aligned
 * ENODEV - The remote node is lost or existed, but is not currently in the
 * network since it may have crashed
 * ENOTCONN - The endpoint is not connected
 * ENXIO - loff is invalid for the registered address of epd, or roff is invalid
 * for the registered address space, of the peer of epd
 */
int scif_fence_signal(scif_epd_t epd, off_t loff, u64 lval, off_t roff,
		      u64 rval, int flags);

/**
 * scif_get_node_ids() - Return information about online nodes
 * @nodes:	array in which to return online node IDs
 * @len:	number of entries in the nodes array
 * @self:	address to place the node ID of the local node
 *
 * scif_get_node_ids() fills in the nodes array with up to len node IDs of the
 * nodes in the SCIF network. If there is not enough space in nodes, as
 * indicated by the len parameter, only len node IDs are returned in nodes. The
 * return value of scif_get_node_ids() is the total number of nodes currently in
 * the SCIF network. By checking the return value against the len parameter,
 * the user may determine if enough space for nodes was allocated.
 *
 * The node ID of the local node is returned at self.
 *
 * Return:
 * Upon successful completion, scif_get_node_ids() returns the actual number of
 * online nodes in the SCIF network including 'self'; otherwise in user mode
 * -1 is returned and errno is set to indicate the error; in kernel mode no
 * errors are returned.
 */
int scif_get_node_ids(u16 *nodes, int len, u16 *self);

/**
 * scif_pin_pages() - Pin a set of pages
 * @addr:		Virtual address of range to pin
 * @len:		Length of range to pin
 * @prot_flags:		Page protection flags
 * @map_flags:		Page classification flags
 * @pinned_pages:	Handle to pinned pages
 *
 * scif_pin_pages() pins (locks in physical memory) the physical pages which
 * back the range of virtual address pages starting at addr and continuing for
 * len bytes. addr and len are constrained to be multiples of the page size. A
 * successful scif_pin_pages() call returns a handle to pinned_pages which may
 * be used in subsequent calls to scif_register_pinned_pages().
 *
 * The pages will remain pinned as long as there is a reference against the
 * scif_pinned_pages_t value returned by scif_pin_pages() and until
 * scif_unpin_pages() is called, passing the scif_pinned_pages_t value. A
 * reference is added to a scif_pinned_pages_t value each time a window is
 * created by calling scif_register_pinned_pages() and passing the
 * scif_pinned_pages_t value. A reference is removed from a
 * scif_pinned_pages_t value each time such a window is deleted.
 *
 * Subsequent operations which change the memory pages to which virtual
 * addresses are mapped (such as mmap(), munmap()) have no effect on the
 * scif_pinned_pages_t value or windows created against it.
 *
 * If the process will fork(), it is recommended that the registered
 * virtual address range be marked with MADV_DONTFORK. Doing so will prevent
 * problems due to copy-on-write semantics.
 *
 * The prot_flags argument is formed by OR'ing together one or more of the
 * following values.
 * SCIF_PROT_READ - allow read operations against the pages
 * SCIF_PROT_WRITE - allow write operations against the pages
 * The map_flags argument can be set as SCIF_MAP_KERNEL to interpret addr as a
 * kernel space address. By default, addr is interpreted as a user space
 * address.
 *
 * Return:
 * Upon successful completion, scif_pin_pages() returns 0; otherwise the
 * negative of one of the following errors is returned.
 *
 * Errors:
 * EINVAL - prot_flags is invalid, map_flags is invalid, or offset is negative
 * ENOMEM - Not enough space
 */
int scif_pin_pages(void *addr, size_t len, int prot_flags, int map_flags,
		   scif_pinned_pages_t *pinned_pages);

/**
 * scif_unpin_pages() - Unpin a set of pages
 * @pinned_pages:	Handle to pinned pages to be unpinned
 *
 * scif_unpin_pages() prevents scif_register_pinned_pages() from registering new
 * windows against pinned_pages. The physical pages represented by pinned_pages
 * will remain pinned until all windows previously registered against
 * pinned_pages are deleted (the window is scif_unregister()'d and all
 * references to the window are removed (see scif_unregister()).
 *
 * pinned_pages must have been obtain from a previous call to scif_pin_pages().
 * After calling scif_unpin_pages(), it is an error to pass pinned_pages to
 * scif_register_pinned_pages().
 *
 * Return:
 * Upon successful completion, scif_unpin_pages() returns 0; otherwise the
 * negative of one of the following errors is returned.
 *
 * Errors:
 * EINVAL - pinned_pages is not valid
 */
int scif_unpin_pages(scif_pinned_pages_t pinned_pages);

/**
 * scif_register_pinned_pages() - Mark a memory region for remote access.
 * @epd:		endpoint descriptor
 * @pinned_pages:	Handle to pinned pages
 * @offset:		Registered address space offset
 * @map_flags:		Flags which control where pages are mapped
 *
 * The scif_register_pinned_pages() function opens a window, a range of whole
 * pages of the registered address space of the endpoint epd, starting at
 * offset po. The value of po, further described below, is a function of the
 * parameters offset and pinned_pages, and the value of map_flags. Each page of
 * the window represents a corresponding physical memory page of the range
 * represented by pinned_pages; the length of the window is the same as the
 * length of range represented by pinned_pages. A successful
 * scif_register_pinned_pages() call returns po as the return value.
 *
 * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset
 * exactly, and offset is constrained to be a multiple of the page size. The
 * mapping established by scif_register_pinned_pages() will not replace any
 * existing registration; an error is returned if any page of the new window
 * would intersect an existing window.
 *
 * When SCIF_MAP_FIXED is not set, the implementation uses offset in an
 * implementation-defined manner to arrive at po. The po so chosen will be an
 * area of the registered address space that the implementation deems suitable
 * for a mapping of the required size. An offset value of 0 is interpreted as
 * granting the implementation complete freedom in selecting po, subject to
 * constraints described below. A non-zero value of offset is taken to be a
 * suggestion of an offset near which the mapping should be placed. When the
 * implementation selects a value for po, it does not replace any extant
 * window. In all cases, po will be a multiple of the page size.
 *
 * The physical pages which are so represented by a window are available for
 * access in calls to scif_get_pages(), scif_readfrom(), scif_writeto(),
 * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the
 * physical pages represented by the window will not be reused by the memory
 * subsystem for any other purpose. Note that the same physical page may be
 * represented by multiple windows.
 *
 * Windows created by scif_register_pinned_pages() are unregistered by
 * scif_unregister().
 *
 * The map_flags argument can be set to SCIF_MAP_FIXED which interprets a
 * fixed offset.
 *
 * Return:
 * Upon successful completion, scif_register_pinned_pages() returns the offset
 * at which the mapping was placed (po); otherwise the negative of one of the
 * following errors is returned.
 *
 * Errors:
 * EADDRINUSE - SCIF_MAP_FIXED is set in map_flags and pages in the new window
 * would intersect an existing window
 * EAGAIN - The mapping could not be performed due to lack of resources
 * ECONNRESET - Connection reset by peer
 * EINVAL - map_flags is invalid, or SCIF_MAP_FIXED is set in map_flags, and
 * offset is not a multiple of the page size, or offset is negative
 * ENODEV - The remote node is lost or existed, but is not currently in the
 * network since it may have crashed
 * ENOMEM - Not enough space
 * ENOTCONN - The endpoint is not connected
 */
off_t scif_register_pinned_pages(scif_epd_t epd,
				 scif_pinned_pages_t pinned_pages,
				 off_t offset, int map_flags);

/**
 * scif_get_pages() - Add references to remote registered pages
 * @epd:	endpoint descriptor
 * @offset:	remote registered offset
 * @len:	length of range of pages
 * @pages:	returned scif_range structure
 *
 * scif_get_pages() returns the addresses of the physical pages represented by
 * those pages of the registered address space of the peer of epd, starting at
 * offset and continuing for len bytes. offset and len are constrained to be
 * multiples of the page size.
 *
 * All of the pages in the specified range [offset, offset + len - 1] must be
 * within a single window of the registered address space of the peer of epd.
 *
 * The addresses are returned as a virtually contiguous array pointed to by the
 * phys_addr component of the scif_range structure whose address is returned in
 * pages. The nr_pages component of scif_range is the length of the array. The
 * prot_flags component of scif_range holds the protection flag value passed
 * when the pages were registered.
 *
 * Each physical page whose address is returned by scif_get_pages() remains
 * available and will not be released for reuse until the scif_range structure
 * is returned in a call to scif_put_pages(). The scif_range structure returned
 * by scif_get_pages() must be unmodified.
 *
 * It is an error to call scif_close() on an endpoint on which a scif_range
 * structure of that endpoint has not been returned to scif_put_pages().
 *
 * Return:
 * Upon successful completion, scif_get_pages() returns 0; otherwise the
 * negative of one of the following errors is returned.
 * Errors:
 * ECONNRESET - Connection reset by peer.
 * EINVAL - offset is not a multiple of the page size, or offset is negative, or
 * len is not a multiple of the page size
 * ENODEV - The remote node is lost or existed, but is not currently in the
 * network since it may have crashed
 * ENOTCONN - The endpoint is not connected
 * ENXIO - Offsets in the range [offset, offset + len - 1] are invalid
 * for the registered address space of the peer epd
 */
int scif_get_pages(scif_epd_t epd, off_t offset, size_t len,
		   struct scif_range **pages);

/**
 * scif_put_pages() - Remove references from remote registered pages
 * @pages:	pages to be returned
 *
 * scif_put_pages() releases a scif_range structure previously obtained by
 * calling scif_get_pages(). The physical pages represented by pages may
 * be reused when the window which represented those pages is unregistered.
 * Therefore, those pages must not be accessed after calling scif_put_pages().
 *
 * Return:
 * Upon successful completion, scif_put_pages() returns 0; otherwise the
 * negative of one of the following errors is returned.
 * Errors:
 * EINVAL - pages does not point to a valid scif_range structure, or
 * the scif_range structure pointed to by pages was already returned
 * ENODEV - The remote node is lost or existed, but is not currently in the
 * network since it may have crashed
 * ENOTCONN - The endpoint is not connected
 */
int scif_put_pages(struct scif_range *pages);

/**
 * scif_poll() - Wait for some event on an endpoint
 * @epds:	Array of endpoint descriptors
 * @nepds:	Length of epds
 * @timeout:	Upper limit on time for which scif_poll() will block
 *
 * scif_poll() waits for one of a set of endpoints to become ready to perform
 * an I/O operation.
 *
 * The epds argument specifies the endpoint descriptors to be examined and the
 * events of interest for each endpoint descriptor. epds is a pointer to an
 * array with one member for each open endpoint descriptor of interest.
 *
 * The number of items in the epds array is specified in nepds. The epd field
 * of scif_pollepd is an endpoint descriptor of an open endpoint. The field
 * events is a bitmask specifying the events which the application is
 * interested in. The field revents is an output parameter, filled by the
 * kernel with the events that actually occurred. The bits returned in revents
 * can include any of those specified in events, or one of the values EPOLLERR,
 * EPOLLHUP, or EPOLLNVAL. (These three bits are meaningless in the events
 * field, and will be set in the revents field whenever the corresponding
 * condition is true.)
 *
 * If none of the events requested (and no error) has occurred for any of the
 * endpoint descriptors, then scif_poll() blocks until one of the events occurs.
 *
 * The timeout argument specifies an upper limit on the time for which
 * scif_poll() will block, in milliseconds. Specifying a negative value in
 * timeout means an infinite timeout.
 *
 * The following bits may be set in events and returned in revents.
 * EPOLLIN - Data may be received without blocking. For a connected
 * endpoint, this means that scif_recv() may be called without blocking. For a
 * listening endpoint, this means that scif_accept() may be called without
 * blocking.
 * EPOLLOUT - Data may be sent without blocking. For a connected endpoint, this
 * means that scif_send() may be called without blocking. EPOLLOUT may also be
 * used to block waiting for a non-blocking connect to complete. This bit value
 * has no meaning for a listening endpoint and is ignored if specified.
 *
 * The following bits are only returned in revents, and are ignored if set in
 * events.
 * EPOLLERR - An error occurred on the endpoint
 * EPOLLHUP - The connection to the peer endpoint was disconnected
 * EPOLLNVAL - The specified endpoint descriptor is invalid.
 *
 * Return:
 * Upon successful completion, scif_poll() returns a non-negative value. A
 * positive value indicates the total number of endpoint descriptors that have
 * been selected (that is, endpoint descriptors for which the revents member is
 * non-zero). A value of 0 indicates that the call timed out and no endpoint
 * descriptors have been selected. Otherwise in user mode -1 is returned and
 * errno is set to indicate the error; in kernel mode the negative of one of
 * the following errors is returned.
 *
 * Errors:
 * EINTR - A signal occurred before any requested event
 * EINVAL - The nepds argument is greater than {OPEN_MAX}
 * ENOMEM - There was no space to allocate file descriptor tables
 */
int scif_poll(struct scif_pollepd *epds, unsigned int nepds, long timeout);

/**
 * scif_client_register() - Register a SCIF client
 * @client:	client to be registered
 *
 * scif_client_register() registers a SCIF client. The probe() method
 * of the client is called when SCIF peer devices come online and the
 * remove() method is called when the peer devices disappear.
 *
 * Return:
 * Upon successful completion, scif_client_register() returns a non-negative
 * value. Otherwise the return value is the same as subsys_interface_register()
 * in the kernel.
 */
int scif_client_register(struct scif_client *client);

/**
 * scif_client_unregister() - Unregister a SCIF client
 * @client:	client to be unregistered
 *
 * scif_client_unregister() unregisters a SCIF client.
 *
 * Return:
 * None
 */
void scif_client_unregister(struct scif_client *client);

#endif /* __SCIF_H__ */