1#ifndef _RDS_RDS_H
2#define _RDS_RDS_H
3
4#include <net/sock.h>
5#include <linux/scatterlist.h>
6#include <linux/highmem.h>
7#include <rdma/rdma_cm.h>
8#include <linux/mutex.h>
9#include <linux/rds.h>
10
11#include "info.h"
12
13
14
15
16#define RDS_PROTOCOL_3_0 0x0300
17#define RDS_PROTOCOL_3_1 0x0301
18#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1
19#define RDS_PROTOCOL_MAJOR(v) ((v) >> 8)
20#define RDS_PROTOCOL_MINOR(v) ((v) & 255)
21#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min)
22
23
24
25
26
27
28
29#define RDS_PORT 18634
30
31#ifdef ATOMIC64_INIT
32#define KERNEL_HAS_ATOMIC64
33#endif
34
35#ifdef DEBUG
36#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
37#else
38
39static inline void __attribute__ ((format (printf, 1, 2)))
40rdsdebug(char *fmt, ...)
41{
42}
43#endif
44
45
46#define ceil(x, y) \
47 ({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; })
48
49#define RDS_FRAG_SHIFT 12
50#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT))
51
52#define RDS_CONG_MAP_BYTES (65536 / 8)
53#define RDS_CONG_MAP_LONGS (RDS_CONG_MAP_BYTES / sizeof(unsigned long))
54#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
55#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
56
57struct rds_cong_map {
58 struct rb_node m_rb_node;
59 __be32 m_addr;
60 wait_queue_head_t m_waitq;
61 struct list_head m_conn_list;
62 unsigned long m_page_addrs[RDS_CONG_MAP_PAGES];
63};
64
65
66
67
68
69
70
71
72enum {
73 RDS_CONN_DOWN = 0,
74 RDS_CONN_CONNECTING,
75 RDS_CONN_DISCONNECTING,
76 RDS_CONN_UP,
77 RDS_CONN_ERROR,
78};
79
80
81#define RDS_LL_SEND_FULL 0
82#define RDS_RECONNECT_PENDING 1
83
84struct rds_connection {
85 struct hlist_node c_hash_node;
86 __be32 c_laddr;
87 __be32 c_faddr;
88 unsigned int c_loopback:1;
89 struct rds_connection *c_passive;
90
91 struct rds_cong_map *c_lcong;
92 struct rds_cong_map *c_fcong;
93
94 struct mutex c_send_lock;
95 struct rds_message *c_xmit_rm;
96 unsigned long c_xmit_sg;
97 unsigned int c_xmit_hdr_off;
98 unsigned int c_xmit_data_off;
99 unsigned int c_xmit_rdma_sent;
100
101 spinlock_t c_lock;
102 u64 c_next_tx_seq;
103 struct list_head c_send_queue;
104 struct list_head c_retrans;
105
106 u64 c_next_rx_seq;
107
108 struct rds_transport *c_trans;
109 void *c_transport_data;
110
111 atomic_t c_state;
112 unsigned long c_flags;
113 unsigned long c_reconnect_jiffies;
114 struct delayed_work c_send_w;
115 struct delayed_work c_recv_w;
116 struct delayed_work c_conn_w;
117 struct work_struct c_down_w;
118 struct mutex c_cm_lock;
119
120 struct list_head c_map_item;
121 unsigned long c_map_queued;
122 unsigned long c_map_offset;
123 unsigned long c_map_bytes;
124
125 unsigned int c_unacked_packets;
126 unsigned int c_unacked_bytes;
127
128
129 unsigned int c_version;
130};
131
132#define RDS_FLAG_CONG_BITMAP 0x01
133#define RDS_FLAG_ACK_REQUIRED 0x02
134#define RDS_FLAG_RETRANSMITTED 0x04
135#define RDS_MAX_ADV_CREDIT 255
136
137
138
139
140#define RDS_HEADER_EXT_SPACE 16
141
142struct rds_header {
143 __be64 h_sequence;
144 __be64 h_ack;
145 __be32 h_len;
146 __be16 h_sport;
147 __be16 h_dport;
148 u8 h_flags;
149 u8 h_credit;
150 u8 h_padding[4];
151 __sum16 h_csum;
152
153 u8 h_exthdr[RDS_HEADER_EXT_SPACE];
154};
155
156
157
158
159#define RDS_EXTHDR_NONE 0
160
161
162
163
164
165
166
167
168
169
170
171#define RDS_EXTHDR_VERSION 1
172struct rds_ext_header_version {
173 __be32 h_version;
174};
175
176
177
178
179
180#define RDS_EXTHDR_RDMA 2
181struct rds_ext_header_rdma {
182 __be32 h_rdma_rkey;
183};
184
185
186
187
188
189
190#define RDS_EXTHDR_RDMA_DEST 3
191struct rds_ext_header_rdma_dest {
192 __be32 h_rdma_rkey;
193 __be32 h_rdma_offset;
194};
195
196#define __RDS_EXTHDR_MAX 16
197
198struct rds_incoming {
199 atomic_t i_refcount;
200 struct list_head i_item;
201 struct rds_connection *i_conn;
202 struct rds_header i_hdr;
203 unsigned long i_rx_jiffies;
204 __be32 i_saddr;
205
206 rds_rdma_cookie_t i_rdma_cookie;
207};
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238#define RDS_MSG_ON_SOCK 1
239#define RDS_MSG_ON_CONN 2
240#define RDS_MSG_HAS_ACK_SEQ 3
241#define RDS_MSG_ACK_REQUIRED 4
242#define RDS_MSG_RETRANSMITTED 5
243#define RDS_MSG_MAPPED 6
244#define RDS_MSG_PAGEVEC 7
245
246struct rds_message {
247 atomic_t m_refcount;
248 struct list_head m_sock_item;
249 struct list_head m_conn_item;
250 struct rds_incoming m_inc;
251 u64 m_ack_seq;
252 __be32 m_daddr;
253 unsigned long m_flags;
254
255
256
257
258
259
260 spinlock_t m_rs_lock;
261 struct rds_sock *m_rs;
262 struct rds_rdma_op *m_rdma_op;
263 rds_rdma_cookie_t m_rdma_cookie;
264 struct rds_mr *m_rdma_mr;
265 unsigned int m_nents;
266 unsigned int m_count;
267 struct scatterlist m_sg[0];
268};
269
270
271
272
273
274
275
276
277struct rds_notifier {
278 struct list_head n_list;
279 uint64_t n_user_token;
280 int n_status;
281};
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314#define RDS_TRANS_IB 0
315#define RDS_TRANS_IWARP 1
316#define RDS_TRANS_TCP 2
317#define RDS_TRANS_COUNT 3
318
319struct rds_transport {
320 char t_name[TRANSNAMSIZ];
321 struct list_head t_item;
322 struct module *t_owner;
323 unsigned int t_prefer_loopback:1;
324 unsigned int t_type;
325
326 int (*laddr_check)(__be32 addr);
327 int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
328 void (*conn_free)(void *data);
329 int (*conn_connect)(struct rds_connection *conn);
330 void (*conn_shutdown)(struct rds_connection *conn);
331 void (*xmit_prepare)(struct rds_connection *conn);
332 void (*xmit_complete)(struct rds_connection *conn);
333 int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
334 unsigned int hdr_off, unsigned int sg, unsigned int off);
335 int (*xmit_cong_map)(struct rds_connection *conn,
336 struct rds_cong_map *map, unsigned long offset);
337 int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
338 int (*recv)(struct rds_connection *conn);
339 int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
340 size_t size);
341 void (*inc_purge)(struct rds_incoming *inc);
342 void (*inc_free)(struct rds_incoming *inc);
343
344 int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
345 struct rdma_cm_event *event);
346 int (*cm_initiate_connect)(struct rdma_cm_id *cm_id);
347 void (*cm_connect_complete)(struct rds_connection *conn,
348 struct rdma_cm_event *event);
349
350 unsigned int (*stats_info_copy)(struct rds_info_iterator *iter,
351 unsigned int avail);
352 void (*exit)(void);
353 void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
354 struct rds_sock *rs, u32 *key_ret);
355 void (*sync_mr)(void *trans_private, int direction);
356 void (*free_mr)(void *trans_private, int invalidate);
357 void (*flush_mrs)(void);
358};
359
360struct rds_sock {
361 struct sock rs_sk;
362
363 u64 rs_user_addr;
364 u64 rs_user_bytes;
365
366
367
368
369
370 struct rb_node rs_bound_node;
371 __be32 rs_bound_addr;
372 __be32 rs_conn_addr;
373 __be16 rs_bound_port;
374 __be16 rs_conn_port;
375
376
377
378
379
380
381 struct rds_transport *rs_transport;
382
383
384
385
386
387 struct rds_connection *rs_conn;
388
389
390 int rs_congested;
391
392
393 spinlock_t rs_lock;
394 struct list_head rs_send_queue;
395 u32 rs_snd_bytes;
396 int rs_rcv_bytes;
397 struct list_head rs_notify_queue;
398
399
400
401
402
403
404 uint64_t rs_cong_mask;
405 uint64_t rs_cong_notify;
406 struct list_head rs_cong_list;
407 unsigned long rs_cong_track;
408
409
410
411
412
413 rwlock_t rs_recv_lock;
414 struct list_head rs_recv_queue;
415
416
417 struct list_head rs_item;
418
419
420 spinlock_t rs_rdma_lock;
421 struct rb_root rs_rdma_keys;
422
423
424 unsigned char rs_recverr,
425 rs_cong_monitor;
426};
427
428static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
429{
430 return container_of(sk, struct rds_sock, rs_sk);
431}
432static inline struct sock *rds_rs_to_sk(struct rds_sock *rs)
433{
434 return &rs->rs_sk;
435}
436
437
438
439
440
441
442static inline int rds_sk_sndbuf(struct rds_sock *rs)
443{
444 return rds_rs_to_sk(rs)->sk_sndbuf / 2;
445}
446static inline int rds_sk_rcvbuf(struct rds_sock *rs)
447{
448 return rds_rs_to_sk(rs)->sk_rcvbuf / 2;
449}
450
451struct rds_statistics {
452 uint64_t s_conn_reset;
453 uint64_t s_recv_drop_bad_checksum;
454 uint64_t s_recv_drop_old_seq;
455 uint64_t s_recv_drop_no_sock;
456 uint64_t s_recv_drop_dead_sock;
457 uint64_t s_recv_deliver_raced;
458 uint64_t s_recv_delivered;
459 uint64_t s_recv_queued;
460 uint64_t s_recv_immediate_retry;
461 uint64_t s_recv_delayed_retry;
462 uint64_t s_recv_ack_required;
463 uint64_t s_recv_rdma_bytes;
464 uint64_t s_recv_ping;
465 uint64_t s_send_queue_empty;
466 uint64_t s_send_queue_full;
467 uint64_t s_send_sem_contention;
468 uint64_t s_send_sem_queue_raced;
469 uint64_t s_send_immediate_retry;
470 uint64_t s_send_delayed_retry;
471 uint64_t s_send_drop_acked;
472 uint64_t s_send_ack_required;
473 uint64_t s_send_queued;
474 uint64_t s_send_rdma;
475 uint64_t s_send_rdma_bytes;
476 uint64_t s_send_pong;
477 uint64_t s_page_remainder_hit;
478 uint64_t s_page_remainder_miss;
479 uint64_t s_copy_to_user;
480 uint64_t s_copy_from_user;
481 uint64_t s_cong_update_queued;
482 uint64_t s_cong_update_received;
483 uint64_t s_cong_send_error;
484 uint64_t s_cong_send_blocked;
485};
486
487
488void rds_sock_addref(struct rds_sock *rs);
489void rds_sock_put(struct rds_sock *rs);
490void rds_wake_sk_sleep(struct rds_sock *rs);
491static inline void __rds_wake_sk_sleep(struct sock *sk)
492{
493 wait_queue_head_t *waitq = sk->sk_sleep;
494
495 if (!sock_flag(sk, SOCK_DEAD) && waitq)
496 wake_up(waitq);
497}
498extern wait_queue_head_t rds_poll_waitq;
499
500
501
502int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
503void rds_remove_bound(struct rds_sock *rs);
504struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
505
506
507int rds_cong_get_maps(struct rds_connection *conn);
508void rds_cong_add_conn(struct rds_connection *conn);
509void rds_cong_remove_conn(struct rds_connection *conn);
510void rds_cong_set_bit(struct rds_cong_map *map, __be16 port);
511void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port);
512int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs);
513void rds_cong_queue_updates(struct rds_cong_map *map);
514void rds_cong_map_updated(struct rds_cong_map *map, uint64_t);
515int rds_cong_updated_since(unsigned long *recent);
516void rds_cong_add_socket(struct rds_sock *);
517void rds_cong_remove_socket(struct rds_sock *);
518void rds_cong_exit(void);
519struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
520
521
522int __init rds_conn_init(void);
523void rds_conn_exit(void);
524struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
525 struct rds_transport *trans, gfp_t gfp);
526struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
527 struct rds_transport *trans, gfp_t gfp);
528void rds_conn_destroy(struct rds_connection *conn);
529void rds_conn_reset(struct rds_connection *conn);
530void rds_conn_drop(struct rds_connection *conn);
531void rds_for_each_conn_info(struct socket *sock, unsigned int len,
532 struct rds_info_iterator *iter,
533 struct rds_info_lengths *lens,
534 int (*visitor)(struct rds_connection *, void *),
535 size_t item_len);
536void __rds_conn_error(struct rds_connection *conn, const char *, ...)
537 __attribute__ ((format (printf, 2, 3)));
538#define rds_conn_error(conn, fmt...) \
539 __rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
540
541static inline int
542rds_conn_transition(struct rds_connection *conn, int old, int new)
543{
544 return atomic_cmpxchg(&conn->c_state, old, new) == old;
545}
546
547static inline int
548rds_conn_state(struct rds_connection *conn)
549{
550 return atomic_read(&conn->c_state);
551}
552
553static inline int
554rds_conn_up(struct rds_connection *conn)
555{
556 return atomic_read(&conn->c_state) == RDS_CONN_UP;
557}
558
559static inline int
560rds_conn_connecting(struct rds_connection *conn)
561{
562 return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING;
563}
564
565
566struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
567struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
568 size_t total_len);
569struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
570void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
571 __be16 dport, u64 seq);
572int rds_message_add_extension(struct rds_header *hdr,
573 unsigned int type, const void *data, unsigned int len);
574int rds_message_next_extension(struct rds_header *hdr,
575 unsigned int *pos, void *buf, unsigned int *buflen);
576int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version);
577int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version);
578int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
579int rds_message_inc_copy_to_user(struct rds_incoming *inc,
580 struct iovec *first_iov, size_t size);
581void rds_message_inc_purge(struct rds_incoming *inc);
582void rds_message_inc_free(struct rds_incoming *inc);
583void rds_message_addref(struct rds_message *rm);
584void rds_message_put(struct rds_message *rm);
585void rds_message_wait(struct rds_message *rm);
586void rds_message_unmapped(struct rds_message *rm);
587
588static inline void rds_message_make_checksum(struct rds_header *hdr)
589{
590 hdr->h_csum = 0;
591 hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2);
592}
593
594static inline int rds_message_verify_checksum(const struct rds_header *hdr)
595{
596 return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0;
597}
598
599
600
601int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
602 gfp_t gfp);
603int rds_page_copy_user(struct page *page, unsigned long offset,
604 void __user *ptr, unsigned long bytes,
605 int to_user);
606#define rds_page_copy_to_user(page, offset, ptr, bytes) \
607 rds_page_copy_user(page, offset, ptr, bytes, 1)
608#define rds_page_copy_from_user(page, offset, ptr, bytes) \
609 rds_page_copy_user(page, offset, ptr, bytes, 0)
610void rds_page_exit(void);
611
612
613void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
614 __be32 saddr);
615void rds_inc_addref(struct rds_incoming *inc);
616void rds_inc_put(struct rds_incoming *inc);
617void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
618 struct rds_incoming *inc, gfp_t gfp, enum km_type km);
619int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
620 size_t size, int msg_flags);
621void rds_clear_recv_queue(struct rds_sock *rs);
622int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
623void rds_inc_info_copy(struct rds_incoming *inc,
624 struct rds_info_iterator *iter,
625 __be32 saddr, __be32 daddr, int flip);
626
627
628int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
629 size_t payload_len);
630void rds_send_reset(struct rds_connection *conn);
631int rds_send_xmit(struct rds_connection *conn);
632struct sockaddr_in;
633void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
634typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
635void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
636 is_acked_func is_acked);
637int rds_send_acked_before(struct rds_connection *conn, u64 seq);
638void rds_send_remove_from_sock(struct list_head *messages, int status);
639int rds_send_pong(struct rds_connection *conn, __be16 dport);
640struct rds_message *rds_send_get_message(struct rds_connection *,
641 struct rds_rdma_op *);
642
643
644void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
645
646
647DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
648#define rds_stats_inc_which(which, member) do { \
649 per_cpu(which, get_cpu()).member++; \
650 put_cpu(); \
651} while (0)
652#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member)
653#define rds_stats_add_which(which, member, count) do { \
654 per_cpu(which, get_cpu()).member += count; \
655 put_cpu(); \
656} while (0)
657#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
658int __init rds_stats_init(void);
659void rds_stats_exit(void);
660void rds_stats_info_copy(struct rds_info_iterator *iter,
661 uint64_t *values, const char *const *names,
662 size_t nr);
663
664
665int __init rds_sysctl_init(void);
666void rds_sysctl_exit(void);
667extern unsigned long rds_sysctl_sndbuf_min;
668extern unsigned long rds_sysctl_sndbuf_default;
669extern unsigned long rds_sysctl_sndbuf_max;
670extern unsigned long rds_sysctl_reconnect_min_jiffies;
671extern unsigned long rds_sysctl_reconnect_max_jiffies;
672extern unsigned int rds_sysctl_max_unacked_packets;
673extern unsigned int rds_sysctl_max_unacked_bytes;
674extern unsigned int rds_sysctl_ping_enable;
675extern unsigned long rds_sysctl_trace_flags;
676extern unsigned int rds_sysctl_trace_level;
677
678
679int __init rds_threads_init(void);
680void rds_threads_exit(void);
681extern struct workqueue_struct *rds_wq;
682void rds_connect_worker(struct work_struct *);
683void rds_shutdown_worker(struct work_struct *);
684void rds_send_worker(struct work_struct *);
685void rds_recv_worker(struct work_struct *);
686void rds_connect_complete(struct rds_connection *conn);
687
688
689int rds_trans_register(struct rds_transport *trans);
690void rds_trans_unregister(struct rds_transport *trans);
691struct rds_transport *rds_trans_get_preferred(__be32 addr);
692unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
693 unsigned int avail);
694int __init rds_trans_init(void);
695void rds_trans_exit(void);
696
697#endif
698